提交 b1397592 编写于 作者: Y yejianwu

fix conflix

......@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
......@@ -33,8 +34,6 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
uint32_t idx = 0;
for (auto input : input_tensors) {
addn_kernel.setArg(idx++,
......@@ -42,12 +41,47 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
}
addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
const uint32_t gws[2] = {
static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)
};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
std::vector<uint32_t> lws = {64, 16};
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
uint32_t local_ws[2];
local_ws[0] = std::min<uint32_t>(width_pixels, kwg_size);
local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
return {{local_ws[0], local_ws[1]},
{kwg_size / 16, 16},
{kwg_size / 32, 32},
{kwg_size / 64, 64},
{kwg_size / 128, 128},
{kwg_size / 256, 256},
{kwg_size, 1},
{1, kwg_size}
};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange,
cl::NDRange(width_pixels, batch_height_pixels),
cl::NDRange(64, 16), // TODO fix this
nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << "error code: " << error;
cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "addn_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
}
template <typename T>
......
......@@ -48,8 +48,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{8, 128, 1}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
......
......@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
......@@ -41,21 +42,57 @@ static void Concat2(const Tensor *input0,
concat_kernel.setArg(idx++, static_cast<int32_t>(input0->dim(3)));
concat_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);
uint32_t lws[3] = {8, 16, 8};
// lws[0] = std::min<uint32_t>(channel_blk, kwg_size);
// lws[1] = std::min<uint32_t>(width, kwg_size / lws[0]);
// lws[2] = std::min<uint32_t>(height * batch, kwg_size / (lws[0] * lws[1]));
std::vector<uint32_t> lws = {8, 16, 8};
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
concat_kernel, cl::NullRange,
cl::NDRange(static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)),
cl::NDRange(lws[0], lws[1], lws[2]),
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "concat_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
}
template<typename T>
......
......@@ -68,8 +68,13 @@ void Conv1x1(const Tensor *input,
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 15, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
auto params_generator = [&]()->std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size/16, 4, 4},
{kwg_size/32, 4, 8},
{kwg_size/32, 8, 4},
......
......@@ -60,8 +60,13 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {4, 15, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
......
......@@ -62,8 +62,13 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
......
......@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
......@@ -23,11 +24,6 @@ static void Pooling(const Tensor *input,
index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
};
auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options;
......@@ -44,13 +40,6 @@ static void Pooling(const Tensor *input,
}
auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
uint32_t lws[3];
lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
uint32_t idx = 0;
pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
......@@ -62,12 +51,60 @@ static void Pooling(const Tensor *input,
pooling_kernel.setArg(idx++, pooling_size);
pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
std::vector<uint32_t> lws(3, 0);
lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << error;
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "pooling_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
}
template<typename T>
......
......@@ -50,8 +50,13 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
......
......@@ -7,6 +7,7 @@
#include "mace/kernels/resize_bilinear.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
......@@ -44,8 +45,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
auto rb_kernel = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
uint32_t idx = 0;
rb_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
rb_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
......@@ -55,17 +54,52 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
rb_kernel.setArg(idx++, static_cast<int32_t>(in_width));
rb_kernel.setArg(idx++, static_cast<int32_t>(out_height));
auto command_queue = runtime->command_queue();
cl_int error = command_queue.enqueueNDRangeKernel(
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
rb_kernel, cl::NullRange,
cl::NDRange(static_cast<int32_t>(channel_blocks),
static_cast<int32_t>(out_width),
static_cast<int32_t>(out_height * batch)),
// TODO tuning
cl::NDRange(1, static_cast<int32_t>(out_width > kwg_size ? kwg_size : out_width), 1),
nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error);
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
}
template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
......
......@@ -8,6 +8,7 @@ py_library(
],
srcs_version = "PY2AND3",
deps = [
":memory_optimizer",
"//mace/proto:mace_py",
],
)
......
......@@ -65,7 +65,7 @@ class MemoryOptimizer(object):
raise Exception('ref count is less than 0')
for mem in self.mem_block:
arena = net_def.mem_arena
arena = self.net_def.mem_arena
block = arena.mem_block.add()
block.mem_id = mem
block.x = self.mem_block[mem][0]
......@@ -83,20 +83,7 @@ class MemoryOptimizer(object):
print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size)
if __name__ == '__main__':
model_file = sys.argv[1]
opt_model_file = sys.argv[2]
with open(model_file, "rb") as f:
net_def = mace_pb2.NetDef()
net_def.ParseFromString(f.read())
optimizer = MemoryOptimizer(net_def)
optimizer.optimize()
with open(opt_model_file, "wb") as f:
f.write(net_def.SerializeToString())
with open(opt_model_file + '_txt', "wb") as f:
net_def.ClearField('tensors')
f.write(str(net_def))
def optimize_memory(net_def):
mem_optimizer = MemoryOptimizer(net_def)
mem_optimizer.optimize()
\ No newline at end of file
from mace.proto import mace_pb2
import tensorflow as tf
import numpy as np
from mace.python.tools.convert_util import tf_dtype_2_mace_dtype
from mace.python.tools import memory_optimizer
# TODO: support NCHW formt, now only support NHWC.
padding_mode = {
......@@ -25,22 +25,10 @@ data_type_map = {
'DT_FLOAT': mace_pb2.DT_FLOAT
}
def convert_tensor(op, tensor):
tf_tensor = op.outputs[0].eval()
tensor.name = op.outputs[0].name
BATCH_NORM_ORDER = ["Add", "Rsqrt", "Mul", "Mul", "Mul", "Sub", "Add"]
shape = list(tf_tensor.shape)
tensor.dims.extend(shape)
tf_dt = op.get_attr('dtype')
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(tf_tensor.astype(float).flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
else:
raise Exception("Not supported tensor type: " + tf_dt.name)
MACE_INPUT_NODE_NAME = "mace_input_node"
MACE_OUTPUT_NODE_NAME = "mace_output_node"
def get_input_tensor(op, index):
input_tensor = op.inputs[index]
......@@ -48,9 +36,26 @@ def get_input_tensor(op, index):
input_tensor = get_input_tensor(input_tensor.op, 0)
return input_tensor
def add_buffer_to_image(input_name, input_type, dt, net_def):
class TFConverter(object):
def __init__(self, tf_ops, net_def, dt, device):
self.net_def = net_def
self.tf_ops = tf_ops
self.dt = dt
self.device = device
self.tf_graph = {}
self.resolved_ops = {}
for op in tf_ops:
self.resolved_ops[op.name] = 0
for input in op.inputs:
input_name = input.name[:-2]
if input_name not in self.tf_graph:
self.tf_graph[input_name] = []
self.tf_graph[input_name].append(op)
def add_buffer_to_image(self, input_name, input_type):
output_name = input_name[:-2] + "_b2i" + input_name[-2:]
op_def = net_def.op.add()
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'BufferToImage'
op_def.input.extend([input_name])
......@@ -64,28 +69,12 @@ def add_buffer_to_image(input_name, input_type, dt, net_def):
arg.i = 0
arg = op_def.arg.add()
arg.name = 'T'
arg.i = dt
arg.i = self.dt
return output_name
def add_image_to_buffer(input_name, input_type, dt, net_def):
output_name = input_name[:-2] + "_i2b" + input_name[-2:]
op_def = net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'T'
arg.i = dt
return output_name
def add_input_transform(name, dt, net_def):
new_input_name = "mace_input_node:0"
op_def = net_def.op.add()
def add_input_transform(self, name):
new_input_name = MACE_INPUT_NODE_NAME + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'BufferToImage'
op_def.input.extend([new_input_name])
......@@ -97,11 +86,11 @@ def add_input_transform(name, dt, net_def):
arg = op_def.arg.add()
arg.name = 'T'
arg.i = dt
arg.i = self.dt
def add_output_transform(name, net_def):
output_name = "mace_output_node:0"
op_def = net_def.op.add()
def add_output_transform(self, name):
output_name = MACE_OUTPUT_NODE_NAME + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([name+':0'])
......@@ -111,197 +100,322 @@ def add_output_transform(name, net_def):
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT']
def convert_op_outputs(mace_op_def, tf_op):
mace_op_def.output.extend([output.name for output in tf_op.outputs])
mace_op_def.output_type.extend([tf_dtype_2_mace_dtype(output.dtype)
for output in tf_op.outputs])
@staticmethod
def add_output_shape(outputs, op):
output_shapes = []
for output in tf_op.outputs:
for output in outputs:
if output.shape.num_elements() is not None:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
mace_op_def.output_shape.extend(output_shapes)
op.output_shape.extend(output_shapes)
def convert_tensor(self, op):
tensor = self.net_def.tensors.add()
tf_tensor = op.outputs[0].eval()
tensor.name = op.outputs[0].name
def convert_ops(unresolved_ops, dt, net_def, device):
ops_count = len(unresolved_ops)
resolved_count = 1
first_op = unresolved_ops[0]
shape = list(tf_tensor.shape)
tensor.dims.extend(shape)
if first_op.type in ['Placeholder', 'Reshape', 'Identity']:
pass
elif first_op.type == 'Const':
tensor = net_def.tensors.add()
convert_tensor(first_op, tensor)
tf_dt = op.get_attr('dtype')
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
else:
op_def = net_def.op.add()
raise Exception("Not supported tensor type: " + tf_dt.name)
self.resolved_ops[op.name] = 1
def convert_conv2d(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = dt
if first_op.type == 'Conv2D' or first_op.type == 'DepthwiseConv2dNative':
op_def.name = first_op.name
if first_op.type == 'DepthwiseConv2dNative':
arg.i = self.dt
op_def.name = op.name
if op.type == 'DepthwiseConv2dNative':
op_def.type = 'DepthwiseConv2d'
else:
op_def.type = first_op.type
if device == 'gpu':
op_def.input.extend([first_op.inputs[0].name])
output_name = add_buffer_to_image(first_op.inputs[1].name, "FILTER", dt, net_def)
op_def.type = op.type
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
output_name = self.add_buffer_to_image(op.inputs[1].name, "FILTER")
op_def.input.extend([output_name])
else:
op_def.input.extend([input.name for input in first_op.inputs])
op_def.input.extend([input.name for input in op.inputs])
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[first_op.get_attr('padding')]
padding_arg.i = padding_mode[op.get_attr('padding')]
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(first_op.get_attr('strides')[1:3])
strides_arg.ints.extend(op.get_attr('strides')[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
final_op = first_op
if ops_count >= 3 and unresolved_ops[1].type == 'Const' and unresolved_ops[2].type == 'BiasAdd' :
bias_tensor = unresolved_ops[1]
tensor = net_def.tensors.add()
convert_tensor(bias_tensor, tensor)
final_op = op
self.resolved_ops[op.name] = 1
bias_add_op = unresolved_ops[2]
if device == 'gpu':
output_name = add_buffer_to_image(bias_add_op.inputs[1].name, "ARGUMENT", dt, net_def)
if len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd' :
bias_add_op = self.tf_graph[op.name][0]
if self.device == 'gpu':
output_name = self.add_buffer_to_image(bias_add_op.inputs[1].name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([bias_add_op.inputs[1].name])
final_op = bias_add_op
resolved_count = 3
self.resolved_ops[bias_add_op.name] = 1
if ops_count >= 4 and unresolved_ops[3].type == 'Relu':
relu_op = unresolved_ops[3];
if len(self.tf_graph[final_op.name]) == 1 \
and self.tf_graph[final_op.name][0].type == 'Relu':
relu_op = self.tf_graph[final_op.name][0]
op_def.type = "FusedConv2D"
final_op = relu_op
resolved_count = 4
self.resolved_ops[relu_op.name] = 1
convert_op_outputs(op_def, final_op)
op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
elif first_op.type == 'FusedBatchNorm':
op_def.name = first_op.name
def convert_fused_batchnorm(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'BatchNorm'
if device == 'gpu':
op_def.input.extend([first_op.inputs[0].name])
for i in range(1, len(first_op.inputs)):
output_name = add_buffer_to_image(first_op.inputs[i].name, "ARGUMENT", dt, net_def)
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
for i in range(1, len(op.inputs)):
output_name = self.add_buffer_to_image(op.inputs[i].name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([input.name for input in first_op.inputs])
op_def.output.extend([first_op.outputs[0].name])
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([op.outputs[0].name])
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(first_op.outputs[0].shape.as_list())
op_def.output_shape.extend([output_shape])
self.add_output_shape(op.outputs, op_def)
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'epsilon'
epsilon_arg.f = first_op.get_attr('epsilon')
epsilon_arg.f = op.get_attr('epsilon')
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
elif first_op.type == 'Add' and first_op.name.endswith(
'batchnorm/add') and ops_count > 7:
add_op = first_op
mul_op = unresolved_ops[2]
mul_1_op = unresolved_ops[3]
mul_2_op = unresolved_ops[4]
sub_op = unresolved_ops[5]
add_1_op = unresolved_ops[6]
# print (mul_op.type, mul_2_op.type, mul_1_op.type, sub_op.type)
if mul_op.type != 'Mul' or mul_2_op.type != 'Mul' or \
mul_1_op.type != 'Mul' or sub_op.type != 'Sub' or add_1_op.type != 'Add':
self.resolved_ops[op.name] = 1
self.net_def.op.extend([op_def])
def convert_batchnorm(self, op):
bn_ops = []
bn_ops.append(op)
for i in range(1, 3):
if len(self.tf_graph[bn_ops[i-1].name]) == 1 \
and self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]:
bn_ops.append(self.tf_graph[bn_ops[i-1].name][0])
else:
raise Exception('Invalid BatchNorm Op')
if len(self.tf_graph[bn_ops[2].name]) == 2 \
and self.tf_graph[bn_ops[2].name][0].type == BATCH_NORM_ORDER[3] \
and self.tf_graph[bn_ops[2].name][1].type == BATCH_NORM_ORDER[4]:
bn_ops.append(self.tf_graph[bn_ops[2].name][0])
bn_ops.append(self.tf_graph[bn_ops[2].name][1])
else:
raise Exception('Invalid BatchNorm Op')
bn_ops.append(self.tf_graph[bn_ops[4].name][0])
bn_ops.append(self.tf_graph[bn_ops[3].name][0])
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
get_input_tensor(mul_1_op, 0)
input_name = get_input_tensor(mul_1_op, 0).name
gamma = get_input_tensor(mul_op, 1).name
beta = get_input_tensor(sub_op, 0).name
mean = get_input_tensor(mul_2_op, 0).name
variance = get_input_tensor(add_op, 0).name
epsilon = get_input_tensor(add_op, 1).name
input_name = get_input_tensor(bn_ops[3], 0).name
gamma = get_input_tensor(bn_ops[2], 1).name
beta = get_input_tensor(bn_ops[5], 0).name
mean = get_input_tensor(bn_ops[4], 0).name
variance = get_input_tensor(bn_ops[0], 0).name
op_def.name = first_op.name[:-4] # remove /add
op_def.name = op.name[:-4] # remove /add
op_def.type = 'BatchNorm'
op_def.input.extend([input_name, gamma, beta, mean, variance, epsilon])
convert_op_outputs(op_def, add_1_op)
if self.device == 'gpu':
op_def.input.extend([input_name])
for tensor_name in [gamma, beta, mean, variance]:
output_name = self.add_buffer_to_image(tensor_name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([input_name, gamma, beta, mean, variance])
op_def.output.extend([output.name for output in bn_ops[6].outputs])
self.add_output_shape(bn_ops[6].outputs, op_def)
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'epsilon'
epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float)
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
resolved_count = 7
elif first_op.type == 'Relu6':
op_def.name = first_op.name
op_def.type = 'Relu'
op_def.input.extend([input.name for input in first_op.inputs])
convert_op_outputs(op_def, first_op)
self.net_def.op.extend([op_def])
for i in range(0, 7):
self.resolved_ops[bn_ops[i].name] = 1
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
elif first_op.type == 'AvgPool' or first_op.type == 'MaxPool':
op_def.name = first_op.name
def convert_pooling(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Pooling'
op_def.input.extend([input.name for input in first_op.inputs])
convert_op_outputs(op_def, first_op)
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[first_op.type]
pooling_type_arg.i = pooling_type_mode[op.type]
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[first_op.get_attr('padding')]
padding_arg.i = padding_mode[op.get_attr('padding')]
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(first_op.get_attr('strides')[1:3])
strides_arg.ints.extend(op.get_attr('strides')[1:3])
kernels_arg = op_def.arg.add()
kernels_arg.name = 'kernels'
kernels_arg.ints.extend(first_op.get_attr('ksize')[1:3])
kernels_arg.ints.extend(op.get_attr('ksize')[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
elif first_op.type == 'Add':
op_def.name = first_op.name
self.resolved_ops[op.name] = 1
def convert_relu6(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Relu'
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
self.resolved_ops[op.name] = 1
def convert_add(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "AddN"
op_def.input.extend([input.name for input in first_op.inputs])
convert_op_outputs(op_def, first_op)
elif first_op.type == 'ConcatV2':
op_def.name = first_op.name
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_concat(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "Concat"
op_def.input.extend([first_op.inputs[i].name for i in xrange(2)])
op_def.input.extend([op.inputs[i].name for i in xrange(2)])
op_def.output.extend([output.name for output in op.outputs])
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis_arg.i = get_input_tensor(first_op, 2).eval().astype(np.int32)
convert_op_outputs(op_def, first_op)
elif first_op.type == 'ResizeBilinear':
op_def.name = first_op.name
axis_arg.i = get_input_tensor(op, 2).eval().astype(np.int32)
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_resize_bilinear(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "ResizeBilinear"
op_def.input.extend([first_op.inputs[0].name])
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'size'
size_arg.ints.extend(get_input_tensor(first_op, 1).eval().astype(np.int32).flat)
size_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
size_arg = op_def.arg.add()
size_arg.name = 'align_corners'
size_arg.i = first_op.get_attr('align_corners')
convert_op_outputs(op_def, first_op)
elif first_op.type in ['Relu', 'SpaceToBatchND', 'BatchToSpaceND', 'BiasAdd']:
op_def.name = first_op.name
op_def.type = first_op.type
op_def.input.extend([input.name for input in first_op.inputs])
convert_op_outputs(op_def, first_op)
size_arg.i = op.get_attr('align_corners')
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_bias_add(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "BiasAdd"
op_def.input.extend([op.inputs[0].name])
if self.device == 'gpu':
output_name = self.add_buffer_to_image(op.inputs[1].name, "ARGUMENT")
op_def.input.extend([output_name])
else:
raise Exception('Unknown Op: %s, type: %s' % (first_op.name, first_op.type))
op_def.input.extend([op.inputs[1].name])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.net_def.op.extend([op_def])
self.resolved_ops[op.name] = 1
def convert_normal_op(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert(self, input_node, output_node):
if self.device == 'gpu':
self.add_input_transform(input_node)
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
continue
if op.type in ['Placeholder', 'Reshape', 'Identity']:
self.resolved_ops[op.name] = 1
pass
elif op.type == 'Const':
self.convert_tensor(op)
elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
self.convert_conv2d(op)
elif op.type == 'FusedBatchNorm':
self.convert_fused_batchnorm(op)
elif op.type == 'Add' and op.name.endswith('batchnorm/add'):
self.convert_batchnorm(op)
elif op.type == 'AvgPool' or op.type == 'MaxPool':
self.convert_pooling(op)
elif op.type == 'Relu6':
self.convert_relu6(op)
elif op.type == 'Add':
self.convert_add(op)
elif op.type == 'ConcatV2':
self.convert_concat(op)
elif op.type == 'ResizeBilinear':
self.convert_resize_bilinear(op)
elif op.type == 'BiasAdd':
self.convert_bias_add(op)
elif op.type in ['Relu', 'SpaceToBatchND', 'BatchToSpaceND']:
self.convert_normal_op(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
for i in range(resolved_count):
del unresolved_ops[0]
if self.device == 'gpu':
self.add_output_transform(output_node)
for key in self.resolved_ops:
if self.resolved_ops[key] != 1:
print 'Unresolve Op: %s' % key
def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, device):
net_def = mace_pb2.NetDef()
......@@ -311,14 +425,11 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, devi
with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name="")
ops = graph.get_operations()
unresolved_ops = ops
if device == 'gpu':
add_input_transform(input_node, dt, net_def)
while len(unresolved_ops) > 0:
convert_ops(unresolved_ops, dt, net_def, device)
if device == 'gpu':
add_output_transform(output_node, net_def)
print "PB Parsed."
converter = TFConverter(ops, net_def, dt, device)
converter.convert(input_node, output_node)
print "PB Converted, start optimize memory."
mem_optimizer = memory_optimizer.MemoryOptimizer(net_def)
mem_optimizer.optimize()
print "Memory optimization done."
return net_def
......@@ -149,6 +149,7 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
elif is_node_flatten_reshape(first_op):
op_def.type = 'Flatten'
op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
elif dsp_ops.has_op(first_op.type):
op_def.input.extend([t.name for t in first_op.inputs])
......
......@@ -4,6 +4,7 @@ import os
import os.path
import tensorflow as tf
import numpy as np
from scipy import spatial
from tensorflow import gfile
......@@ -34,9 +35,12 @@ def load_data(file):
def valid_output(out_shape, mace_out_file, tf_out_value):
mace_out_value = load_data(mace_out_file)
if mace_out_value.size != 0:
similarity = (1 - spatial.distance.cosine(tf_out_value.flat, mace_out_value))
print 'MACE VS TF similarity: ', similarity
if similarity > 0.999:
print '=======================Passed! Haha======================'
mace_out_value = mace_out_value.reshape(out_shape)
np.testing.assert_allclose(mace_out_value, tf_out_value, rtol=0.05)
print '=======================Passed! Haha======================'
else:
print '=======================Skip empty node==================='
......@@ -62,7 +66,7 @@ def run_model(input_shape):
input_value = input_value.reshape(input_shape)
output_value = session.run(output_node, feed_dict={input_node: [input_value]})
# output_value.astype(np.float32).tofile( os.path.dirname(FLAGS.input_file) + '/tf_weight')
output_value.astype(np.float32).tofile( os.path.dirname(FLAGS.input_file) + '/tf_out')
return output_value
def main(unused_args):
......
......@@ -2,10 +2,10 @@
# Must run at root dir of mace project.
set +x
Usage() {
echo 'Usage: bash tools/validate_gcn.sh tf_model_file'
echo 'Usage: bash tools/validate_gcn.sh tf_model_path image_size'
}
if [ $# != 1 ];then
if [ $# != 2 ];then
Usage
exit -1
fi
......@@ -13,18 +13,18 @@ fi
TF_MODEL_FILE_PATH=$1
MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH})
MACE_MODEL_NAME='mace_model.pb'
MACE_OPT_MODEL_NAME='mace_opt_model.pb'
INPUT_FILE_NAME='model_input'
OUTPUT_FILE_NAME='gcn.out'
OUTPUT_LIST_FILE='gcn.list'
PHONE_DATA_DIR="/data/local/tmp/${MACE_MODEL_NAME}"
KERNEL_DIR="${PHONE_DATA_DIR}/cl/"
IMAGE_SIZE=$2
# Step 1: Generate input data
echo "Step 1: Generate input data"
python tools/validate.py --generate_data true --random_seed 1 \
--input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \
--input_shape=512,512,3
--input_shape="${IMAGE_SIZE},${IMAGE_SIZE},3"
# Step 2: convert tf model to mace model
echo "Step 2: convert tf model to mace model and optimize memory"
......@@ -35,10 +35,6 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
--output_node=GCN/br_result_2/fcn_br \
--data_type=DT_HALF \
--runtime=gpu
bazel build mace/python/tools:memory_optimizer
bazel-bin/mace/python/tools/memory_optimizer ${MODEL_DIR}/${MACE_MODEL_NAME} \
${MODEL_DIR}/${MACE_OPT_MODEL_NAME}
# Step 3: Run model on the phone
echo "Step 3: Run model on the phone"
......@@ -49,21 +45,22 @@ bazel build -c opt --strip always mace/examples:mace_run \
adb shell "mkdir -p ${PHONE_DATA_DIR}"
adb shell "mkdir -p ${KERNEL_DIR}"
adb push mace/kernels/opencl/cl/ ${KERNEL_DIR}
adb push ${MODEL_DIR}/${MACE_OPT_MODEL_NAME} ${PHONE_DATA_DIR}
adb push mace/kernels/opencl/cl/* ${KERNEL_DIR}
adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR}
adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR}
adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR}
num_threads=${1:-4}
adb </dev/null shell MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
adb </dev/null shell MACE_CPP_MIN_VLOG_LEVEL=0 \
MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
MACE_KERNEL_PATH=$KERNEL_DIR \
OMP_NUM_THREADS=$num_threads \
${PHONE_DATA_DIR}/mace_run \
--model=${PHONE_DATA_DIR}/${MACE_OPT_MODEL_NAME} \
--model=${PHONE_DATA_DIR}/${MACE_MODEL_NAME} \
--input=mace_input_node \
--output=mace_output_node \
--input_shape=1,512,512,3\
--input_shape="1,${IMAGE_SIZE},${IMAGE_SIZE},3"\
--input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \
--output_file=${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME} \
--device=OPENCL \
......@@ -81,4 +78,5 @@ python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \
--mace_out_file ${MODEL_DIR}/${OUTPUT_FILE_NAME} \
--input_node input \
--output_node GCN/br_result_2/fcn_br\
--output_shape 1,512,512,2
--input_shape "${IMAGE_SIZE},${IMAGE_SIZE},3" \
--output_shape "1,${IMAGE_SIZE},${IMAGE_SIZE},2"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册