提交 4888bd5f 编写于 作者: 叶剑武

Merge branch 'fix-one-hot' into 'master'

Remove GPU code of OneHot op for only support 1D input.

See merge request !1039
......@@ -79,12 +79,10 @@ MemoryBlock MemoryOptimizer::CreateMemoryBlock(
*op_def, "buffer_type", OpenCLBufferType::IN_OUT_CHANNEL));
}
std::vector<size_t> image_shape;
if (shape.size() == 1) {
shape = {shape[0], 1, 1, 1};
} else if (shape.size() == 2) {
if (shape.size() == 2) {
shape = {shape[0], 1, 1, shape[1]};
} else {
MACE_CHECK(shape.size() == 4) << "GPU only support 1D/2D/4D input";
MACE_CHECK(shape.size() == 4) << "GPU only support 2D/4D input";
}
OpenCLUtil::CalImage2DShape(shape, buffer_type, &image_shape);
block.set_x(image_shape[0]);
......
......@@ -29,22 +29,21 @@ class ConcatOpBase : public Operation {
public:
explicit ConcatOpBase(OpConstructContext *context)
: Operation(context),
axis_(Operation::GetOptionalArg<int>("axis", 3)),
checked_(false) {}
axis_(Operation::GetOptionalArg<int>("axis", 3)) {}
protected:
void Validate() {
int FormatAxis() {
const int32_t input_dims = this->Input(0)->dim_size();
axis_ =
axis_ < 0 ? axis_ + input_dims : axis_;
MACE_CHECK((0 <= axis_ && axis_ < input_dims),
"Expected concatenating axis in the range [", -input_dims, ", ",
input_dims, "], but got ", axis_);
return axis_;
}
protected:
int axis_;
bool checked_;
};
template <DeviceType D, class T>
......@@ -54,20 +53,17 @@ template <typename T>
class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
public:
explicit ConcatOp(OpConstructContext *context)
: ConcatOpBase(context) {}
: ConcatOpBase(context),
has_data_format_(Operation::GetOptionalArg<int>(
"has_data_format", 0) == 1) {}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
if (!checked_) {
Validate();
auto has_df = Operation::GetOptionalArg<int>(
"has_data_format", 0);
if (has_df && this->Input(0)->dim_size() == 4) {
if (axis_ == 3) axis_ = 1;
else if (axis_ == 2) axis_ = 3;
else if (axis_ == 1) axis_ = 2;
}
checked_ = true;
int axis = FormatAxis();
if (has_data_format_ && this->Input(0)->dim_size() == 4) {
if (axis == 3) axis = 1;
else if (axis == 2) axis = 3;
else if (axis == 1) axis = 2;
}
const std::vector<const Tensor *> &inputs = this->Inputs();
Tensor *output = this->Output(0);
......@@ -76,7 +72,7 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
std::vector<index_t> output_shape(input0->shape());
index_t inner_size = 1;
for (int i = 0; i < axis_; ++i) {
for (int i = 0; i < axis; ++i) {
inner_size *= output_shape[i];
}
std::vector<index_t> outer_sizes(inputs_count, 0);
......@@ -86,14 +82,14 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis_) {
if (j == axis) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
outer_sizes[i] = input->size() / inner_size;
output_shape[axis_] += input->dim(axis_);
output_shape[axis] += input->dim(axis);
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
......@@ -119,6 +115,9 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
return MaceStatus::MACE_SUCCESS;
}
private:
bool has_data_format_;
};
#ifdef MACE_ENABLE_QUANTIZE
......@@ -130,7 +129,7 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
Validate();
int axis = FormatAxis();
const std::vector<const Tensor *> &inputs = this->Inputs();
Tensor *output = this->Output(0);
MACE_CHECK(output->scale() != 0);
......@@ -139,7 +138,7 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
std::vector<index_t> output_shape(input0->shape());
index_t inner_size = 1;
for (int i = 0; i < axis_; ++i) {
for (int i = 0; i < axis; ++i) {
inner_size *= output_shape[i];
}
std::vector<index_t> outer_sizes(inputs_count, 0);
......@@ -149,14 +148,14 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis_) {
if (j == axis) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
outer_sizes[i] = input->size() / inner_size;
output_shape[axis_] += input->dim(axis_);
output_shape[axis] += input->dim(axis);
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
......@@ -200,15 +199,14 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
explicit ConcatOp(OpConstructContext *context)
: ConcatOpBase(context) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
kernel_ = make_unique<opencl::image::ConcatKernel<T>>(axis_);
kernel_ = make_unique<opencl::image::ConcatKernel<T>>();
} else {
MACE_NOT_IMPLEMENTED;
}
}
MaceStatus Run(OpContext *context) override {
Validate();
Tensor *output = this->Output(0);
return kernel_->Compute(context, inputs_, output);
return kernel_->Compute(context, inputs_, FormatAxis(), output);
}
private:
......
......@@ -16,9 +16,6 @@
#include <memory>
#include "mace/core/operator.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/one_hot.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace ops {
......@@ -148,52 +145,9 @@ class OneHotOp<DeviceType::CPU, T> : public OneHotOpBase {
}
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class OneHotOp<DeviceType::GPU, T> : public OneHotOpBase {
public:
explicit OneHotOp(OpConstructContext *context) : OneHotOpBase(context) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::OneHotKernel<T>(
depth_, on_value_, off_value_, axis_));
} else {
MACE_NOT_IMPLEMENTED;
}
}
MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0);
Tensor *output = this->Output(0);
return kernel_->Compute(context, input, output);
}
private:
std::unique_ptr<OpenCLOneHotKernel> kernel_;
};
#endif // MACE_ENABLE_OPENCL
void RegisterOneHot(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "OneHot", OneHotOp, DeviceType::CPU, float);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "OneHot", OneHotOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "OneHot", OneHotOp, DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("OneHot")
.SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
}
if (op->output_shape(0).dims_size() != 2) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
#endif // MACE_ENABLE_OPENCL
}
} // namespace ops
......
......@@ -61,9 +61,7 @@ void OneHot(int iters, int batch, int depth, int axis) {
MACE_BENCHMARK(MACE_BM_ONE_HOT_##N##_##DEPTH##_##AXIS##_##TYPE##_##DEVICE)
#define MACE_BM_ONE_HOT(N, DEPTH, AXIS) \
MACE_BM_ONE_HOT_MACRO(N, DEPTH, AXIS, float, CPU); \
MACE_BM_ONE_HOT_MACRO(N, DEPTH, AXIS, float, GPU); \
MACE_BM_ONE_HOT_MACRO(N, DEPTH, AXIS, half, GPU);
MACE_BM_ONE_HOT_MACRO(N, DEPTH, AXIS, float, CPU);
MACE_BM_ONE_HOT(512, 16, 0);
MACE_BM_ONE_HOT(512, 16, 1);
......
......@@ -45,7 +45,6 @@ void TestOneHot(const std::vector<index_t> &input_shape,
.AddFloatArg("on_value", on_value)
.AddFloatArg("off_value", off_value)
.AddIntArg("axis", axis)
.AddIntArg("data_format", DataFormat::NHWC)
.Finalize(net.NewOperatorDef());
// Run
......@@ -77,10 +76,6 @@ TEST_F(OneHotTest, Dim1) {
TestOneHot<DeviceType::CPU, float>(input_shape, input_data, expected_shape,
expected_data, 5, -1);
TestOneHot<DeviceType::GPU, float>(input_shape, input_data, expected_shape,
expected_data, 5, -1);
TestOneHot<DeviceType::GPU, half>(input_shape, input_data, expected_shape,
expected_data, 5, -1);
expected_shape = {5, 10};
expected_data = {
......@@ -93,10 +88,6 @@ TEST_F(OneHotTest, Dim1) {
TestOneHot<DeviceType::CPU, float>(input_shape, input_data, expected_shape,
expected_data, 5, 0);
TestOneHot<DeviceType::GPU, float>(input_shape, input_data, expected_shape,
expected_data, 5, 0);
TestOneHot<DeviceType::GPU, half>(input_shape, input_data, expected_shape,
expected_data, 5, 0);
}
TEST_F(OneHotTest, OnOffValue) {
......@@ -111,10 +102,6 @@ TEST_F(OneHotTest, OnOffValue) {
TestOneHot<DeviceType::CPU, float>(input_shape, input_data, expected_shape,
expected_data, 6, -1, 7, 8);
TestOneHot<DeviceType::GPU, float>(input_shape, input_data, expected_shape,
expected_data, 6, -1, 7, 8);
TestOneHot<DeviceType::GPU, half>(input_shape, input_data, expected_shape,
expected_data, 6, -1, 7, 8);
}
TEST_F(OneHotTest, Dim2) {
......
#include <common.h>
__kernel void one_hot(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
__write_only image2d_t output,
#ifdef AXIS_0
__private const int in_size,
#endif
__private const float on_value,
__private const float off_value) {
const int channel_idx = get_global_id(0);
const int batch_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (channel_idx >= global_size_dim0 || batch_idx >= global_size_dim1) {
return;
}
#endif
DATA_TYPE4 out = off_value;
#ifdef AXIS_0
int in_idx = channel_idx * 4;
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(0, in_idx));
if (in.s0 == batch_idx) {
out.s0 = on_value;
}
if (++in_idx < in_size) {
in = READ_IMAGET(input, SAMPLER, (int2)(0, in_idx));
if (in.s0 == batch_idx) {
out.s1 = on_value;
}
if (++in_idx < in_size) {
in = READ_IMAGET(input, SAMPLER, (int2)(0, in_idx));
if (in.s0 == batch_idx) {
out.s2 = on_value;
}
if (++in_idx < in_size) {
in = READ_IMAGET(input, SAMPLER, (int2)(0, in_idx));
if (in.s0 == batch_idx) {
out.s3 = on_value;
}
}
}
}
#else
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(0, batch_idx));
int i = in.s0;
if (i / 4 == channel_idx) {
switch (i % 4) {
case 0:
out.s0 = on_value;
break;
case 1:
out.s1 = on_value;
break;
case 2:
out.s2 = on_value;
break;
case 3:
out.s3 = on_value;
break;
}
}
#endif
WRITE_IMAGET(output, (int2)(channel_idx, batch_idx), out);
}
......@@ -31,6 +31,7 @@ class OpenCLConcatKernel {
virtual MaceStatus Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
const int32_t axis,
Tensor *output) = 0;
MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLConcatKernel);
};
......
......@@ -34,8 +34,6 @@ std::vector<index_t> FormatBufferShape(
return buffer_shape;
} else if (buffer_shape_size == 2) { // NC
return {buffer_shape[0], 1, 1, buffer_shape[1]};
} else if (buffer_shape_size == 1) { // N
return {buffer_shape[0], 1, 1, 1};
} else {
LOG(FATAL) << "GPU only support 2D or 4D input and output";
}
......
......@@ -48,14 +48,14 @@ MaceStatus ConcatN(OpContext *context,
template <typename T>
class ConcatKernel : public OpenCLConcatKernel {
public:
explicit ConcatKernel(const int32_t axis) : axis_(axis) {}
ConcatKernel() {}
MaceStatus Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
const int32_t axis,
Tensor *output) override;
private:
int32_t axis_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
......@@ -65,6 +65,7 @@ template <typename T>
MaceStatus ConcatKernel<T>::Compute(
OpContext *context,
const std::vector<const Tensor *> &input_list,
const int32_t axis,
Tensor *output) {
const int inputs_count = input_list.size();
......@@ -76,13 +77,13 @@ MaceStatus ConcatKernel<T>::Compute(
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis_) {
if (j == axis) {
continue;
}
MACE_CHECK(input->dim(j) == input0->dim(j),
"Dimensions of inputs should equal except axis.");
}
output_shape[axis_] += input->dim(axis_);
output_shape[axis] += input->dim(axis);
}
std::vector<size_t> image_shape;
OpenCLUtil::CalImage2DShape(output_shape,
......
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_OPENCL_IMAGE_ONE_HOT_H_
#define MACE_OPS_OPENCL_IMAGE_ONE_HOT_H_
#include "mace/ops/opencl/one_hot.h"
#include <memory>
#include <vector>
#include <set>
#include <string>
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
namespace mace {
namespace ops {
namespace opencl {
namespace image {
template <typename T>
class OneHotKernel : public OpenCLOneHotKernel {
public:
OneHotKernel(const int depth, const float on_value,
const float off_value, const int axis)
: depth_(depth), on_value_(on_value),
off_value_(off_value), axis_(axis) {}
MaceStatus Compute(
OpContext *context,
const Tensor *input,
Tensor *output) override;
private:
int depth_;
float on_value_;
float off_value_;
int axis_;
cl::Kernel kernel_;
uint32_t kwg_size_;
std::vector<index_t> input_shape_;
};
template <typename T>
MaceStatus OneHotKernel<T>::Compute(
OpContext *context,
const Tensor *input,
Tensor *output) {
auto input_shape = input->shape();
index_t axis = axis_ == -1 ? input->dim_size() : axis_;
MACE_CHECK(input->dim_size() == 1, "OneHot GPU only supports 1D input");
MACE_CHECK(axis >= 0 && axis <= input->dim_size());
std::vector<index_t> output_shape =
axis == 0 ? std::vector<index_t>{depth_, input_shape[0]} :
std::vector<index_t>{input_shape[0], depth_};
std::vector<size_t> output_image_shape{
static_cast<size_t>(RoundUpDiv4(output_shape[1])),
static_cast<size_t>(output_shape[0])};
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
auto runtime = context->device()->gpu_runtime()->opencl_runtime();
MACE_OUT_OF_RANGE_DEFINITION;
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
MACE_OUT_OF_RANGE_CONFIG;
MACE_NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("one_hot");
built_options.emplace("-Done_hot=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
if (axis == 0) {
built_options.emplace("-DAXIS_0");
}
MACE_RETURN_IF_ERROR(runtime->BuildKernel("one_hot", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
const uint32_t gws[2] = {
static_cast<uint32_t>(output_image_shape[0]),
static_cast<uint32_t>(output_image_shape[1])
};
MACE_OUT_OF_RANGE_INIT(kernel_);
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
MACE_SET_2D_GWS_ARGS(kernel_, gws);
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
if (axis == 0) {
kernel_.setArg(idx++, static_cast<int>(input_shape[0]));
}
kernel_.setArg(idx++, on_value_);
kernel_.setArg(idx++, off_value_);
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = Concat("one_hot", output->dim(0), output->dim(1));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, context->future()));
MACE_OUT_OF_RANGE_VALIDATION;
return MaceStatus::MACE_SUCCESS;
}
} // namespace image
} // namespace opencl
} // namespace ops
} // namespace mace
#endif // MACE_OPS_OPENCL_IMAGE_ONE_HOT_H_
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_OPENCL_ONE_HOT_H_
#define MACE_OPS_OPENCL_ONE_HOT_H_
#include "mace/public/mace.h"
#include "mace/utils/utils.h"
namespace mace {
class OpContext;
class Tensor;
namespace ops {
class OpenCLOneHotKernel {
public:
virtual MaceStatus Compute(
OpContext *context,
const Tensor *input,
Tensor *output) = 0;
MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLOneHotKernel);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_OPENCL_ONE_HOT_H_
......@@ -50,7 +50,6 @@ def _opencl_encrypt_kernel_impl(repository_ctx):
unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/fully_connected.cl"))
unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/lstmcell.cl"))
unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/matmul.cl"))
unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/one_hot.cl"))
unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/pad.cl"))
unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/pooling.cl"))
unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/pooling_buffer.cl"))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册