提交 927cdbb5 编写于 作者: R Raman Sarokin 提交者: TensorFlower Gardener

Added specialized version of ConvolutionTransposed for kernel 3x3 and stride 2x2.

Supported on PowerVR, NVidia and AMD.

PiperOrigin-RevId: 293522528
Change-Id: Ieaf14db078e379244cb2323b3a8b7ad74dd8d92f
上级 7e231e30
......@@ -423,6 +423,47 @@ cc_library(
],
)
cc_library(
name = "convolution_transposed_3x3",
srcs = ["convolution_transposed_3x3.cc"],
hdrs = ["convolution_transposed_3x3.h"],
deps = [
":gpu_operation",
":util",
":work_group_picking",
"//tensorflow/lite/delegates/gpu/cl:buffer",
"//tensorflow/lite/delegates/gpu/cl:linear_storage",
"//tensorflow/lite/delegates/gpu/cl:precision",
"//tensorflow/lite/delegates/gpu/cl:tensor",
"//tensorflow/lite/delegates/gpu/cl:tensor_type",
"//tensorflow/lite/delegates/gpu/cl:util",
"//tensorflow/lite/delegates/gpu/common:data_type",
"//tensorflow/lite/delegates/gpu/common:operations",
"//tensorflow/lite/delegates/gpu/common:shape",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:tensor",
"//tensorflow/lite/delegates/gpu/common:types",
],
)
cc_test(
name = "convolution_transposed_3x3_test",
srcs = ["convolution_transposed_3x3_test.cc"],
linkstatic = True,
tags = [
"linux",
"local",
],
deps = [
":cl_test",
":convolution_transposed_3x3",
"//tensorflow/lite/delegates/gpu/cl:tensor",
"//tensorflow/lite/delegates/gpu/common:operations",
"//tensorflow/lite/delegates/gpu/common:status",
"@com_google_googletest//:gtest_main",
],
)
cc_library(
name = "convolution_transposed_3x3_thin",
srcs = ["convolution_transposed_3x3_thin.cc"],
......
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h"
#include <string>
#include <utility>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
namespace tflite {
namespace gpu {
namespace cl {
namespace {
std::string GenerateConvolutionTransposedCode(
const OperationDef& op_def,
const std::vector<ElementwiseOperation*>& linked_operations,
ConvolutionTransposed3x3::WeightsUploadType weights_upload_type,
int2 padding, int3 work_group_launch_order) {
std::string c = GetCommonDefines(op_def.precision);
TensorCodeGenerator src_tensor(
"src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
op_def.src_tensors[0]);
TensorCodeGenerator dst_tensor(
"dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
op_def.dst_tensors[0]);
const auto src_tensor_type = op_def.src_tensors[0].storage_type;
const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
src_tensor_type == TensorStorageType::IMAGE_BUFFER;
const bool need_local_mem =
weights_upload_type ==
ConvolutionTransposed3x3::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
weights_upload_type ==
ConvolutionTransposed3x3::WeightsUploadType::LOCAL_MEM_ASYNC;
switch (op_def.precision) {
case CalculationsPrecision::F32:
case CalculationsPrecision::F16:
c += "#define CONV(R, SRC, F) \\\n";
c += " R += SRC.x * weights_cache[F]; \\\n";
c += " R += SRC.y * weights_cache[F + 1]; \\\n";
c += " R += SRC.z * weights_cache[F + 2]; \\\n";
c += " R += SRC.w * weights_cache[F + 3]; \n";
break;
case CalculationsPrecision::F32_F16:
c += "#define CONV(R, SRC, F) \\\n";
c += " R += convert_float4(SRC.x * weights_cache[F] + SRC.y * "
"weights_cache[F + 1] + SRC.z * weights_cache[F + 2] + SRC.w * "
"weights_cache[F + 3]);\n";
break;
}
const std::string weights_space =
weights_upload_type ==
ConvolutionTransposed3x3::WeightsUploadType::CONSTANT_MEM
? "__constant"
: "__global";
const std::string pixel_stride =
op_def.IsBatchSupported() ? "dst_size.w" : "1";
c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
c += " " + weights_space + " FLT4* filters,\n";
c += " __read_only image2d_t biases";
c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 src_size, \n";
c += " int4 dst_size, \n";
c += " int filter_offset, \n";
c += " int2 padding \n";
c += ") {\n";
int3 launch_remap;
launch_remap[work_group_launch_order.x] = 0;
launch_remap[work_group_launch_order.y] = 1;
launch_remap[work_group_launch_order.z] = 2;
auto GetGlobalID = [&](int id) {
std::string result;
const std::string sid = std::to_string(id);
if (work_group_launch_order[id] == id) {
return "get_global_id(" + sid + ")";
} else {
return "get_group_id(" + std::to_string(launch_remap[id]) +
") * get_local_size(" + sid + ") + get_local_id(" + sid + ")";
}
};
if (op_def.IsBatchSupported()) {
c += " int linear_id = " + GetGlobalID(0) + ";\n";
c += " int X0 = linear_id / dst_size.w;\n";
c += " int B = linear_id % dst_size.w;\n";
c += " int DST_X = X0 * 2 * dst_size.w + B;\n";
c += " int SRC_X = linear_id + padding.x;\n";
} else {
c += " int X = " + GetGlobalID(0) + ";\n";
c += " int DST_X = X * 2;\n";
c += " int SRC_X = X + padding.x;\n";
}
c += " int Y = " + GetGlobalID(1) + ";\n";
c += " int DST_Y = Y * 2;\n";
c += " int SRC_Y = Y + padding.y;\n";
c += " int Z = " + GetGlobalID(2) + ";\n";
if (!need_local_mem) {
c += " if (DST_X >= dst_size.x || DST_Y >= dst_size.y || Z >= dst_size.z) "
"return;\n";
}
c += " ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f);\n";
c += " ACCUM_FLT4 r1 = (ACCUM_FLT4)(0.0f);\n";
c += " ACCUM_FLT4 r2 = (ACCUM_FLT4)(0.0f);\n";
c += " ACCUM_FLT4 r3 = (ACCUM_FLT4)(0.0f);\n";
c += " int f_offset = Z * filter_offset;\n";
if (need_local_mem) {
c += " __local FLT4 weights_cache[36];\n";
}
if (weights_upload_type ==
ConvolutionTransposed3x3::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
c += " int local_id = (int)(get_local_id(1) * 8 + get_local_id(0));\n";
}
if (manual_clamp) {
const std::string layer_offset = "src_size.x * src_size.y";
const std::string next_x = "SRC_X + " + pixel_stride;
c += " bool in_x0 = SRC_X >= 0 && SRC_X < src_size.x;\n";
c += " bool in_x1 = " + next_x + " >= 0 && " + next_x + " < src_size.x;\n";
c += " bool in_y0 = SRC_Y >= 0 && SRC_Y < src_size.y;\n";
c += " bool in_y1 = SRC_Y + 1 >= 0 && SRC_Y + 1 < src_size.y;\n";
if (src_tensor_type == TensorStorageType::BUFFER) {
c += " int xc0 = clamp(SRC_X, 0, src_size.x - 1);\n";
c += " int xc1 = clamp(" + next_x + ", 0, src_size.x - 1);\n";
c += " int yc0 = clamp(SRC_Y, 0, src_size.y - 1);\n";
c += " int yc1 = clamp(SRC_Y + 1, 0, src_size.y - 1);\n";
c += " " + src_tensor.GetAddressWHS("addr_0", "xc0", "yc0", "0");
c += " " + src_tensor.GetAddressWHS("addr_1", "xc1", "yc0", "0");
c += " " + src_tensor.GetAddressWHS("addr_2", "xc0", "yc1", "0");
c += " " + src_tensor.GetAddressWHS("addr_3", "xc1", "yc1", "0");
c += " int dz = " + layer_offset + ";\n";
} else { // TensorStorageType::IMAGE_BUFFER
c += " " + src_tensor.GetAddressWHS("addr_0", "SRC_X", "SRC_Y", "0");
c += " " + src_tensor.GetAddressWHS("addr_1", next_x, "SRC_Y", "0");
c += " " + src_tensor.GetAddressWHS("addr_2", "SRC_X", "SRC_Y + 1", "0");
c += " " + src_tensor.GetAddressWHS("addr_3", next_x, "SRC_Y + 1", "0");
c += " addr_0 = select(-1, addr_0, (in_x0 && in_y0));\n";
c += " addr_1 = select(-1, addr_1, (in_x1 && in_y0));\n";
c += " addr_2 = select(-1, addr_2, (in_x0 && in_y1));\n";
c += " addr_3 = select(-1, addr_3, (in_x1 && in_y1));\n";
c += " int dz_0 = select(0, " + layer_offset + ", (in_x0 && in_y0));\n";
c += " int dz_1 = select(0, " + layer_offset + ", (in_x1 && in_y0));\n";
c += " int dz_2 = select(0, " + layer_offset + ", (in_x0 && in_y1));\n";
c += " int dz_3 = select(0, " + layer_offset + ", (in_x1 && in_y1));\n";
}
}
auto read_src = [&](int x, int y) {
if (manual_clamp) {
const std::string id = std::to_string(y * 2 + x);
const std::string addr = "addr_" + std::to_string(y * 2 + x);
if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
return src_tensor.Read(addr) + "; " + addr + " += dz_" + id + ";\n";
} else {
return src_tensor.Read(addr) + " * (FLT)(in_x" + std::to_string(x) +
" && in_y" + std::to_string(y) + "); " + addr + " += dz;\n";
}
} else {
return src_tensor.ReadWHS(
"SRC_X + " + std::to_string(x) + "*" + pixel_stride,
"SRC_Y + " + std::to_string(y), "s",
TextureAddressMode::ZERO) +
";\n";
}
};
const int padding_x_rem = abs(padding.x) % 2;
const int padding_y_rem = abs(padding.y) % 2;
std::vector<std::pair<int, int>> permutation;
if (padding_x_rem == 1 && padding_y_rem == 1) {
permutation = {{0, 0}, {1, 0}, {1, 1}, {2, 0}, {2, 2},
{3, 0}, {3, 1}, {3, 2}, {3, 3}};
} else if (padding_x_rem == 0 && padding_y_rem == 1) {
permutation = {{0, 0}, {0, 1}, {1, 1}, {2, 0}, {2, 1},
{2, 2}, {2, 3}, {3, 1}, {3, 3}};
} else if (padding_x_rem == 1 && padding_y_rem == 0) {
permutation = {{0, 0}, {0, 2}, {1, 0}, {1, 1}, {1, 2},
{1, 3}, {2, 2}, {3, 2}, {3, 3}};
} else { // padding_x_rem == 0 && padding_y_rem == 0
permutation = {{0, 0}, {0, 1}, {0, 2}, {0, 3}, {1, 1},
{1, 3}, {2, 2}, {2, 3}, {3, 3}};
}
c += " for (int s = 0; s < src_size.z; ++s) {\n";
if (need_local_mem) {
c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
}
if (weights_upload_type ==
ConvolutionTransposed3x3::WeightsUploadType::LOCAL_MEM_ASYNC) {
c += " async_work_group_copy(weights_cache, filters + f_offset, 36, "
"0);\n";
} else if (weights_upload_type ==
ConvolutionTransposed3x3::WeightsUploadType::
LOCAL_MEM_BY_THREADS) {
c += " weights_cache[local_id] = filters[f_offset + local_id];\n";
c += " if (local_id < 4) {\n";
c += " weights_cache[local_id + 32] = filters[f_offset + local_id + "
"32];\n";
c += " };\n";
} else { // GLOBAL_MEM/CONSTANT_MEM
c +=
" " + weights_space + " FLT4* weights_cache = filters + f_offset;\n";
}
c += " FLT4 src0 = " + read_src(0, 0);
c += " FLT4 src1 = " + read_src(1, 0);
c += " FLT4 src2 = " + read_src(0, 1);
c += " FLT4 src3 = " + read_src(1, 1);
c += " f_offset += 36;\n";
if (need_local_mem) {
c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
}
for (int i = 0; i < 9; ++i) {
const std::string r_name = "r" + std::to_string(permutation[i].first);
const std::string s_name = "src" + std::to_string(permutation[i].second);
const std::string w_name = std::to_string(i * 4);
c += " CONV(" + r_name + ", " + s_name + ", " + w_name + ");\n";
}
c += " }\n";
if (need_local_mem) {
c += " if (DST_X >= dst_size.x || DST_Y >= dst_size.y || Z >= dst_size.z) "
"return;\n";
}
c += " FLT4 bias_val = READ_IMAGE(biases, (int2)(Z, 0));\n";
for (int y = 0; y < 2; ++y) {
for (int x = 0; x < 2; ++x) {
const std::string s_x = std::to_string(x);
const std::string s_y = std::to_string(y);
const std::string id = std::to_string(y * 2 + x);
const std::string x_c = "DST_X + " + s_x + " * " + pixel_stride;
const std::string y_c = "DST_Y + " + s_y;
c += " if (" + x_c + " < dst_size.x && " + y_c + " < dst_size.y) {\n";
c += " FLT4 res0 = TO_FLT4(r" + id + ") + bias_val;\n";
const LinkingContext context{"res0", x_c, y_c, "Z"};
c += PostProcess(linked_operations, context);
c += " " + dst_tensor.WriteWHS("res0", x_c, y_c, "Z");
c += " }\n";
}
}
c += "}\n";
return c;
}
} // namespace
ConvolutionTransposed3x3::ConvolutionTransposed3x3(
const OperationDef& definition, const CLDevice& device, int2 padding)
: GPUOperation(definition),
padding_(padding),
work_group_launch_order_(2, 0, 1) {
if (device.IsPowerVR()) {
weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
} else if (device.IsNvidia()) {
weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
} else if (device.IsAMD()) {
weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
} else {
weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
}
}
ConvolutionTransposed3x3::ConvolutionTransposed3x3(
ConvolutionTransposed3x3&& operation)
: GPUOperation(std::move(operation)),
padding_(operation.padding_),
work_group_launch_order_(operation.work_group_launch_order_),
weights_(std::move(operation.weights_)),
weights_upload_type_(operation.weights_upload_type_),
biases_(std::move(operation.biases_)),
kernel_(std::move(operation.kernel_)),
work_group_size_(operation.work_group_size_) {}
ConvolutionTransposed3x3& ConvolutionTransposed3x3::operator=(
ConvolutionTransposed3x3&& operation) {
if (this != &operation) {
std::swap(padding_, operation.padding_);
std::swap(work_group_launch_order_, operation.work_group_launch_order_);
weights_ = std::move(operation.weights_);
std::swap(weights_upload_type_, operation.weights_upload_type_);
biases_ = std::move(operation.biases_);
kernel_ = std::move(operation.kernel_);
std::swap(work_group_size_, operation.work_group_size_);
GPUOperation::operator=(std::move(operation));
}
return *this;
}
Status ConvolutionTransposed3x3::Compile(
const CreationContext& creation_context) {
const auto code = GenerateConvolutionTransposedCode(
definition_, linked_operations_, weights_upload_type_, padding_,
work_group_launch_order_);
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsPowerVR()) {
options.push_back(CompilerOptions::POWERVR_FP16);
}
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_));
return OkStatus();
}
Status ConvolutionTransposed3x3::BindArguments() {
kernel_.ResetBindingCounter();
RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
const int filters_offset = 4 * 9 * src_[0]->Slices();
RETURN_IF_ERROR(kernel_.SetBytesAuto(filters_offset));
const int padding_x =
padding_.x >= 1 ? (padding_.x - 1) / 2 : (padding_.x - 2) / 2;
const int padding_y =
padding_.y >= 1 ? (padding_.y - 1) / 2 : (padding_.y - 2) / 2;
RETURN_IF_ERROR(
kernel_.SetBytesAuto(int2(padding_x * src_[0]->Batch(), padding_y)));
return OkStatus();
}
int3 ConvolutionTransposed3x3::GetGridSize() const {
const int grid_x =
IntegralDivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), 2);
const int grid_z = dst_[0]->Slices();
int3 wg;
wg.x = IntegralDivideRoundUp(grid_x, work_group_size_.x);
wg.y = IntegralDivideRoundUp(grid_y, work_group_size_.y);
wg.z = IntegralDivideRoundUp(grid_z, work_group_size_.z);
return int3(wg[work_group_launch_order_[0]] * work_group_size_.x,
wg[work_group_launch_order_[1]] * work_group_size_.y,
wg[work_group_launch_order_[2]] * work_group_size_.z);
return int3(grid_x, grid_y, grid_z);
}
Status ConvolutionTransposed3x3::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
}
bool IsConvolutionTransposed3x3Supported(
const CLDevice& device, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr) {
return attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
attr.stride.w == 2 && attr.stride.h == 2;
}
Status CreateConvolutionTransposed3x3(
const CreationContext& creation_context, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
ConvolutionTransposed3x3* result) {
if (!IsConvolutionTransposed3x3Supported(*creation_context.device, definition,
attr)) {
return InvalidArgumentError(
"ConvolutionTransposed3x3 doesn't support this attributes");
}
const int2 padding = int2(attr.padding.prepended.w, attr.padding.prepended.h);
*result =
ConvolutionTransposed3x3(definition, *creation_context.device, padding);
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
LinearStorageCreateInfo create_info;
create_info.storage_type = LinearStorageType::TEXTURE_2D;
create_info.data_type = definition.GetDataType();
create_info.aligned_size = attr.weights.shape.o;
RETURN_IF_ERROR(CreateLinearStorage(
create_info, attr.bias, creation_context.context, &result->biases_));
return OkStatus();
}
} // namespace cl
} // namespace gpu
} // namespace tflite
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3X3_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3X3_H_
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace cl {
class ConvolutionTransposed3x3 : public GPUOperation {
public:
ConvolutionTransposed3x3() = default;
Status AddToQueue(CLCommandQueue* queue) override;
Status Compile(const CreationContext& creation_context) override;
// Move only
ConvolutionTransposed3x3(ConvolutionTransposed3x3&& operation);
ConvolutionTransposed3x3& operator=(ConvolutionTransposed3x3&& operation);
ConvolutionTransposed3x3(const ConvolutionTransposed3x3&) = delete;
ConvolutionTransposed3x3& operator=(const ConvolutionTransposed3x3&) = delete;
enum class WeightsUploadType {
LOCAL_MEM_ASYNC,
LOCAL_MEM_BY_THREADS,
GLOBAL_MEM,
CONSTANT_MEM,
};
private:
ConvolutionTransposed3x3(const OperationDef& definition,
const CLDevice& device, int2 padding);
friend Status CreateConvolutionTransposed3x3(
const CreationContext& creation_context, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
ConvolutionTransposed3x3* result);
template <DataType T>
Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
template <DataType S, typename T>
void RearrangeWeightsData(const ::tflite::gpu::Tensor<OHWI, S>& weights,
absl::Span<T> dst);
Status BindArguments();
int3 GetGridSize() const;
int2 padding_;
int3 work_group_launch_order_;
Buffer weights_;
WeightsUploadType weights_upload_type_;
LinearStorage biases_;
CLKernel kernel_;
int3 work_group_size_ = int3(8, 4, 1);
};
template <DataType T>
Status ConvolutionTransposed3x3::UploadWeights(
const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int kernel_x = 3; // This operation support only 3x3 kernel
const int kernel_y = 3;
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
if (f32_weights) {
std::vector<float4> gpu_data(flt4_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
context, &weights_);
} else {
std::vector<half4> gpu_data(flt4_count);
RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
context, &weights_);
}
}
template <DataType S, typename T>
void ConvolutionTransposed3x3::RearrangeWeightsData(
const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int kernel_x = 3;
const int kernel_y = 3;
const int padding_x_rem = abs(padding_.x) % 2;
const int padding_y_rem = abs(padding_.y) % 2;
// we are reorganizing weights to read them sequentially in kernel
std::vector<int> remap;
if (padding_x_rem == 1 && padding_y_rem == 1) {
remap = {4, 5, 3, 7, 1, 8, 6, 2, 0};
} else if (padding_x_rem == 0 && padding_y_rem == 1) {
remap = {5, 3, 4, 8, 6, 2, 0, 7, 1};
} else if (padding_x_rem == 1 && padding_y_rem == 0) {
remap = {7, 1, 8, 6, 2, 0, 4, 5, 3};
} else { // padding_x_rem == 0 && padding_y_rem == 0
remap = {8, 6, 2, 0, 7, 1, 5, 3, 4};
}
int counter = 0;
for (int d = 0; d < dst_depth; ++d) {
for (int s = 0; s < src_depth; ++s) {
for (int y = 0; y < kernel_y; ++y) {
for (int x = 0; x < kernel_x; ++x) {
const int kernel_index = remap[y * kernel_x + x];
const int kernel_index_x = kernel_index % kernel_x;
const int kernel_index_y = kernel_index / kernel_x;
T filters[4];
for (int j = 0; j < 4; ++j) {
for (int i = 0; i < 4; ++i) {
const int s_ch = s * 4 + i;
const int d_ch = d * 4 + j;
if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
const int f_index = weights.shape.LinearIndex(
{d_ch, kernel_index_y, kernel_index_x, s_ch});
filters[i][j] = weights.data[f_index];
} else {
filters[i][j] = 0.0f;
}
}
}
dst[counter++] = filters[0];
dst[counter++] = filters[1];
dst[counter++] = filters[2];
dst[counter++] = filters[3];
}
}
}
}
}
bool IsConvolutionTransposed3x3Supported(
const CLDevice& device, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
Status CreateConvolutionTransposed3x3(
const CreationContext& creation_context, const OperationDef& definition,
const ConvolutionTransposedAttributes& attr,
ConvolutionTransposed3x3* result);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3X3_H_
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h"
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
using ::testing::FloatNear;
using ::testing::Pointwise;
namespace tflite {
namespace gpu {
namespace cl {
namespace {
TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3) {
TensorFloat32 src_tensor;
src_tensor.shape = BHWC(1, 2, 2, 1);
src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
ConvolutionTransposedAttributes attr;
attr.padding.prepended = HW(1, 1);
attr.padding.appended = HW(0, 0);
attr.stride = HW(2, 2);
attr.weights.shape = OHWI(1, 3, 3, 1);
attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
attr.bias.shape = Linear(1);
attr.bias.data = {0.0f};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
ConvolutionTransposed3x3 operation;
ASSERT_OK(CreateConvolutionTransposed3x3(creation_context_, op_def, attr,
&operation));
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 4, 4, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
Pointwise(FloatNear(eps),
{0.0f, 1.0f, 1.0f, 1.0f, 2.0f, 6.0f, 4.0f, 4.0f,
2.0f, 5.0f, 3.0f, 3.0f, 2.0f, 5.0f, 3.0f, 3.0f}));
}
}
}
} // namespace
} // namespace cl
} // namespace gpu
} // namespace tflite
......@@ -32,6 +32,7 @@ cc_library(
deps = [
"//tensorflow/lite/delegates/gpu/cl:tensor_type",
"//tensorflow/lite/delegates/gpu/cl/kernels:convolution_transposed",
"//tensorflow/lite/delegates/gpu/cl/kernels:convolution_transposed_3x3",
"//tensorflow/lite/delegates/gpu/cl/kernels:convolution_transposed_3x3_thin",
"//tensorflow/lite/delegates/gpu/cl/kernels:convolution_transposed_4x4",
"//tensorflow/lite/delegates/gpu/cl/kernels:convolution_transposed_thin",
......
......@@ -17,6 +17,7 @@ limitations under the License.
#include "absl/memory/memory.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h"
......@@ -66,6 +67,12 @@ Status SelectConvolutionTransposedPowerVR(
RETURN_IF_ERROR(CreateConvolutionTransposed3x3Thin(creation_context, op_def,
attr, &conv));
*ptr = absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
} else if (IsConvolutionTransposed3x3Supported(*creation_context.device,
op_def, attr)) {
ConvolutionTransposed3x3 conv;
RETURN_IF_ERROR(
CreateConvolutionTransposed3x3(creation_context, op_def, attr, &conv));
*ptr = absl::make_unique<ConvolutionTransposed3x3>(std::move(conv));
} else if (IsConvolutionTransposed4x4Supported(*creation_context.device,
op_def, attr)) {
ConvolutionTransposed4x4 conv;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册