提交 a49117d4 编写于 作者: R Raman Sarokin 提交者: TensorFlower Gardener

Using default Compile for all ops.

PiperOrigin-RevId: 324316456
Change-Id: I1a9c31a6893174798420de07bc57811db013c2fe
上级 5313d56b
......@@ -175,7 +175,15 @@ Conv3D::Conv3D(const OperationDef& definition,
kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
attr.weights.shape.d),
dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
conv_params_(GuessBestParams(device, definition, attr)) {}
conv_params_(GuessBestParams(device, definition, attr)) {
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
code_ = GenerateConv3D(definition_, stride_correction, conv_params_);
if (definition_.precision == CalculationsPrecision::F16 &&
device.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
}
Conv3D::Conv3D(Conv3D&& operation)
: GPUOperation(std::move(operation)),
......@@ -197,29 +205,6 @@ Conv3D& Conv3D::operator=(Conv3D&& operation) {
return *this;
}
absl::Status Conv3D::Compile(const CreationContext& creation_context) {
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
std::string code =
GenerateConv3D(definition_, stride_correction, conv_params_);
work_group_size_ = conv_params_.work_group_size;
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsPowerVR()) {
options.push_back(CompilerOptions::POWERVR_FP16);
}
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status Conv3D::BindArguments() {
if (!conv_params_.x_kernel_is_1) {
RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
......
......@@ -40,7 +40,6 @@ class Conv3D : public GPUOperation {
public:
Conv3D() = default;
absl::Status Tune(const TuningParameters& params) override;
absl::Status Compile(const CreationContext& creation_context) override;
absl::Status BindArguments() override;
int3 GetGridSize() const override;
......
......@@ -151,7 +151,10 @@ ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition,
const ConvParams& conv_params)
: GPUOperation(definition), conv_params_(conv_params) {}
: GPUOperation(definition), conv_params_(conv_params) {
code_ = GenerateConvBuffer1x1(definition_, conv_params_, &args_);
work_group_size_ = conv_params_.work_group_size;
}
ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
: GPUOperation(std::move(operation)),
......@@ -300,21 +303,6 @@ std::string ConvBuffer1x1::GenerateConvBuffer1x1(
return c;
}
absl::Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
std::string code = GenerateConvBuffer1x1(definition_, conv_params_, &args_);
work_group_size_ = conv_params_.work_group_size;
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_));
return absl::OkStatus();
}
int3 ConvBuffer1x1::GetGridSize() const {
const int dst_width_elements = DivideRoundUp(
dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4));
......
......@@ -48,7 +48,6 @@ class ConvBuffer1x1 : public GPUOperation {
ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
absl::Status Tune(const TuningParameters& params) override;
absl::Status Compile(const CreationContext& creation_context) override;
int3 GetGridSize() const override;
ConvWeightsDescription GetConvWeightsDescription() const {
......
......@@ -47,6 +47,32 @@ int GetOptimalMaxConstantSize(const DeviceInfo& info) {
}
} // namespace
ConvConstants::ConvConstants(const OperationDef& definition,
const Convolution2DAttributes& attr,
const DeviceInfo& device_info)
: GPUOperation(definition),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
stride_(attr.strides.w, attr.strides.h),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
dilation_(attr.dilations.w, attr.dilations.h),
src_channels_(attr.weights.shape.i),
dst_channels_(attr.weights.shape.o) {
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
code_ = GenerateConvolutionConstantCode(definition_, kernel_size_,
src_channels_, dst_channels_,
stride_correction, device_info);
if (definition_.precision == CalculationsPrecision::F16 &&
device_info.IsAdreno3xx()) {
compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
if (definition_.precision != CalculationsPrecision::F32 &&
device_info.IsPowerVR()) {
// BUG, some PowerVRs (GE8320) produce incorrect result without it
compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
}
}
ConvConstants::ConvConstants(ConvConstants&& kernel)
: GPUOperation(std::move(kernel)),
kernel_size_(kernel.kernel_size_),
......@@ -71,9 +97,9 @@ ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
std::string ConvConstants::GenerateConvolutionConstantCode(
const OperationDef& op_def, const int2& kernel_size, int src_channels,
int dst_channels, bool stride_correction, const CLDevice& device) {
int dst_channels, bool stride_correction, const DeviceInfo& device_info) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
if (op_def.IsBatchSupported()) {
src_desc.SetStateVar("BatchedWidth", "true");
}
......@@ -214,33 +240,6 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
return c;
}
absl::Status ConvConstants::Compile(const CreationContext& creation_context) {
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
std::string code = GenerateConvolutionConstantCode(
definition_, kernel_size_, src_channels_, dst_channels_,
stride_correction, *creation_context.device);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsAdreno3xx()) {
options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
if (definition_.precision != CalculationsPrecision::F32 &&
creation_context.device->IsPowerVR()) {
// BUG, some PowerVRs (GE8320) produce incorrect result without it
options.push_back(CompilerOptions::CL_OPT_DISABLE);
}
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status ConvConstants::BindArguments() {
RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
......@@ -284,7 +283,7 @@ absl::Status CreateConvConstants(const CreationContext& creation_context,
if (!IsConvConstantsSupported(*creation_context.device, definition, attr)) {
return absl::InvalidArgumentError("ConvConstants doesn't supported");
}
*result = ConvConstants(definition, attr);
*result = ConvConstants(definition, attr, creation_context.device->GetInfo());
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
......
......@@ -35,8 +35,6 @@ namespace cl {
class ConvConstants : public GPUOperation {
public:
ConvConstants() = default;
absl::Status Compile(const CreationContext& creation_context) override;
absl::Status BindArguments() override;
int3 GetGridSize() const override;
......@@ -50,15 +48,9 @@ class ConvConstants : public GPUOperation {
friend absl::Status CreateConvConstants(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvConstants* result);
explicit ConvConstants(const OperationDef& definition,
const Convolution2DAttributes& attr)
: GPUOperation(definition),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
stride_(attr.strides.w, attr.strides.h),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
dilation_(attr.dilations.w, attr.dilations.h),
src_channels_(attr.weights.shape.i),
dst_channels_(attr.weights.shape.o) {}
ConvConstants(const OperationDef& definition,
const Convolution2DAttributes& attr,
const DeviceInfo& device_info);
template <DataType T>
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
......@@ -70,7 +62,7 @@ class ConvConstants : public GPUOperation {
std::string GenerateConvolutionConstantCode(
const OperationDef& op_def, const int2& kernel_size, int src_channels,
int dst_channels, bool stride_correction, const CLDevice& device);
int dst_channels, bool stride_correction, const DeviceInfo& device_info);
int2 kernel_size_;
int2 stride_;
......
......@@ -179,29 +179,19 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
return *this;
}
absl::Status ConvPowerVR::Compile(const CreationContext& creation_context) {
void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
const bool stride_correction =
definition_.IsBatchSupported() && stride_padding_.x != 1;
std::string code = GenerateConv(*creation_context.device, definition_,
stride_correction, conv_params_);
code_ =
GenerateConv(device_info, definition_, stride_correction, conv_params_);
work_group_size_ = conv_params_.work_group_size;
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsPowerVR()) {
options.push_back(CompilerOptions::POWERVR_FP16);
device_info.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
if (conv_params_.IsPrivateMemBroadcast()) {
options.push_back(CompilerOptions::CL_2_0);
compiler_options_.push_back(CompilerOptions::CL_2_0);
}
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status ConvPowerVR::BindArguments() {
......@@ -274,11 +264,12 @@ absl::Status ConvPowerVR::Tune(const TuningParameters& params) {
return absl::OkStatus();
}
std::string ConvPowerVR::GenerateConv(
const CLDevice& device, const OperationDef& op_def, bool stride_correction,
const ConvPowerVR::ConvParams& conv_params) {
std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
const OperationDef& op_def,
bool stride_correction,
const ConvParams& conv_params) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
if (op_def.IsBatchSupported()) {
src_desc.SetStateVar("BatchedWidth", "true");
}
......@@ -350,7 +341,7 @@ std::string ConvPowerVR::GenerateConv(
std::string c = GetCommonDefines(op_def.precision);
if (use_simd_broadcast) {
if (device.cl_version() == OpenCLVersion::CL_2_0) {
if (device_info.cl_version == OpenCLVersion::CL_2_0) {
c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
}
}
......@@ -363,7 +354,7 @@ std::string ConvPowerVR::GenerateConv(
std::to_string(work_group_size.y) + ", " +
std::to_string(work_group_size.z) + ")))\n";
}
if (use_simd_broadcast && device.IsIntel()) {
if (use_simd_broadcast && device_info.IsIntel()) {
c += "__attribute__((intel_reqd_sub_group_size(" +
std::to_string(simd_size) + ")))\n";
}
......@@ -498,7 +489,7 @@ std::string ConvPowerVR::GenerateConv(
}
}
};
const bool conditional_read = device.IsMali();
const bool conditional_read = device_info.IsMali();
auto read_src = [&]() {
const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
for (int y = 0; y < block_size.y; ++y) {
......@@ -1004,6 +995,7 @@ absl::Status CreateConvPowerVR(const CreationContext& creation_context,
const Convolution2DAttributes& attr,
ConvPowerVR* result, const BHWC* dst_shape) {
*result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
result->GenerateCode(creation_context.device->GetInfo());
return result->UploadData(attr.weights, attr.bias, creation_context.context);
}
......@@ -1012,6 +1004,7 @@ absl::Status CreateConvPowerVR(const CreationContext& creation_context,
const FullyConnectedAttributes& attr,
ConvPowerVR* result, const BHWC* dst_shape) {
*result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
result->GenerateCode(creation_context.device->GetInfo());
return result->UploadData(attr.weights, attr.bias, creation_context.context);
}
......@@ -1021,6 +1014,7 @@ absl::Status CreateConvPowerVRDynamicWeights(
ConvPowerVR* result, const BHWC* dst_shape) {
*result = ConvPowerVR(definition, attr, weights_shape,
*creation_context.device, dst_shape);
result->GenerateCode(creation_context.device->GetInfo());
return result->UploadBias(attr.bias, creation_context.context);
}
......@@ -1031,6 +1025,7 @@ absl::Status CreateConvPowerVRWino4x4To6x6(
*result = ConvPowerVR(definition);
result->conv_params_ = result->GuessBestParamsWinograd(
*creation_context.device, definition, attr, dst_shape);
result->GenerateCode(creation_context.device->GetInfo());
return result->UploadDataForWinograd4x4To6x6(
attr.weights, *creation_context.device, creation_context.context);
}
......
......@@ -42,7 +42,6 @@ class ConvPowerVR : public GPUOperation {
public:
ConvPowerVR() = default;
absl::Status Tune(const TuningParameters& params) override;
absl::Status Compile(const CreationContext& creation_context) override;
absl::Status BindArguments() override;
int3 GetGridSize() const override;
......@@ -137,6 +136,8 @@ class ConvPowerVR : public GPUOperation {
const BHWC* dst_shape = nullptr);
explicit ConvPowerVR(const OperationDef& definition);
void GenerateCode(const DeviceInfo& device_info);
template <DataType T>
absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases,
......@@ -176,12 +177,6 @@ class ConvPowerVR : public GPUOperation {
const Convolution2DAttributes& attr, ConvPowerVR* result,
const BHWC* dst_shape);
friend std::string GenerateConv(const CLDevice& device,
const OperationDef& op_def,
bool stride_correction,
const ConvParams& conv_params,
Arguments* args);
ConvParams GuessBestParams(const CLDevice& device,
const OperationDef& definition,
const Convolution2DAttributes& attr,
......@@ -206,9 +201,9 @@ class ConvPowerVR : public GPUOperation {
bool different_weights_for_height,
const BHWC* dst_shape = nullptr) const;
std::string GenerateConv(const CLDevice& device, const OperationDef& op_def,
bool stride_correction,
const ConvPowerVR::ConvParams& conv_params);
std::string GenerateConv(const DeviceInfo& device_info,
const OperationDef& op_def, bool stride_correction,
const ConvParams& conv_params);
int4 stride_padding_;
int4 kernel_dilation_;
......
......@@ -30,9 +30,9 @@ namespace tflite {
namespace gpu {
namespace cl {
namespace {
bool UseFP16SIMD(const CLDevice& device, CalculationsPrecision precision,
bool UseFP16SIMD(const DeviceInfo& device_info, CalculationsPrecision precision,
bool kernel1x1) {
if (!device.IsAdreno()) {
if (!device_info.IsAdreno()) {
return false;
}
switch (precision) {
......@@ -40,7 +40,7 @@ bool UseFP16SIMD(const CLDevice& device, CalculationsPrecision precision,
case CalculationsPrecision::F32_F16:
return false;
case CalculationsPrecision::F16:
return device.IsAdreno3xx() && kernel1x1;
return device_info.IsAdreno3xx() && kernel1x1;
}
}
} // namespace
......@@ -96,9 +96,9 @@ std::string ConvTexture::GenerateConvCode(const OperationDef& op_def,
bool adreno4xx_optimization,
bool stride_correction,
bool different_weights_for_height,
const CLDevice& device) {
const DeviceInfo& device_info) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
if (op_def.IsBatchSupported()) {
src_desc.SetStateVar("BatchedWidth", "true");
}
......@@ -380,33 +380,23 @@ std::string ConvTexture::GenerateConvCode(const OperationDef& op_def,
return c;
}
absl::Status ConvTexture::Compile(const CreationContext& creation_context) {
void ConvTexture::GenerateCode(const DeviceInfo& device_info) {
auto storage_type = definition_.GetPrimaryStorageType();
bool is1x1 = kernel_size_.x == 1 && kernel_size_.y == 1;
bool adreno4xx_optimization =
stride_.x == 1 && stride_.y == 1 && padding_.x == 0 && padding_.y == 0 &&
creation_context.device->IsAdreno4xx() &&
device_info.IsAdreno4xx() &&
storage_type == TensorStorageType::TEXTURE_ARRAY &&
definition_.precision == CalculationsPrecision::F16;
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
std::string code =
GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
stride_correction, different_weights_for_height_,
*creation_context.device);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
std::vector<CompilerOptions> options;
if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
code_ = GenerateConvCode(definition_, block_size_, is1x1,
adreno4xx_optimization, stride_correction,
different_weights_for_height_, device_info);
if (UseFP16SIMD(device_info, definition_.precision, is1x1)) {
compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status ConvTexture::BindArguments() {
......@@ -441,6 +431,7 @@ absl::Status CreateConvTexture(const CreationContext& creation_context,
const Convolution2DAttributes& attr,
ConvTexture* result) {
*result = ConvTexture(definition, attr);
result->GenerateCode(creation_context.device->GetInfo());
return result->UploadData(attr.weights, attr.bias, creation_context.context);
}
......@@ -449,6 +440,7 @@ absl::Status CreateConvTexture(const CreationContext& creation_context,
const FullyConnectedAttributes& attr,
ConvTexture* result) {
*result = ConvTexture(definition);
result->GenerateCode(creation_context.device->GetInfo());
return result->UploadData(attr.weights, attr.bias, creation_context.context);
}
......@@ -458,6 +450,7 @@ absl::Status CreateConvTextureWino4x4To6x6(
*result = ConvTexture(definition);
result->different_weights_for_height_ = true;
result->block_size_ = {4, 1, 2};
result->GenerateCode(creation_context.device->GetInfo());
return result->UploadDataForWinograd4x4To6x6(
attr.weights, *creation_context.device, creation_context.context);
}
......
......@@ -43,7 +43,6 @@ class ConvTexture : public GPUOperation {
public:
ConvTexture() = default;
absl::Status Tune(const TuningParameters& params) override;
absl::Status Compile(const CreationContext& creation_context) override;
absl::Status BindArguments() override;
int3 GetGridSize() const override;
......@@ -89,12 +88,14 @@ class ConvTexture : public GPUOperation {
absl::Span<T> dst_0, absl::Span<T> dst_1,
absl::Span<T> dst_2, absl::Span<T> dst_3);
void GenerateCode(const DeviceInfo& device_info);
std::string GenerateConvCode(const OperationDef& op_def,
const int3& block_size, bool is1x1,
bool adreno4xx_optimization,
bool stride_correction,
bool different_weights_for_height,
const CLDevice& device);
const DeviceInfo& device_info);
int2 kernel_size_;
int2 stride_;
......
......@@ -24,6 +24,13 @@ namespace tflite {
namespace gpu {
namespace cl {
ConverterToConvWeights::ConverterToConvWeights(
const OperationDef& definition,
const ConvWeightsDescription& conv_weights_desc)
: GPUOperation(definition), conv_weights_desc_(conv_weights_desc) {
code_ = GetConverterToConvWeightsCode(definition_, conv_weights_desc_);
}
ConverterToConvWeights::ConverterToConvWeights(
ConverterToConvWeights&& operation)
: GPUOperation(std::move(operation)),
......@@ -103,17 +110,6 @@ std::string ConverterToConvWeights::GetConverterToConvWeightsCode(
return c;
}
absl::Status ConverterToConvWeights::Compile(
const CreationContext& creation_context) {
std::string code =
GetConverterToConvWeightsCode(definition_, conv_weights_desc_);
RETURN_IF_ERROR(
args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status ConverterToConvWeights::BindArguments() {
float4 mask = GetMaskForLastPlane(src_[0]->Channels());
RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));
......
......@@ -30,9 +30,7 @@ namespace cl {
class ConverterToConvWeights : public GPUOperation {
public:
ConverterToConvWeights(const OperationDef& definition,
const ConvWeightsDescription& conv_weights_desc)
: GPUOperation(definition), conv_weights_desc_(conv_weights_desc) {}
absl::Status Compile(const CreationContext& creation_context) override;
const ConvWeightsDescription& conv_weights_desc);
absl::Status BindArguments() override;
int3 GetGridSize() const override;
......
......@@ -54,6 +54,9 @@ ConvolutionTransposed::ConvolutionTransposed(
}
block_size_.z = 1;
}
code_ = GenerateConvolutionTransposedCode(definition_, device,
weights_are_buffer_, block_size_);
}
ConvolutionTransposed::ConvolutionTransposed(ConvolutionTransposed&& operation)
......@@ -331,24 +334,6 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
return c;
}
absl::Status ConvolutionTransposed::Compile(
const CreationContext& creation_context) {
std::string code = GenerateConvolutionTransposedCode(
definition_, *creation_context.device, weights_are_buffer_, block_size_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
std::vector<CompilerOptions> options;
// options.push_back(CompilerOptions::POWERVR_FP16);
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status ConvolutionTransposed::BindArguments() {
RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
......
......@@ -39,7 +39,6 @@ class ConvolutionTransposed : public GPUOperation {
public:
ConvolutionTransposed() = default;
absl::Status Tune(const TuningParameters& params) override;
absl::Status Compile(const CreationContext& creation_context) override;
absl::Status BindArguments() override;
int3 GetGridSize() const override;
......
......@@ -38,7 +38,19 @@ ConvolutionTransposed3D::ConvolutionTransposed3D(
stride_(attr.stride.w, attr.stride.h, attr.stride.d),
padding_(attr.padding.prepended.w, attr.padding.prepended.h,
attr.padding.prepended.d),
block_size_(2, 2, 1, 2) {}
block_size_(2, 2, 1, 2) {
code_ = GenerateConvolutionTransposed3DCode(definition_, device,
weights_are_buffer_, block_size_);
if (device.IsPowerVR() && block_size_.y != 1) {
bool is_texture3d = definition_.src_tensors[0].storage_type ==
TensorStorageType::TEXTURE_3D;
bool is_texture_array = definition_.src_tensors[0].storage_type ==
TensorStorageType::TEXTURE_ARRAY;
if (is_texture3d || is_texture_array) {
compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
}
}
}
ConvolutionTransposed3D::ConvolutionTransposed3D(
ConvolutionTransposed3D&& operation)
......@@ -356,32 +368,6 @@ std::string ConvolutionTransposed3D::GenerateConvolutionTransposed3DCode(
return c;
}
absl::Status ConvolutionTransposed3D::Compile(
const CreationContext& creation_context) {
std::string code = GenerateConvolutionTransposed3DCode(
definition_, *creation_context.device, weights_are_buffer_, block_size_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
std::vector<CompilerOptions> options;
if (creation_context.device->IsPowerVR() && block_size_.y != 1) {
bool is_texture3d = definition_.src_tensors[0].storage_type ==
TensorStorageType::TEXTURE_3D;
bool is_texture_array = definition_.src_tensors[0].storage_type ==
TensorStorageType::TEXTURE_ARRAY;
if (is_texture3d || is_texture_array) {
options.push_back(CompilerOptions::CL_OPT_DISABLE);
}
}
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status ConvolutionTransposed3D::BindArguments() {
RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
......
......@@ -39,7 +39,6 @@ class ConvolutionTransposed3D : public GPUOperation {
public:
ConvolutionTransposed3D() = default;
absl::Status Tune(const TuningParameters& params) override;
absl::Status Compile(const CreationContext& creation_context) override;
absl::Status BindArguments() override;
int3 GetGridSize() const override;
......
......@@ -42,6 +42,12 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
} else {
weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
}
code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type_,
padding_, work_group_launch_order_);
if (definition_.precision == CalculationsPrecision::F16 &&
device.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
}
ConvolutionTransposed3x3::ConvolutionTransposed3x3(
......@@ -299,28 +305,6 @@ std::string ConvolutionTransposed3x3::GenerateConvolutionTransposedCode(
return c;
}
absl::Status ConvolutionTransposed3x3::Compile(
const CreationContext& creation_context) {
std::string code = GenerateConvolutionTransposedCode(
definition_, weights_upload_type_, padding_, work_group_launch_order_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsPowerVR()) {
options.push_back(CompilerOptions::POWERVR_FP16);
}
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_));
return absl::OkStatus();
}
absl::Status ConvolutionTransposed3x3::BindArguments() {
RETURN_IF_ERROR(args_.SetInt("filter_offset", 4 * 9 * src_[0]->Slices()));
const int padding_x =
......
......@@ -40,7 +40,6 @@ class ConvolutionTransposed3x3 : public GPUOperation {
absl::Status Tune(const TuningParameters& params) override {
return absl::OkStatus();
}
absl::Status Compile(const CreationContext& creation_context) override;
absl::Status BindArguments() override;
int3 GetGridSize() const override;
......
......@@ -27,22 +27,21 @@ namespace gpu {
namespace cl {
ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
: GPUOperation(definition),
src_channels_(attr.weights.shape.i),
dst_channels_(attr.weights.shape.o) {}
const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
const DeviceInfo& device_info)
: GPUOperation(definition) {
code_ = GenerateConvolutionTransposedCode(
definition_, DivideRoundUp(attr.weights.shape.i, 4),
DivideRoundUp(attr.weights.shape.o, 4), device_info);
}
ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
ConvolutionTransposed3x3Thin&& operation)
: GPUOperation(std::move(operation)),
src_channels_(operation.src_channels_),
dst_channels_(operation.dst_channels_) {}
: GPUOperation(std::move(operation)) {}
ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
ConvolutionTransposed3x3Thin&& operation) {
if (this != &operation) {
std::swap(src_channels_, operation.src_channels_);
std::swap(dst_channels_, operation.dst_channels_);
GPUOperation::operator=(std::move(operation));
}
return *this;
......@@ -50,9 +49,9 @@ ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
std::string ConvolutionTransposed3x3Thin::GenerateConvolutionTransposedCode(
const OperationDef& op_def, int src_depth, int dst_depth,
const CLDevice& device) {
const DeviceInfo& device_info) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
AddSrcTensor("src_tensor", src_desc);
AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
......@@ -184,22 +183,6 @@ std::string ConvolutionTransposed3x3Thin::GenerateConvolutionTransposedCode(
return c;
}
absl::Status ConvolutionTransposed3x3Thin::Compile(
const CreationContext& creation_context) {
std::string code = GenerateConvolutionTransposedCode(
definition_, DivideRoundUp(src_channels_, 4),
DivideRoundUp(dst_channels_, 4), *creation_context.device);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
const int grid_x = src_[0]->Width() * dst_[0]->Batch();
const int grid_y = src_[0]->Height();
......@@ -225,7 +208,8 @@ absl::Status CreateConvolutionTransposed3x3Thin(
return absl::InvalidArgumentError(
"ConvolutionTransposed3x3Thin doesn't support this attributes");
}
*result = ConvolutionTransposed3x3Thin(definition, attr);
*result = ConvolutionTransposed3x3Thin(definition, attr,
creation_context.device->GetInfo());
RETURN_IF_ERROR(
result->UploadData(attr.weights, attr.bias, creation_context.context));
return absl::OkStatus();
......
......@@ -37,7 +37,6 @@ namespace cl {
class ConvolutionTransposed3x3Thin : public GPUOperation {
public:
ConvolutionTransposed3x3Thin() = default;
absl::Status Compile(const CreationContext& creation_context) override;
int3 GetGridSize() const override;
// Move only
......@@ -55,7 +54,8 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
ConvolutionTransposed3x3Thin* result);
explicit ConvolutionTransposed3x3Thin(
const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
const ConvolutionTransposedAttributes& attr,
const DeviceInfo& device_info);
template <DataType T>
absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases,
......@@ -67,18 +67,15 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
int src_depth, int dst_depth,
const CLDevice& device);
int src_channels_;
int dst_channels_;
const DeviceInfo& device_info);
};
template <DataType T>
absl::Status ConvolutionTransposed3x3Thin::UploadData(
const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
const int src_depth = DivideRoundUp(src_channels_, 4);
const int dst_depth = DivideRoundUp(dst_channels_, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = 3; // This operation support only 3x3 kernel
const int kernel_y = 3;
const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
......@@ -131,8 +128,8 @@ absl::Status ConvolutionTransposed3x3Thin::UploadData(
template <DataType S, typename T>
void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int src_depth = DivideRoundUp(src_channels_, 4);
const int dst_depth = DivideRoundUp(dst_channels_, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int kernel_x = 3;
const int kernel_y = 3;
......@@ -151,7 +148,7 @@ void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
for (int i = 0; i < 4; ++i) {
const int s_ch = s * 4 + i;
const int d_ch = d * 4 + j;
if (s_ch < src_channels_ && d_ch < dst_channels_) {
if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
const int f_index = weights.shape.LinearIndex(
{d_ch, kernel_index_y, kernel_index_x, s_ch});
filters[i][j] = weights.data[f_index];
......
......@@ -40,6 +40,12 @@ ConvolutionTransposed4x4::ConvolutionTransposed4x4(
} else {
weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
}
code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type_);
if (definition_.precision == CalculationsPrecision::F16 &&
device.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
}
ConvolutionTransposed4x4::ConvolutionTransposed4x4(
......@@ -57,8 +63,7 @@ ConvolutionTransposed4x4& ConvolutionTransposed4x4::operator=(
}
std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
const OperationDef& op_def,
ConvolutionTransposed4x4::WeightsUploadType weights_upload_type) {
const OperationDef& op_def, WeightsUploadType weights_upload_type) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
if (op_def.IsBatchSupported()) {
......@@ -290,28 +295,6 @@ std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
return c;
}
absl::Status ConvolutionTransposed4x4::Compile(
const CreationContext& creation_context) {
std::string code =
GenerateConvolutionTransposedCode(definition_, weights_upload_type_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsPowerVR()) {
options.push_back(CompilerOptions::POWERVR_FP16);
}
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_));
return absl::OkStatus();
}
absl::Status ConvolutionTransposed4x4::BindArguments() {
return args_.SetInt("filter_offset", 4 * 16 * src_[0]->Slices());
}
......
......@@ -40,7 +40,6 @@ class ConvolutionTransposed4x4 : public GPUOperation {
absl::Status Tune(const TuningParameters& params) override {
return absl::OkStatus();
}
absl::Status Compile(const CreationContext& creation_context) override;
absl::Status BindArguments() override;
int3 GetGridSize() const override;
......@@ -73,8 +72,7 @@ class ConvolutionTransposed4x4 : public GPUOperation {
absl::Span<T> dst);
std::string GenerateConvolutionTransposedCode(
const OperationDef& op_def,
ConvolutionTransposed4x4::WeightsUploadType weights_upload_type);
const OperationDef& op_def, WeightsUploadType weights_upload_type);
WeightsUploadType weights_upload_type_;
};
......
......@@ -28,25 +28,25 @@ namespace gpu {
namespace cl {
ConvolutionTransposedThin::ConvolutionTransposedThin(
const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
: GPUOperation(definition),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
src_channels_(attr.weights.shape.i),
dst_channels_(attr.weights.shape.o) {}
const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
const DeviceInfo& device_info)
: GPUOperation(definition) {
code_ = GenerateConvolutionTransposedCode(
definition_, DivideRoundUp(attr.weights.shape.i, 4), attr.weights.shape.o,
int2(attr.weights.shape.w, attr.weights.shape.h));
if (definition_.precision == CalculationsPrecision::F16 &&
device_info.IsAdreno3xx()) {
compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
}
ConvolutionTransposedThin::ConvolutionTransposedThin(
ConvolutionTransposedThin&& operation)
: GPUOperation(std::move(operation)),
kernel_size_(operation.kernel_size_),
src_channels_(operation.src_channels_),
dst_channels_(operation.dst_channels_) {}
: GPUOperation(std::move(operation)) {}
ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
ConvolutionTransposedThin&& operation) {
if (this != &operation) {
std::swap(kernel_size_, operation.kernel_size_);
std::swap(src_channels_, operation.src_channels_);
std::swap(dst_channels_, operation.dst_channels_);
GPUOperation::operator=(std::move(operation));
}
return *this;
......@@ -151,29 +151,6 @@ std::string ConvolutionTransposedThin::GenerateConvolutionTransposedCode(
return c;
}
absl::Status ConvolutionTransposedThin::Compile(
const CreationContext& creation_context) {
std::string code = GenerateConvolutionTransposedCode(
definition_, DivideRoundUp(src_channels_, 4), dst_channels_,
kernel_size_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsAdreno3xx()) {
options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
}
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
int3 ConvolutionTransposedThin::GetGridSize() const {
const int grid_x = src_[0]->Width() * dst_[0]->Batch();
const int grid_y = src_[0]->Height();
......@@ -197,7 +174,8 @@ absl::Status CreateConvolutionTransposedThin(
return absl::InvalidArgumentError(
"ConvolutionTransposedThin doesn't support this attributes");
}
*result = ConvolutionTransposedThin(definition, attr);
*result = ConvolutionTransposedThin(definition, attr,
creation_context.device->GetInfo());
RETURN_IF_ERROR(
result->UploadData(attr.weights, attr.bias, creation_context.context));
return absl::OkStatus();
......
......@@ -37,7 +37,6 @@ namespace cl {
class ConvolutionTransposedThin : public GPUOperation {
public:
ConvolutionTransposedThin() = default;
absl::Status Compile(const CreationContext& creation_context) override;
int3 GetGridSize() const override;
// Move only
......@@ -53,7 +52,8 @@ class ConvolutionTransposedThin : public GPUOperation {
const ConvolutionTransposedAttributes& attr,
ConvolutionTransposedThin* result);
ConvolutionTransposedThin(const OperationDef& definition,
const ConvolutionTransposedAttributes& attr);
const ConvolutionTransposedAttributes& attr,
const DeviceInfo& device_info);
template <DataType T>
absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases,
......@@ -65,19 +65,15 @@ class ConvolutionTransposedThin : public GPUOperation {
std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
int src_depth, int dst_channels,
const int2& kernel_size);
int2 kernel_size_;
int src_channels_;
int dst_channels_;
};
template <DataType T>
absl::Status ConvolutionTransposedThin::UploadData(
const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
const int src_depth = DivideRoundUp(src_channels_, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int flt4_count =
kernel_size_.x * kernel_size_.y * src_depth * dst_channels_;
weights.shape.w * weights.shape.h * src_depth * weights.shape.o;
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
......@@ -121,20 +117,20 @@ absl::Status ConvolutionTransposedThin::UploadData(
template <DataType S, typename T>
void ConvolutionTransposedThin::RearrangeWeightsData(
const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
const int src_depth = DivideRoundUp(src_channels_, 4);
const int kernel_x = kernel_size_.x;
const int kernel_y = kernel_size_.y;
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
int counter = 0;
for (int s = 0; s < src_depth; ++s) {
for (int y = 0; y < kernel_y; ++y) {
for (int x = 0; x < kernel_x; ++x) {
std::vector<T> filters(dst_channels_);
for (int j = 0; j < dst_channels_; ++j) {
std::vector<T> filters(weights.shape.o);
for (int j = 0; j < weights.shape.o; ++j) {
for (int i = 0; i < 4; ++i) {
const int s_ch = s * 4 + i;
const int d_ch = j;
if (s_ch < src_channels_ && d_ch < dst_channels_) {
if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
const int f_index = weights.shape.LinearIndex({d_ch, y, x, s_ch});
filters[j][i] = weights.data[f_index];
} else {
......@@ -142,7 +138,7 @@ void ConvolutionTransposedThin::RearrangeWeightsData(
}
}
}
for (int j = 0; j < dst_channels_; ++j) {
for (int j = 0; j < weights.shape.o; ++j) {
dst[counter++] = filters[j];
}
}
......
......@@ -70,7 +70,8 @@ std::string GetSrcValue(int channel_multiplier, const std::string coords) {
DepthwiseConvolution::DepthwiseConvolution(
const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr, bool weights_are_buffer)
const DepthwiseConvolution2DAttributes& attr, bool weights_are_buffer,
const DeviceInfo& device_info)
: GPUOperation(definition),
weights_are_buffer_(weights_are_buffer),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 0, 0),
......@@ -79,11 +80,17 @@ DepthwiseConvolution::DepthwiseConvolution(
dilation_(attr.dilations.w, attr.dilations.h, 0, 0),
channel_multiplier_(attr.weights.shape.o) {
work_group_size_ = int3(8, 8, 1);
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
code_ = GenerateDepthwiseConvolutionCode(definition_, stride_correction,
channel_multiplier_,
weights_are_buffer_, device_info);
}
DepthwiseConvolution::DepthwiseConvolution(
const OperationDef& definition,
const DepthwiseConvolution3DAttributes& attr, bool weights_are_buffer)
const DepthwiseConvolution3DAttributes& attr, bool weights_are_buffer,
const DeviceInfo& device_info)
: GPUOperation(definition),
weights_are_buffer_(weights_are_buffer),
kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
......@@ -94,6 +101,11 @@ DepthwiseConvolution::DepthwiseConvolution(
dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 0),
channel_multiplier_(attr.weights.shape.o) {
work_group_size_ = int3(8, 8, 1);
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
code_ = GenerateDepthwiseConvolutionCode(definition_, stride_correction,
channel_multiplier_,
weights_are_buffer_, device_info);
}
DepthwiseConvolution::DepthwiseConvolution(DepthwiseConvolution&& operation)
......@@ -121,9 +133,9 @@ DepthwiseConvolution& DepthwiseConvolution::operator=(
std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
const OperationDef& op_def, bool stride_correction, int channel_multiplier,
bool weights_are_buffer, const CLDevice& device) {
bool weights_are_buffer, const DeviceInfo& device_info) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
if (op_def.IsBatchSupported()) {
src_desc.SetStateVar("BatchedWidth", "true");
}
......@@ -270,24 +282,6 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
return c;
}
absl::Status DepthwiseConvolution::Compile(
const CreationContext& creation_context) {
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
std::string code = GenerateDepthwiseConvolutionCode(
definition_, stride_correction, channel_multiplier_, weights_are_buffer_,
*creation_context.device);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status DepthwiseConvolution::BindArguments() {
RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
......@@ -321,7 +315,8 @@ absl::Status CreateDepthwiseConvolution(
const DepthwiseConvolution2DAttributes& attr,
DepthwiseConvolution* result) {
bool weights_are_buffer = creation_context.device->IsMali();
*result = DepthwiseConvolution(definition, attr, weights_are_buffer);
*result = DepthwiseConvolution(definition, attr, weights_are_buffer,
creation_context.device->GetInfo());
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
......@@ -344,7 +339,8 @@ absl::Status CreateDepthwiseConvolution(
const DepthwiseConvolution3DAttributes& attr,
DepthwiseConvolution* result) {
bool weights_are_buffer = creation_context.device->IsMali();
*result = DepthwiseConvolution(definition, attr, weights_are_buffer);
*result = DepthwiseConvolution(definition, attr, weights_are_buffer,
creation_context.device->GetInfo());
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
......
......@@ -38,7 +38,6 @@ namespace cl {
class DepthwiseConvolution : public GPUOperation {
public:
DepthwiseConvolution() = default;
absl::Status Compile(const CreationContext& creation_context) override;
absl::Status BindArguments() override;
int3 GetGridSize() const override;
......@@ -59,10 +58,10 @@ class DepthwiseConvolution : public GPUOperation {
DepthwiseConvolution* result);
DepthwiseConvolution(const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr,
bool weights_are_buffer);
bool weights_are_buffer, const DeviceInfo& device_info);
DepthwiseConvolution(const OperationDef& definition,
const DepthwiseConvolution3DAttributes& attr,
bool weights_are_buffer);
bool weights_are_buffer, const DeviceInfo& device_info);
template <DataType T>
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
......@@ -84,7 +83,7 @@ class DepthwiseConvolution : public GPUOperation {
bool stride_correction,
int channel_multiplier,
bool weights_are_buffer,
const CLDevice& device);
const DeviceInfo& device_info);
bool weights_are_buffer_;
......
......@@ -29,11 +29,19 @@ namespace cl {
DepthwiseConv3x3::DepthwiseConv3x3(const OperationDef& definition,
bool weights_are_buffer,
bool local_mem_uploads)
bool local_mem_uploads,
const DeviceInfo& device_info)
: GPUOperation(definition),
weights_are_buffer_(weights_are_buffer),
local_mem_uploads_(local_mem_uploads) {
work_group_size_ = int3(8, 4, 1);
code_ = GenerateDepthwiseConvCode(definition_, device_info,
weights_are_buffer_, local_mem_uploads_);
if (definition_.precision == CalculationsPrecision::F16 &&
device_info.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
}
DepthwiseConv3x3::DepthwiseConv3x3(DepthwiseConv3x3&& operation)
......@@ -51,10 +59,10 @@ DepthwiseConv3x3& DepthwiseConv3x3::operator=(DepthwiseConv3x3&& operation) {
}
std::string DepthwiseConv3x3::GenerateDepthwiseConvCode(
const OperationDef& op_def, const CLDevice& device, bool weights_are_buffer,
bool local_mem_uploads) {
const OperationDef& op_def, const DeviceInfo& device_info,
bool weights_are_buffer, bool local_mem_uploads) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
AddSrcTensor("src_tensor", src_desc);
AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
......@@ -281,28 +289,6 @@ std::string DepthwiseConv3x3::GenerateDepthwiseConvCode(
return c;
}
absl::Status DepthwiseConv3x3::Compile(
const CreationContext& creation_context) {
std::string code =
GenerateDepthwiseConvCode(definition_, *creation_context.device,
weights_are_buffer_, local_mem_uploads_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsPowerVR()) {
options.push_back(CompilerOptions::POWERVR_FP16);
}
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status DepthwiseConv3x3::BindArguments() {
RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
return args_.SetObjectRef("dst_tensor", dst_[0]);
......@@ -343,7 +329,8 @@ absl::Status CreateDepthwiseConv3x3(
creation_context.device->IsPowerVR() || creation_context.device->IsMali();
bool local_mem_uploads =
weights_are_buffer && creation_context.device->IsPowerVR();
*result = DepthwiseConv3x3(definition, weights_are_buffer, local_mem_uploads);
*result = DepthwiseConv3x3(definition, weights_are_buffer, local_mem_uploads,
creation_context.device->GetInfo());
return result->UploadWeightsAndBiases(attr.weights, attr.bias,
creation_context.context);
}
......
......@@ -39,7 +39,6 @@ class DepthwiseConv3x3 : public GPUOperation {
public:
DepthwiseConv3x3() = default;
absl::Status Tune(const TuningParameters& params) override;
absl::Status Compile(const CreationContext& creation_context) override;
absl::Status BindArguments() override;
int3 GetGridSize() const override;
......@@ -51,7 +50,8 @@ class DepthwiseConv3x3 : public GPUOperation {
private:
explicit DepthwiseConv3x3(const OperationDef& definition,
bool weights_are_buffer, bool local_mem_uploads);
bool weights_are_buffer, bool local_mem_uploads,
const DeviceInfo& device_info);
template <DataType T>
absl::Status UploadWeightsAndBiases(
const tflite::gpu::Tensor<OHWI, T>& weights,
......@@ -67,7 +67,7 @@ class DepthwiseConv3x3 : public GPUOperation {
const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
std::string GenerateDepthwiseConvCode(const OperationDef& op_def,
const CLDevice& device,
const DeviceInfo& device_info,
bool weights_are_buffer,
bool local_mem_uploads);
......
......@@ -24,8 +24,22 @@ namespace tflite {
namespace gpu {
namespace cl {
FullyConnected::FullyConnected(const OperationDef& definition)
: GPUOperation(definition) {}
FullyConnected::FullyConnected(const OperationDef& definition,
const DeviceInfo& device_info)
: GPUOperation(definition) {
if (device_info.IsAdreno()) {
if (device_info.IsAdreno3xx()) {
work_group_size_ = int3(8, 4, 1);
} else if (device_info.IsAdreno4xx()) {
work_group_size_ = int3(16, 4, 1);
} else {
work_group_size_ = int3(32, 4, 1);
}
} else {
work_group_size_ = int3(16, 4, 1);
}
code_ = GetFullyConnectedKernelCode(definition_, work_group_size_);
}
FullyConnected::FullyConnected(FullyConnected&& kernel)
: GPUOperation(std::move(kernel)) {}
......@@ -92,36 +106,6 @@ std::string FullyConnected::GetFullyConnectedKernelCode(
return c;
}
absl::Status FullyConnected::Compile(const CreationContext& creation_context) {
int wg_width = 32;
int wg_height = 4;
int work_items;
do {
work_group_size_ = {wg_width, wg_height, 1};
wg_width /= 2;
std::string code =
GetFullyConnectedKernelCode(definition_, work_group_size_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
auto status = creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
if (!status.ok()) {
if (work_group_size_.x == 1) {
return status;
} else {
continue;
}
}
work_items = work_group_size_.x * work_group_size_.y * work_group_size_.z;
} while (work_items > kernel_.GetMaxWorkGroupSize());
return absl::OkStatus();
}
int3 FullyConnected::GetGridSize() const {
return int3(dst_[0]->Slices(), 1, 1);
}
......@@ -130,7 +114,7 @@ absl::Status CreateFullyConnected(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
FullyConnected* result) {
*result = FullyConnected(definition);
*result = FullyConnected(definition, creation_context.device->GetInfo());
RETURN_IF_ERROR(
result->UploadWeights(attr.weights, creation_context.context));
......
......@@ -93,7 +93,6 @@ class FullyConnected : public GPUOperation {
return absl::OkStatus();
}
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
FullyConnected(FullyConnected&& kernel);
......@@ -102,7 +101,7 @@ class FullyConnected : public GPUOperation {
FullyConnected& operator=(const FullyConnected&) = delete;
private:
explicit FullyConnected(const OperationDef& definition);
FullyConnected(const OperationDef& definition, const DeviceInfo& device_info);
friend absl::Status CreateFullyConnected(
const CreationContext& creation_context, const OperationDef& definition,
const FullyConnectedAttributes& attr, FullyConnected* result);
......
......@@ -190,9 +190,9 @@ absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
creation_context.device->GetInfo(),
{{dst_tensors_names_[0], element_wise_code}}, &code_));
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code_, "main_function", *creation_context.context,
code_, "main_function", compiler_options_, *creation_context.context,
*creation_context.device, &kernel_));
return PostCompileCheck();
return PostCompileCheck(creation_context.device->GetInfo());
}
ElementwiseOperation::ElementwiseOperation(ElementwiseOperation&& operation)
......
......@@ -103,7 +103,9 @@ class GPUOperation {
virtual absl::Status Compile(const CreationContext& creation_context);
virtual absl::Status PostCompileCheck() { return absl::OkStatus(); }
virtual absl::Status PostCompileCheck(const DeviceInfo& device_info) {
return absl::OkStatus();
}
const OperationDef& GetDefinition() const { return definition_; }
......
......@@ -25,7 +25,10 @@ namespace tflite {
namespace gpu {
namespace cl {
LSTM::LSTM(const OperationDef& definition) : GPUOperation(definition) {}
LSTM::LSTM(const OperationDef& definition, const DeviceInfo& device_info)
: GPUOperation(definition) {
code_ = GetLSTMCode(definition_, device_info);
}
LSTM::LSTM(LSTM&& kernel) : GPUOperation(std::move(kernel)) {}
......@@ -37,7 +40,7 @@ LSTM& LSTM::operator=(LSTM&& kernel) {
}
std::string LSTM::GetLSTMCode(const OperationDef& op_def,
const CLDevice& device) {
const DeviceInfo& device_info) {
AddSrcTensor("intermediate", op_def.src_tensors[0]);
AddSrcTensor("prev_state", op_def.src_tensors[1]);
AddDstTensor("new_state", op_def.dst_tensors[0]);
......@@ -56,7 +59,8 @@ std::string LSTM::GetLSTMCode(const OperationDef& op_def,
c += " FLT4 r1 = args.intermediate.Read(0, 0, Z + state_stride, B);\n";
c += " FLT4 r2 = args.intermediate.Read(0, 0, Z + state_stride * 2, B);\n";
c += " FLT4 r3 = args.intermediate.Read(0, 0, Z + state_stride * 3, B);\n";
if (op_def.precision != CalculationsPrecision::F32 && device.IsAdreno()) {
if (op_def.precision != CalculationsPrecision::F32 &&
device_info.IsAdreno()) {
c += " FLT4 input_gate;\n";
c += " FLT4 new_input;\n";
c += " FLT4 forget_gate;\n";
......@@ -101,15 +105,6 @@ std::string LSTM::GetLSTMCode(const OperationDef& op_def,
return c;
}
absl::Status LSTM::Compile(const CreationContext& creation_context) {
std::string code = GetLSTMCode(definition_, *creation_context.device);
RETURN_IF_ERROR(
args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
int3 LSTM::GetGridSize() const {
const int grid_x = dst_[0]->Batch();
const int grid_y = dst_[0]->Slices();
......@@ -117,7 +112,9 @@ int3 LSTM::GetGridSize() const {
return int3(grid_x, grid_y, grid_z);
}
LSTM CreateLSTM(const OperationDef& definition) { return LSTM(definition); }
LSTM CreateLSTM(const OperationDef& definition, const DeviceInfo& device_info) {
return LSTM(definition, device_info);
}
} // namespace cl
} // namespace gpu
......
......@@ -27,9 +27,8 @@ namespace cl {
class LSTM : public GPUOperation {
public:
explicit LSTM(const OperationDef& definition);
LSTM(const OperationDef& definition, const DeviceInfo& device_info);
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
LSTM(LSTM&& kernel);
......@@ -38,10 +37,11 @@ class LSTM : public GPUOperation {
LSTM& operator=(const LSTM&) = delete;
private:
std::string GetLSTMCode(const OperationDef& op_def, const CLDevice& device);
std::string GetLSTMCode(const OperationDef& op_def,
const DeviceInfo& device_info);
};
LSTM CreateLSTM(const OperationDef& definition);
LSTM CreateLSTM(const OperationDef& definition, const DeviceInfo& device_info);
} // namespace cl
} // namespace gpu
......
......@@ -67,7 +67,7 @@ TEST_F(OpenCLOperationTest, LSTM) {
op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
TensorFloat32 new_state;
TensorFloat32 new_activ;
LSTM operation = CreateLSTM(op_def);
LSTM operation = CreateLSTM(op_def, env_.GetDevicePtr()->GetInfo());
ASSERT_OK(ExecuteGPUOperation(
{src_tensor, prev_state}, creation_context_, &operation,
{BHWC(1, 1, 1, 4), BHWC(1, 1, 1, 4)}, {&new_state, &new_activ}));
......
......@@ -25,19 +25,25 @@ namespace gpu {
namespace cl {
MaxUnpooling::MaxUnpooling(const OperationDef& definition,
const MaxUnpooling2DAttributes& attr)
const MaxUnpooling2DAttributes& attr,
const DeviceInfo& device_info)
: GPUOperation(definition),
stride_(attr.strides.w, attr.strides.h, 0, 0),
padding_(attr.padding.appended.w, attr.padding.appended.h, 0, 0),
kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0) {}
kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0) {
code_ = GetMaxUnpoolingKernelCode(definition_, device_info);
}
MaxUnpooling::MaxUnpooling(const OperationDef& definition,
const MaxUnpooling3DAttributes& attr)
const MaxUnpooling3DAttributes& attr,
const DeviceInfo& device_info)
: GPUOperation(definition),
stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
padding_(attr.padding.appended.w, attr.padding.appended.h,
attr.padding.appended.d, 0),
kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0) {}
kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0) {
code_ = GetMaxUnpoolingKernelCode(definition_, device_info);
}
MaxUnpooling::MaxUnpooling(MaxUnpooling&& kernel)
: GPUOperation(std::move(kernel)),
......@@ -55,16 +61,16 @@ MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
return *this;
}
std::string MaxUnpooling::GetMaxUnpoolingKernelCode(const OperationDef& op_def,
const CLDevice& device) {
std::string MaxUnpooling::GetMaxUnpoolingKernelCode(
const OperationDef& op_def, const DeviceInfo& device_info) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
if (op_def.IsBatchSupported()) {
src_desc.SetStateVar("BatchedWidth", "true");
}
AddSrcTensor("src_tensor", src_desc);
auto src_ind_desc = op_def.src_tensors[1];
src_ind_desc.SetTextureAddressMode(GetFastestZeroMode(device));
src_ind_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
if (op_def.IsBatchSupported()) {
src_ind_desc.SetStateVar("BatchedWidth", "true");
}
......@@ -169,20 +175,6 @@ std::string MaxUnpooling::GetMaxUnpoolingKernelCode(const OperationDef& op_def,
return c;
}
absl::Status MaxUnpooling::Compile(const CreationContext& creation_context) {
std::string code =
GetMaxUnpoolingKernelCode(definition_, *creation_context.device);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status MaxUnpooling::BindArguments() {
if (definition_.dst_tensors[0].HasAxis(Axis::WIDTH)) {
RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
......@@ -210,13 +202,15 @@ int3 MaxUnpooling::GetGridSize() const {
}
MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
const MaxUnpooling2DAttributes& attr) {
return MaxUnpooling(definition, attr);
const MaxUnpooling2DAttributes& attr,
const DeviceInfo& device_info) {
return MaxUnpooling(definition, attr, device_info);
}
MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
const MaxUnpooling3DAttributes& attr) {
return MaxUnpooling(definition, attr);
const MaxUnpooling3DAttributes& attr,
const DeviceInfo& device_info) {
return MaxUnpooling(definition, attr, device_info);
}
} // namespace cl
......
......@@ -28,13 +28,14 @@ namespace cl {
class MaxUnpooling : public GPUOperation {
public:
MaxUnpooling(const OperationDef& definition,
const MaxUnpooling2DAttributes& attr);
const MaxUnpooling2DAttributes& attr,
const DeviceInfo& device_info);
MaxUnpooling(const OperationDef& definition,
const MaxUnpooling3DAttributes& attr);
const MaxUnpooling3DAttributes& attr,
const DeviceInfo& device_info);
absl::Status BindArguments() override;
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
MaxUnpooling(MaxUnpooling&& kernel);
......@@ -44,7 +45,7 @@ class MaxUnpooling : public GPUOperation {
private:
std::string GetMaxUnpoolingKernelCode(const OperationDef& op_def,
const CLDevice& device);
const DeviceInfo& device_info);
int4 stride_;
int4 padding_;
......@@ -52,10 +53,12 @@ class MaxUnpooling : public GPUOperation {
};
MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
const MaxUnpooling2DAttributes& attr);
const MaxUnpooling2DAttributes& attr,
const DeviceInfo& device_info);
MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
const MaxUnpooling3DAttributes& attr);
const MaxUnpooling3DAttributes& attr,
const DeviceInfo& device_info);
} // namespace cl
} // namespace gpu
......
......@@ -55,7 +55,8 @@ TEST_F(OpenCLOperationTest, MaxUnpooling) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
MaxUnpooling operation = CreateMaxUnpooling(op_def, attr);
MaxUnpooling operation =
CreateMaxUnpooling(op_def, attr, env_.GetDevicePtr()->GetInfo());
ASSERT_OK(ExecuteGPUOperation({src_tensor, src_ind_tensor},
creation_context_, &operation,
BHWC(1, 4, 4, 1), &dst_tensor));
......
......@@ -26,6 +26,18 @@ namespace tflite {
namespace gpu {
namespace cl {
Mean::Mean(const OperationDef& definition, const DeviceInfo& device_info)
: GPUOperation(definition) {
// for workgroup size:
// must be: (x * y) % 4 = 0;
// must be: z = 1;
work_group_size_ = int3(16, 16, 1);
if (device_info.IsAdreno3xx()) {
work_group_size_ = int3(16, 8, 1);
}
code_ = GetMeanKernelCode(definition_, work_group_size_);
}
Mean::Mean(Mean&& operation) : GPUOperation(std::move(operation)) {}
Mean& Mean::operator=(Mean&& operation) {
......@@ -96,25 +108,6 @@ std::string Mean::GetMeanKernelCode(const OperationDef& op_def,
return c;
}
absl::Status Mean::Compile(const CreationContext& creation_context) {
// must be: (x * y) % 4 = 0;
// must be: z = 1;
work_group_size_ = int3(16, 16, 1);
if (creation_context.device->IsAdreno3xx()) {
work_group_size_ = int3(16, 8, 1);
}
std::string code = GetMeanKernelCode(definition_, work_group_size_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status Mean::BindArguments() {
const double total_size = src_[0]->Width() * src_[0]->Height();
const double size_0 = work_group_size_.x * work_group_size_.y;
......@@ -131,7 +124,9 @@ int3 Mean::GetGridSize() const {
return int3(grid_x, grid_y, grid_z);
}
Mean CreateMean(const OperationDef& definition) { return Mean(definition); }
Mean CreateMean(const OperationDef& definition, const DeviceInfo& device_info) {
return Mean(definition, device_info);
}
} // namespace cl
} // namespace gpu
......
......@@ -29,14 +29,13 @@ namespace cl {
class Mean : public GPUOperation {
public:
Mean() = default;
explicit Mean(const OperationDef& definition) : GPUOperation(definition) {}
Mean(const OperationDef& definition, const DeviceInfo& device_info);
absl::Status Tune(const TuningParameters& params) override {
return absl::OkStatus();
}
absl::Status BindArguments() override;
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
Mean(Mean&& operation);
......@@ -49,7 +48,7 @@ class Mean : public GPUOperation {
const int3& work_group_size);
};
Mean CreateMean(const OperationDef& definition);
Mean CreateMean(const OperationDef& definition, const DeviceInfo& device_info);
} // namespace cl
} // namespace gpu
......
......@@ -26,7 +26,9 @@ namespace gpu {
namespace cl {
MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition)
: GPUOperation(definition) {}
: GPUOperation(definition) {
code_ = GetNormalizationCode(definition_);
}
std::string MeanStdDevNormalization::GetNormalizationCode(
const OperationDef& op_def) {
......@@ -70,16 +72,6 @@ std::string MeanStdDevNormalization::GetNormalizationCode(
return c;
}
absl::Status MeanStdDevNormalization::Compile(
const CreationContext& creation_context) {
std::string code = GetNormalizationCode(definition_);
RETURN_IF_ERROR(
args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
int3 MeanStdDevNormalization::GetGridSize() const {
const int grid_x = dst_[0]->Batch();
const int grid_y = 1;
......
......@@ -31,7 +31,6 @@ class MeanStdDevNormalization : public GPUOperation {
explicit MeanStdDevNormalization(const OperationDef& definition);
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
MeanStdDevNormalization(MeanStdDevNormalization&& kernel) = default;
......
......@@ -47,7 +47,7 @@ TEST_F(OpenCLOperationTest, Mean) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
Mean operation = CreateMean(op_def);
Mean operation = CreateMean(op_def, env_.GetDevicePtr()->GetInfo());
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 1, 1, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {2.5f}));
......
......@@ -26,14 +26,14 @@ namespace gpu {
namespace cl {
Padding::Padding(const OperationDef& definition, const PadAttributes& attr)
: GPUOperation(definition), attributes_(attr) {}
: GPUOperation(definition) {
code_ = GetPaddingCode(definition_, attr);
}
Padding::Padding(Padding&& kernel)
: GPUOperation(std::move(kernel)), attributes_(kernel.attributes_) {}
Padding::Padding(Padding&& kernel) : GPUOperation(std::move(kernel)) {}
Padding& Padding::operator=(Padding&& kernel) {
if (this != &kernel) {
std::swap(attributes_, kernel.attributes_);
GPUOperation::operator=(std::move(kernel));
}
return *this;
......@@ -43,10 +43,10 @@ std::string Padding::GetPaddingCode(const OperationDef& op_def,
const PadAttributes& attr) {
AddSrcTensor("src_tensor", op_def.src_tensors[0]);
AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
args_.AddInt("prepended_x");
args_.AddInt("prepended_y");
args_.AddInt("prepended_z");
args_.AddInt("prepended_w");
args_.AddInt("prepended_x", attr.prepended.w);
args_.AddInt("prepended_y", attr.prepended.h);
args_.AddInt("prepended_z", attr.prepended.c);
args_.AddInt("prepended_w", attr.prepended.b);
const std::string dst_batch =
op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
......@@ -149,27 +149,6 @@ std::string Padding::GetPaddingCode(const OperationDef& op_def,
return c;
}
absl::Status Padding::Compile(const CreationContext& creation_context) {
std::string code = GetPaddingCode(definition_, attributes_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status Padding::BindArguments() {
RETURN_IF_ERROR(args_.SetInt("prepended_x", attributes_.prepended.w));
RETURN_IF_ERROR(args_.SetInt("prepended_y", attributes_.prepended.h));
RETURN_IF_ERROR(args_.SetInt("prepended_z", attributes_.prepended.c));
RETURN_IF_ERROR(args_.SetInt("prepended_w", attributes_.prepended.b));
return absl::OkStatus();
}
int3 Padding::GetGridSize() const {
const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
const int grid_y = dst_[0]->Height();
......
......@@ -28,10 +28,7 @@ namespace cl {
class Padding : public GPUOperation {
public:
Padding(const OperationDef& definition, const PadAttributes& attr);
absl::Status BindArguments() override;
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
Padding(Padding&& kernel);
......@@ -42,8 +39,6 @@ class Padding : public GPUOperation {
private:
std::string GetPaddingCode(const OperationDef& op_def,
const PadAttributes& attr);
PadAttributes attributes_;
};
Padding CreatePadding(const OperationDef& definition,
......
......@@ -25,23 +25,27 @@ namespace gpu {
namespace cl {
Pooling::Pooling(const OperationDef& definition,
const Pooling2DAttributes& attr)
const Pooling2DAttributes& attr, const DeviceInfo& device_info)
: GPUOperation(definition),
stride_(attr.strides.w, attr.strides.h, 0, 0),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0),
type_(attr.type),
output_indices_(attr.output_indices) {}
output_indices_(attr.output_indices) {
GenerateCode(device_info);
}
Pooling::Pooling(const OperationDef& definition,
const Pooling3DAttributes& attr)
const Pooling3DAttributes& attr, const DeviceInfo& device_info)
: GPUOperation(definition),
stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-attr.padding.prepended.d, 0),
kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0),
type_(attr.type),
output_indices_(attr.output_indices) {}
output_indices_(attr.output_indices) {
GenerateCode(device_info);
}
Pooling::Pooling(Pooling&& kernel)
: GPUOperation(std::move(kernel)),
......@@ -63,11 +67,11 @@ Pooling& Pooling::operator=(Pooling&& kernel) {
return *this;
}
std::string Pooling::GetAveragePoolingKernelCode(const OperationDef& op_def,
bool stride_correction,
const CLDevice& device) {
std::string Pooling::GetAveragePoolingKernelCode(
const OperationDef& op_def, bool stride_correction,
const DeviceInfo& device_info) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
if (op_def.IsBatchSupported()) {
src_desc.SetStateVar("BatchedWidth", "true");
}
......@@ -344,33 +348,16 @@ std::string Pooling::GetMaxPoolingKernelCode(const OperationDef& op_def,
return c;
}
absl::Status Pooling::Compile(const CreationContext& creation_context) {
std::string code;
void Pooling::GenerateCode(const DeviceInfo& device_info) {
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
switch (type_) {
case PoolingType::AVERAGE:
code = GetAveragePoolingKernelCode(definition_, stride_correction,
*creation_context.device);
break;
case PoolingType::MAX:
code = GetMaxPoolingKernelCode(definition_, stride_correction,
output_indices_);
break;
default:
return absl::InvalidArgumentError(
"You should create another kernel with this params");
break;
}
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
if (type_ == PoolingType::AVERAGE) {
code_ = GetAveragePoolingKernelCode(definition_, stride_correction,
device_info);
} else if (type_ == PoolingType::MAX) {
code_ = GetMaxPoolingKernelCode(definition_, stride_correction,
output_indices_);
}
}
absl::Status Pooling::BindArguments() {
......@@ -400,13 +387,15 @@ int3 Pooling::GetGridSize() const {
}
Pooling CreatePooling(const OperationDef& definition,
const Pooling2DAttributes& attr) {
return Pooling(definition, attr);
const Pooling2DAttributes& attr,
const DeviceInfo& device_info) {
return Pooling(definition, attr, device_info);
}
Pooling CreatePooling(const OperationDef& definition,
const Pooling3DAttributes& attr) {
return Pooling(definition, attr);
const Pooling3DAttributes& attr,
const DeviceInfo& device_info) {
return Pooling(definition, attr, device_info);
}
} // namespace cl
......
......@@ -29,12 +29,13 @@ namespace cl {
class Pooling : public GPUOperation {
public:
Pooling(const OperationDef& definition, const Pooling2DAttributes& attr);
Pooling(const OperationDef& definition, const Pooling3DAttributes& attr);
Pooling(const OperationDef& definition, const Pooling2DAttributes& attr,
const DeviceInfo& device_info);
Pooling(const OperationDef& definition, const Pooling3DAttributes& attr,
const DeviceInfo& device_info);
absl::Status BindArguments() override;
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
Pooling(Pooling&& kernel);
......@@ -45,11 +46,13 @@ class Pooling : public GPUOperation {
private:
std::string GetAveragePoolingKernelCode(const OperationDef& op_def,
bool stride_correction,
const CLDevice& device);
const DeviceInfo& device_info);
std::string GetMaxPoolingKernelCode(const OperationDef& op_def,
bool stride_correction,
bool output_indices);
void GenerateCode(const DeviceInfo& device_info);
int4 stride_;
int4 padding_;
int4 kernel_size_;
......@@ -59,10 +62,12 @@ class Pooling : public GPUOperation {
};
Pooling CreatePooling(const OperationDef& definition,
const Pooling2DAttributes& attr);
const Pooling2DAttributes& attr,
const DeviceInfo& device_info);
Pooling CreatePooling(const OperationDef& definition,
const Pooling3DAttributes& attr);
const Pooling3DAttributes& attr,
const DeviceInfo& device_info);
} // namespace cl
} // namespace gpu
......
......@@ -52,7 +52,8 @@ TEST_F(OpenCLOperationTest, AveragePooling) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
Pooling operation = CreatePooling(op_def, attr);
Pooling operation =
CreatePooling(op_def, attr, env_.GetDevicePtr()->GetInfo());
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 1, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {3.0f, 4.0f}));
......@@ -81,7 +82,8 @@ TEST_F(OpenCLOperationTest, AveragePoolingNonEmptyPadding) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
Pooling operation = CreatePooling(op_def, attr);
Pooling operation =
CreatePooling(op_def, attr, env_.GetDevicePtr()->GetInfo());
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 2, 2, 1), &dst_tensor));
EXPECT_THAT(dst_tensor.data,
......@@ -111,7 +113,8 @@ TEST_F(OpenCLOperationTest, MaxPooling) {
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
Pooling operation = CreatePooling(op_def, attr);
Pooling operation =
CreatePooling(op_def, attr, env_.GetDevicePtr()->GetInfo());
ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
BHWC(1, 1, 1, 2), &dst_tensor));
EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {8.0f, 7.0f}));
......@@ -143,7 +146,8 @@ TEST_F(OpenCLOperationTest, MaxPoolingIndices) {
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
TensorFloat32 dst_tensor;
TensorFloat32 dst_tensor_ind;
Pooling operation = CreatePooling(op_def, attr);
Pooling operation =
CreatePooling(op_def, attr, env_.GetDevicePtr()->GetInfo());
ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
{BHWC(1, 1, 1, 2), BHWC(1, 1, 1, 2)},
{&dst_tensor, &dst_tensor_ind}));
......
......@@ -24,6 +24,10 @@ namespace tflite {
namespace gpu {
namespace cl {
Reshape::Reshape(const OperationDef& definition) : GPUOperation(definition) {
code_ = GetReshapeCode(definition_);
}
Reshape::Reshape(Reshape&& operation) : GPUOperation(std::move(operation)) {}
Reshape& Reshape::operator=(Reshape&& operation) {
......@@ -92,19 +96,6 @@ std::string Reshape::GetReshapeCode(const OperationDef& op_def) {
return c;
}
absl::Status Reshape::Compile(const CreationContext& creation_context) {
std::string code = GetReshapeCode(definition_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
int3 Reshape::GetGridSize() const {
const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
const int grid_y = dst_[0]->Height();
......
......@@ -27,10 +27,9 @@ namespace cl {
class Reshape : public GPUOperation {
public:
explicit Reshape(const OperationDef& definition) : GPUOperation(definition) {}
explicit Reshape(const OperationDef& definition);
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
Reshape(Reshape&& operation);
......
......@@ -24,6 +24,11 @@ namespace tflite {
namespace gpu {
namespace cl {
Reshapex4::Reshapex4(const OperationDef& definition)
: GPUOperation(definition) {
code_ = GetReshapeCode(definition_);
}
Reshapex4::Reshapex4(Reshapex4&& operation)
: GPUOperation(std::move(operation)) {}
......@@ -77,19 +82,6 @@ std::string Reshapex4::GetReshapeCode(const OperationDef& op_def) {
return c;
}
absl::Status Reshapex4::Compile(const CreationContext& creation_context) {
std::string code = GetReshapeCode(definition_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
int3 Reshapex4::GetGridSize() const {
const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
const int grid_y = dst_[0]->Height();
......
......@@ -28,11 +28,9 @@ namespace cl {
class Reshapex4 : public GPUOperation {
public:
explicit Reshapex4(const OperationDef& definition)
: GPUOperation(definition) {}
explicit Reshapex4(const OperationDef& definition);
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
Reshapex4(Reshapex4&& operation);
......
......@@ -24,6 +24,11 @@ namespace tflite {
namespace gpu {
namespace cl {
Resize::Resize(const OperationDef& definition, const Resize2DAttributes& attr)
: GPUOperation(definition), attr_(attr) {
code_ = GetResizeCode(definition_, attr_);
}
Resize::Resize(Resize&& operation)
: GPUOperation(std::move(operation)), attr_(operation.attr_) {}
......@@ -127,19 +132,6 @@ std::string Resize::GetResizeCode(const OperationDef& op_def,
return c;
}
absl::Status Resize::Compile(const CreationContext& creation_context) {
std::string code = GetResizeCode(definition_, attr_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status Resize::BindArguments() {
RETURN_IF_ERROR(args_.SetInt("border_x", src_[0]->Width() - 1));
RETURN_IF_ERROR(args_.SetInt("border_y", src_[0]->Height() - 1));
......@@ -164,6 +156,12 @@ Resize CreateResize(const OperationDef& definition,
return Resize(definition, attr);
}
Resize3D::Resize3D(const OperationDef& definition,
const Resize3DAttributes& attr)
: GPUOperation(definition), attr_(attr) {
code_ = GetResize3DCode(definition_, attr_);
}
Resize3D::Resize3D(Resize3D&& operation)
: GPUOperation(std::move(operation)), attr_(operation.attr_) {}
......@@ -288,19 +286,6 @@ std::string Resize3D::GetResize3DCode(const OperationDef& op_def,
return c;
}
absl::Status Resize3D::Compile(const CreationContext& creation_context) {
std::string code = GetResize3DCode(definition_, attr_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status Resize3D::BindArguments() {
RETURN_IF_ERROR(args_.SetInt("border_x", src_[0]->Width() - 1));
RETURN_IF_ERROR(args_.SetInt("border_y", src_[0]->Height() - 1));
......
......@@ -29,7 +29,6 @@ class Resize : public GPUOperation {
public:
absl::Status BindArguments() override;
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
Resize(Resize&& operation);
......@@ -41,8 +40,7 @@ class Resize : public GPUOperation {
const Resize2DAttributes& attr);
private:
Resize(const OperationDef& definition, const Resize2DAttributes& attr)
: GPUOperation(definition), attr_(attr) {}
Resize(const OperationDef& definition, const Resize2DAttributes& attr);
std::string GetResizeCode(const OperationDef& op_def,
const Resize2DAttributes& attr);
......@@ -57,7 +55,6 @@ class Resize3D : public GPUOperation {
public:
absl::Status BindArguments() override;
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
Resize3D(Resize3D&& operation);
......@@ -69,8 +66,7 @@ class Resize3D : public GPUOperation {
const Resize3DAttributes& attr);
private:
Resize3D(const OperationDef& definition, const Resize3DAttributes& attr)
: GPUOperation(definition), attr_(attr) {}
Resize3D(const OperationDef& definition, const Resize3DAttributes& attr);
std::string GetResize3DCode(const OperationDef& op_def,
const Resize3DAttributes& attr);
......
......@@ -25,6 +25,10 @@ namespace tflite {
namespace gpu {
namespace cl {
Softmax::Softmax(const OperationDef& definition) : GPUOperation(definition) {
code_ = GetSoftmaxKernelCode(definition_);
}
Softmax::Softmax(Softmax&& kernel) : GPUOperation(std::move(kernel)) {}
Softmax& Softmax::operator=(Softmax&& kernel) {
......@@ -71,19 +75,6 @@ std::string Softmax::GetSoftmaxKernelCode(const OperationDef& op_def) {
return c;
}
absl::Status Softmax::Compile(const CreationContext& creation_context) {
std::string code = GetSoftmaxKernelCode(definition_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
int3 Softmax::GetGridSize() const {
const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
const int grid_y = dst_[0]->Height();
......
......@@ -29,10 +29,9 @@ namespace cl {
class Softmax : public GPUOperation {
public:
Softmax() = default;
explicit Softmax(const OperationDef& definition) : GPUOperation(definition) {}
explicit Softmax(const OperationDef& definition);
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
Softmax(Softmax&& kernel);
......
......@@ -24,6 +24,12 @@ namespace tflite {
namespace gpu {
namespace cl {
Softmax1x1::Softmax1x1(const OperationDef& definition)
: GPUOperation(definition) {
work_group_size_ = int3(32, 1, 1);
code_ = GetSoftmaxKernelCode(definition_);
}
Softmax1x1::Softmax1x1(Softmax1x1&& kernel) : GPUOperation(std::move(kernel)) {}
Softmax1x1& Softmax1x1::operator=(Softmax1x1&& kernel) {
......@@ -103,20 +109,6 @@ std::string Softmax1x1::GetSoftmaxKernelCode(const OperationDef& op_def) {
return c;
}
absl::Status Softmax1x1::Compile(const CreationContext& creation_context) {
std::string code = GetSoftmaxKernelCode(definition_);
std::string element_wise_code;
work_group_size_ = int3(32, 1, 1);
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status Softmax1x1::BindArguments() {
float4 mask = GetMaskForLastPlane(src_[0]->Channels());
RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));
......
......@@ -28,14 +28,12 @@ namespace cl {
class Softmax1x1 : public GPUOperation {
public:
Softmax1x1() = default;
explicit Softmax1x1(const OperationDef& definition)
: GPUOperation(definition) {}
explicit Softmax1x1(const OperationDef& definition);
absl::Status Tune(const TuningParameters& params) override {
return absl::OkStatus();
}
absl::Status BindArguments() override;
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
Softmax1x1(Softmax1x1&& kernel);
......
......@@ -26,6 +26,12 @@ namespace tflite {
namespace gpu {
namespace cl {
SpaceToDepth::SpaceToDepth(const OperationDef& op_def,
const SpaceToDepthAttributes& attr)
: GPUOperation(op_def), attr_(attr) {
code_ = GetSpaceToDepthCode(definition_);
}
SpaceToDepth::SpaceToDepth(SpaceToDepth&& operation)
: GPUOperation(std::move(operation)), attr_(operation.attr_) {}
......@@ -82,19 +88,6 @@ std::string SpaceToDepth::GetSpaceToDepthCode(const OperationDef& op_def) {
return c;
}
absl::Status SpaceToDepth::Compile(const CreationContext& creation_context) {
std::string code = GetSpaceToDepthCode(definition_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status SpaceToDepth::BindArguments() {
RETURN_IF_ERROR(args_.SetInt("block_size", attr_.block_size));
return absl::OkStatus();
......
......@@ -28,11 +28,9 @@ namespace cl {
class SpaceToDepth : public GPUOperation {
public:
SpaceToDepth(const OperationDef& op_def, const SpaceToDepthAttributes& attr)
: GPUOperation(op_def), attr_(attr) {}
SpaceToDepth(const OperationDef& op_def, const SpaceToDepthAttributes& attr);
absl::Status BindArguments() override;
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
SpaceToDepth(SpaceToDepth&& operation);
SpaceToDepth& operator=(SpaceToDepth&& operation);
......
......@@ -30,24 +30,23 @@ namespace cl {
DepthwiseConvPlus1x1Conv::DepthwiseConvPlus1x1Conv(
const OperationDef& definition,
const DepthwiseConvolution2DAttributes& dw_attr,
const Convolution2DAttributes& conv_attr)
: GPUOperation(definition),
dw_attr_(dw_attr),
result_depth_(DivideRoundUp(conv_attr.weights.shape.o, 4)) {
const Convolution2DAttributes& conv_attr, const DeviceInfo& device_info)
: GPUOperation(definition), dw_attr_(dw_attr) {
work_group_size_ = int3(8, 8, 1);
code_ =
GenerateCode(definition_, dw_attr_,
DivideRoundUp(conv_attr.weights.shape.o, 4), device_info);
}
DepthwiseConvPlus1x1Conv::DepthwiseConvPlus1x1Conv(
DepthwiseConvPlus1x1Conv&& operation)
: GPUOperation(std::move(operation)),
dw_attr_(std::move(operation.dw_attr_)),
result_depth_(operation.result_depth_) {}
dw_attr_(std::move(operation.dw_attr_)) {}
DepthwiseConvPlus1x1Conv& DepthwiseConvPlus1x1Conv::operator=(
DepthwiseConvPlus1x1Conv&& operation) {
if (this != &operation) {
dw_attr_ = std::move(operation.dw_attr_);
std::swap(result_depth_, operation.result_depth_);
GPUOperation::operator=(std::move(operation));
}
return *this;
......@@ -147,9 +146,9 @@ absl::Status DepthwiseConvPlus1x1Conv::UploadWeights(
std::string DepthwiseConvPlus1x1Conv::GenerateCode(
const OperationDef& op_def, const DepthwiseConvolution2DAttributes& dw_attr,
int result_depth, const CLDevice& device) {
int result_depth, const DeviceInfo& device_info) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
AddSrcTensor("src_tensor", src_desc);
AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
......@@ -243,21 +242,6 @@ std::string DepthwiseConvPlus1x1Conv::GenerateCode(
return c;
}
absl::Status DepthwiseConvPlus1x1Conv::Compile(
const CreationContext& creation_context) {
std::string code = GenerateCode(definition_, dw_attr_, result_depth_,
*creation_context.device);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
int3 DepthwiseConvPlus1x1Conv::GetGridSize() const {
const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
const int grid_y = dst_[0]->Height();
......@@ -289,7 +273,8 @@ absl::Status CreateDepthwiseConvPlus1x1Conv(
const DepthwiseConvolution2DAttributes& dw_attr,
const Convolution2DAttributes& conv_attr,
DepthwiseConvPlus1x1Conv* result) {
*result = DepthwiseConvPlus1x1Conv(definition, dw_attr, conv_attr);
*result = DepthwiseConvPlus1x1Conv(definition, dw_attr, conv_attr,
creation_context.device->GetInfo());
RETURN_IF_ERROR(
result->UploadWeights(dw_attr, conv_attr, creation_context.context));
return absl::OkStatus();
......
......@@ -37,7 +37,6 @@ class DepthwiseConvPlus1x1Conv : public GPUOperation {
public:
DepthwiseConvPlus1x1Conv() = default;
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
DepthwiseConvPlus1x1Conv(DepthwiseConvPlus1x1Conv&& operation);
......@@ -53,7 +52,8 @@ class DepthwiseConvPlus1x1Conv : public GPUOperation {
DepthwiseConvPlus1x1Conv* result);
DepthwiseConvPlus1x1Conv(const OperationDef& definition,
const DepthwiseConvolution2DAttributes& dw_attr,
const Convolution2DAttributes& conv_attr);
const Convolution2DAttributes& conv_attr,
const DeviceInfo& device_info);
absl::Status UploadWeights(const DepthwiseConvolution2DAttributes& dw_attr,
const Convolution2DAttributes& conv_attr,
......@@ -61,10 +61,9 @@ class DepthwiseConvPlus1x1Conv : public GPUOperation {
std::string GenerateCode(const OperationDef& op_def,
const DepthwiseConvolution2DAttributes& dw_attr,
int result_depth, const CLDevice& device);
int result_depth, const DeviceInfo& device_info);
DepthwiseConvolution2DAttributes dw_attr_;
int result_depth_;
};
bool IsDepthwiseConvPlus1x1ConvSupported(
......
......@@ -79,6 +79,7 @@ StridedSlice::StridedSlice(const OperationDef& definition,
const SliceAttributes& attr)
: GPUOperation(definition), attributes_(attr) {
work_group_size_ = int3(8, 4, 1);
code_ = GetStridedSliceCode(definition_, Is4Aligned(attributes_));
}
StridedSlice::StridedSlice(StridedSlice&& operation)
......@@ -153,19 +154,6 @@ std::string StridedSlice::GetStridedSliceCode(const OperationDef& op_def,
return c;
}
absl::Status StridedSlice::Compile(const CreationContext& creation_context) {
std::string code = GetStridedSliceCode(definition_, Is4Aligned(attributes_));
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
absl::Status StridedSlice::BindArguments() {
int4 offset = GetOffset(attributes_, src_[0]->Width(), src_[0]->Height(),
src_[0]->Channels(), src_[0]->Batch());
......
......@@ -29,7 +29,6 @@ class StridedSlice : public GPUOperation {
StridedSlice(const OperationDef& definition, const SliceAttributes& attr);
absl::Status BindArguments() override;
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
StridedSlice(StridedSlice&& operation);
......
......@@ -25,6 +25,12 @@ namespace tflite {
namespace gpu {
namespace cl {
Transpose::Transpose(const OperationDef& definition,
const TransposeAttributes& attr)
: GPUOperation(definition), attr_(attr) {
code_ = GetTransposeCode(definition_, attr_);
}
Transpose::Transpose(Transpose&& operation)
: GPUOperation(std::move(operation)), attr_(operation.attr_) {}
......@@ -107,19 +113,6 @@ std::string Transpose::GetTransposeCode(const OperationDef& op_def,
return c;
}
absl::Status Transpose::Compile(const CreationContext& creation_context) {
std::string code = GetTransposeCode(definition_, attr_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context,
*creation_context.device, &kernel_);
}
int3 Transpose::GetGridSize() const {
const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
const int grid_y = dst_[0]->Height();
......
......@@ -26,10 +26,8 @@ namespace cl {
class Transpose : public GPUOperation {
public:
Transpose(const OperationDef& definition, const TransposeAttributes& attr)
: GPUOperation(definition), attr_(attr) {}
Transpose(const OperationDef& definition, const TransposeAttributes& attr);
int3 GetGridSize() const override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
Transpose(Transpose&& operation);
......
......@@ -100,6 +100,11 @@ TextureAddressMode GetFastestZeroMode(const CLDevice& device) {
: TextureAddressMode::ZERO;
}
TextureAddressMode GetFastestZeroMode(const DeviceInfo& device_info) {
return device_info.IsAdreno3xx() ? TextureAddressMode::DONT_CARE
: TextureAddressMode::ZERO;
}
float4 GetMaskForLastPlane(int channels) {
float4 mask = float4(0.0f);
const int reminder = channels % 4 == 0 ? 4 : channels % 4;
......
......@@ -95,6 +95,7 @@ void RearrangeWeightsToOHWIOGroupI4O4(
// textures on Adreno3xx devices. Using CLK_ADDRESS_NONE is significantly faster
// than CLK_ADDRESS_CLAMP on Adreno 3xx.
TextureAddressMode GetFastestZeroMode(const CLDevice& device);
TextureAddressMode GetFastestZeroMode(const DeviceInfo& device_info);
// Returns float4 mask for last plane(batch of 4 channels)
// assumes that plane size is 4;
......
......@@ -32,6 +32,21 @@ namespace tflite {
namespace gpu {
namespace cl {
Winograd4x4To36::Winograd4x4To36(const OperationDef& definition,
const Padding2D& padding,
const DeviceInfo& device_info)
: GPUOperation(definition), padding_(padding) {
work_group_size_ = int3(32, 1, 1);
code_ = GetWinograd4x4To36Code(definition_);
if (device_info.IsAdreno()) {
compiler_options_.push_back(CompilerOptions::ADRENO_MORE_WAVES);
}
if (definition_.precision == CalculationsPrecision::F16 &&
device_info.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
}
Winograd4x4To36::Winograd4x4To36(Winograd4x4To36&& operation)
: GPUOperation(std::move(operation)), padding_(operation.padding_) {}
......@@ -219,30 +234,6 @@ std::string Winograd4x4To36::GetWinograd4x4To36Code(
return c;
}
absl::Status Winograd4x4To36::Compile(const CreationContext& creation_context) {
std::vector<CompilerOptions> options;
if (creation_context.device->IsAdreno()) {
options.push_back(CompilerOptions::ADRENO_MORE_WAVES);
}
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsPowerVR()) {
options.push_back(CompilerOptions::POWERVR_FP16);
}
RETURN_IF_ERROR(UploadBt(creation_context.context));
std::string code = GetWinograd4x4To36Code(definition_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_));
work_group_size_ = SelectBestWorkGroup();
return absl::OkStatus();
}
absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
tflite::gpu::Tensor<Linear, DataType::FLOAT32> bt_aligned;
bt_aligned.shape = Linear(6 * 8);
......@@ -311,10 +302,22 @@ absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
const OperationDef& definition,
const Padding2D& padding,
Winograd4x4To36* result) {
*result = Winograd4x4To36(definition, padding);
*result =
Winograd4x4To36(definition, padding, creation_context.device->GetInfo());
return result->UploadBt(creation_context.context);
}
Winograd36To4x4::Winograd36To4x4(const OperationDef& definition,
const DeviceInfo& device_info)
: GPUOperation(definition) {
work_group_size_ = int3(32, 1, 1);
if (definition_.precision == CalculationsPrecision::F16 &&
device_info.IsPowerVR()) {
compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
}
code_ = GetWinograd36To4x4Code(definition_);
}
Winograd36To4x4::Winograd36To4x4(Winograd36To4x4&& operation)
: GPUOperation(std::move(operation)) {}
......@@ -434,26 +437,6 @@ std::string Winograd36To4x4::GetWinograd36To4x4Code(
return c;
}
absl::Status Winograd36To4x4::Compile(const CreationContext& creation_context) {
std::vector<CompilerOptions> options;
if (definition_.precision == CalculationsPrecision::F16 &&
creation_context.device->IsPowerVR()) {
options.push_back(CompilerOptions::POWERVR_FP16);
}
std::string code = GetWinograd36To4x4Code(definition_);
std::string element_wise_code;
RETURN_IF_ERROR(
MergeOperations(linked_operations_, &args_, &element_wise_code));
RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
{{"dst_tensor", element_wise_code}},
&code));
RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
code, "main_function", options, *creation_context.context,
*creation_context.device, &kernel_));
work_group_size_ = SelectBestWorkGroup();
return absl::OkStatus();
}
absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
tflite::gpu::Tensor<Linear, DataType::FLOAT32> at_aligned;
at_aligned.shape = Linear(4 * 8);
......@@ -519,7 +502,7 @@ absl::Status CreateWinograd36To4x4(
const CreationContext& creation_context, const OperationDef& definition,
const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
Winograd36To4x4* result) {
*result = Winograd36To4x4(definition);
*result = Winograd36To4x4(definition, creation_context.device->GetInfo());
TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::TEXTURE_2D;
desc.element_type = definition.GetDataType();
......
......@@ -34,14 +34,11 @@ namespace cl {
class Winograd4x4To36 : public GPUOperation {
public:
Winograd4x4To36() = default;
Winograd4x4To36(const OperationDef& definition, const Padding2D& padding)
: GPUOperation(definition), padding_(padding) {
work_group_size_ = int3(128, 1, 1);
}
Winograd4x4To36(const OperationDef& definition, const Padding2D& padding,
const DeviceInfo& device_info);
absl::Status BindArguments() override;
int3 GetGridSize() const override;
absl::Status Tune(const TuningParameters& params) override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
Winograd4x4To36(Winograd4x4To36&& operation);
......@@ -72,14 +69,11 @@ absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
class Winograd36To4x4 : public GPUOperation {
public:
Winograd36To4x4() = default;
explicit Winograd36To4x4(const OperationDef& definition)
: GPUOperation(definition) {
work_group_size_ = int3(128, 1, 1);
}
Winograd36To4x4(const OperationDef& definition,
const DeviceInfo& device_info);
absl::Status BindArguments() override;
int3 GetGridSize() const override;
absl::Status Tune(const TuningParameters& params) override;
absl::Status Compile(const CreationContext& creation_context) override;
// Move only
Winograd36To4x4(Winograd36To4x4&& operation);
......
......@@ -270,18 +270,20 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
inputs[0]->tensor.shape.b, gpu_op);
}
case OperationType::LSTM: {
SelectLSTM(op_def, gpu_op);
SelectLSTM(op_def, creation_context.device->GetInfo(), gpu_op);
return absl::OkStatus();
}
case OperationType::MAX_UNPOOLING_2D: {
auto attr =
absl::any_cast<MaxUnpooling2DAttributes>(node.operation.attributes);
SelectMaxUnpooling(attr, op_def, gpu_op);
SelectMaxUnpooling(attr, op_def, creation_context.device->GetInfo(),
gpu_op);
return absl::OkStatus();
}
case OperationType::MEAN: {
auto attr = absl::any_cast<MeanAttributes>(node.operation.attributes);
return SelectMean(attr, op_def, gpu_op);
return SelectMean(attr, op_def, creation_context.device->GetInfo(),
gpu_op);
}
case OperationType::MUL: {
if (inputs.size() == 2) {
......@@ -333,7 +335,7 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
case OperationType::POOLING_2D: {
auto attr =
absl::any_cast<Pooling2DAttributes>(node.operation.attributes);
SelectPooling(attr, op_def, gpu_op);
SelectPooling(attr, op_def, creation_context.device->GetInfo(), gpu_op);
return absl::OkStatus();
}
case OperationType::PRELU: {
......
......@@ -45,9 +45,9 @@ namespace tflite {
namespace gpu {
namespace cl {
void SelectLSTM(const OperationDef& op_def,
void SelectLSTM(const OperationDef& op_def, const DeviceInfo& device_info,
std::unique_ptr<GPUOperation>* ptr) {
LSTM operation = CreateLSTM(op_def);
LSTM operation = CreateLSTM(op_def, device_info);
*ptr = absl::make_unique<LSTM>(std::move(operation));
}
......@@ -69,15 +69,17 @@ absl::Status SelectPReLU(const PReLUAttributes& attr,
}
void SelectPooling(const Pooling2DAttributes& attr, const OperationDef& op_def,
const DeviceInfo& device_info,
std::unique_ptr<GPUOperation>* ptr) {
Pooling pooling = CreatePooling(op_def, attr);
Pooling pooling = CreatePooling(op_def, attr, device_info);
*ptr = absl::make_unique<Pooling>(std::move(pooling));
}
void SelectMaxUnpooling(const MaxUnpooling2DAttributes& attr,
const OperationDef& op_def,
const DeviceInfo& device_info,
std::unique_ptr<GPUOperation>* ptr) {
MaxUnpooling operation = CreateMaxUnpooling(op_def, attr);
MaxUnpooling operation = CreateMaxUnpooling(op_def, attr, device_info);
*ptr = absl::make_unique<MaxUnpooling>(std::move(operation));
}
......@@ -151,11 +153,12 @@ void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
}
absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
const DeviceInfo& device_info,
std::unique_ptr<GPUOperation>* ptr) {
if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) {
return absl::UnimplementedError("Mean operation supports only HW plane");
}
Mean operation = CreateMean(op_def);
Mean operation = CreateMean(op_def, device_info);
*ptr = absl::make_unique<Mean>(std::move(operation));
return absl::OkStatus();
}
......
......@@ -28,7 +28,8 @@ namespace tflite {
namespace gpu {
namespace cl {
void SelectLSTM(const OperationDef& op_def, std::unique_ptr<GPUOperation>* ptr);
void SelectLSTM(const OperationDef& op_def, const DeviceInfo& device_info,
std::unique_ptr<GPUOperation>* ptr);
void SelectReLU(const CreationContext& creation_context,
const ReLUAttributes& attr, const OperationDef& op_def,
......@@ -40,10 +41,12 @@ absl::Status SelectPReLU(const PReLUAttributes& attr,
std::unique_ptr<GPUOperation>* ptr);
void SelectPooling(const Pooling2DAttributes& attr, const OperationDef& op_def,
const DeviceInfo& device_info,
std::unique_ptr<GPUOperation>* ptr);
void SelectMaxUnpooling(const MaxUnpooling2DAttributes& attr,
const OperationDef& op_def,
const DeviceInfo& device_info,
std::unique_ptr<GPUOperation>* ptr);
void SelectAdd(const OperationDef& op_def, const std::vector<int>& channels,
......@@ -70,6 +73,7 @@ void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
std::unique_ptr<GPUOperation>* ptr);
absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
const DeviceInfo& device_info,
std::unique_ptr<GPUOperation>* ptr);
void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册