DepthwiseConvolution(2D/3D) converted to generic GPUOperation.

PiperOrigin-RevId: 328239471 Change-Id: I404f86d58a73b353c49aa56fefcba134026a1eb8

DepthwiseConvolution(2D/3D) converted to generic GPUOperation.
PiperOrigin-RevId: 328239471 Change-Id: I404f86d58a73b353c49aa56fefcba134026a1eb8
8959c4ce · Raman Sarokin · TensorFlower Gardener · 7d262f59 · 8959c4ce · 8959c4ce
4 changed file
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
@@ -66,100 +66,24 @@ std::string GetSrcValue(int channel_multiplier, const std::string coords) {

  return c;
 }
-}  // namespace
-
-DepthwiseConvolution::DepthwiseConvolution(
-    const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr, bool weights_are_buffer)
-    : GPUOperation(definition),
-      weights_are_buffer_(weights_are_buffer),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 0, 0),
-      stride_(attr.strides.w, attr.strides.h, 0, 0),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
-      dilation_(attr.dilations.w, attr.dilations.h, 0, 0),
-      channel_multiplier_(attr.weights.shape.o) {
-  work_group_size_ = int3(8, 8, 1);
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ = GenerateDepthwiseConvolutionCode(
-      definition_, stride_correction, channel_multiplier_, weights_are_buffer_);
-}
-
-DepthwiseConvolution::DepthwiseConvolution(
-    const OperationDef& definition,
-    const DepthwiseConvolution3DAttributes& attr, bool weights_are_buffer)
-    : GPUOperation(definition),
-      weights_are_buffer_(weights_are_buffer),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
-                   attr.weights.shape.d, 0),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-               -attr.padding.prepended.d, 0),
-      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 0),
-      channel_multiplier_(attr.weights.shape.o) {
-  work_group_size_ = int3(8, 8, 1);
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ = GenerateDepthwiseConvolutionCode(
-      definition_, stride_correction, channel_multiplier_, weights_are_buffer_);
-}

-DepthwiseConvolution::DepthwiseConvolution(DepthwiseConvolution&& operation)
-    : GPUOperation(std::move(operation)),
-      weights_are_buffer_(operation.weights_are_buffer_),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      dilation_(operation.dilation_),
-      channel_multiplier_(operation.channel_multiplier_) {}
-
-DepthwiseConvolution& DepthwiseConvolution::operator=(
-    DepthwiseConvolution&& operation) {
-  if (this != &operation) {
-    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(channel_multiplier_, operation.channel_multiplier_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
-    const OperationDef& op_def, bool stride_correction, int channel_multiplier,
-    bool weights_are_buffer) {
+std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
+                                             bool stride_correction,
+                                             int channel_multiplier,
+                                             bool weights_are_buffer,
+                                             GPUOperation* op) {
  auto src_desc = op_def.src_tensors[0];
  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
  if (op_def.IsBatchSupported()) {
    src_desc.SetStateVar("BatchedWidth", "true");
  }
-  AddSrcTensor("src_tensor", src_desc);
+  op->AddSrcTensor("src_tensor", src_desc);

  auto dst_desc = op_def.dst_tensors[0];
  if (op_def.IsBatchSupported()) {
    dst_desc.SetStateVar("BatchedWidth", "true");
  }
-  AddDstTensor("dst_tensor", dst_desc);
-
-  args_.AddInt("kernel_size_x");
-  args_.AddInt("stride_x");
-  args_.AddInt("padding_x");
-  args_.AddInt("dilation_x");
-  args_.AddInt("kernel_size_y");
-  args_.AddInt("stride_y");
-  args_.AddInt("padding_y");
-  args_.AddInt("dilation_y");
-  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    args_.AddInt("kernel_size_z");
-    args_.AddInt("stride_z");
-    args_.AddInt("padding_z");
-    args_.AddInt("dilation_z");
-  }
-  if (!IsSpecializedCase(channel_multiplier)) {
-    args_.AddInt("ch_multiplier");
-  }
+  op->AddDstTensor("dst_tensor", dst_desc);

  const auto src_tensor_type = op_def.src_tensors[0].storage_type;

@@ -171,14 +95,14 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
  c += "__kernel void main_function(\n";
  c += "$0) {\n";
  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    c += "  int linear_id_2 = get_global_id(2);\n";
-    c += "  int S = linear_id_2 / args.dst_tensor.Depth();\n";
-    c += "  int Z = linear_id_2 % args.dst_tensor.Depth();\n";
+    c += "  int linear_id_1 = get_global_id(1);\n";
+    c += "  int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+    c += "  int Z = linear_id_1 % args.dst_tensor.Depth();\n";
  } else {
-    c += "  int S = get_global_id(2);\n";
+    c += "  int Y = get_global_id(1);\n";
  }
+  c += "  int S = get_global_id(2);\n";
  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
       "S >= args.dst_tensor.Slices()) { \n";
  c += "    return; \n";
@@ -186,11 +110,16 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
  c += "  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
  if (stride_correction) {
    c += "  int x_offseted = " +
-         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
-                             "args.padding_x") +
+         GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
+                               "args.padding_x") +
         ";\n";
  } else {
-    c += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
+    if (op_def.IsBatchSupported()) {
+      c += "  int x_offseted = X * args.stride_x + args.padding_x * "
+           "args.src_tensor.Batch();\n";
+    } else {
+      c += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
+    }
  }
  c += "  int y_offseted = Y * args.stride_y + args.padding_y;\n";
  std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
@@ -218,7 +147,10 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
    c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
    c += "    bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
    c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
-    c += "      int x_c = x_offseted + kx * args.dilation_x;\n";
+    const std::string dilation_x =
+        op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
+                                  : "args.dilation_x";
+    c += "      int x_c = x_offseted + kx * " + dilation_x + ";\n";
    c += "      bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n";
    c += "      if (" + check + ") {\n";
    if (weights_are_buffer) {
@@ -252,7 +184,10 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
    c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
    c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
    c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
-    c += "      int x_c = x_offseted + kx * args.dilation_x;\n";
+    const std::string dilation_x =
+        op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
+                                  : "args.dilation_x";
+    c += "      int x_c = x_offseted + kx * " + dilation_x + ";\n";
    c += GetSrcValue(channel_multiplier, flat_coords);
    if (weights_are_buffer) {
      c += "      FLT4 f = args.weights.Read(fx_c);\n";
@@ -277,67 +212,80 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(

  return c;
 }
+}  // namespace

-absl::Status DepthwiseConvolution::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-  RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
-  if (definition_.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
-    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
-    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
-    RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z));
-  }
-  if (!IsSpecializedCase(channel_multiplier_)) {
-    RETURN_IF_ERROR(args_.SetInt("ch_multiplier", channel_multiplier_));
-  }
-  return absl::OkStatus();
-}
-
-int3 DepthwiseConvolution::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-DepthwiseConvolution CreateDepthwiseConvolution(
+GPUOperation CreateDepthwiseConvolution2D(
    const DeviceInfo& device_info, const OperationDef& definition,
    const DepthwiseConvolution2DAttributes& attr) {
  bool weights_are_buffer = device_info.IsMali();
-  DepthwiseConvolution result(definition, attr, weights_are_buffer);
-  result.UploadWeights(attr.weights);
+  GPUOperation op(definition);
+  op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+  op.args_.AddInt("dilation_x", attr.dilations.w);
+  op.args_.AddInt("kernel_size_y", attr.weights.shape.h);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+  op.args_.AddInt("dilation_y", attr.dilations.h);
+  if (!IsSpecializedCase(attr.weights.shape.o)) {
+    op.args_.AddInt("ch_multiplier", attr.weights.shape.o);
+  }
+  const bool stride_correction =
+      definition.IsBatchSupported() && attr.strides.w != 1;
+  op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
+                                              attr.weights.shape.o,
+                                              weights_are_buffer, &op);
+  UploadWeightsForDWConv2D(attr.weights, weights_are_buffer,
+                           definition.precision, &op);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;

  TensorLinearDescriptor desc;
  desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
                                         : LinearStorageType::TEXTURE_2D;
  desc.element_type = definition.GetDataType();
  desc.UploadLinearData(attr.bias);
-  result.args_.AddObject(
+  op.args_.AddObject(
      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-  return result;
+  return op;
 }

-DepthwiseConvolution CreateDepthwiseConvolution(
+GPUOperation CreateDepthwiseConvolution3D(
    const DeviceInfo& device_info, const OperationDef& definition,
    const DepthwiseConvolution3DAttributes& attr) {
  bool weights_are_buffer = device_info.IsMali();
-  DepthwiseConvolution result(definition, attr, weights_are_buffer);
-  result.UploadWeights(attr.weights);
+  GPUOperation op(definition);
+  op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+  op.args_.AddInt("dilation_x", attr.dilations.w);
+  op.args_.AddInt("kernel_size_y", attr.weights.shape.h);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+  op.args_.AddInt("dilation_y", attr.dilations.h);
+  op.args_.AddInt("kernel_size_z", attr.weights.shape.d);
+  op.args_.AddInt("stride_z", attr.strides.d);
+  op.args_.AddInt("padding_z", -attr.padding.prepended.d);
+  op.args_.AddInt("dilation_z", attr.dilations.d);
+  if (!IsSpecializedCase(attr.weights.shape.o)) {
+    op.args_.AddInt("ch_multiplier", attr.weights.shape.o);
+  }
+  const bool stride_correction =
+      definition.IsBatchSupported() && attr.strides.w != 1;
+  op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
+                                              attr.weights.shape.o,
+                                              weights_are_buffer, &op);
+  UploadWeightsForDWConv3D(attr.weights, weights_are_buffer,
+                           definition.precision, &op);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;

  TensorLinearDescriptor desc;
  desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
                                         : LinearStorageType::TEXTURE_2D;
  desc.element_type = definition.GetDataType();
  desc.UploadLinearData(attr.bias);
-  result.args_.AddObject(
+  op.args_.AddObject(
      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-  return result;
+  return op;
 }

 }  // namespace cl

--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
@@ -35,102 +35,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {

-class DepthwiseConvolution : public GPUOperation {
- public:
-  DepthwiseConvolution() = default;
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  DepthwiseConvolution(DepthwiseConvolution&& operation);
-  DepthwiseConvolution& operator=(DepthwiseConvolution&& operation);
-  DepthwiseConvolution(const DepthwiseConvolution&) = delete;
-  DepthwiseConvolution& operator=(const DepthwiseConvolution&) = delete;
-
- private:
-  friend DepthwiseConvolution CreateDepthwiseConvolution(
-      const DeviceInfo& device_info, const OperationDef& definition,
-      const DepthwiseConvolution2DAttributes& attr);
-  friend DepthwiseConvolution CreateDepthwiseConvolution(
-      const DeviceInfo& device_info, const OperationDef& definition,
-      const DepthwiseConvolution3DAttributes& attr);
-  DepthwiseConvolution(const OperationDef& definition,
-                       const DepthwiseConvolution2DAttributes& attr,
-                       bool weights_are_buffer);
-  DepthwiseConvolution(const OperationDef& definition,
-                       const DepthwiseConvolution3DAttributes& attr,
-                       bool weights_are_buffer);
-
-  template <DataType T>
-  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
-                            absl::Span<T> dst);
-
-  template <DataType T>
-  void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
-                            absl::Span<T> dst);
-
-  std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
-                                               bool stride_correction,
-                                               int channel_multiplier,
-                                               bool weights_are_buffer);
-
-  bool weights_are_buffer_;
-
-  int4 kernel_size_;
-  int4 stride_;
-  int4 padding_;
-  int4 dilation_;
-  int channel_multiplier_;
-};
-
-template <DataType T>
-void DepthwiseConvolution::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights) {
-  const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_slices = DivideRoundUp(dst_channels, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-
-  const int elements_count = kernel_x * kernel_y * dst_slices;
-
-  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
-  const int float4_size = fp32_weights ? 16 : 8;
-
-  std::vector<uint8_t> data(float4_size * elements_count);
-
-  if (fp32_weights) {
-    float4* ptr = reinterpret_cast<float4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
-  } else {
-    half4* ptr = reinterpret_cast<half4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
-  }
-
-  if (weights_are_buffer_) {
-    BufferDescriptor desc;
-    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc.element_size = 4;
-    desc.size = float4_size * elements_count;
-    desc.data = std::move(data);
-    args_.AddObject("weights", absl::make_unique<BufferDescriptor>(desc));
-  } else {
-    Texture2DDescriptor desc;
-    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc.size = int2(kernel_x * kernel_y, dst_slices);
-    desc.data = std::move(data);
-    args_.AddObject("weights", absl::make_unique<Texture2DDescriptor>(desc));
-  }
-}
-
 template <DataType S, typename T>
-void DepthwiseConvolution::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+void RearrangeWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, S>& weights,
+                                 absl::Span<T> dst) {
  const int dst_channels = weights.shape.i * weights.shape.o;
  const int dst_depth = DivideRoundUp(dst_channels, 4);
  const int kernel_x = weights.shape.w;
@@ -158,50 +65,50 @@ void DepthwiseConvolution::RearrangeWeightsData(
 }

 template <DataType T>
-void DepthwiseConvolution::UploadWeights(
-    const tflite::gpu::Tensor<OHWDI, T>& weights) {
+void UploadWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, T>& weights,
+                              bool weights_are_buffer,
+                              CalculationsPrecision precision,
+                              GPUOperation* op) {
  const int dst_channels = weights.shape.i * weights.shape.o;
  const int dst_slices = DivideRoundUp(dst_channels, 4);
  const int kernel_x = weights.shape.w;
  const int kernel_y = weights.shape.h;
-  const int kernel_z = weights.shape.d;

-  const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
+  const int elements_count = kernel_x * kernel_y * dst_slices;

-  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+  const bool fp32_weights = precision == CalculationsPrecision::F32;
  const int float4_size = fp32_weights ? 16 : 8;

  std::vector<uint8_t> data(float4_size * elements_count);

  if (fp32_weights) {
    float4* ptr = reinterpret_cast<float4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
+    RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
  } else {
    half4* ptr = reinterpret_cast<half4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
+    RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
  }

-  if (weights_are_buffer_) {
+  if (weights_are_buffer) {
    BufferDescriptor desc;
    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
    desc.element_size = 4;
    desc.size = float4_size * elements_count;
    desc.data = std::move(data);
-    args_.AddObject("weights",
-                    absl::make_unique<BufferDescriptor>(std::move(desc)));
+    op->args_.AddObject("weights", absl::make_unique<BufferDescriptor>(desc));
  } else {
    Texture2DDescriptor desc;
    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc.size = int2(kernel_x * kernel_y * kernel_z, dst_slices);
+    desc.size = int2(kernel_x * kernel_y, dst_slices);
    desc.data = std::move(data);
-    args_.AddObject("weights",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+    op->args_.AddObject("weights",
+                        absl::make_unique<Texture2DDescriptor>(desc));
  }
 }

 template <DataType S, typename T>
-void DepthwiseConvolution::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
+void RearrangeWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, S>& weights,
+                                 absl::Span<T> dst) {
  const int dst_channels = weights.shape.i * weights.shape.o;
  const int dst_slices = DivideRoundUp(dst_channels, 4);
  const int kernel_x = weights.shape.w;
@@ -231,11 +138,55 @@ void DepthwiseConvolution::RearrangeWeightsData(
  }
 }

-DepthwiseConvolution CreateDepthwiseConvolution(
+template <DataType T>
+void UploadWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, T>& weights,
+                              bool weights_are_buffer,
+                              CalculationsPrecision precision,
+                              GPUOperation* op) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+  const int kernel_z = weights.shape.d;
+
+  const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
+
+  const bool fp32_weights = precision == CalculationsPrecision::F32;
+  const int float4_size = fp32_weights ? 16 : 8;
+
+  std::vector<uint8_t> data(float4_size * elements_count);
+
+  if (fp32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
+  }
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    op->args_.AddObject("weights",
+                        absl::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    Texture2DDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.size = int2(kernel_x * kernel_y * kernel_z, dst_slices);
+    desc.data = std::move(data);
+    op->args_.AddObject(
+        "weights", absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+  }
+}
+
+GPUOperation CreateDepthwiseConvolution2D(
    const DeviceInfo& device_info, const OperationDef& definition,
    const DepthwiseConvolution2DAttributes& attr);

-DepthwiseConvolution CreateDepthwiseConvolution(
+GPUOperation CreateDepthwiseConvolution3D(
    const DeviceInfo& device_info, const OperationDef& definition,
    const DepthwiseConvolution3DAttributes& attr);


--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
@@ -55,7 +55,7 @@ TEST_F(OpenCLOperationTest, DepthwiseConvSimpleWeights) {
      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
      TensorFloat32 dst_tensor;
-      DepthwiseConvolution operation = CreateDepthwiseConvolution(
+      GPUOperation operation = CreateDepthwiseConvolution2D(
          creation_context_.GetDeviceInfo(), op_def, attr);
      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                    BHWC(1, 2, 2, 2), &dst_tensor));
@@ -90,7 +90,7 @@ TEST_F(OpenCLOperationTest, DepthwiseConvNoMultiplier) {
      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
      TensorFloat32 dst_tensor;
-      DepthwiseConvolution operation = CreateDepthwiseConvolution(
+      GPUOperation operation = CreateDepthwiseConvolution2D(
          creation_context_.GetDeviceInfo(), op_def, attr);
      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                    BHWC(1, 2, 2, 2), &dst_tensor));
@@ -126,7 +126,7 @@ TEST_F(OpenCLOperationTest, DepthwiseConvMultiplier2) {
      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
      TensorFloat32 dst_tensor;
-      DepthwiseConvolution operation = CreateDepthwiseConvolution(
+      GPUOperation operation = CreateDepthwiseConvolution2D(
          creation_context_.GetDeviceInfo(), op_def, attr);
      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                    BHWC(1, 2, 2, 4), &dst_tensor));

--- a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
@@ -33,8 +33,8 @@ std::unique_ptr<GPUOperation> SelectDWConvolutionAdreno(
    return absl::make_unique<DepthwiseConv3x3>(
        CreateDepthwiseConv3x3(device_info, op_def, attr));
  } else {
-    return absl::make_unique<DepthwiseConvolution>(
-        CreateDepthwiseConvolution(device_info, op_def, attr));
+    return absl::make_unique<GPUOperation>(
+        CreateDepthwiseConvolution2D(device_info, op_def, attr));
  }
 }

@@ -45,8 +45,8 @@ std::unique_ptr<GPUOperation> SelectDWConvolutionPowerVR(
    return absl::make_unique<DepthwiseConv3x3>(
        CreateDepthwiseConv3x3(device_info, op_def, attr));
  } else {
-    return absl::make_unique<DepthwiseConvolution>(
-        CreateDepthwiseConvolution(device_info, op_def, attr));
+    return absl::make_unique<GPUOperation>(
+        CreateDepthwiseConvolution2D(device_info, op_def, attr));
  }
 }

@@ -62,8 +62,8 @@ std::unique_ptr<GPUOperation> SelectDWConvolutionMali(
    return absl::make_unique<DepthwiseConv3x3>(
        CreateDepthwiseConv3x3(device_info, op_def, attr));
  } else {
-    return absl::make_unique<DepthwiseConvolution>(
-        CreateDepthwiseConvolution(device_info, op_def, attr));
+    return absl::make_unique<GPUOperation>(
+        CreateDepthwiseConvolution2D(device_info, op_def, attr));
  }
 }
 }  // namespace