Using default Compile for all ops.

PiperOrigin-RevId: 324316456 Change-Id: I1a9c31a6893174798420de07bc57811db013c2fe

Using default Compile for all ops.
PiperOrigin-RevId: 324316456 Change-Id: I1a9c31a6893174798420de07bc57811db013c2fe
a49117d4 · Raman Sarokin · TensorFlower Gardener · 5313d56b · a49117d4 · a49117d4
73 changed file
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
@@ -175,7 +175,15 @@ Conv3D::Conv3D(const OperationDef& definition,
      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
                   attr.weights.shape.d),
      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
-      conv_params_(GuessBestParams(device, definition, attr)) {}
+      conv_params_(GuessBestParams(device, definition, attr)) {
+  const bool stride_correction =
+      definition_.IsBatchSupported() && stride_.x != 1;
+  code_ = GenerateConv3D(definition_, stride_correction, conv_params_);
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
+  }
+}

 Conv3D::Conv3D(Conv3D&& operation)
    : GPUOperation(std::move(operation)),
@@ -197,29 +205,6 @@ Conv3D& Conv3D::operator=(Conv3D&& operation) {
  return *this;
 }

-absl::Status Conv3D::Compile(const CreationContext& creation_context) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  std::string code =
-      GenerateConv3D(definition_, stride_correction, conv_params_);
-  work_group_size_ = conv_params_.work_group_size;
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
-  }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status Conv3D::BindArguments() {
  if (!conv_params_.x_kernel_is_1) {
    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));

--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
@@ -40,7 +40,6 @@ class Conv3D : public GPUOperation {
 public:
  Conv3D() = default;
  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;


--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
@@ -151,7 +151,10 @@ ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,

 ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition,
                             const ConvParams& conv_params)
-    : GPUOperation(definition), conv_params_(conv_params) {}
+    : GPUOperation(definition), conv_params_(conv_params) {
+  code_ = GenerateConvBuffer1x1(definition_, conv_params_, &args_);
+  work_group_size_ = conv_params_.work_group_size;
+}

 ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
    : GPUOperation(std::move(operation)),
@@ -300,21 +303,6 @@ std::string ConvBuffer1x1::GenerateConvBuffer1x1(
  return c;
 }

-absl::Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
-  std::string code = GenerateConvBuffer1x1(definition_, conv_params_, &args_);
-  work_group_size_ = conv_params_.work_group_size;
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_));
-  return absl::OkStatus();
-}
-
 int3 ConvBuffer1x1::GetGridSize() const {
  const int dst_width_elements = DivideRoundUp(
      dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4));

--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
@@ -48,7 +48,6 @@ class ConvBuffer1x1 : public GPUOperation {
  ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;

  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
  int3 GetGridSize() const override;

  ConvWeightsDescription GetConvWeightsDescription() const {

--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@@ -47,6 +47,32 @@ int GetOptimalMaxConstantSize(const DeviceInfo& info) {
 }
 }  // namespace

+ConvConstants::ConvConstants(const OperationDef& definition,
+                             const Convolution2DAttributes& attr,
+                             const DeviceInfo& device_info)
+    : GPUOperation(definition),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
+      stride_(attr.strides.w, attr.strides.h),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
+      dilation_(attr.dilations.w, attr.dilations.h),
+      src_channels_(attr.weights.shape.i),
+      dst_channels_(attr.weights.shape.o) {
+  const bool stride_correction =
+      definition_.IsBatchSupported() && stride_.x != 1;
+  code_ = GenerateConvolutionConstantCode(definition_, kernel_size_,
+                                          src_channels_, dst_channels_,
+                                          stride_correction, device_info);
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device_info.IsAdreno3xx()) {
+    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
+  if (definition_.precision != CalculationsPrecision::F32 &&
+      device_info.IsPowerVR()) {
+    // BUG, some PowerVRs (GE8320) produce incorrect result without it
+    compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
+  }
+}
+
 ConvConstants::ConvConstants(ConvConstants&& kernel)
    : GPUOperation(std::move(kernel)),
      kernel_size_(kernel.kernel_size_),
@@ -71,9 +97,9 @@ ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {

 std::string ConvConstants::GenerateConvolutionConstantCode(
    const OperationDef& op_def, const int2& kernel_size, int src_channels,
-    int dst_channels, bool stride_correction, const CLDevice& device) {
+    int dst_channels, bool stride_correction, const DeviceInfo& device_info) {
  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+  src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
  if (op_def.IsBatchSupported()) {
    src_desc.SetStateVar("BatchedWidth", "true");
  }
@@ -214,33 +240,6 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
  return c;
 }

-absl::Status ConvConstants::Compile(const CreationContext& creation_context) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  std::string code = GenerateConvolutionConstantCode(
-      definition_, kernel_size_, src_channels_, dst_channels_,
-      stride_correction, *creation_context.device);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsAdreno3xx()) {
-    options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
-  }
-  if (definition_.precision != CalculationsPrecision::F32 &&
-      creation_context.device->IsPowerVR()) {
-    // BUG, some PowerVRs (GE8320) produce incorrect result without it
-    options.push_back(CompilerOptions::CL_OPT_DISABLE);
-  }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status ConvConstants::BindArguments() {
  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
@@ -284,7 +283,7 @@ absl::Status CreateConvConstants(const CreationContext& creation_context,
  if (!IsConvConstantsSupported(*creation_context.device, definition, attr)) {
    return absl::InvalidArgumentError("ConvConstants doesn't supported");
  }
-  *result = ConvConstants(definition, attr);
+  *result = ConvConstants(definition, attr, creation_context.device->GetInfo());
  RETURN_IF_ERROR(
      result->UploadWeights(attr.weights, creation_context.context));


--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@@ -35,8 +35,6 @@ namespace cl {
 class ConvConstants : public GPUOperation {
 public:
  ConvConstants() = default;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;

@@ -50,15 +48,9 @@ class ConvConstants : public GPUOperation {
  friend absl::Status CreateConvConstants(
      const CreationContext& creation_context, const OperationDef& definition,
      const Convolution2DAttributes& attr, ConvConstants* result);
-  explicit ConvConstants(const OperationDef& definition,
-                         const Convolution2DAttributes& attr)
-      : GPUOperation(definition),
-        kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-        stride_(attr.strides.w, attr.strides.h),
-        padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-        dilation_(attr.dilations.w, attr.dilations.h),
-        src_channels_(attr.weights.shape.i),
-        dst_channels_(attr.weights.shape.o) {}
+  ConvConstants(const OperationDef& definition,
+                const Convolution2DAttributes& attr,
+                const DeviceInfo& device_info);

  template <DataType T>
  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
@@ -70,7 +62,7 @@ class ConvConstants : public GPUOperation {

  std::string GenerateConvolutionConstantCode(
      const OperationDef& op_def, const int2& kernel_size, int src_channels,
-      int dst_channels, bool stride_correction, const CLDevice& device);
+      int dst_channels, bool stride_correction, const DeviceInfo& device_info);

  int2 kernel_size_;
  int2 stride_;

--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -179,29 +179,19 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
  return *this;
 }

-absl::Status ConvPowerVR::Compile(const CreationContext& creation_context) {
+void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
  const bool stride_correction =
      definition_.IsBatchSupported() && stride_padding_.x != 1;
-  std::string code = GenerateConv(*creation_context.device, definition_,
-                                  stride_correction, conv_params_);
+  code_ =
+      GenerateConv(device_info, definition_, stride_correction, conv_params_);
  work_group_size_ = conv_params_.work_group_size;
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  std::vector<CompilerOptions> options;
  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
+      device_info.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
  }
  if (conv_params_.IsPrivateMemBroadcast()) {
-    options.push_back(CompilerOptions::CL_2_0);
+    compiler_options_.push_back(CompilerOptions::CL_2_0);
  }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
 }

 absl::Status ConvPowerVR::BindArguments() {
@@ -274,11 +264,12 @@ absl::Status ConvPowerVR::Tune(const TuningParameters& params) {
  return absl::OkStatus();
 }

-std::string ConvPowerVR::GenerateConv(
-    const CLDevice& device, const OperationDef& op_def, bool stride_correction,
-    const ConvPowerVR::ConvParams& conv_params) {
+std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
+                                      const OperationDef& op_def,
+                                      bool stride_correction,
+                                      const ConvParams& conv_params) {
  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
+  src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
  if (op_def.IsBatchSupported()) {
    src_desc.SetStateVar("BatchedWidth", "true");
  }
@@ -350,7 +341,7 @@ std::string ConvPowerVR::GenerateConv(

  std::string c = GetCommonDefines(op_def.precision);
  if (use_simd_broadcast) {
-    if (device.cl_version() == OpenCLVersion::CL_2_0) {
+    if (device_info.cl_version == OpenCLVersion::CL_2_0) {
      c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
    }
  }
@@ -363,7 +354,7 @@ std::string ConvPowerVR::GenerateConv(
         std::to_string(work_group_size.y) + ", " +
         std::to_string(work_group_size.z) + ")))\n";
  }
-  if (use_simd_broadcast && device.IsIntel()) {
+  if (use_simd_broadcast && device_info.IsIntel()) {
    c += "__attribute__((intel_reqd_sub_group_size(" +
         std::to_string(simd_size) + ")))\n";
  }
@@ -498,7 +489,7 @@ std::string ConvPowerVR::GenerateConv(
      }
    }
  };
-  const bool conditional_read = device.IsMali();
+  const bool conditional_read = device_info.IsMali();
  auto read_src = [&]() {
    const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
    for (int y = 0; y < block_size.y; ++y) {
@@ -1004,6 +995,7 @@ absl::Status CreateConvPowerVR(const CreationContext& creation_context,
                               const Convolution2DAttributes& attr,
                               ConvPowerVR* result, const BHWC* dst_shape) {
  *result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
+  result->GenerateCode(creation_context.device->GetInfo());
  return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }

@@ -1012,6 +1004,7 @@ absl::Status CreateConvPowerVR(const CreationContext& creation_context,
                               const FullyConnectedAttributes& attr,
                               ConvPowerVR* result, const BHWC* dst_shape) {
  *result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
+  result->GenerateCode(creation_context.device->GetInfo());
  return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }

@@ -1021,6 +1014,7 @@ absl::Status CreateConvPowerVRDynamicWeights(
    ConvPowerVR* result, const BHWC* dst_shape) {
  *result = ConvPowerVR(definition, attr, weights_shape,
                        *creation_context.device, dst_shape);
+  result->GenerateCode(creation_context.device->GetInfo());
  return result->UploadBias(attr.bias, creation_context.context);
 }

@@ -1031,6 +1025,7 @@ absl::Status CreateConvPowerVRWino4x4To6x6(
  *result = ConvPowerVR(definition);
  result->conv_params_ = result->GuessBestParamsWinograd(
      *creation_context.device, definition, attr, dst_shape);
+  result->GenerateCode(creation_context.device->GetInfo());
  return result->UploadDataForWinograd4x4To6x6(
      attr.weights, *creation_context.device, creation_context.context);
 }

--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -42,7 +42,6 @@ class ConvPowerVR : public GPUOperation {
 public:
  ConvPowerVR() = default;
  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;

@@ -137,6 +136,8 @@ class ConvPowerVR : public GPUOperation {
              const BHWC* dst_shape = nullptr);
  explicit ConvPowerVR(const OperationDef& definition);

+  void GenerateCode(const DeviceInfo& device_info);
+
  template <DataType T>
  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
                          const tflite::gpu::Tensor<Linear, T>& biases,
@@ -176,12 +177,6 @@ class ConvPowerVR : public GPUOperation {
      const Convolution2DAttributes& attr, ConvPowerVR* result,
      const BHWC* dst_shape);

-  friend std::string GenerateConv(const CLDevice& device,
-                                  const OperationDef& op_def,
-                                  bool stride_correction,
-                                  const ConvParams& conv_params,
-                                  Arguments* args);
-
  ConvParams GuessBestParams(const CLDevice& device,
                             const OperationDef& definition,
                             const Convolution2DAttributes& attr,
@@ -206,9 +201,9 @@ class ConvPowerVR : public GPUOperation {
                             bool different_weights_for_height,
                             const BHWC* dst_shape = nullptr) const;

-  std::string GenerateConv(const CLDevice& device, const OperationDef& op_def,
-                           bool stride_correction,
-                           const ConvPowerVR::ConvParams& conv_params);
+  std::string GenerateConv(const DeviceInfo& device_info,
+                           const OperationDef& op_def, bool stride_correction,
+                           const ConvParams& conv_params);

  int4 stride_padding_;
  int4 kernel_dilation_;

--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@@ -30,9 +30,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
-bool UseFP16SIMD(const CLDevice& device, CalculationsPrecision precision,
+bool UseFP16SIMD(const DeviceInfo& device_info, CalculationsPrecision precision,
                 bool kernel1x1) {
-  if (!device.IsAdreno()) {
+  if (!device_info.IsAdreno()) {
    return false;
  }
  switch (precision) {
@@ -40,7 +40,7 @@ bool UseFP16SIMD(const CLDevice& device, CalculationsPrecision precision,
    case CalculationsPrecision::F32_F16:
      return false;
    case CalculationsPrecision::F16:
-      return device.IsAdreno3xx() && kernel1x1;
+      return device_info.IsAdreno3xx() && kernel1x1;
  }
 }
 }  // namespace
@@ -96,9 +96,9 @@ std::string ConvTexture::GenerateConvCode(const OperationDef& op_def,
                                          bool adreno4xx_optimization,
                                          bool stride_correction,
                                          bool different_weights_for_height,
-                                          const CLDevice& device) {
+                                          const DeviceInfo& device_info) {
  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
+  src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
  if (op_def.IsBatchSupported()) {
    src_desc.SetStateVar("BatchedWidth", "true");
  }
@@ -380,33 +380,23 @@ std::string ConvTexture::GenerateConvCode(const OperationDef& op_def,
  return c;
 }

-absl::Status ConvTexture::Compile(const CreationContext& creation_context) {
+void ConvTexture::GenerateCode(const DeviceInfo& device_info) {
  auto storage_type = definition_.GetPrimaryStorageType();
  bool is1x1 = kernel_size_.x == 1 && kernel_size_.y == 1;
  bool adreno4xx_optimization =
      stride_.x == 1 && stride_.y == 1 && padding_.x == 0 && padding_.y == 0 &&
-      creation_context.device->IsAdreno4xx() &&
+      device_info.IsAdreno4xx() &&
      storage_type == TensorStorageType::TEXTURE_ARRAY &&
      definition_.precision == CalculationsPrecision::F16;
  const bool stride_correction =
      definition_.IsBatchSupported() && stride_.x != 1;
-  std::string code =
-      GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
-                       stride_correction, different_weights_for_height_,
-                       *creation_context.device);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  std::vector<CompilerOptions> options;
-  if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
-    options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  code_ = GenerateConvCode(definition_, block_size_, is1x1,
+                           adreno4xx_optimization, stride_correction,
+                           different_weights_for_height_, device_info);
+
+  if (UseFP16SIMD(device_info, definition_.precision, is1x1)) {
+    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
  }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
 }

 absl::Status ConvTexture::BindArguments() {
@@ -441,6 +431,7 @@ absl::Status CreateConvTexture(const CreationContext& creation_context,
                               const Convolution2DAttributes& attr,
                               ConvTexture* result) {
  *result = ConvTexture(definition, attr);
+  result->GenerateCode(creation_context.device->GetInfo());
  return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }

@@ -449,6 +440,7 @@ absl::Status CreateConvTexture(const CreationContext& creation_context,
                               const FullyConnectedAttributes& attr,
                               ConvTexture* result) {
  *result = ConvTexture(definition);
+  result->GenerateCode(creation_context.device->GetInfo());
  return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }

@@ -458,6 +450,7 @@ absl::Status CreateConvTextureWino4x4To6x6(
  *result = ConvTexture(definition);
  result->different_weights_for_height_ = true;
  result->block_size_ = {4, 1, 2};
+  result->GenerateCode(creation_context.device->GetInfo());
  return result->UploadDataForWinograd4x4To6x6(
      attr.weights, *creation_context.device, creation_context.context);
 }

--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@@ -43,7 +43,6 @@ class ConvTexture : public GPUOperation {
 public:
  ConvTexture() = default;
  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;

@@ -89,12 +88,14 @@ class ConvTexture : public GPUOperation {
                            absl::Span<T> dst_0, absl::Span<T> dst_1,
                            absl::Span<T> dst_2, absl::Span<T> dst_3);

+  void GenerateCode(const DeviceInfo& device_info);
+
  std::string GenerateConvCode(const OperationDef& op_def,
                               const int3& block_size, bool is1x1,
                               bool adreno4xx_optimization,
                               bool stride_correction,
                               bool different_weights_for_height,
-                               const CLDevice& device);
+                               const DeviceInfo& device_info);

  int2 kernel_size_;
  int2 stride_;

--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
@@ -24,6 +24,13 @@ namespace tflite {
 namespace gpu {
 namespace cl {

+ConverterToConvWeights::ConverterToConvWeights(
+    const OperationDef& definition,
+    const ConvWeightsDescription& conv_weights_desc)
+    : GPUOperation(definition), conv_weights_desc_(conv_weights_desc) {
+  code_ = GetConverterToConvWeightsCode(definition_, conv_weights_desc_);
+}
+
 ConverterToConvWeights::ConverterToConvWeights(
    ConverterToConvWeights&& operation)
    : GPUOperation(std::move(operation)),
@@ -103,17 +110,6 @@ std::string ConverterToConvWeights::GetConverterToConvWeightsCode(
  return c;
 }

-absl::Status ConverterToConvWeights::Compile(
-    const CreationContext& creation_context) {
-  std::string code =
-      GetConverterToConvWeightsCode(definition_, conv_weights_desc_);
-  RETURN_IF_ERROR(
-      args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status ConverterToConvWeights::BindArguments() {
  float4 mask = GetMaskForLastPlane(src_[0]->Channels());
  RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));

--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
@@ -30,9 +30,7 @@ namespace cl {
 class ConverterToConvWeights : public GPUOperation {
 public:
  ConverterToConvWeights(const OperationDef& definition,
-                         const ConvWeightsDescription& conv_weights_desc)
-      : GPUOperation(definition), conv_weights_desc_(conv_weights_desc) {}
-  absl::Status Compile(const CreationContext& creation_context) override;
+                         const ConvWeightsDescription& conv_weights_desc);
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;


--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@@ -54,6 +54,9 @@ ConvolutionTransposed::ConvolutionTransposed(
    }
    block_size_.z = 1;
  }
+
+  code_ = GenerateConvolutionTransposedCode(definition_, device,
+                                            weights_are_buffer_, block_size_);
 }

 ConvolutionTransposed::ConvolutionTransposed(ConvolutionTransposed&& operation)
@@ -331,24 +334,6 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
  return c;
 }

-absl::Status ConvolutionTransposed::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GenerateConvolutionTransposedCode(
-      definition_, *creation_context.device, weights_are_buffer_, block_size_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  // options.push_back(CompilerOptions::POWERVR_FP16);
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status ConvolutionTransposed::BindArguments() {
  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));

--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
@@ -39,7 +39,6 @@ class ConvolutionTransposed : public GPUOperation {
 public:
  ConvolutionTransposed() = default;
  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;


--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
@@ -38,7 +38,19 @@ ConvolutionTransposed3D::ConvolutionTransposed3D(
      stride_(attr.stride.w, attr.stride.h, attr.stride.d),
      padding_(attr.padding.prepended.w, attr.padding.prepended.h,
               attr.padding.prepended.d),
-      block_size_(2, 2, 1, 2) {}
+      block_size_(2, 2, 1, 2) {
+  code_ = GenerateConvolutionTransposed3DCode(definition_, device,
+                                              weights_are_buffer_, block_size_);
+  if (device.IsPowerVR() && block_size_.y != 1) {
+    bool is_texture3d = definition_.src_tensors[0].storage_type ==
+                        TensorStorageType::TEXTURE_3D;
+    bool is_texture_array = definition_.src_tensors[0].storage_type ==
+                            TensorStorageType::TEXTURE_ARRAY;
+    if (is_texture3d || is_texture_array) {
+      compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
+    }
+  }
+}

 ConvolutionTransposed3D::ConvolutionTransposed3D(
    ConvolutionTransposed3D&& operation)
@@ -356,32 +368,6 @@ std::string ConvolutionTransposed3D::GenerateConvolutionTransposed3DCode(
  return c;
 }

-absl::Status ConvolutionTransposed3D::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GenerateConvolutionTransposed3DCode(
-      definition_, *creation_context.device, weights_are_buffer_, block_size_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  if (creation_context.device->IsPowerVR() && block_size_.y != 1) {
-    bool is_texture3d = definition_.src_tensors[0].storage_type ==
-                        TensorStorageType::TEXTURE_3D;
-    bool is_texture_array = definition_.src_tensors[0].storage_type ==
-                            TensorStorageType::TEXTURE_ARRAY;
-    if (is_texture3d || is_texture_array) {
-      options.push_back(CompilerOptions::CL_OPT_DISABLE);
-    }
-  }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status ConvolutionTransposed3D::BindArguments() {
  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));

--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
@@ -39,7 +39,6 @@ class ConvolutionTransposed3D : public GPUOperation {
 public:
  ConvolutionTransposed3D() = default;
  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;


--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@@ -42,6 +42,12 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
  } else {
    weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
  }
+  code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type_,
+                                            padding_, work_group_launch_order_);
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
+  }
 }

 ConvolutionTransposed3x3::ConvolutionTransposed3x3(
@@ -299,28 +305,6 @@ std::string ConvolutionTransposed3x3::GenerateConvolutionTransposedCode(
  return c;
 }

-absl::Status ConvolutionTransposed3x3::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GenerateConvolutionTransposedCode(
-      definition_, weights_upload_type_, padding_, work_group_launch_order_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
-  }
-  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_));
-  return absl::OkStatus();
-}
-
 absl::Status ConvolutionTransposed3x3::BindArguments() {
  RETURN_IF_ERROR(args_.SetInt("filter_offset", 4 * 9 * src_[0]->Slices()));
  const int padding_x =

--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
@@ -40,7 +40,6 @@ class ConvolutionTransposed3x3 : public GPUOperation {
  absl::Status Tune(const TuningParameters& params) override {
    return absl::OkStatus();
  }
-  absl::Status Compile(const CreationContext& creation_context) override;
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;


--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
@@ -27,22 +27,21 @@ namespace gpu {
 namespace cl {

 ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
-    const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
-    : GPUOperation(definition),
-      src_channels_(attr.weights.shape.i),
-      dst_channels_(attr.weights.shape.o) {}
+    const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
+    const DeviceInfo& device_info)
+    : GPUOperation(definition) {
+  code_ = GenerateConvolutionTransposedCode(
+      definition_, DivideRoundUp(attr.weights.shape.i, 4),
+      DivideRoundUp(attr.weights.shape.o, 4), device_info);
+}

 ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
    ConvolutionTransposed3x3Thin&& operation)
-    : GPUOperation(std::move(operation)),
-      src_channels_(operation.src_channels_),
-      dst_channels_(operation.dst_channels_) {}
+    : GPUOperation(std::move(operation)) {}

 ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
    ConvolutionTransposed3x3Thin&& operation) {
  if (this != &operation) {
-    std::swap(src_channels_, operation.src_channels_);
-    std::swap(dst_channels_, operation.dst_channels_);
    GPUOperation::operator=(std::move(operation));
  }
  return *this;
@@ -50,9 +49,9 @@ ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(

 std::string ConvolutionTransposed3x3Thin::GenerateConvolutionTransposedCode(
    const OperationDef& op_def, int src_depth, int dst_depth,
-    const CLDevice& device) {
+    const DeviceInfo& device_info) {
  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
+  src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
  AddSrcTensor("src_tensor", src_desc);
  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);

@@ -184,22 +183,6 @@ std::string ConvolutionTransposed3x3Thin::GenerateConvolutionTransposedCode(
  return c;
 }

-absl::Status ConvolutionTransposed3x3Thin::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GenerateConvolutionTransposedCode(
-      definition_, DivideRoundUp(src_channels_, 4),
-      DivideRoundUp(dst_channels_, 4), *creation_context.device);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
  const int grid_x = src_[0]->Width() * dst_[0]->Batch();
  const int grid_y = src_[0]->Height();
@@ -225,7 +208,8 @@ absl::Status CreateConvolutionTransposed3x3Thin(
    return absl::InvalidArgumentError(
        "ConvolutionTransposed3x3Thin doesn't support this attributes");
  }
-  *result = ConvolutionTransposed3x3Thin(definition, attr);
+  *result = ConvolutionTransposed3x3Thin(definition, attr,
+                                         creation_context.device->GetInfo());
  RETURN_IF_ERROR(
      result->UploadData(attr.weights, attr.bias, creation_context.context));
  return absl::OkStatus();

--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
@@ -37,7 +37,6 @@ namespace cl {
 class ConvolutionTransposed3x3Thin : public GPUOperation {
 public:
  ConvolutionTransposed3x3Thin() = default;
-  absl::Status Compile(const CreationContext& creation_context) override;
  int3 GetGridSize() const override;

  // Move only
@@ -55,7 +54,8 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
      ConvolutionTransposed3x3Thin* result);
  explicit ConvolutionTransposed3x3Thin(
      const OperationDef& definition,
-      const ConvolutionTransposedAttributes& attr);
+      const ConvolutionTransposedAttributes& attr,
+      const DeviceInfo& device_info);
  template <DataType T>
  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
                          const tflite::gpu::Tensor<Linear, T>& biases,
@@ -67,18 +67,15 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {

  std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
                                                int src_depth, int dst_depth,
-                                                const CLDevice& device);
-
-  int src_channels_;
-  int dst_channels_;
+                                                const DeviceInfo& device_info);
 };

 template <DataType T>
 absl::Status ConvolutionTransposed3x3Thin::UploadData(
    const tflite::gpu::Tensor<OHWI, T>& weights,
    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
-  const int src_depth = DivideRoundUp(src_channels_, 4);
-  const int dst_depth = DivideRoundUp(dst_channels_, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
  const int kernel_x = 3;  //  This operation support only 3x3 kernel
  const int kernel_y = 3;
  const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
@@ -131,8 +128,8 @@ absl::Status ConvolutionTransposed3x3Thin::UploadData(
 template <DataType S, typename T>
 void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int src_depth = DivideRoundUp(src_channels_, 4);
-  const int dst_depth = DivideRoundUp(dst_channels_, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
  const int kernel_x = 3;
  const int kernel_y = 3;

@@ -151,7 +148,7 @@ void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
            for (int i = 0; i < 4; ++i) {
              const int s_ch = s * 4 + i;
              const int d_ch = d * 4 + j;
-              if (s_ch < src_channels_ && d_ch < dst_channels_) {
+              if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
                const int f_index = weights.shape.LinearIndex(
                    {d_ch, kernel_index_y, kernel_index_x, s_ch});
                filters[i][j] = weights.data[f_index];

--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
@@ -40,6 +40,12 @@ ConvolutionTransposed4x4::ConvolutionTransposed4x4(
  } else {
    weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
  }
+
+  code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type_);
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
+  }
 }

 ConvolutionTransposed4x4::ConvolutionTransposed4x4(
@@ -57,8 +63,7 @@ ConvolutionTransposed4x4& ConvolutionTransposed4x4::operator=(
 }

 std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
-    const OperationDef& op_def,
-    ConvolutionTransposed4x4::WeightsUploadType weights_upload_type) {
+    const OperationDef& op_def, WeightsUploadType weights_upload_type) {
  auto src_desc = op_def.src_tensors[0];
  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
  if (op_def.IsBatchSupported()) {
@@ -290,28 +295,6 @@ std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
  return c;
 }

-absl::Status ConvolutionTransposed4x4::Compile(
-    const CreationContext& creation_context) {
-  std::string code =
-      GenerateConvolutionTransposedCode(definition_, weights_upload_type_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
-  }
-  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_));
-  return absl::OkStatus();
-}
-
 absl::Status ConvolutionTransposed4x4::BindArguments() {
  return args_.SetInt("filter_offset", 4 * 16 * src_[0]->Slices());
 }

--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
@@ -40,7 +40,6 @@ class ConvolutionTransposed4x4 : public GPUOperation {
  absl::Status Tune(const TuningParameters& params) override {
    return absl::OkStatus();
  }
-  absl::Status Compile(const CreationContext& creation_context) override;
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;

@@ -73,8 +72,7 @@ class ConvolutionTransposed4x4 : public GPUOperation {
                            absl::Span<T> dst);

  std::string GenerateConvolutionTransposedCode(
-      const OperationDef& op_def,
-      ConvolutionTransposed4x4::WeightsUploadType weights_upload_type);
+      const OperationDef& op_def, WeightsUploadType weights_upload_type);

  WeightsUploadType weights_upload_type_;
 };

--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
@@ -28,25 +28,25 @@ namespace gpu {
 namespace cl {

 ConvolutionTransposedThin::ConvolutionTransposedThin(
-    const OperationDef& definition, const ConvolutionTransposedAttributes& attr)
-    : GPUOperation(definition),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      src_channels_(attr.weights.shape.i),
-      dst_channels_(attr.weights.shape.o) {}
+    const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
+    const DeviceInfo& device_info)
+    : GPUOperation(definition) {
+  code_ = GenerateConvolutionTransposedCode(
+      definition_, DivideRoundUp(attr.weights.shape.i, 4), attr.weights.shape.o,
+      int2(attr.weights.shape.w, attr.weights.shape.h));
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device_info.IsAdreno3xx()) {
+    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
+}

 ConvolutionTransposedThin::ConvolutionTransposedThin(
    ConvolutionTransposedThin&& operation)
-    : GPUOperation(std::move(operation)),
-      kernel_size_(operation.kernel_size_),
-      src_channels_(operation.src_channels_),
-      dst_channels_(operation.dst_channels_) {}
+    : GPUOperation(std::move(operation)) {}

 ConvolutionTransposedThin& ConvolutionTransposedThin::operator=(
    ConvolutionTransposedThin&& operation) {
  if (this != &operation) {
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(src_channels_, operation.src_channels_);
-    std::swap(dst_channels_, operation.dst_channels_);
    GPUOperation::operator=(std::move(operation));
  }
  return *this;
@@ -151,29 +151,6 @@ std::string ConvolutionTransposedThin::GenerateConvolutionTransposedCode(
  return c;
 }

-absl::Status ConvolutionTransposedThin::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GenerateConvolutionTransposedCode(
-      definition_, DivideRoundUp(src_channels_, 4), dst_channels_,
-      kernel_size_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsAdreno3xx()) {
-    options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
-  }
-
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 int3 ConvolutionTransposedThin::GetGridSize() const {
  const int grid_x = src_[0]->Width() * dst_[0]->Batch();
  const int grid_y = src_[0]->Height();
@@ -197,7 +174,8 @@ absl::Status CreateConvolutionTransposedThin(
    return absl::InvalidArgumentError(
        "ConvolutionTransposedThin doesn't support this attributes");
  }
-  *result = ConvolutionTransposedThin(definition, attr);
+  *result = ConvolutionTransposedThin(definition, attr,
+                                      creation_context.device->GetInfo());
  RETURN_IF_ERROR(
      result->UploadData(attr.weights, attr.bias, creation_context.context));
  return absl::OkStatus();

--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
@@ -37,7 +37,6 @@ namespace cl {
 class ConvolutionTransposedThin : public GPUOperation {
 public:
  ConvolutionTransposedThin() = default;
-  absl::Status Compile(const CreationContext& creation_context) override;
  int3 GetGridSize() const override;

  // Move only
@@ -53,7 +52,8 @@ class ConvolutionTransposedThin : public GPUOperation {
      const ConvolutionTransposedAttributes& attr,
      ConvolutionTransposedThin* result);
  ConvolutionTransposedThin(const OperationDef& definition,
-                            const ConvolutionTransposedAttributes& attr);
+                            const ConvolutionTransposedAttributes& attr,
+                            const DeviceInfo& device_info);
  template <DataType T>
  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
                          const tflite::gpu::Tensor<Linear, T>& biases,
@@ -65,19 +65,15 @@ class ConvolutionTransposedThin : public GPUOperation {
  std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
                                                int src_depth, int dst_channels,
                                                const int2& kernel_size);
-
-  int2 kernel_size_;
-  int src_channels_;
-  int dst_channels_;
 };

 template <DataType T>
 absl::Status ConvolutionTransposedThin::UploadData(
    const tflite::gpu::Tensor<OHWI, T>& weights,
    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
-  const int src_depth = DivideRoundUp(src_channels_, 4);
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
  const int flt4_count =
-      kernel_size_.x * kernel_size_.y * src_depth * dst_channels_;
+      weights.shape.w * weights.shape.h * src_depth * weights.shape.o;

  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;

@@ -121,20 +117,20 @@ absl::Status ConvolutionTransposedThin::UploadData(
 template <DataType S, typename T>
 void ConvolutionTransposedThin::RearrangeWeightsData(
    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
-  const int src_depth = DivideRoundUp(src_channels_, 4);
-  const int kernel_x = kernel_size_.x;
-  const int kernel_y = kernel_size_.y;
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;

  int counter = 0;
  for (int s = 0; s < src_depth; ++s) {
    for (int y = 0; y < kernel_y; ++y) {
      for (int x = 0; x < kernel_x; ++x) {
-        std::vector<T> filters(dst_channels_);
-        for (int j = 0; j < dst_channels_; ++j) {
+        std::vector<T> filters(weights.shape.o);
+        for (int j = 0; j < weights.shape.o; ++j) {
          for (int i = 0; i < 4; ++i) {
            const int s_ch = s * 4 + i;
            const int d_ch = j;
-            if (s_ch < src_channels_ && d_ch < dst_channels_) {
+            if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
              const int f_index = weights.shape.LinearIndex({d_ch, y, x, s_ch});
              filters[j][i] = weights.data[f_index];
            } else {
@@ -142,7 +138,7 @@ void ConvolutionTransposedThin::RearrangeWeightsData(
            }
          }
        }
-        for (int j = 0; j < dst_channels_; ++j) {
+        for (int j = 0; j < weights.shape.o; ++j) {
          dst[counter++] = filters[j];
        }
      }

--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
@@ -70,7 +70,8 @@ std::string GetSrcValue(int channel_multiplier, const std::string coords) {

 DepthwiseConvolution::DepthwiseConvolution(
    const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr, bool weights_are_buffer)
+    const DepthwiseConvolution2DAttributes& attr, bool weights_are_buffer,
+    const DeviceInfo& device_info)
    : GPUOperation(definition),
      weights_are_buffer_(weights_are_buffer),
      kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 0, 0),
@@ -79,11 +80,17 @@ DepthwiseConvolution::DepthwiseConvolution(
      dilation_(attr.dilations.w, attr.dilations.h, 0, 0),
      channel_multiplier_(attr.weights.shape.o) {
  work_group_size_ = int3(8, 8, 1);
+  const bool stride_correction =
+      definition_.IsBatchSupported() && stride_.x != 1;
+  code_ = GenerateDepthwiseConvolutionCode(definition_, stride_correction,
+                                           channel_multiplier_,
+                                           weights_are_buffer_, device_info);
 }

 DepthwiseConvolution::DepthwiseConvolution(
    const OperationDef& definition,
-    const DepthwiseConvolution3DAttributes& attr, bool weights_are_buffer)
+    const DepthwiseConvolution3DAttributes& attr, bool weights_are_buffer,
+    const DeviceInfo& device_info)
    : GPUOperation(definition),
      weights_are_buffer_(weights_are_buffer),
      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
@@ -94,6 +101,11 @@ DepthwiseConvolution::DepthwiseConvolution(
      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 0),
      channel_multiplier_(attr.weights.shape.o) {
  work_group_size_ = int3(8, 8, 1);
+  const bool stride_correction =
+      definition_.IsBatchSupported() && stride_.x != 1;
+  code_ = GenerateDepthwiseConvolutionCode(definition_, stride_correction,
+                                           channel_multiplier_,
+                                           weights_are_buffer_, device_info);
 }

 DepthwiseConvolution::DepthwiseConvolution(DepthwiseConvolution&& operation)
@@ -121,9 +133,9 @@ DepthwiseConvolution& DepthwiseConvolution::operator=(

 std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
    const OperationDef& op_def, bool stride_correction, int channel_multiplier,
-    bool weights_are_buffer, const CLDevice& device) {
+    bool weights_are_buffer, const DeviceInfo& device_info) {
  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
+  src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
  if (op_def.IsBatchSupported()) {
    src_desc.SetStateVar("BatchedWidth", "true");
  }
@@ -270,24 +282,6 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
  return c;
 }

-absl::Status DepthwiseConvolution::Compile(
-    const CreationContext& creation_context) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  std::string code = GenerateDepthwiseConvolutionCode(
-      definition_, stride_correction, channel_multiplier_, weights_are_buffer_,
-      *creation_context.device);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status DepthwiseConvolution::BindArguments() {
  RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
@@ -321,7 +315,8 @@ absl::Status CreateDepthwiseConvolution(
    const DepthwiseConvolution2DAttributes& attr,
    DepthwiseConvolution* result) {
  bool weights_are_buffer = creation_context.device->IsMali();
-  *result = DepthwiseConvolution(definition, attr, weights_are_buffer);
+  *result = DepthwiseConvolution(definition, attr, weights_are_buffer,
+                                 creation_context.device->GetInfo());
  RETURN_IF_ERROR(
      result->UploadWeights(attr.weights, creation_context.context));

@@ -344,7 +339,8 @@ absl::Status CreateDepthwiseConvolution(
    const DepthwiseConvolution3DAttributes& attr,
    DepthwiseConvolution* result) {
  bool weights_are_buffer = creation_context.device->IsMali();
-  *result = DepthwiseConvolution(definition, attr, weights_are_buffer);
+  *result = DepthwiseConvolution(definition, attr, weights_are_buffer,
+                                 creation_context.device->GetInfo());
  RETURN_IF_ERROR(
      result->UploadWeights(attr.weights, creation_context.context));


--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
@@ -38,7 +38,6 @@ namespace cl {
 class DepthwiseConvolution : public GPUOperation {
 public:
  DepthwiseConvolution() = default;
-  absl::Status Compile(const CreationContext& creation_context) override;
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;

@@ -59,10 +58,10 @@ class DepthwiseConvolution : public GPUOperation {
      DepthwiseConvolution* result);
  DepthwiseConvolution(const OperationDef& definition,
                       const DepthwiseConvolution2DAttributes& attr,
-                       bool weights_are_buffer);
+                       bool weights_are_buffer, const DeviceInfo& device_info);
  DepthwiseConvolution(const OperationDef& definition,
                       const DepthwiseConvolution3DAttributes& attr,
-                       bool weights_are_buffer);
+                       bool weights_are_buffer, const DeviceInfo& device_info);

  template <DataType T>
  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
@@ -84,7 +83,7 @@ class DepthwiseConvolution : public GPUOperation {
                                               bool stride_correction,
                                               int channel_multiplier,
                                               bool weights_are_buffer,
-                                               const CLDevice& device);
+                                               const DeviceInfo& device_info);

  bool weights_are_buffer_;


--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
@@ -29,11 +29,19 @@ namespace cl {

 DepthwiseConv3x3::DepthwiseConv3x3(const OperationDef& definition,
                                   bool weights_are_buffer,
-                                   bool local_mem_uploads)
+                                   bool local_mem_uploads,
+                                   const DeviceInfo& device_info)
    : GPUOperation(definition),
      weights_are_buffer_(weights_are_buffer),
      local_mem_uploads_(local_mem_uploads) {
  work_group_size_ = int3(8, 4, 1);
+  code_ = GenerateDepthwiseConvCode(definition_, device_info,
+                                    weights_are_buffer_, local_mem_uploads_);
+
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device_info.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
+  }
 }

 DepthwiseConv3x3::DepthwiseConv3x3(DepthwiseConv3x3&& operation)
@@ -51,10 +59,10 @@ DepthwiseConv3x3& DepthwiseConv3x3::operator=(DepthwiseConv3x3&& operation) {
 }

 std::string DepthwiseConv3x3::GenerateDepthwiseConvCode(
-    const OperationDef& op_def, const CLDevice& device, bool weights_are_buffer,
-    bool local_mem_uploads) {
+    const OperationDef& op_def, const DeviceInfo& device_info,
+    bool weights_are_buffer, bool local_mem_uploads) {
  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
+  src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
  AddSrcTensor("src_tensor", src_desc);
  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);

@@ -281,28 +289,6 @@ std::string DepthwiseConv3x3::GenerateDepthwiseConvCode(
  return c;
 }

-absl::Status DepthwiseConv3x3::Compile(
-    const CreationContext& creation_context) {
-  std::string code =
-      GenerateDepthwiseConvCode(definition_, *creation_context.device,
-                                weights_are_buffer_, local_mem_uploads_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
-  }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status DepthwiseConv3x3::BindArguments() {
  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
  return args_.SetObjectRef("dst_tensor", dst_[0]);
@@ -343,7 +329,8 @@ absl::Status CreateDepthwiseConv3x3(
      creation_context.device->IsPowerVR() || creation_context.device->IsMali();
  bool local_mem_uploads =
      weights_are_buffer && creation_context.device->IsPowerVR();
-  *result = DepthwiseConv3x3(definition, weights_are_buffer, local_mem_uploads);
+  *result = DepthwiseConv3x3(definition, weights_are_buffer, local_mem_uploads,
+                             creation_context.device->GetInfo());
  return result->UploadWeightsAndBiases(attr.weights, attr.bias,
                                        creation_context.context);
 }

--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
@@ -39,7 +39,6 @@ class DepthwiseConv3x3 : public GPUOperation {
 public:
  DepthwiseConv3x3() = default;
  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;

@@ -51,7 +50,8 @@ class DepthwiseConv3x3 : public GPUOperation {

 private:
  explicit DepthwiseConv3x3(const OperationDef& definition,
-                            bool weights_are_buffer, bool local_mem_uploads);
+                            bool weights_are_buffer, bool local_mem_uploads,
+                            const DeviceInfo& device_info);
  template <DataType T>
  absl::Status UploadWeightsAndBiases(
      const tflite::gpu::Tensor<OHWI, T>& weights,
@@ -67,7 +67,7 @@ class DepthwiseConv3x3 : public GPUOperation {
      const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);

  std::string GenerateDepthwiseConvCode(const OperationDef& op_def,
-                                        const CLDevice& device,
+                                        const DeviceInfo& device_info,
                                        bool weights_are_buffer,
                                        bool local_mem_uploads);


--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
@@ -24,8 +24,22 @@ namespace tflite {
 namespace gpu {
 namespace cl {

-FullyConnected::FullyConnected(const OperationDef& definition)
-    : GPUOperation(definition) {}
+FullyConnected::FullyConnected(const OperationDef& definition,
+                               const DeviceInfo& device_info)
+    : GPUOperation(definition) {
+  if (device_info.IsAdreno()) {
+    if (device_info.IsAdreno3xx()) {
+      work_group_size_ = int3(8, 4, 1);
+    } else if (device_info.IsAdreno4xx()) {
+      work_group_size_ = int3(16, 4, 1);
+    } else {
+      work_group_size_ = int3(32, 4, 1);
+    }
+  } else {
+    work_group_size_ = int3(16, 4, 1);
+  }
+  code_ = GetFullyConnectedKernelCode(definition_, work_group_size_);
+}

 FullyConnected::FullyConnected(FullyConnected&& kernel)
    : GPUOperation(std::move(kernel)) {}
@@ -92,36 +106,6 @@ std::string FullyConnected::GetFullyConnectedKernelCode(
  return c;
 }

-absl::Status FullyConnected::Compile(const CreationContext& creation_context) {
-  int wg_width = 32;
-  int wg_height = 4;
-  int work_items;
-  do {
-    work_group_size_ = {wg_width, wg_height, 1};
-    wg_width /= 2;
-    std::string code =
-        GetFullyConnectedKernelCode(definition_, work_group_size_);
-    std::string element_wise_code;
-    RETURN_IF_ERROR(
-        MergeOperations(linked_operations_, &args_, &element_wise_code));
-    RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                            {{"dst_tensor", element_wise_code}},
-                                            &code));
-    auto status = creation_context.cache->GetOrCreateCLKernel(
-        code, "main_function", *creation_context.context,
-        *creation_context.device, &kernel_);
-    if (!status.ok()) {
-      if (work_group_size_.x == 1) {
-        return status;
-      } else {
-        continue;
-      }
-    }
-    work_items = work_group_size_.x * work_group_size_.y * work_group_size_.z;
-  } while (work_items > kernel_.GetMaxWorkGroupSize());
-  return absl::OkStatus();
-}
-
 int3 FullyConnected::GetGridSize() const {
  return int3(dst_[0]->Slices(), 1, 1);
 }
@@ -130,7 +114,7 @@ absl::Status CreateFullyConnected(const CreationContext& creation_context,
                                  const OperationDef& definition,
                                  const FullyConnectedAttributes& attr,
                                  FullyConnected* result) {
-  *result = FullyConnected(definition);
+  *result = FullyConnected(definition, creation_context.device->GetInfo());
  RETURN_IF_ERROR(
      result->UploadWeights(attr.weights, creation_context.context));


--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
@@ -93,7 +93,6 @@ class FullyConnected : public GPUOperation {
    return absl::OkStatus();
  }
  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  FullyConnected(FullyConnected&& kernel);
@@ -102,7 +101,7 @@ class FullyConnected : public GPUOperation {
  FullyConnected& operator=(const FullyConnected&) = delete;

 private:
-  explicit FullyConnected(const OperationDef& definition);
+  FullyConnected(const OperationDef& definition, const DeviceInfo& device_info);
  friend absl::Status CreateFullyConnected(
      const CreationContext& creation_context, const OperationDef& definition,
      const FullyConnectedAttributes& attr, FullyConnected* result);

--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@@ -190,9 +190,9 @@ absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
      creation_context.device->GetInfo(),
      {{dst_tensors_names_[0], element_wise_code}}, &code_));
  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code_, "main_function", *creation_context.context,
+      code_, "main_function", compiler_options_, *creation_context.context,
      *creation_context.device, &kernel_));
-  return PostCompileCheck();
+  return PostCompileCheck(creation_context.device->GetInfo());
 }

 ElementwiseOperation::ElementwiseOperation(ElementwiseOperation&& operation)

--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@@ -103,7 +103,9 @@ class GPUOperation {

  virtual absl::Status Compile(const CreationContext& creation_context);

-  virtual absl::Status PostCompileCheck() { return absl::OkStatus(); }
+  virtual absl::Status PostCompileCheck(const DeviceInfo& device_info) {
+    return absl::OkStatus();
+  }

  const OperationDef& GetDefinition() const { return definition_; }


--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
@@ -25,7 +25,10 @@ namespace tflite {
 namespace gpu {
 namespace cl {

-LSTM::LSTM(const OperationDef& definition) : GPUOperation(definition) {}
+LSTM::LSTM(const OperationDef& definition, const DeviceInfo& device_info)
+    : GPUOperation(definition) {
+  code_ = GetLSTMCode(definition_, device_info);
+}

 LSTM::LSTM(LSTM&& kernel) : GPUOperation(std::move(kernel)) {}

@@ -37,7 +40,7 @@ LSTM& LSTM::operator=(LSTM&& kernel) {
 }

 std::string LSTM::GetLSTMCode(const OperationDef& op_def,
-                              const CLDevice& device) {
+                              const DeviceInfo& device_info) {
  AddSrcTensor("intermediate", op_def.src_tensors[0]);
  AddSrcTensor("prev_state", op_def.src_tensors[1]);
  AddDstTensor("new_state", op_def.dst_tensors[0]);
@@ -56,7 +59,8 @@ std::string LSTM::GetLSTMCode(const OperationDef& op_def,
  c += "  FLT4 r1 = args.intermediate.Read(0, 0, Z + state_stride, B);\n";
  c += "  FLT4 r2 = args.intermediate.Read(0, 0, Z + state_stride * 2, B);\n";
  c += "  FLT4 r3 = args.intermediate.Read(0, 0, Z + state_stride * 3, B);\n";
-  if (op_def.precision != CalculationsPrecision::F32 && device.IsAdreno()) {
+  if (op_def.precision != CalculationsPrecision::F32 &&
+      device_info.IsAdreno()) {
    c += "  FLT4 input_gate;\n";
    c += "  FLT4 new_input;\n";
    c += "  FLT4 forget_gate;\n";
@@ -101,15 +105,6 @@ std::string LSTM::GetLSTMCode(const OperationDef& op_def,
  return c;
 }

-absl::Status LSTM::Compile(const CreationContext& creation_context) {
-  std::string code = GetLSTMCode(definition_, *creation_context.device);
-  RETURN_IF_ERROR(
-      args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 int3 LSTM::GetGridSize() const {
  const int grid_x = dst_[0]->Batch();
  const int grid_y = dst_[0]->Slices();
@@ -117,7 +112,9 @@ int3 LSTM::GetGridSize() const {
  return int3(grid_x, grid_y, grid_z);
 }

-LSTM CreateLSTM(const OperationDef& definition) { return LSTM(definition); }
+LSTM CreateLSTM(const OperationDef& definition, const DeviceInfo& device_info) {
+  return LSTM(definition, device_info);
+}

 }  // namespace cl
 }  // namespace gpu

--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h
@@ -27,9 +27,8 @@ namespace cl {

 class LSTM : public GPUOperation {
 public:
-  explicit LSTM(const OperationDef& definition);
+  LSTM(const OperationDef& definition, const DeviceInfo& device_info);
  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  LSTM(LSTM&& kernel);
@@ -38,10 +37,11 @@ class LSTM : public GPUOperation {
  LSTM& operator=(const LSTM&) = delete;

 private:
-  std::string GetLSTMCode(const OperationDef& op_def, const CLDevice& device);
+  std::string GetLSTMCode(const OperationDef& op_def,
+                          const DeviceInfo& device_info);
 };

-LSTM CreateLSTM(const OperationDef& definition);
+LSTM CreateLSTM(const OperationDef& definition, const DeviceInfo& device_info);

 }  // namespace cl
 }  // namespace gpu

--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm_test.cc
@@ -67,7 +67,7 @@ TEST_F(OpenCLOperationTest, LSTM) {
      op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
      TensorFloat32 new_state;
      TensorFloat32 new_activ;
-      LSTM operation = CreateLSTM(op_def);
+      LSTM operation = CreateLSTM(op_def, env_.GetDevicePtr()->GetInfo());
      ASSERT_OK(ExecuteGPUOperation(
          {src_tensor, prev_state}, creation_context_, &operation,
          {BHWC(1, 1, 1, 4), BHWC(1, 1, 1, 4)}, {&new_state, &new_activ}));

--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
@@ -25,19 +25,25 @@ namespace gpu {
 namespace cl {

 MaxUnpooling::MaxUnpooling(const OperationDef& definition,
-                           const MaxUnpooling2DAttributes& attr)
+                           const MaxUnpooling2DAttributes& attr,
+                           const DeviceInfo& device_info)
    : GPUOperation(definition),
      stride_(attr.strides.w, attr.strides.h, 0, 0),
      padding_(attr.padding.appended.w, attr.padding.appended.h, 0, 0),
-      kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0) {}
+      kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0) {
+  code_ = GetMaxUnpoolingKernelCode(definition_, device_info);
+}

 MaxUnpooling::MaxUnpooling(const OperationDef& definition,
-                           const MaxUnpooling3DAttributes& attr)
+                           const MaxUnpooling3DAttributes& attr,
+                           const DeviceInfo& device_info)
    : GPUOperation(definition),
      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
      padding_(attr.padding.appended.w, attr.padding.appended.h,
               attr.padding.appended.d, 0),
-      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0) {}
+      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0) {
+  code_ = GetMaxUnpoolingKernelCode(definition_, device_info);
+}

 MaxUnpooling::MaxUnpooling(MaxUnpooling&& kernel)
    : GPUOperation(std::move(kernel)),
@@ -55,16 +61,16 @@ MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
  return *this;
 }

-std::string MaxUnpooling::GetMaxUnpoolingKernelCode(const OperationDef& op_def,
-                                                    const CLDevice& device) {
+std::string MaxUnpooling::GetMaxUnpoolingKernelCode(
+    const OperationDef& op_def, const DeviceInfo& device_info) {
  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
+  src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
  if (op_def.IsBatchSupported()) {
    src_desc.SetStateVar("BatchedWidth", "true");
  }
  AddSrcTensor("src_tensor", src_desc);
  auto src_ind_desc = op_def.src_tensors[1];
-  src_ind_desc.SetTextureAddressMode(GetFastestZeroMode(device));
+  src_ind_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
  if (op_def.IsBatchSupported()) {
    src_ind_desc.SetStateVar("BatchedWidth", "true");
  }
@@ -169,20 +175,6 @@ std::string MaxUnpooling::GetMaxUnpoolingKernelCode(const OperationDef& op_def,
  return c;
 }

-absl::Status MaxUnpooling::Compile(const CreationContext& creation_context) {
-  std::string code =
-      GetMaxUnpoolingKernelCode(definition_, *creation_context.device);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status MaxUnpooling::BindArguments() {
  if (definition_.dst_tensors[0].HasAxis(Axis::WIDTH)) {
    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
@@ -210,13 +202,15 @@ int3 MaxUnpooling::GetGridSize() const {
 }

 MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
-                                const MaxUnpooling2DAttributes& attr) {
-  return MaxUnpooling(definition, attr);
+                                const MaxUnpooling2DAttributes& attr,
+                                const DeviceInfo& device_info) {
+  return MaxUnpooling(definition, attr, device_info);
 }

 MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
-                                const MaxUnpooling3DAttributes& attr) {
-  return MaxUnpooling(definition, attr);
+                                const MaxUnpooling3DAttributes& attr,
+                                const DeviceInfo& device_info) {
+  return MaxUnpooling(definition, attr, device_info);
 }

 }  // namespace cl

--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
@@ -28,13 +28,14 @@ namespace cl {
 class MaxUnpooling : public GPUOperation {
 public:
  MaxUnpooling(const OperationDef& definition,
-               const MaxUnpooling2DAttributes& attr);
+               const MaxUnpooling2DAttributes& attr,
+               const DeviceInfo& device_info);
  MaxUnpooling(const OperationDef& definition,
-               const MaxUnpooling3DAttributes& attr);
+               const MaxUnpooling3DAttributes& attr,
+               const DeviceInfo& device_info);

  absl::Status BindArguments() override;
  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  MaxUnpooling(MaxUnpooling&& kernel);
@@ -44,7 +45,7 @@ class MaxUnpooling : public GPUOperation {

 private:
  std::string GetMaxUnpoolingKernelCode(const OperationDef& op_def,
-                                        const CLDevice& device);
+                                        const DeviceInfo& device_info);

  int4 stride_;
  int4 padding_;
@@ -52,10 +53,12 @@ class MaxUnpooling : public GPUOperation {
 };

 MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
-                                const MaxUnpooling2DAttributes& attr);
+                                const MaxUnpooling2DAttributes& attr,
+                                const DeviceInfo& device_info);

 MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
-                                const MaxUnpooling3DAttributes& attr);
+                                const MaxUnpooling3DAttributes& attr,
+                                const DeviceInfo& device_info);

 }  // namespace cl
 }  // namespace gpu

--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling_test.cc
@@ -55,7 +55,8 @@ TEST_F(OpenCLOperationTest, MaxUnpooling) {
      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
      TensorFloat32 dst_tensor;
-      MaxUnpooling operation = CreateMaxUnpooling(op_def, attr);
+      MaxUnpooling operation =
+          CreateMaxUnpooling(op_def, attr, env_.GetDevicePtr()->GetInfo());
      ASSERT_OK(ExecuteGPUOperation({src_tensor, src_ind_tensor},
                                    creation_context_, &operation,
                                    BHWC(1, 4, 4, 1), &dst_tensor));

--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
@@ -26,6 +26,18 @@ namespace tflite {
 namespace gpu {
 namespace cl {

+Mean::Mean(const OperationDef& definition, const DeviceInfo& device_info)
+    : GPUOperation(definition) {
+  // for workgroup size:
+  // must be: (x * y) % 4 = 0;
+  // must be: z = 1;
+  work_group_size_ = int3(16, 16, 1);
+  if (device_info.IsAdreno3xx()) {
+    work_group_size_ = int3(16, 8, 1);
+  }
+  code_ = GetMeanKernelCode(definition_, work_group_size_);
+}
+
 Mean::Mean(Mean&& operation) : GPUOperation(std::move(operation)) {}

 Mean& Mean::operator=(Mean&& operation) {
@@ -96,25 +108,6 @@ std::string Mean::GetMeanKernelCode(const OperationDef& op_def,
  return c;
 }

-absl::Status Mean::Compile(const CreationContext& creation_context) {
-  // must be: (x * y) % 4 = 0;
-  // must be: z = 1;
-  work_group_size_ = int3(16, 16, 1);
-  if (creation_context.device->IsAdreno3xx()) {
-    work_group_size_ = int3(16, 8, 1);
-  }
-  std::string code = GetMeanKernelCode(definition_, work_group_size_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status Mean::BindArguments() {
  const double total_size = src_[0]->Width() * src_[0]->Height();
  const double size_0 = work_group_size_.x * work_group_size_.y;
@@ -131,7 +124,9 @@ int3 Mean::GetGridSize() const {
  return int3(grid_x, grid_y, grid_z);
 }

-Mean CreateMean(const OperationDef& definition) { return Mean(definition); }
+Mean CreateMean(const OperationDef& definition, const DeviceInfo& device_info) {
+  return Mean(definition, device_info);
+}

 }  // namespace cl
 }  // namespace gpu

--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
@@ -29,14 +29,13 @@ namespace cl {
 class Mean : public GPUOperation {
 public:
  Mean() = default;
-  explicit Mean(const OperationDef& definition) : GPUOperation(definition) {}
+  Mean(const OperationDef& definition, const DeviceInfo& device_info);

  absl::Status Tune(const TuningParameters& params) override {
    return absl::OkStatus();
  }
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  Mean(Mean&& operation);
@@ -49,7 +48,7 @@ class Mean : public GPUOperation {
                                const int3& work_group_size);
 };

-Mean CreateMean(const OperationDef& definition);
+Mean CreateMean(const OperationDef& definition, const DeviceInfo& device_info);

 }  // namespace cl
 }  // namespace gpu

--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
@@ -26,7 +26,9 @@ namespace gpu {
 namespace cl {

 MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition)
-    : GPUOperation(definition) {}
+    : GPUOperation(definition) {
+  code_ = GetNormalizationCode(definition_);
+}

 std::string MeanStdDevNormalization::GetNormalizationCode(
    const OperationDef& op_def) {
@@ -70,16 +72,6 @@ std::string MeanStdDevNormalization::GetNormalizationCode(
  return c;
 }

-absl::Status MeanStdDevNormalization::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GetNormalizationCode(definition_);
-  RETURN_IF_ERROR(
-      args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 int3 MeanStdDevNormalization::GetGridSize() const {
  const int grid_x = dst_[0]->Batch();
  const int grid_y = 1;

--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
@@ -31,7 +31,6 @@ class MeanStdDevNormalization : public GPUOperation {
  explicit MeanStdDevNormalization(const OperationDef& definition);

  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  MeanStdDevNormalization(MeanStdDevNormalization&& kernel) = default;

--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_test.cc
@@ -47,7 +47,7 @@ TEST_F(OpenCLOperationTest, Mean) {
      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
      TensorFloat32 dst_tensor;
-      Mean operation = CreateMean(op_def);
+      Mean operation = CreateMean(op_def, env_.GetDevicePtr()->GetInfo());
      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                    BHWC(1, 1, 1, 1), &dst_tensor));
      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {2.5f}));

--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
@@ -26,14 +26,14 @@ namespace gpu {
 namespace cl {

 Padding::Padding(const OperationDef& definition, const PadAttributes& attr)
-    : GPUOperation(definition), attributes_(attr) {}
+    : GPUOperation(definition) {
+  code_ = GetPaddingCode(definition_, attr);
+}

-Padding::Padding(Padding&& kernel)
-    : GPUOperation(std::move(kernel)), attributes_(kernel.attributes_) {}
+Padding::Padding(Padding&& kernel) : GPUOperation(std::move(kernel)) {}

 Padding& Padding::operator=(Padding&& kernel) {
  if (this != &kernel) {
-    std::swap(attributes_, kernel.attributes_);
    GPUOperation::operator=(std::move(kernel));
  }
  return *this;
@@ -43,10 +43,10 @@ std::string Padding::GetPaddingCode(const OperationDef& op_def,
                                    const PadAttributes& attr) {
  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-  args_.AddInt("prepended_x");
-  args_.AddInt("prepended_y");
-  args_.AddInt("prepended_z");
-  args_.AddInt("prepended_w");
+  args_.AddInt("prepended_x", attr.prepended.w);
+  args_.AddInt("prepended_y", attr.prepended.h);
+  args_.AddInt("prepended_z", attr.prepended.c);
+  args_.AddInt("prepended_w", attr.prepended.b);

  const std::string dst_batch =
      op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
@@ -149,27 +149,6 @@ std::string Padding::GetPaddingCode(const OperationDef& op_def,
  return c;
 }

-absl::Status Padding::Compile(const CreationContext& creation_context) {
-  std::string code = GetPaddingCode(definition_, attributes_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status Padding::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("prepended_x", attributes_.prepended.w));
-  RETURN_IF_ERROR(args_.SetInt("prepended_y", attributes_.prepended.h));
-  RETURN_IF_ERROR(args_.SetInt("prepended_z", attributes_.prepended.c));
-  RETURN_IF_ERROR(args_.SetInt("prepended_w", attributes_.prepended.b));
-  return absl::OkStatus();
-}
-
 int3 Padding::GetGridSize() const {
  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
  const int grid_y = dst_[0]->Height();

--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
@@ -28,10 +28,7 @@ namespace cl {
 class Padding : public GPUOperation {
 public:
  Padding(const OperationDef& definition, const PadAttributes& attr);
-
-  absl::Status BindArguments() override;
  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  Padding(Padding&& kernel);
@@ -42,8 +39,6 @@ class Padding : public GPUOperation {
 private:
  std::string GetPaddingCode(const OperationDef& op_def,
                             const PadAttributes& attr);
-
-  PadAttributes attributes_;
 };

 Padding CreatePadding(const OperationDef& definition,

--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
@@ -25,23 +25,27 @@ namespace gpu {
 namespace cl {

 Pooling::Pooling(const OperationDef& definition,
-                 const Pooling2DAttributes& attr)
+                 const Pooling2DAttributes& attr, const DeviceInfo& device_info)
    : GPUOperation(definition),
      stride_(attr.strides.w, attr.strides.h, 0, 0),
      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
      kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0),
      type_(attr.type),
-      output_indices_(attr.output_indices) {}
+      output_indices_(attr.output_indices) {
+  GenerateCode(device_info);
+}

 Pooling::Pooling(const OperationDef& definition,
-                 const Pooling3DAttributes& attr)
+                 const Pooling3DAttributes& attr, const DeviceInfo& device_info)
    : GPUOperation(definition),
      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
               -attr.padding.prepended.d, 0),
      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0),
      type_(attr.type),
-      output_indices_(attr.output_indices) {}
+      output_indices_(attr.output_indices) {
+  GenerateCode(device_info);
+}

 Pooling::Pooling(Pooling&& kernel)
    : GPUOperation(std::move(kernel)),
@@ -63,11 +67,11 @@ Pooling& Pooling::operator=(Pooling&& kernel) {
  return *this;
 }

-std::string Pooling::GetAveragePoolingKernelCode(const OperationDef& op_def,
-                                                 bool stride_correction,
-                                                 const CLDevice& device) {
+std::string Pooling::GetAveragePoolingKernelCode(
+    const OperationDef& op_def, bool stride_correction,
+    const DeviceInfo& device_info) {
  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
+  src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
  if (op_def.IsBatchSupported()) {
    src_desc.SetStateVar("BatchedWidth", "true");
  }
@@ -344,33 +348,16 @@ std::string Pooling::GetMaxPoolingKernelCode(const OperationDef& op_def,
  return c;
 }

-absl::Status Pooling::Compile(const CreationContext& creation_context) {
-  std::string code;
+void Pooling::GenerateCode(const DeviceInfo& device_info) {
  const bool stride_correction =
      definition_.IsBatchSupported() && stride_.x != 1;
-  switch (type_) {
-    case PoolingType::AVERAGE:
-      code = GetAveragePoolingKernelCode(definition_, stride_correction,
-                                         *creation_context.device);
-      break;
-    case PoolingType::MAX:
-      code = GetMaxPoolingKernelCode(definition_, stride_correction,
-                                     output_indices_);
-      break;
-    default:
-      return absl::InvalidArgumentError(
-          "You should create another kernel with this params");
-      break;
-  }
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
+  if (type_ == PoolingType::AVERAGE) {
+    code_ = GetAveragePoolingKernelCode(definition_, stride_correction,
+                                        device_info);
+  } else if (type_ == PoolingType::MAX) {
+    code_ = GetMaxPoolingKernelCode(definition_, stride_correction,
+                                    output_indices_);
+  }
 }

 absl::Status Pooling::BindArguments() {
@@ -400,13 +387,15 @@ int3 Pooling::GetGridSize() const {
 }

 Pooling CreatePooling(const OperationDef& definition,
-                      const Pooling2DAttributes& attr) {
-  return Pooling(definition, attr);
+                      const Pooling2DAttributes& attr,
+                      const DeviceInfo& device_info) {
+  return Pooling(definition, attr, device_info);
 }

 Pooling CreatePooling(const OperationDef& definition,
-                      const Pooling3DAttributes& attr) {
-  return Pooling(definition, attr);
+                      const Pooling3DAttributes& attr,
+                      const DeviceInfo& device_info) {
+  return Pooling(definition, attr, device_info);
 }

 }  // namespace cl

--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
@@ -29,12 +29,13 @@ namespace cl {

 class Pooling : public GPUOperation {
 public:
-  Pooling(const OperationDef& definition, const Pooling2DAttributes& attr);
-  Pooling(const OperationDef& definition, const Pooling3DAttributes& attr);
+  Pooling(const OperationDef& definition, const Pooling2DAttributes& attr,
+          const DeviceInfo& device_info);
+  Pooling(const OperationDef& definition, const Pooling3DAttributes& attr,
+          const DeviceInfo& device_info);

  absl::Status BindArguments() override;
  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  Pooling(Pooling&& kernel);
@@ -45,11 +46,13 @@ class Pooling : public GPUOperation {
 private:
  std::string GetAveragePoolingKernelCode(const OperationDef& op_def,
                                          bool stride_correction,
-                                          const CLDevice& device);
+                                          const DeviceInfo& device_info);
  std::string GetMaxPoolingKernelCode(const OperationDef& op_def,
                                      bool stride_correction,
                                      bool output_indices);

+  void GenerateCode(const DeviceInfo& device_info);
+
  int4 stride_;
  int4 padding_;
  int4 kernel_size_;
@@ -59,10 +62,12 @@ class Pooling : public GPUOperation {
 };

 Pooling CreatePooling(const OperationDef& definition,
-                      const Pooling2DAttributes& attr);
+                      const Pooling2DAttributes& attr,
+                      const DeviceInfo& device_info);

 Pooling CreatePooling(const OperationDef& definition,
-                      const Pooling3DAttributes& attr);
+                      const Pooling3DAttributes& attr,
+                      const DeviceInfo& device_info);

 }  // namespace cl
 }  // namespace gpu

--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling_test.cc
@@ -52,7 +52,8 @@ TEST_F(OpenCLOperationTest, AveragePooling) {
      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
      TensorFloat32 dst_tensor;
-      Pooling operation = CreatePooling(op_def, attr);
+      Pooling operation =
+          CreatePooling(op_def, attr, env_.GetDevicePtr()->GetInfo());
      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                    BHWC(1, 1, 1, 2), &dst_tensor));
      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {3.0f, 4.0f}));
@@ -81,7 +82,8 @@ TEST_F(OpenCLOperationTest, AveragePoolingNonEmptyPadding) {
      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
      TensorFloat32 dst_tensor;
-      Pooling operation = CreatePooling(op_def, attr);
+      Pooling operation =
+          CreatePooling(op_def, attr, env_.GetDevicePtr()->GetInfo());
      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                    BHWC(1, 2, 2, 1), &dst_tensor));
      EXPECT_THAT(dst_tensor.data,
@@ -111,7 +113,8 @@ TEST_F(OpenCLOperationTest, MaxPooling) {
      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
      TensorFloat32 dst_tensor;
-      Pooling operation = CreatePooling(op_def, attr);
+      Pooling operation =
+          CreatePooling(op_def, attr, env_.GetDevicePtr()->GetInfo());
      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                    BHWC(1, 1, 1, 2), &dst_tensor));
      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {8.0f, 7.0f}));
@@ -143,7 +146,8 @@ TEST_F(OpenCLOperationTest, MaxPoolingIndices) {
      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
      TensorFloat32 dst_tensor;
      TensorFloat32 dst_tensor_ind;
-      Pooling operation = CreatePooling(op_def, attr);
+      Pooling operation =
+          CreatePooling(op_def, attr, env_.GetDevicePtr()->GetInfo());
      ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
                                    {BHWC(1, 1, 1, 2), BHWC(1, 1, 1, 2)},
                                    {&dst_tensor, &dst_tensor_ind}));

--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
@@ -24,6 +24,10 @@ namespace tflite {
 namespace gpu {
 namespace cl {

+Reshape::Reshape(const OperationDef& definition) : GPUOperation(definition) {
+  code_ = GetReshapeCode(definition_);
+}
+
 Reshape::Reshape(Reshape&& operation) : GPUOperation(std::move(operation)) {}

 Reshape& Reshape::operator=(Reshape&& operation) {
@@ -92,19 +96,6 @@ std::string Reshape::GetReshapeCode(const OperationDef& op_def) {
  return c;
 }

-absl::Status Reshape::Compile(const CreationContext& creation_context) {
-  std::string code = GetReshapeCode(definition_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 int3 Reshape::GetGridSize() const {
  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
  const int grid_y = dst_[0]->Height();

--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
@@ -27,10 +27,9 @@ namespace cl {

 class Reshape : public GPUOperation {
 public:
-  explicit Reshape(const OperationDef& definition) : GPUOperation(definition) {}
+  explicit Reshape(const OperationDef& definition);

  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  Reshape(Reshape&& operation);

--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
@@ -24,6 +24,11 @@ namespace tflite {
 namespace gpu {
 namespace cl {

+Reshapex4::Reshapex4(const OperationDef& definition)
+    : GPUOperation(definition) {
+  code_ = GetReshapeCode(definition_);
+}
+
 Reshapex4::Reshapex4(Reshapex4&& operation)
    : GPUOperation(std::move(operation)) {}

@@ -77,19 +82,6 @@ std::string Reshapex4::GetReshapeCode(const OperationDef& op_def) {
  return c;
 }

-absl::Status Reshapex4::Compile(const CreationContext& creation_context) {
-  std::string code = GetReshapeCode(definition_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 int3 Reshapex4::GetGridSize() const {
  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
  const int grid_y = dst_[0]->Height();

--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
@@ -28,11 +28,9 @@ namespace cl {

 class Reshapex4 : public GPUOperation {
 public:
-  explicit Reshapex4(const OperationDef& definition)
-      : GPUOperation(definition) {}
+  explicit Reshapex4(const OperationDef& definition);

  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  Reshapex4(Reshapex4&& operation);

--- a/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
@@ -24,6 +24,11 @@ namespace tflite {
 namespace gpu {
 namespace cl {

+Resize::Resize(const OperationDef& definition, const Resize2DAttributes& attr)
+    : GPUOperation(definition), attr_(attr) {
+  code_ = GetResizeCode(definition_, attr_);
+}
+
 Resize::Resize(Resize&& operation)
    : GPUOperation(std::move(operation)), attr_(operation.attr_) {}

@@ -127,19 +132,6 @@ std::string Resize::GetResizeCode(const OperationDef& op_def,
  return c;
 }

-absl::Status Resize::Compile(const CreationContext& creation_context) {
-  std::string code = GetResizeCode(definition_, attr_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status Resize::BindArguments() {
  RETURN_IF_ERROR(args_.SetInt("border_x", src_[0]->Width() - 1));
  RETURN_IF_ERROR(args_.SetInt("border_y", src_[0]->Height() - 1));
@@ -164,6 +156,12 @@ Resize CreateResize(const OperationDef& definition,
  return Resize(definition, attr);
 }

+Resize3D::Resize3D(const OperationDef& definition,
+                   const Resize3DAttributes& attr)
+    : GPUOperation(definition), attr_(attr) {
+  code_ = GetResize3DCode(definition_, attr_);
+}
+
 Resize3D::Resize3D(Resize3D&& operation)
    : GPUOperation(std::move(operation)), attr_(operation.attr_) {}

@@ -288,19 +286,6 @@ std::string Resize3D::GetResize3DCode(const OperationDef& op_def,
  return c;
 }

-absl::Status Resize3D::Compile(const CreationContext& creation_context) {
-  std::string code = GetResize3DCode(definition_, attr_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status Resize3D::BindArguments() {
  RETURN_IF_ERROR(args_.SetInt("border_x", src_[0]->Width() - 1));
  RETURN_IF_ERROR(args_.SetInt("border_y", src_[0]->Height() - 1));

--- a/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
@@ -29,7 +29,6 @@ class Resize : public GPUOperation {
 public:
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  Resize(Resize&& operation);
@@ -41,8 +40,7 @@ class Resize : public GPUOperation {
                             const Resize2DAttributes& attr);

 private:
-  Resize(const OperationDef& definition, const Resize2DAttributes& attr)
-      : GPUOperation(definition), attr_(attr) {}
+  Resize(const OperationDef& definition, const Resize2DAttributes& attr);

  std::string GetResizeCode(const OperationDef& op_def,
                            const Resize2DAttributes& attr);
@@ -57,7 +55,6 @@ class Resize3D : public GPUOperation {
 public:
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  Resize3D(Resize3D&& operation);
@@ -69,8 +66,7 @@ class Resize3D : public GPUOperation {
                                 const Resize3DAttributes& attr);

 private:
-  Resize3D(const OperationDef& definition, const Resize3DAttributes& attr)
-      : GPUOperation(definition), attr_(attr) {}
+  Resize3D(const OperationDef& definition, const Resize3DAttributes& attr);

  std::string GetResize3DCode(const OperationDef& op_def,
                              const Resize3DAttributes& attr);

--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
@@ -25,6 +25,10 @@ namespace tflite {
 namespace gpu {
 namespace cl {

+Softmax::Softmax(const OperationDef& definition) : GPUOperation(definition) {
+  code_ = GetSoftmaxKernelCode(definition_);
+}
+
 Softmax::Softmax(Softmax&& kernel) : GPUOperation(std::move(kernel)) {}

 Softmax& Softmax::operator=(Softmax&& kernel) {
@@ -71,19 +75,6 @@ std::string Softmax::GetSoftmaxKernelCode(const OperationDef& op_def) {
  return c;
 }

-absl::Status Softmax::Compile(const CreationContext& creation_context) {
-  std::string code = GetSoftmaxKernelCode(definition_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 int3 Softmax::GetGridSize() const {
  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
  const int grid_y = dst_[0]->Height();

--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
@@ -29,10 +29,9 @@ namespace cl {
 class Softmax : public GPUOperation {
 public:
  Softmax() = default;
-  explicit Softmax(const OperationDef& definition) : GPUOperation(definition) {}
+  explicit Softmax(const OperationDef& definition);

  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  Softmax(Softmax&& kernel);

--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
@@ -24,6 +24,12 @@ namespace tflite {
 namespace gpu {
 namespace cl {

+Softmax1x1::Softmax1x1(const OperationDef& definition)
+    : GPUOperation(definition) {
+  work_group_size_ = int3(32, 1, 1);
+  code_ = GetSoftmaxKernelCode(definition_);
+}
+
 Softmax1x1::Softmax1x1(Softmax1x1&& kernel) : GPUOperation(std::move(kernel)) {}

 Softmax1x1& Softmax1x1::operator=(Softmax1x1&& kernel) {
@@ -103,20 +109,6 @@ std::string Softmax1x1::GetSoftmaxKernelCode(const OperationDef& op_def) {
  return c;
 }

-absl::Status Softmax1x1::Compile(const CreationContext& creation_context) {
-  std::string code = GetSoftmaxKernelCode(definition_);
-  std::string element_wise_code;
-  work_group_size_ = int3(32, 1, 1);
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status Softmax1x1::BindArguments() {
  float4 mask = GetMaskForLastPlane(src_[0]->Channels());
  RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));

--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
@@ -28,14 +28,12 @@ namespace cl {
 class Softmax1x1 : public GPUOperation {
 public:
  Softmax1x1() = default;
-  explicit Softmax1x1(const OperationDef& definition)
-      : GPUOperation(definition) {}
+  explicit Softmax1x1(const OperationDef& definition);
  absl::Status Tune(const TuningParameters& params) override {
    return absl::OkStatus();
  }
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  Softmax1x1(Softmax1x1&& kernel);

--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
@@ -26,6 +26,12 @@ namespace tflite {
 namespace gpu {
 namespace cl {

+SpaceToDepth::SpaceToDepth(const OperationDef& op_def,
+                           const SpaceToDepthAttributes& attr)
+    : GPUOperation(op_def), attr_(attr) {
+  code_ = GetSpaceToDepthCode(definition_);
+}
+
 SpaceToDepth::SpaceToDepth(SpaceToDepth&& operation)
    : GPUOperation(std::move(operation)), attr_(operation.attr_) {}

@@ -82,19 +88,6 @@ std::string SpaceToDepth::GetSpaceToDepthCode(const OperationDef& op_def) {
  return c;
 }

-absl::Status SpaceToDepth::Compile(const CreationContext& creation_context) {
-  std::string code = GetSpaceToDepthCode(definition_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status SpaceToDepth::BindArguments() {
  RETURN_IF_ERROR(args_.SetInt("block_size", attr_.block_size));
  return absl::OkStatus();

--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
@@ -28,11 +28,9 @@ namespace cl {

 class SpaceToDepth : public GPUOperation {
 public:
-  SpaceToDepth(const OperationDef& op_def, const SpaceToDepthAttributes& attr)
-      : GPUOperation(op_def), attr_(attr) {}
+  SpaceToDepth(const OperationDef& op_def, const SpaceToDepthAttributes& attr);
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  SpaceToDepth(SpaceToDepth&& operation);
  SpaceToDepth& operator=(SpaceToDepth&& operation);

--- a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc
@@ -30,24 +30,23 @@ namespace cl {
 DepthwiseConvPlus1x1Conv::DepthwiseConvPlus1x1Conv(
    const OperationDef& definition,
    const DepthwiseConvolution2DAttributes& dw_attr,
-    const Convolution2DAttributes& conv_attr)
-    : GPUOperation(definition),
-      dw_attr_(dw_attr),
-      result_depth_(DivideRoundUp(conv_attr.weights.shape.o, 4)) {
+    const Convolution2DAttributes& conv_attr, const DeviceInfo& device_info)
+    : GPUOperation(definition), dw_attr_(dw_attr) {
  work_group_size_ = int3(8, 8, 1);
+  code_ =
+      GenerateCode(definition_, dw_attr_,
+                   DivideRoundUp(conv_attr.weights.shape.o, 4), device_info);
 }

 DepthwiseConvPlus1x1Conv::DepthwiseConvPlus1x1Conv(
    DepthwiseConvPlus1x1Conv&& operation)
    : GPUOperation(std::move(operation)),
-      dw_attr_(std::move(operation.dw_attr_)),
-      result_depth_(operation.result_depth_) {}
+      dw_attr_(std::move(operation.dw_attr_)) {}

 DepthwiseConvPlus1x1Conv& DepthwiseConvPlus1x1Conv::operator=(
    DepthwiseConvPlus1x1Conv&& operation) {
  if (this != &operation) {
    dw_attr_ = std::move(operation.dw_attr_);
-    std::swap(result_depth_, operation.result_depth_);
    GPUOperation::operator=(std::move(operation));
  }
  return *this;
@@ -147,9 +146,9 @@ absl::Status DepthwiseConvPlus1x1Conv::UploadWeights(

 std::string DepthwiseConvPlus1x1Conv::GenerateCode(
    const OperationDef& op_def, const DepthwiseConvolution2DAttributes& dw_attr,
-    int result_depth, const CLDevice& device) {
+    int result_depth, const DeviceInfo& device_info) {
  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(GetFastestZeroMode(device));
+  src_desc.SetTextureAddressMode(GetFastestZeroMode(device_info));
  AddSrcTensor("src_tensor", src_desc);
  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);

@@ -243,21 +242,6 @@ std::string DepthwiseConvPlus1x1Conv::GenerateCode(
  return c;
 }

-absl::Status DepthwiseConvPlus1x1Conv::Compile(
-    const CreationContext& creation_context) {
-  std::string code = GenerateCode(definition_, dw_attr_, result_depth_,
-                                  *creation_context.device);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 int3 DepthwiseConvPlus1x1Conv::GetGridSize() const {
  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
  const int grid_y = dst_[0]->Height();
@@ -289,7 +273,8 @@ absl::Status CreateDepthwiseConvPlus1x1Conv(
    const DepthwiseConvolution2DAttributes& dw_attr,
    const Convolution2DAttributes& conv_attr,
    DepthwiseConvPlus1x1Conv* result) {
-  *result = DepthwiseConvPlus1x1Conv(definition, dw_attr, conv_attr);
+  *result = DepthwiseConvPlus1x1Conv(definition, dw_attr, conv_attr,
+                                     creation_context.device->GetInfo());
  RETURN_IF_ERROR(
      result->UploadWeights(dw_attr, conv_attr, creation_context.context));
  return absl::OkStatus();

--- a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h
@@ -37,7 +37,6 @@ class DepthwiseConvPlus1x1Conv : public GPUOperation {
 public:
  DepthwiseConvPlus1x1Conv() = default;
  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  DepthwiseConvPlus1x1Conv(DepthwiseConvPlus1x1Conv&& operation);
@@ -53,7 +52,8 @@ class DepthwiseConvPlus1x1Conv : public GPUOperation {
      DepthwiseConvPlus1x1Conv* result);
  DepthwiseConvPlus1x1Conv(const OperationDef& definition,
                           const DepthwiseConvolution2DAttributes& dw_attr,
-                           const Convolution2DAttributes& conv_attr);
+                           const Convolution2DAttributes& conv_attr,
+                           const DeviceInfo& device_info);

  absl::Status UploadWeights(const DepthwiseConvolution2DAttributes& dw_attr,
                             const Convolution2DAttributes& conv_attr,
@@ -61,10 +61,9 @@ class DepthwiseConvPlus1x1Conv : public GPUOperation {

  std::string GenerateCode(const OperationDef& op_def,
                           const DepthwiseConvolution2DAttributes& dw_attr,
-                           int result_depth, const CLDevice& device);
+                           int result_depth, const DeviceInfo& device_info);

  DepthwiseConvolution2DAttributes dw_attr_;
-  int result_depth_;
 };

 bool IsDepthwiseConvPlus1x1ConvSupported(

--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
@@ -79,6 +79,7 @@ StridedSlice::StridedSlice(const OperationDef& definition,
                           const SliceAttributes& attr)
    : GPUOperation(definition), attributes_(attr) {
  work_group_size_ = int3(8, 4, 1);
+  code_ = GetStridedSliceCode(definition_, Is4Aligned(attributes_));
 }

 StridedSlice::StridedSlice(StridedSlice&& operation)
@@ -153,19 +154,6 @@ std::string StridedSlice::GetStridedSliceCode(const OperationDef& op_def,
  return c;
 }

-absl::Status StridedSlice::Compile(const CreationContext& creation_context) {
-  std::string code = GetStridedSliceCode(definition_, Is4Aligned(attributes_));
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 absl::Status StridedSlice::BindArguments() {
  int4 offset = GetOffset(attributes_, src_[0]->Width(), src_[0]->Height(),
                          src_[0]->Channels(), src_[0]->Batch());

--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
@@ -29,7 +29,6 @@ class StridedSlice : public GPUOperation {
  StridedSlice(const OperationDef& definition, const SliceAttributes& attr);
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  StridedSlice(StridedSlice&& operation);

--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -25,6 +25,12 @@ namespace tflite {
 namespace gpu {
 namespace cl {

+Transpose::Transpose(const OperationDef& definition,
+                     const TransposeAttributes& attr)
+    : GPUOperation(definition), attr_(attr) {
+  code_ = GetTransposeCode(definition_, attr_);
+}
+
 Transpose::Transpose(Transpose&& operation)
    : GPUOperation(std::move(operation)), attr_(operation.attr_) {}

@@ -107,19 +113,6 @@ std::string Transpose::GetTransposeCode(const OperationDef& op_def,
  return c;
 }

-absl::Status Transpose::Compile(const CreationContext& creation_context) {
-  std::string code = GetTransposeCode(definition_, attr_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
 int3 Transpose::GetGridSize() const {
  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
  const int grid_y = dst_[0]->Height();

--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
@@ -26,10 +26,8 @@ namespace cl {

 class Transpose : public GPUOperation {
 public:
-  Transpose(const OperationDef& definition, const TransposeAttributes& attr)
-      : GPUOperation(definition), attr_(attr) {}
+  Transpose(const OperationDef& definition, const TransposeAttributes& attr);
  int3 GetGridSize() const override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  Transpose(Transpose&& operation);

--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -100,6 +100,11 @@ TextureAddressMode GetFastestZeroMode(const CLDevice& device) {
                              : TextureAddressMode::ZERO;
 }

+TextureAddressMode GetFastestZeroMode(const DeviceInfo& device_info) {
+  return device_info.IsAdreno3xx() ? TextureAddressMode::DONT_CARE
+                                   : TextureAddressMode::ZERO;
+}
+
 float4 GetMaskForLastPlane(int channels) {
  float4 mask = float4(0.0f);
  const int reminder = channels % 4 == 0 ? 4 : channels % 4;

--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -95,6 +95,7 @@ void RearrangeWeightsToOHWIOGroupI4O4(
 // textures on Adreno3xx devices. Using CLK_ADDRESS_NONE is significantly faster
 // than CLK_ADDRESS_CLAMP on Adreno 3xx.
 TextureAddressMode GetFastestZeroMode(const CLDevice& device);
+TextureAddressMode GetFastestZeroMode(const DeviceInfo& device_info);

 // Returns float4 mask for last plane(batch of 4 channels)
 // assumes that plane size is 4;

--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -32,6 +32,21 @@ namespace tflite {
 namespace gpu {
 namespace cl {

+Winograd4x4To36::Winograd4x4To36(const OperationDef& definition,
+                                 const Padding2D& padding,
+                                 const DeviceInfo& device_info)
+    : GPUOperation(definition), padding_(padding) {
+  work_group_size_ = int3(32, 1, 1);
+  code_ = GetWinograd4x4To36Code(definition_);
+  if (device_info.IsAdreno()) {
+    compiler_options_.push_back(CompilerOptions::ADRENO_MORE_WAVES);
+  }
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device_info.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
+  }
+}
+
 Winograd4x4To36::Winograd4x4To36(Winograd4x4To36&& operation)
    : GPUOperation(std::move(operation)), padding_(operation.padding_) {}

@@ -219,30 +234,6 @@ std::string Winograd4x4To36::GetWinograd4x4To36Code(
  return c;
 }

-absl::Status Winograd4x4To36::Compile(const CreationContext& creation_context) {
-  std::vector<CompilerOptions> options;
-  if (creation_context.device->IsAdreno()) {
-    options.push_back(CompilerOptions::ADRENO_MORE_WAVES);
-  }
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
-  }
-  RETURN_IF_ERROR(UploadBt(creation_context.context));
-  std::string code = GetWinograd4x4To36Code(definition_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_));
-  work_group_size_ = SelectBestWorkGroup();
-  return absl::OkStatus();
-}
-
 absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
  tflite::gpu::Tensor<Linear, DataType::FLOAT32> bt_aligned;
  bt_aligned.shape = Linear(6 * 8);
@@ -311,10 +302,22 @@ absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
                                   const OperationDef& definition,
                                   const Padding2D& padding,
                                   Winograd4x4To36* result) {
-  *result = Winograd4x4To36(definition, padding);
+  *result =
+      Winograd4x4To36(definition, padding, creation_context.device->GetInfo());
  return result->UploadBt(creation_context.context);
 }

+Winograd36To4x4::Winograd36To4x4(const OperationDef& definition,
+                                 const DeviceInfo& device_info)
+    : GPUOperation(definition) {
+  work_group_size_ = int3(32, 1, 1);
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      device_info.IsPowerVR()) {
+    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
+  }
+  code_ = GetWinograd36To4x4Code(definition_);
+}
+
 Winograd36To4x4::Winograd36To4x4(Winograd36To4x4&& operation)
    : GPUOperation(std::move(operation)) {}

@@ -434,26 +437,6 @@ std::string Winograd36To4x4::GetWinograd36To4x4Code(
  return c;
 }

-absl::Status Winograd36To4x4::Compile(const CreationContext& creation_context) {
-  std::vector<CompilerOptions> options;
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      creation_context.device->IsPowerVR()) {
-    options.push_back(CompilerOptions::POWERVR_FP16);
-  }
-  std::string code = GetWinograd36To4x4Code(definition_);
-  std::string element_wise_code;
-  RETURN_IF_ERROR(
-      MergeOperations(linked_operations_, &args_, &element_wise_code));
-  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
-                                          {{"dst_tensor", element_wise_code}},
-                                          &code));
-  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_));
-  work_group_size_ = SelectBestWorkGroup();
-  return absl::OkStatus();
-}
-
 absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
  tflite::gpu::Tensor<Linear, DataType::FLOAT32> at_aligned;
  at_aligned.shape = Linear(4 * 8);
@@ -519,7 +502,7 @@ absl::Status CreateWinograd36To4x4(
    const CreationContext& creation_context, const OperationDef& definition,
    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
    Winograd36To4x4* result) {
-  *result = Winograd36To4x4(definition);
+  *result = Winograd36To4x4(definition, creation_context.device->GetInfo());
  TensorLinearDescriptor desc;
  desc.storage_type = LinearStorageType::TEXTURE_2D;
  desc.element_type = definition.GetDataType();

--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
@@ -34,14 +34,11 @@ namespace cl {
 class Winograd4x4To36 : public GPUOperation {
 public:
  Winograd4x4To36() = default;
-  Winograd4x4To36(const OperationDef& definition, const Padding2D& padding)
-      : GPUOperation(definition), padding_(padding) {
-    work_group_size_ = int3(128, 1, 1);
-  }
+  Winograd4x4To36(const OperationDef& definition, const Padding2D& padding,
+                  const DeviceInfo& device_info);
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;
  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  Winograd4x4To36(Winograd4x4To36&& operation);
@@ -72,14 +69,11 @@ absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
 class Winograd36To4x4 : public GPUOperation {
 public:
  Winograd36To4x4() = default;
-  explicit Winograd36To4x4(const OperationDef& definition)
-      : GPUOperation(definition) {
-    work_group_size_ = int3(128, 1, 1);
-  }
+  Winograd36To4x4(const OperationDef& definition,
+                  const DeviceInfo& device_info);
  absl::Status BindArguments() override;
  int3 GetGridSize() const override;
  absl::Status Tune(const TuningParameters& params) override;
-  absl::Status Compile(const CreationContext& creation_context) override;

  // Move only
  Winograd36To4x4(Winograd36To4x4&& operation);

--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -270,18 +270,20 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
                                  inputs[0]->tensor.shape.b, gpu_op);
    }
    case OperationType::LSTM: {
-      SelectLSTM(op_def, gpu_op);
+      SelectLSTM(op_def, creation_context.device->GetInfo(), gpu_op);
      return absl::OkStatus();
    }
    case OperationType::MAX_UNPOOLING_2D: {
      auto attr =
          absl::any_cast<MaxUnpooling2DAttributes>(node.operation.attributes);
-      SelectMaxUnpooling(attr, op_def, gpu_op);
+      SelectMaxUnpooling(attr, op_def, creation_context.device->GetInfo(),
+                         gpu_op);
      return absl::OkStatus();
    }
    case OperationType::MEAN: {
      auto attr = absl::any_cast<MeanAttributes>(node.operation.attributes);
-      return SelectMean(attr, op_def, gpu_op);
+      return SelectMean(attr, op_def, creation_context.device->GetInfo(),
+                        gpu_op);
    }
    case OperationType::MUL: {
      if (inputs.size() == 2) {
@@ -333,7 +335,7 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
    case OperationType::POOLING_2D: {
      auto attr =
          absl::any_cast<Pooling2DAttributes>(node.operation.attributes);
-      SelectPooling(attr, op_def, gpu_op);
+      SelectPooling(attr, op_def, creation_context.device->GetInfo(), gpu_op);
      return absl::OkStatus();
    }
    case OperationType::PRELU: {

--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -45,9 +45,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {

-void SelectLSTM(const OperationDef& op_def,
+void SelectLSTM(const OperationDef& op_def, const DeviceInfo& device_info,
                std::unique_ptr<GPUOperation>* ptr) {
-  LSTM operation = CreateLSTM(op_def);
+  LSTM operation = CreateLSTM(op_def, device_info);
  *ptr = absl::make_unique<LSTM>(std::move(operation));
 }

@@ -69,15 +69,17 @@ absl::Status SelectPReLU(const PReLUAttributes& attr,
 }

 void SelectPooling(const Pooling2DAttributes& attr, const OperationDef& op_def,
+                   const DeviceInfo& device_info,
                   std::unique_ptr<GPUOperation>* ptr) {
-  Pooling pooling = CreatePooling(op_def, attr);
+  Pooling pooling = CreatePooling(op_def, attr, device_info);
  *ptr = absl::make_unique<Pooling>(std::move(pooling));
 }

 void SelectMaxUnpooling(const MaxUnpooling2DAttributes& attr,
                        const OperationDef& op_def,
+                        const DeviceInfo& device_info,
                        std::unique_ptr<GPUOperation>* ptr) {
-  MaxUnpooling operation = CreateMaxUnpooling(op_def, attr);
+  MaxUnpooling operation = CreateMaxUnpooling(op_def, attr, device_info);
  *ptr = absl::make_unique<MaxUnpooling>(std::move(operation));
 }

@@ -151,11 +153,12 @@ void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
 }

 absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
+                        const DeviceInfo& device_info,
                        std::unique_ptr<GPUOperation>* ptr) {
  if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) {
    return absl::UnimplementedError("Mean operation supports only HW plane");
  }
-  Mean operation = CreateMean(op_def);
+  Mean operation = CreateMean(op_def, device_info);
  *ptr = absl::make_unique<Mean>(std::move(operation));
  return absl::OkStatus();
 }

--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@@ -28,7 +28,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {

-void SelectLSTM(const OperationDef& op_def, std::unique_ptr<GPUOperation>* ptr);
+void SelectLSTM(const OperationDef& op_def, const DeviceInfo& device_info,
+                std::unique_ptr<GPUOperation>* ptr);

 void SelectReLU(const CreationContext& creation_context,
                const ReLUAttributes& attr, const OperationDef& op_def,
@@ -40,10 +41,12 @@ absl::Status SelectPReLU(const PReLUAttributes& attr,
                         std::unique_ptr<GPUOperation>* ptr);

 void SelectPooling(const Pooling2DAttributes& attr, const OperationDef& op_def,
+                   const DeviceInfo& device_info,
                   std::unique_ptr<GPUOperation>* ptr);

 void SelectMaxUnpooling(const MaxUnpooling2DAttributes& attr,
                        const OperationDef& op_def,
+                        const DeviceInfo& device_info,
                        std::unique_ptr<GPUOperation>* ptr);

 void SelectAdd(const OperationDef& op_def, const std::vector<int>& channels,
@@ -70,6 +73,7 @@ void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
                        std::unique_ptr<GPUOperation>* ptr);

 absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
+                        const DeviceInfo& device_info,
                        std::unique_ptr<GPUOperation>* ptr);

 void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,