Make IrEmitterUnnested private; replace it with a free function.

Also merge it with IrEmitter::EmitCallToNestedComputation. PiperOrigin-RevId: 549553449

Make IrEmitterUnnested private; replace it with a free function.
Also merge it with IrEmitter::EmitCallToNestedComputation. PiperOrigin-RevId: 549553449
89945e8b · Johannes Reifferscheid · TensorFlower Gardener · f6a3feaa · 89945e8b · 89945e8b
6 changed file
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -362,6 +362,7 @@ cc_library(
        "//tensorflow/compiler/xla/service:custom_call_target_registry",
        "//tensorflow/compiler/xla/service:elemental_ir_emitter",
        "//tensorflow/compiler/xla/service:hlo_execution_profile",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
        "//tensorflow/compiler/xla/service:name_uniquer",
        "//tensorflow/compiler/xla/service:pattern_matcher",
        "//tensorflow/compiler/xla/service:shape_inference",

--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -15,11 +15,9 @@ limitations under the License.

 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"

-#include <iterator>
 #include <utility>

 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
-#include "absl/algorithm/container.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -36,27 +34,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/errors.h"

-// Convenient function to cast the provided llvm::Value* using IRBuilder
-// to default address space. This is useful in particular for generating
-// IR for AMDGPU target, as its kernel variables are in address space 5
-// instead of the default address space.
-static llvm::Value* AddrCastToDefault(llvm::Value* arg, llvm::IRBuilder<>& b) {
-  llvm::Type* arg_type = arg->getType();
-  CHECK(arg_type->isPointerTy());
-  if (arg_type->getPointerAddressSpace() != 0) {
-    llvm::Type* generic_arg_type = llvm::PointerType::getWithSamePointeeType(
-        llvm::cast<llvm::PointerType>(arg_type), 0);
-    llvm::Value* addrspacecast_arg =
-        b.CreateAddrSpaceCast(arg, generic_arg_type);
-    return addrspacecast_arg;
-  }
-  return arg;
-}
-
 namespace xla {

 using llvm_ir::SetToFirstInsertPoint;
@@ -144,32 +124,6 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
  return OkStatus();
 }

-Status IrEmitter::EmitCallToNestedComputation(
-    const HloComputation& nested_computation,
-    absl::Span<llvm::Value* const> operands, llvm::Value* output) {
-  TF_RET_CHECK(nested_computation.num_parameters() > 0);
-
-  TF_ASSIGN_OR_RETURN(llvm::Function * emitted_function,
-                      IrEmitterNested(hlo_module_config_, nested_computation,
-                                      ir_emitter_context_)
-                          .CodegenNestedComputation());
-
-  // Operands are in default address space for non-AMDGPU target.
-  // However for AMDGPU target, addrspacecast alloca variables from
-  // addrspace 5 to addrspace 0 is needed.
-  std::vector<llvm::Value*> arguments;
-  absl::c_transform(
-      operands, std::back_inserter(arguments),
-      [this](llvm::Value* arg) { return AddrCastToDefault(arg, b_); });
-
-  llvm::Value* casted_output = AddrCastToDefault(output, b_);
-  arguments.push_back(casted_output);
-
-  Call(emitted_function, arguments);
-
-  return OkStatus();
-}
-
 bool IrEmitter::MaybeEmitDirectAtomicOperation(
    const HloComputation& computation, llvm::Value* output_address,
    llvm::Value* source_address) {
@@ -402,9 +356,9 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation,
                        cas_old_output_address, "cas_old_output");
  Store(cas_old_output, cas_new_output_address);
  // Emits code to calculate new_output = operation(old_output, source);
-  TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-      computation, {binop_output_address, source_address},
-      binop_output_address));
+  TF_RETURN_IF_ERROR(CallNestedComputation(
+      &b_, hlo_module_config_, computation, *ir_emitter_context_,
+      {binop_output_address, source_address}, binop_output_address));

  llvm::Value* cas_new_output = Load(cas_new_output_address->getAllocatedType(),
                                     cas_new_output_address, "cas_new_output");
@@ -565,8 +519,9 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
  for (HloInstruction* operand : call->operands()) {
    operand_addresses.push_back(GetBasePointer(*operand));
  }
-  return EmitCallToNestedComputation(*call->to_apply(), operand_addresses,
-                                     GetBasePointer(*call));
+  return CallNestedComputation(&b_, hlo_module_config_, *call->to_apply(),
+                               *ir_emitter_context_, operand_addresses,
+                               GetBasePointer(*call));
 }

 Status IrEmitter::HandleCustomCall(HloInstruction*) {
@@ -637,8 +592,9 @@ StatusOr<std::vector<llvm::Value*>> IrEmitter::ComputeNestedElementFromAddrs(
    EmitTuple(tuple_array, allocas_for_returned_scalars, &b_);
  }

-  TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-      computation, parameter_elements_addrs, return_buffer));
+  TF_RETURN_IF_ERROR(CallNestedComputation(
+      &b_, hlo_module_config_, computation, *ir_emitter_context_,
+      parameter_elements_addrs, return_buffer));

  std::vector<llvm::Value*> returned_scalars;
  returned_scalars.reserve(allocas_for_returned_scalars.size());

--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -125,13 +125,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
      const HloInstruction& hlo,
      const llvm_ir::ElementGenerator& body_emitter) = 0;

-  // Emits a call in IR to the given nested computation with the given operands
-  // and output. If no IR function has been previously emitted for the
-  // computation, also emits such a function.
-  Status EmitCallToNestedComputation(const HloComputation& nested_computation,
-                                     absl::Span<llvm::Value* const> operands,
-                                     llvm::Value* output);
-
  // Emits an atomic operation that implements `nested_computation` in the
  // sequentially consistent memory model. `output_address` and `source_address`
  // are the arguments of the nested computation. For example,

--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -12,12 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"

-#include <memory>
 #include <string>
-#include <utility>
 #include <vector>

 #include "absl/strings/str_cat.h"
@@ -25,22 +22,60 @@ limitations under the License.
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
-#include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
-#include "tensorflow/compiler/xla/service/name_uniquer.h"
-#include "tensorflow/tsl/platform/status.h"

 namespace xla {
 namespace gpu {
+namespace {
+
+class IrEmitterNested : public IrEmitter {
+ public:
+  // Constructs an LLVM IR emitter for a nested HLO computation. `function` is
+  // the containing IR function this emitter produces IR to. See
+  // IrEmitter::IrEmitter for the meanings of other arguments.
+  IrEmitterNested(const HloModuleConfig& hlo_module_config,
+                  const HloComputation& nested_computation,
+                  IrEmitterContext* ir_emitter_context);
+
+  IrEmitterNested(const IrEmitterNested&) = delete;
+  IrEmitterNested& operator=(const IrEmitterNested&) = delete;
+
+  // Overrides the default empty implementation. Binds the given instruction
+  // "parameter" with the parameter of the IR function.
+  Status HandleParameter(HloInstruction* parameter) override;
+
+  // Generate the code for the computation passed in the constructor, if it
+  // wasn't already generated previously.
+  // As well as generting the code for the function, emits code for global
+  // constants, and also populates related information to 'ir_emitter_context_'
+  // for large-constant initializations. Large constants don't get initializers
+  // in the generated code and so must be initialized by XLA. The value of these
+  // constants will be stored in 'content'. Constants with initializers in the
+  // generated code will have empty 'content'.
+  //
+  // The allocation index for these constants will always be -1 (i.e. doesn't
+  // correspond to any allocation)
+  StatusOr<llvm::Function*> CodegenNestedComputation();
+
+ protected:
+  Status EmitTargetElementLoop(
+      const HloInstruction& hlo,
+      const llvm_ir::ElementGenerator& element_generator) override;
+
+ private:
+  // Emits constants to generated LLVM IR, and also populates related
+  // information to 'ir_emitter_context_' for large-constant initializations.
+  Status EmitConstants(const HloComputation& computation);
+
+  const HloComputation& nested_computation_;
+};

 IrEmitterNested::IrEmitterNested(const HloModuleConfig& hlo_module_config,
                                 const HloComputation& nested_computation,
@@ -219,5 +254,52 @@ Status IrEmitterNested::EmitConstants(const HloComputation& computation) {
  return OkStatus();
 }

+// Casts the provided llvm::Value* to the default address space. This is useful
+// in particular for generating IR for AMDGPU target, as its kernel variables
+// are in address space 5 instead of the default address space.
+static llvm::Value* AddrCastToDefault(llvm::Value* arg, llvm::IRBuilder<>& b) {
+  llvm::Type* arg_type = arg->getType();
+  CHECK(arg_type->isPointerTy());
+  if (arg_type->getPointerAddressSpace() != 0) {
+    llvm::Type* generic_arg_type = llvm::PointerType::getWithSamePointeeType(
+        llvm::cast<llvm::PointerType>(arg_type), 0);
+    llvm::Value* addrspacecast_arg =
+        b.CreateAddrSpaceCast(arg, generic_arg_type);
+    return addrspacecast_arg;
+  }
+  return arg;
+}
+
+}  // namespace
+
+Status CallNestedComputation(llvm::IRBuilder<>* builder,
+                             const HloModuleConfig& hlo_module_config,
+                             const HloComputation& nested_computation,
+                             IrEmitterContext& ir_emitter_context,
+                             absl::Span<llvm::Value* const> operands,
+                             llvm::Value* output) {
+  TF_RET_CHECK(nested_computation.num_parameters() > 0);
+
+  TF_ASSIGN_OR_RETURN(llvm::Function * emitted_function,
+                      IrEmitterNested(hlo_module_config, nested_computation,
+                                      &ir_emitter_context)
+                          .CodegenNestedComputation());
+
+  // Operands are in default address space for non-AMDGPU target.
+  // However for AMDGPU target, addrspacecast alloca variables from
+  // addrspace 5 to addrspace 0 is needed.
+  std::vector<llvm::Value*> arguments;
+  absl::c_transform(
+      operands, std::back_inserter(arguments),
+      [builder](llvm::Value* arg) { return AddrCastToDefault(arg, *builder); });
+
+  llvm::Value* casted_output = AddrCastToDefault(output, *builder);
+  arguments.push_back(casted_output);
+
+  builder->CreateCall(emitted_function, arguments);
+
+  return OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
@@ -16,8 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_

-#include "llvm/IR/Function.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
+#include "absl/types/span.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"

 namespace xla {
 namespace gpu {
@@ -36,48 +40,12 @@ namespace gpu {
 //   - N pointers to the buffers of each of the N parameters to the computation,
 //   - a pointer to the output buffer of the computation, and
 //   - a pointer to the top-level temp buffer.
-//
-class IrEmitterNested : public IrEmitter {
- public:
-  // Constructs an LLVM IR emitter for a nested HLO computation. `function` is
-  // the containing IR function this emitter produces IR to. See
-  // IrEmitter::IrEmitter for the meanings of other arguments.
-  IrEmitterNested(const HloModuleConfig& hlo_module_config,
-                  const HloComputation& nested_computation,
-                  IrEmitterContext* ir_emitter_context);
-
-  IrEmitterNested(const IrEmitterNested&) = delete;
-  IrEmitterNested& operator=(const IrEmitterNested&) = delete;
-
-  // Overrides the default empty implementation. Binds the given instruction
-  // "parameter" with the parameter of the IR function.
-  Status HandleParameter(HloInstruction* parameter) override;
-
-  // Generate the code for the computation passed in the constructor, if it
-  // wasn't already generated previously.
-  // As well as generting the code for the function, emits code for global
-  // constants, and also populates related information to 'ir_emitter_context_'
-  // for large-constant initializations. Large constants don't get initializers
-  // in the generated code and so must be initialized by XLA. The value of these
-  // constants will be stored in 'content'. Constants with initializers in the
-  // generated code will have empty 'content'.
-  //
-  // The allocation index for these constants will always be -1 (i.e. doesn't
-  // correspond to any allocation)
-  StatusOr<llvm::Function*> CodegenNestedComputation();
-
- protected:
-  Status EmitTargetElementLoop(
-      const HloInstruction& hlo,
-      const llvm_ir::ElementGenerator& body_emitter) override;
-
- private:
-  // Emits constants to generated LLVM IR, and also populates related
-  // information to 'ir_emitter_context_' for large-constant initializations.
-  Status EmitConstants(const HloComputation& computation);
-
-  const HloComputation& nested_computation_;
-};
+Status CallNestedComputation(llvm::IRBuilder<>* builder,
+                             const HloModuleConfig& hlo_module_config,
+                             const HloComputation& nested_computation,
+                             IrEmitterContext& ir_emitter_context,
+                             absl::Span<llvm::Value* const> operands,
+                             llvm::Value* output);

 }  // namespace gpu
 }  // namespace xla

--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <cstring>
 #include <functional>
 #include <iterator>
-#include <limits>
 #include <map>
 #include <memory>
 #include <numeric>
@@ -96,7 +95,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/fusions/tiling_util.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
@@ -106,6 +104,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_arguments.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
@@ -125,7 +124,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/while_thunk.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
@@ -2285,9 +2283,9 @@ Status IrEmitterUnnested::EmitSelectAndScatter(mlir::Operation* op) {
        GetOrCreateSubComputationFromRegion(&select_and_scatter_op.getSelect(),
                                            /*is_fusion=*/false));

-    TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-        *select_computation, {selected_value_address, operand_address},
-        select_return_buffer));
+    TF_RETURN_IF_ERROR(CallNestedComputation(
+        &b_, hlo_module_config_, *select_computation, *ir_emitter_context_,
+        {selected_value_address, operand_address}, select_return_buffer));
    llvm::Value* result =
        Load(select_return_buffer->getAllocatedType(), select_return_buffer);

@@ -2620,9 +2618,10 @@ Status IrEmitterUnnested::EmitScatter(
          *desc.update_computation, output_address, input_address,
          desc.output.GetElementLlvmType());
    } else {
-      return EmitCallToNestedComputation(*desc.update_computation,
-                                         {output_address, input_address},
-                                         output_address);
+      return CallNestedComputation(
+          &b_, hlo_module_config_, *desc.update_computation,
+          *ir_emitter_context_, {output_address, input_address},
+          output_address);
    }
  };

@@ -2906,7 +2905,8 @@ Status IrEmitterUnnested::EmitSort(mlir::Operation* op) {
                             : standard_num_iterations_in_sort_dim,
        kTileSize,
        [&](absl::Span<llvm::Value* const> operands, llvm::Value* output) {
-          return EmitCallToNestedComputation(*comparator, operands, output);
+          return CallNestedComputation(&b_, hlo_module_config_, *comparator,
+                                       *ir_emitter_context_, operands, output);
        });
  };
  std::vector<int64_t> xor_masks;