提交 89945e8b 编写于 作者: J Johannes Reifferscheid 提交者: TensorFlower Gardener

Make IrEmitterUnnested private; replace it with a free function.

Also merge it with IrEmitter::EmitCallToNestedComputation.

PiperOrigin-RevId: 549553449
上级 f6a3feaa
......@@ -362,6 +362,7 @@ cc_library(
"//tensorflow/compiler/xla/service:custom_call_target_registry",
"//tensorflow/compiler/xla/service:elemental_ir_emitter",
"//tensorflow/compiler/xla/service:hlo_execution_profile",
"//tensorflow/compiler/xla/service:hlo_module_config",
"//tensorflow/compiler/xla/service:name_uniquer",
"//tensorflow/compiler/xla/service:pattern_matcher",
"//tensorflow/compiler/xla/service:shape_inference",
......
......@@ -15,11 +15,9 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
#include <iterator>
#include <utility>
// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
#include "absl/algorithm/container.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
......@@ -36,27 +34,9 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
#include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
#include "tensorflow/compiler/xla/shape_util.h"
#include "tensorflow/compiler/xla/status_macros.h"
#include "tensorflow/compiler/xla/util.h"
#include "tensorflow/tsl/platform/errors.h"
// Convenient function to cast the provided llvm::Value* using IRBuilder
// to default address space. This is useful in particular for generating
// IR for AMDGPU target, as its kernel variables are in address space 5
// instead of the default address space.
static llvm::Value* AddrCastToDefault(llvm::Value* arg, llvm::IRBuilder<>& b) {
llvm::Type* arg_type = arg->getType();
CHECK(arg_type->isPointerTy());
if (arg_type->getPointerAddressSpace() != 0) {
llvm::Type* generic_arg_type = llvm::PointerType::getWithSamePointeeType(
llvm::cast<llvm::PointerType>(arg_type), 0);
llvm::Value* addrspacecast_arg =
b.CreateAddrSpaceCast(arg, generic_arg_type);
return addrspacecast_arg;
}
return arg;
}
namespace xla {
using llvm_ir::SetToFirstInsertPoint;
......@@ -144,32 +124,6 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
return OkStatus();
}
Status IrEmitter::EmitCallToNestedComputation(
const HloComputation& nested_computation,
absl::Span<llvm::Value* const> operands, llvm::Value* output) {
TF_RET_CHECK(nested_computation.num_parameters() > 0);
TF_ASSIGN_OR_RETURN(llvm::Function * emitted_function,
IrEmitterNested(hlo_module_config_, nested_computation,
ir_emitter_context_)
.CodegenNestedComputation());
// Operands are in default address space for non-AMDGPU target.
// However for AMDGPU target, addrspacecast alloca variables from
// addrspace 5 to addrspace 0 is needed.
std::vector<llvm::Value*> arguments;
absl::c_transform(
operands, std::back_inserter(arguments),
[this](llvm::Value* arg) { return AddrCastToDefault(arg, b_); });
llvm::Value* casted_output = AddrCastToDefault(output, b_);
arguments.push_back(casted_output);
Call(emitted_function, arguments);
return OkStatus();
}
bool IrEmitter::MaybeEmitDirectAtomicOperation(
const HloComputation& computation, llvm::Value* output_address,
llvm::Value* source_address) {
......@@ -402,9 +356,9 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation,
cas_old_output_address, "cas_old_output");
Store(cas_old_output, cas_new_output_address);
// Emits code to calculate new_output = operation(old_output, source);
TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
computation, {binop_output_address, source_address},
binop_output_address));
TF_RETURN_IF_ERROR(CallNestedComputation(
&b_, hlo_module_config_, computation, *ir_emitter_context_,
{binop_output_address, source_address}, binop_output_address));
llvm::Value* cas_new_output = Load(cas_new_output_address->getAllocatedType(),
cas_new_output_address, "cas_new_output");
......@@ -565,8 +519,9 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
for (HloInstruction* operand : call->operands()) {
operand_addresses.push_back(GetBasePointer(*operand));
}
return EmitCallToNestedComputation(*call->to_apply(), operand_addresses,
GetBasePointer(*call));
return CallNestedComputation(&b_, hlo_module_config_, *call->to_apply(),
*ir_emitter_context_, operand_addresses,
GetBasePointer(*call));
}
Status IrEmitter::HandleCustomCall(HloInstruction*) {
......@@ -637,8 +592,9 @@ StatusOr<std::vector<llvm::Value*>> IrEmitter::ComputeNestedElementFromAddrs(
EmitTuple(tuple_array, allocas_for_returned_scalars, &b_);
}
TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
computation, parameter_elements_addrs, return_buffer));
TF_RETURN_IF_ERROR(CallNestedComputation(
&b_, hlo_module_config_, computation, *ir_emitter_context_,
parameter_elements_addrs, return_buffer));
std::vector<llvm::Value*> returned_scalars;
returned_scalars.reserve(allocas_for_returned_scalars.size());
......
......@@ -125,13 +125,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
const HloInstruction& hlo,
const llvm_ir::ElementGenerator& body_emitter) = 0;
// Emits a call in IR to the given nested computation with the given operands
// and output. If no IR function has been previously emitted for the
// computation, also emits such a function.
Status EmitCallToNestedComputation(const HloComputation& nested_computation,
absl::Span<llvm::Value* const> operands,
llvm::Value* output);
// Emits an atomic operation that implements `nested_computation` in the
// sequentially consistent memory model. `output_address` and `source_address`
// are the arguments of the nested computation. For example,
......
......@@ -12,12 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "absl/strings/str_cat.h"
......@@ -25,22 +22,60 @@ limitations under the License.
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
#include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
#include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
#include "tensorflow/compiler/xla/service/name_uniquer.h"
#include "tensorflow/tsl/platform/status.h"
namespace xla {
namespace gpu {
namespace {
class IrEmitterNested : public IrEmitter {
public:
// Constructs an LLVM IR emitter for a nested HLO computation. `function` is
// the containing IR function this emitter produces IR to. See
// IrEmitter::IrEmitter for the meanings of other arguments.
IrEmitterNested(const HloModuleConfig& hlo_module_config,
const HloComputation& nested_computation,
IrEmitterContext* ir_emitter_context);
IrEmitterNested(const IrEmitterNested&) = delete;
IrEmitterNested& operator=(const IrEmitterNested&) = delete;
// Overrides the default empty implementation. Binds the given instruction
// "parameter" with the parameter of the IR function.
Status HandleParameter(HloInstruction* parameter) override;
// Generate the code for the computation passed in the constructor, if it
// wasn't already generated previously.
// As well as generting the code for the function, emits code for global
// constants, and also populates related information to 'ir_emitter_context_'
// for large-constant initializations. Large constants don't get initializers
// in the generated code and so must be initialized by XLA. The value of these
// constants will be stored in 'content'. Constants with initializers in the
// generated code will have empty 'content'.
//
// The allocation index for these constants will always be -1 (i.e. doesn't
// correspond to any allocation)
StatusOr<llvm::Function*> CodegenNestedComputation();
protected:
Status EmitTargetElementLoop(
const HloInstruction& hlo,
const llvm_ir::ElementGenerator& element_generator) override;
private:
// Emits constants to generated LLVM IR, and also populates related
// information to 'ir_emitter_context_' for large-constant initializations.
Status EmitConstants(const HloComputation& computation);
const HloComputation& nested_computation_;
};
IrEmitterNested::IrEmitterNested(const HloModuleConfig& hlo_module_config,
const HloComputation& nested_computation,
......@@ -219,5 +254,52 @@ Status IrEmitterNested::EmitConstants(const HloComputation& computation) {
return OkStatus();
}
// Casts the provided llvm::Value* to the default address space. This is useful
// in particular for generating IR for AMDGPU target, as its kernel variables
// are in address space 5 instead of the default address space.
static llvm::Value* AddrCastToDefault(llvm::Value* arg, llvm::IRBuilder<>& b) {
llvm::Type* arg_type = arg->getType();
CHECK(arg_type->isPointerTy());
if (arg_type->getPointerAddressSpace() != 0) {
llvm::Type* generic_arg_type = llvm::PointerType::getWithSamePointeeType(
llvm::cast<llvm::PointerType>(arg_type), 0);
llvm::Value* addrspacecast_arg =
b.CreateAddrSpaceCast(arg, generic_arg_type);
return addrspacecast_arg;
}
return arg;
}
} // namespace
Status CallNestedComputation(llvm::IRBuilder<>* builder,
const HloModuleConfig& hlo_module_config,
const HloComputation& nested_computation,
IrEmitterContext& ir_emitter_context,
absl::Span<llvm::Value* const> operands,
llvm::Value* output) {
TF_RET_CHECK(nested_computation.num_parameters() > 0);
TF_ASSIGN_OR_RETURN(llvm::Function * emitted_function,
IrEmitterNested(hlo_module_config, nested_computation,
&ir_emitter_context)
.CodegenNestedComputation());
// Operands are in default address space for non-AMDGPU target.
// However for AMDGPU target, addrspacecast alloca variables from
// addrspace 5 to addrspace 0 is needed.
std::vector<llvm::Value*> arguments;
absl::c_transform(
operands, std::back_inserter(arguments),
[builder](llvm::Value* arg) { return AddrCastToDefault(arg, *builder); });
llvm::Value* casted_output = AddrCastToDefault(output, *builder);
arguments.push_back(casted_output);
builder->CreateCall(emitted_function, arguments);
return OkStatus();
}
} // namespace gpu
} // namespace xla
......@@ -16,8 +16,12 @@ limitations under the License.
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
#include "llvm/IR/Function.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
#include "absl/types/span.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Value.h"
#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
#include "tensorflow/compiler/xla/service/hlo_module_config.h"
namespace xla {
namespace gpu {
......@@ -36,48 +40,12 @@ namespace gpu {
// - N pointers to the buffers of each of the N parameters to the computation,
// - a pointer to the output buffer of the computation, and
// - a pointer to the top-level temp buffer.
//
class IrEmitterNested : public IrEmitter {
public:
// Constructs an LLVM IR emitter for a nested HLO computation. `function` is
// the containing IR function this emitter produces IR to. See
// IrEmitter::IrEmitter for the meanings of other arguments.
IrEmitterNested(const HloModuleConfig& hlo_module_config,
const HloComputation& nested_computation,
IrEmitterContext* ir_emitter_context);
IrEmitterNested(const IrEmitterNested&) = delete;
IrEmitterNested& operator=(const IrEmitterNested&) = delete;
// Overrides the default empty implementation. Binds the given instruction
// "parameter" with the parameter of the IR function.
Status HandleParameter(HloInstruction* parameter) override;
// Generate the code for the computation passed in the constructor, if it
// wasn't already generated previously.
// As well as generting the code for the function, emits code for global
// constants, and also populates related information to 'ir_emitter_context_'
// for large-constant initializations. Large constants don't get initializers
// in the generated code and so must be initialized by XLA. The value of these
// constants will be stored in 'content'. Constants with initializers in the
// generated code will have empty 'content'.
//
// The allocation index for these constants will always be -1 (i.e. doesn't
// correspond to any allocation)
StatusOr<llvm::Function*> CodegenNestedComputation();
protected:
Status EmitTargetElementLoop(
const HloInstruction& hlo,
const llvm_ir::ElementGenerator& body_emitter) override;
private:
// Emits constants to generated LLVM IR, and also populates related
// information to 'ir_emitter_context_' for large-constant initializations.
Status EmitConstants(const HloComputation& computation);
const HloComputation& nested_computation_;
};
Status CallNestedComputation(llvm::IRBuilder<>* builder,
const HloModuleConfig& hlo_module_config,
const HloComputation& nested_computation,
IrEmitterContext& ir_emitter_context,
absl::Span<llvm::Value* const> operands,
llvm::Value* output);
} // namespace gpu
} // namespace xla
......
......@@ -21,7 +21,6 @@ limitations under the License.
#include <cstring>
#include <functional>
#include <iterator>
#include <limits>
#include <map>
#include <memory>
#include <numeric>
......@@ -96,7 +95,6 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/gpu/fusions/tiling_util.h"
#include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
......@@ -106,6 +104,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
#include "tensorflow/compiler/xla/service/gpu/kernel_arguments.h"
#include "tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h"
#include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
......@@ -125,7 +124,6 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/gpu/thunk.h"
#include "tensorflow/compiler/xla/service/gpu/while_thunk.h"
#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
#include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
......@@ -2285,9 +2283,9 @@ Status IrEmitterUnnested::EmitSelectAndScatter(mlir::Operation* op) {
GetOrCreateSubComputationFromRegion(&select_and_scatter_op.getSelect(),
/*is_fusion=*/false));
TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
*select_computation, {selected_value_address, operand_address},
select_return_buffer));
TF_RETURN_IF_ERROR(CallNestedComputation(
&b_, hlo_module_config_, *select_computation, *ir_emitter_context_,
{selected_value_address, operand_address}, select_return_buffer));
llvm::Value* result =
Load(select_return_buffer->getAllocatedType(), select_return_buffer);
......@@ -2620,9 +2618,10 @@ Status IrEmitterUnnested::EmitScatter(
*desc.update_computation, output_address, input_address,
desc.output.GetElementLlvmType());
} else {
return EmitCallToNestedComputation(*desc.update_computation,
{output_address, input_address},
output_address);
return CallNestedComputation(
&b_, hlo_module_config_, *desc.update_computation,
*ir_emitter_context_, {output_address, input_address},
output_address);
}
};
......@@ -2906,7 +2905,8 @@ Status IrEmitterUnnested::EmitSort(mlir::Operation* op) {
: standard_num_iterations_in_sort_dim,
kTileSize,
[&](absl::Span<llvm::Value* const> operands, llvm::Value* output) {
return EmitCallToNestedComputation(*comparator, operands, output);
return CallNestedComputation(&b_, hlo_module_config_, *comparator,
*ir_emitter_context_, operands, output);
});
};
std::vector<int64_t> xor_masks;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册