add rocm llvm compiler as an option

fsx950223 · fsx950223 · commit 7bd9ab87aed6 · 2024-10-20T10:59:58.000Z
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -61,7 +61,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/tsl/platform/cuda_libdevice_path.h"
+
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/path.h"
@@ -73,6 +73,8 @@ limitations under the License.
 #if !defined(PLATFORM_GOOGLE) && TENSORFLOW_USE_ROCM
 #include "tensorflow/tsl/platform/rocm_rocdl_path.h"
 #include "rocm/rocm_config.h"
+#else
+#include "tensorflow/tsl/platform/cuda_libdevice_path.h"
 #endif
 
 namespace xla {
@@ -423,7 +425,7 @@ auto DumpCallbackForModule(std::string module_identifier,
     DumpModule(tsl::io::JoinPath(outputs_dir, basename), module);
   };
 }
-
+#ifdef TENSORFLOW_HSACO_USE_ROCM_LLVM
 Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
                              const HloModuleConfig& hlo_module_config,
                              const std::string& device_bitcode_dir_path,
@@ -437,7 +439,99 @@ Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
                        device_bitcode_dir_path, ir_path, linked_ir_path,
                        optimized_ir_path);
 }
+#else
+Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
+                             const HloModuleConfig& hlo_module_config,
+                             const std::string& device_bitcode_dir_path,
+                             TargetModuleLinker module_linker,
+                             llvm::Triple default_target_triple,
+                             llvm::TargetMachine* target_machine,
+                             int inline_threshold) {
+  TF_RETURN_IF_ERROR(module_linker(module, gpu_version, hlo_module_config,
+                                   device_bitcode_dir_path));
+
+  llvm::LoopAnalysisManager lam;
+  llvm::FunctionAnalysisManager fam;
+  llvm::CGSCCAnalysisManager cgam;
+  llvm::ModuleAnalysisManager mam;
+
+  fam.registerPass([&] { return target_machine->getTargetIRAnalysis(); });
+
+  llvm::PipelineTuningOptions pto;
+  pto.SLPVectorization = true;
+  pto.InlinerThreshold = inline_threshold;
 
+  llvm::PassInstrumentationCallbacks pic;
+
+  llvm::StandardInstrumentations si(module->getContext(), false);
+  si.registerCallbacks(pic, &mam);
+
+  llvm::PassBuilder pb(target_machine, pto, std::nullopt, &pic);
+  pb.registerModuleAnalyses(mam);
+  pb.registerCGSCCAnalyses(cgam);
+  pb.registerFunctionAnalyses(fam);
+  pb.registerLoopAnalyses(lam);
+  pb.crossRegisterProxies(lam, fam, cgam, mam);
+
+  if (hlo_module_config.debug_options().xla_gpu_dump_llvmir()) {
+    std::string outputs_dir;
+    if (!tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir)) {
+      outputs_dir = hlo_module_config.debug_options().xla_dump_to();
+    }
+    if (!outputs_dir.empty()) {
+      pic.registerBeforeNonSkippedPassCallback(
+          DumpCallbackForModule(module->getModuleIdentifier(), outputs_dir));
+    } else {
+      LOG(ERROR) << "--xla_gpu_dump_llvmir is set, but neither the environment "
+                 << "variable TEST_UNDECLARED_OUTPUTS_DIR nor the flag "
+                 << "--xla_dump_to is set, so the llvm dumps are disabled.";
+    }
+  }
+
+  int32_t opt_level =
+      hlo_module_config.debug_options().xla_backend_optimization_level();
+
+  if (opt_level < 2) {
+    LOG(ERROR) << std::string(80, '*');
+    LOG(ERROR) << "The XLA GPU backend doesn't support unoptimized code "
+                  "generation but ";
+    LOG(ERROR) << "--xla_backend_optimization_level is set to " << opt_level
+               << "!";
+    LOG(ERROR) << "(Supported configuration is "
+                  "--xla_backend_optimization_level >= 2.)";
+    LOG(ERROR) << std::string(80, '*');
+  }
+  llvm::OptimizationLevel ol;
+  switch (opt_level) {
+    case 0:
+      ol = llvm::OptimizationLevel::O0;
+      break;
+    case 1:
+      ol = llvm::OptimizationLevel::O1;
+      break;
+    case 2:
+      ol = llvm::OptimizationLevel::O2;
+      break;
+    case 3:
+      ol = llvm::OptimizationLevel::O3;
+      break;
+  }
+
+  llvm::ModulePassManager mpm;
+  mpm.addPass(llvm::VerifierPass());
+  if (ol == llvm::OptimizationLevel::O0) {
+    mpm.addPass(pb.buildO0DefaultPipeline(ol));
+  } else {
+    mpm.addPass(pb.buildPerModuleDefaultPipeline(ol));
+  }
+  mpm.addPass(llvm::VerifierPass());
+
+  mpm.run(*module, mam);
+
+  return OkStatus();
+}
+
+#endif
 // One-time module initializer.
 // Must be called only once -- DO NOT CALL DIRECTLY.
 void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) {
@@ -491,7 +585,7 @@ void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) {
 }  // namespace
 
 namespace nvptx {
-
+#ifdef GOOGLE_CUDA
 std::string CantFindCudaMessage(absl::string_view msg,
                                 absl::string_view xla_gpu_cuda_data_dir) {
   return absl::StrCat(
@@ -588,7 +682,7 @@ StatusOr<std::string> CompileToPtx(
     TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
         module, gpu_version, hlo_module_config, libdevice_dir_path,
         NVPTXTargetModuleLinker, default_target_triple, target_machine.get(),
-        kDefaultInlineThreshold, "", "", ""));
+        kDefaultInlineThreshold));
 
     uint64_t end_usecs = tsl::Env::Default()->NowMicros();
     RecordLlvmPassesDuration(end_usecs - start_usecs);
@@ -603,7 +697,7 @@ StatusOr<std::string> CompileToPtx(
   }
   return ptx;
 }
-
+#endif // GOOGLE_CUDA
 }  // namespace nvptx
 
 namespace {
@@ -685,6 +779,8 @@ Status ReadHsaco(std::string hsaco_path, std::vector<uint8_t>& hsaco) {
 
 // Emits the given module to HSA Code Object. target_machine is an initialized
 // TargetMachine for the AMDGPU target.
+#ifdef TENSORFLOW_HSACO_USE_ROCM_LLVM
+
 StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
     llvm::Module* module, llvm::TargetMachine* target_machine,
     const std::string& optimized_ir_path, const std::string& isabin_path,
@@ -751,6 +847,118 @@ StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
   return hsaco;
 }
 
+#else
+
+StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
+    llvm::Module* module, llvm::TargetMachine* target_machine) {
+  auto* env = tsl::Env::Default();
+  std::vector<std::string> tempdir_vector;
+  env->GetLocalTempDirectories(&tempdir_vector);
+  if (tempdir_vector.empty()) {
+    return xla::InternalError(
+        "Unable to locate a temporary directory for compile-time artifacts.");
+  }
+  std::string tempdir_name = tempdir_vector.front();
+  VLOG(1) << "Compile-time artifacts located at: " << tempdir_name;
+
+  bool keep_tempfiles = false;
+  TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_KEEP_XLA_TEMPFILES",
+                                      /*default_val=*/false, &keep_tempfiles));
+  // Prepare filenames for all stages of compilation:
+  // IR, binary ISA, and HSACO.
+  std::string random_number = std::to_string(tsl::random::New64());
+  std::string ir_filename =
+      absl::StrCat(module->getModuleIdentifier(), random_number + ".ll");
+  std::string ir_path = tsl::io::JoinPath(tempdir_name, ir_filename);
+
+  std::string ir_opt_filename =
+      absl::StrCat(module->getModuleIdentifier(), random_number + "_opt.ll");
+  std::string ir_opt_path = tsl::io::JoinPath(tempdir_name, ir_opt_filename);
+
+  std::string isabin_filename =
+      absl::StrCat(module->getModuleIdentifier(), random_number + ".o");
+  std::string isabin_path = tsl::io::JoinPath(tempdir_name, isabin_filename);
+
+  std::string hsaco_filename =
+      absl::StrCat(module->getModuleIdentifier(), random_number + ".hsaco");
+  std::string hsaco_path = tsl::io::JoinPath(tempdir_name, hsaco_filename);
+
+  std::error_code ec;
+
+  // Dump LLVM IR.
+  std::unique_ptr<llvm::raw_fd_ostream> ir_fs(
+      new llvm::raw_fd_ostream(ir_path, ec, llvm::sys::fs::OF_None));
+  module->print(*ir_fs, nullptr);
+  ir_fs->flush();
+
+  // Emit GCN ISA binary.
+  llvm::legacy::PassManager pm;
+  pm.add(new llvm::TargetLibraryInfoWrapperPass(
+      llvm::Triple(module->getTargetTriple())));
+  llvm::SmallVector<char, 0> stream;
+  llvm::raw_svector_ostream pstream(stream);
+  std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
+      new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
+  module->setDataLayout(target_machine->createDataLayout());
+  target_machine->addPassesToEmitFile(pm, *isabin_fs, nullptr,
+                                      llvm::CGFT_ObjectFile);
+  pm.run(*module);
+  isabin_fs->flush();
+
+  if (keep_tempfiles) {
+    std::unique_ptr<llvm::raw_fd_ostream> ir_fs(
+        new llvm::raw_fd_ostream(ir_opt_path, ec, llvm::sys::fs::OF_None));
+    module->print(*ir_fs, nullptr);
+    ir_fs->flush();
+  }
+  // Locate lld.
+  std::string lld_path;
+  if (std::getenv("ROCM_PATH")) {
+       lld_path = tsl::io::JoinPath(std::getenv("ROCM_PATH"), "llvm/bin");
+  }
+  else {
+       lld_path = tsl::io::JoinPath("/opt/rocm", "llvm/bin");
+  }
+
+  auto lld_program = llvm::sys::findProgramByName("ld.lld", {lld_path});
+  if (!lld_program) {
+    return xla::InternalError("unable to find ld.lld in %s: %s", lld_path,
+                              lld_program.getError().message());
+  }
+  std::vector<llvm::StringRef> lld_args{
+      llvm_ir::AsStringRef("ld.lld"),    llvm_ir::AsStringRef("-flavor"),
+      llvm_ir::AsStringRef("gnu"),       llvm_ir::AsStringRef("-shared"),
+      llvm_ir::AsStringRef(isabin_path), llvm_ir::AsStringRef("-o"),
+      llvm_ir::AsStringRef(hsaco_path),
+  };
+
+  std::string error_message;
+  int lld_result =
+      llvm::sys::ExecuteAndWait(*lld_program, llvm_ir::AsArrayRef(lld_args),
+                                std::nullopt, {}, 0, 0, &error_message);
+  if (lld_result) {
+    return xla::InternalError("ld.lld execute fail: %s, error code %d",
+                              error_message, lld_result);
+  }
+
+  // Read HSACO.
+  std::ifstream hsaco_file(hsaco_path, std::ios::binary | std::ios::ate);
+  std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
+
+  std::vector<uint8_t> hsaco(hsaco_file_size);
+  hsaco_file.seekg(0, std::ios::beg);
+  hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
+  hsaco_file.close();
+  if (!keep_tempfiles) {
+    remove(ir_path.c_str());
+    remove(isabin_path.c_str());
+    remove(hsaco_path.c_str());
+  }
+  return hsaco;
+}
+
+#endif // TENSORFLOW_HSACO_USE_ROCM_LLVM
+
 // Links ROCm-Device-Libs into the given module if the module needs it.
 Status LinkROCDLIfNecessary(llvm::Module* module, std::string gcn_arch_name,
                             const std::string& rocdl_dir_path,
@@ -850,7 +1058,13 @@ std::pair<std::string, std::string> GetFeatureStrFromGCNArchName(
 std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
     llvm::Triple target_triple, GpuVersion gpu_version,
     const HloModuleConfig& hlo_module_config) {
-  return {};
+  auto compute_capability =
+      std::get_if<se::RocmComputeCapability>(&gpu_version);
+
+  std::string gcn_arch_name = compute_capability->gcn_arch_name();
+  auto arch = GetFeatureStrFromGCNArchName(gcn_arch_name);
+  return GetTargetMachine(std::move(target_triple), arch.first,
+                          hlo_module_config, arch.second);
 }
 
 void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) {
@@ -860,22 +1074,21 @@ void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) {
   // Initialize the AMDGPU target; it's the only target we link with, so call
   // its specific initialization functions instead of the catch-all
   // InitializeAll*.
-#if TENSORFLOW_USE_ROCM
+
   InitHsacoCacheDir();
   LLVMInitializeAMDGPUTarget();
   LLVMInitializeAMDGPUTargetInfo();
   LLVMInitializeAMDGPUTargetMC();
   LLVMInitializeAMDGPUAsmPrinter();
 
-#endif
-
   llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
   InitializePasses(registry);
 }
 
 }  // namespace
 
 namespace amdgpu {
+#ifdef TENSORFLOW_USE_ROCM
 StatusOr<std::vector<uint8_t>> CompileToHsaco(
     llvm::Module* module, GpuVersion gpu_version,
     const HloModuleConfig& hlo_module_config,
@@ -900,7 +1113,7 @@ StatusOr<std::vector<uint8_t>> CompileToHsaco(
     auto pos = str.find('\n');
     if (pos != std::string::npos) str = str.substr(pos + 1);
   }
-
+  // str += hlo_module_config.compilation_cache_key();
   {
     tsl::profiler::TraceMe activity(
         [&] { return absl::StrCat("Compiling IR", module->getName().str()); },
@@ -933,7 +1146,7 @@ StatusOr<std::vector<uint8_t>> CompileToHsaco(
     std::unique_ptr<llvm::TargetMachine> target_machine =
         AMDGPUGetTargetMachine(default_target_triple, gpu_version,
                                hlo_module_config);
-
+#ifdef TENSORFLOW_HSACO_USE_ROCM_LLVM
     auto* env = tsl::Env::Default();
     // Prepare filenames for all stages of compilation:
     // IR, binary ISA, and HSACO.
@@ -971,9 +1184,19 @@ StatusOr<std::vector<uint8_t>> CompileToHsaco(
         std::vector<std::string>{ir_path, linked_ir_path, optimized_ir_path,
                                  isabin_path});
   }
+#else
+    // Link with ROCm-Device-Libs, and optimize the LLVM module.
+    TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
+        module, gpu_version, hlo_module_config, rocdl_dir_path,
+        AMDGPUTargetModuleLinker, default_target_triple, target_machine.get(),
+        kAMDGPUInlineThreshold));
+
+    // Lower optimized LLVM module to HSA code object.
+    TF_ASSIGN_OR_RETURN(hsaco, EmitModuleToHsaco(module, target_machine.get()));
+#endif // TENSORFLOW_HSACO_USE_ROCM_LLVM
   return hsaco;
 }
-
+#endif // TENSORFLOW_USE_ROCM
 }  // namespace amdgpu
 
 }  // namespace gpu