@@ -61,7 +61,7 @@ limitations under the License.
6161#include " tensorflow/compiler/xla/status_macros.h"
6262#include " tensorflow/compiler/xla/types.h"
6363#include " tensorflow/compiler/xla/util.h"
64- # include " tensorflow/tsl/platform/cuda_libdevice_path.h "
64+
6565#include " tensorflow/tsl/platform/env.h"
6666#include " tensorflow/tsl/platform/logging.h"
6767#include " tensorflow/tsl/platform/path.h"
@@ -73,6 +73,8 @@ limitations under the License.
7373#if !defined(PLATFORM_GOOGLE) && TENSORFLOW_USE_ROCM
7474#include " tensorflow/tsl/platform/rocm_rocdl_path.h"
7575#include " rocm/rocm_config.h"
76+ #else
77+ #include " tensorflow/tsl/platform/cuda_libdevice_path.h"
7678#endif
7779
7880namespace xla {
@@ -423,7 +425,7 @@ auto DumpCallbackForModule(std::string module_identifier,
423425 DumpModule (tsl::io::JoinPath (outputs_dir, basename), module );
424426 };
425427}
426-
428+ # ifdef TENSORFLOW_HSACO_USE_ROCM_LLVM
427429Status LinkAndOptimizeModule (llvm::Module* module , GpuVersion gpu_version,
428430 const HloModuleConfig& hlo_module_config,
429431 const std::string& device_bitcode_dir_path,
@@ -437,7 +439,99 @@ Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
437439 device_bitcode_dir_path, ir_path, linked_ir_path,
438440 optimized_ir_path);
439441}
442+ #else
443+ Status LinkAndOptimizeModule (llvm::Module* module , GpuVersion gpu_version,
444+ const HloModuleConfig& hlo_module_config,
445+ const std::string& device_bitcode_dir_path,
446+ TargetModuleLinker module_linker,
447+ llvm::Triple default_target_triple,
448+ llvm::TargetMachine* target_machine,
449+ int inline_threshold) {
450+ TF_RETURN_IF_ERROR (module_linker (module , gpu_version, hlo_module_config,
451+ device_bitcode_dir_path));
452+
453+ llvm::LoopAnalysisManager lam;
454+ llvm::FunctionAnalysisManager fam;
455+ llvm::CGSCCAnalysisManager cgam;
456+ llvm::ModuleAnalysisManager mam;
457+
458+ fam.registerPass ([&] { return target_machine->getTargetIRAnalysis (); });
459+
460+ llvm::PipelineTuningOptions pto;
461+ pto.SLPVectorization = true ;
462+ pto.InlinerThreshold = inline_threshold;
440463
464+ llvm::PassInstrumentationCallbacks pic;
465+
466+ llvm::StandardInstrumentations si (module ->getContext (), false );
467+ si.registerCallbacks (pic, &mam);
468+
469+ llvm::PassBuilder pb (target_machine, pto, std::nullopt , &pic);
470+ pb.registerModuleAnalyses (mam);
471+ pb.registerCGSCCAnalyses (cgam);
472+ pb.registerFunctionAnalyses (fam);
473+ pb.registerLoopAnalyses (lam);
474+ pb.crossRegisterProxies (lam, fam, cgam, mam);
475+
476+ if (hlo_module_config.debug_options ().xla_gpu_dump_llvmir ()) {
477+ std::string outputs_dir;
478+ if (!tsl::io::GetTestUndeclaredOutputsDir (&outputs_dir)) {
479+ outputs_dir = hlo_module_config.debug_options ().xla_dump_to ();
480+ }
481+ if (!outputs_dir.empty ()) {
482+ pic.registerBeforeNonSkippedPassCallback (
483+ DumpCallbackForModule (module ->getModuleIdentifier (), outputs_dir));
484+ } else {
485+ LOG (ERROR) << " --xla_gpu_dump_llvmir is set, but neither the environment "
486+ << " variable TEST_UNDECLARED_OUTPUTS_DIR nor the flag "
487+ << " --xla_dump_to is set, so the llvm dumps are disabled." ;
488+ }
489+ }
490+
491+ int32_t opt_level =
492+ hlo_module_config.debug_options ().xla_backend_optimization_level ();
493+
494+ if (opt_level < 2 ) {
495+ LOG (ERROR) << std::string (80 , ' *' );
496+ LOG (ERROR) << " The XLA GPU backend doesn't support unoptimized code "
497+ " generation but " ;
498+ LOG (ERROR) << " --xla_backend_optimization_level is set to " << opt_level
499+ << " !" ;
500+ LOG (ERROR) << " (Supported configuration is "
501+ " --xla_backend_optimization_level >= 2.)" ;
502+ LOG (ERROR) << std::string (80 , ' *' );
503+ }
504+ llvm::OptimizationLevel ol;
505+ switch (opt_level) {
506+ case 0 :
507+ ol = llvm::OptimizationLevel::O0;
508+ break ;
509+ case 1 :
510+ ol = llvm::OptimizationLevel::O1;
511+ break ;
512+ case 2 :
513+ ol = llvm::OptimizationLevel::O2;
514+ break ;
515+ case 3 :
516+ ol = llvm::OptimizationLevel::O3;
517+ break ;
518+ }
519+
520+ llvm::ModulePassManager mpm;
521+ mpm.addPass (llvm::VerifierPass ());
522+ if (ol == llvm::OptimizationLevel::O0) {
523+ mpm.addPass (pb.buildO0DefaultPipeline (ol));
524+ } else {
525+ mpm.addPass (pb.buildPerModuleDefaultPipeline (ol));
526+ }
527+ mpm.addPass (llvm::VerifierPass ());
528+
529+ mpm.run (*module , mam);
530+
531+ return OkStatus ();
532+ }
533+
534+ #endif
441535// One-time module initializer.
442536// Must be called only once -- DO NOT CALL DIRECTLY.
443537void NVPTXBackendInit (const HloModuleConfig& hlo_module_config) {
@@ -491,7 +585,7 @@ void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) {
491585} // namespace
492586
493587namespace nvptx {
494-
588+ # ifdef GOOGLE_CUDA
495589std::string CantFindCudaMessage (absl::string_view msg,
496590 absl::string_view xla_gpu_cuda_data_dir) {
497591 return absl::StrCat (
@@ -588,7 +682,7 @@ StatusOr<std::string> CompileToPtx(
588682 TF_RETURN_IF_ERROR (LinkAndOptimizeModule (
589683 module , gpu_version, hlo_module_config, libdevice_dir_path,
590684 NVPTXTargetModuleLinker, default_target_triple, target_machine.get (),
591- kDefaultInlineThreshold , " " , " " , " " ));
685+ kDefaultInlineThreshold ));
592686
593687 uint64_t end_usecs = tsl::Env::Default ()->NowMicros ();
594688 RecordLlvmPassesDuration (end_usecs - start_usecs);
@@ -603,7 +697,7 @@ StatusOr<std::string> CompileToPtx(
603697 }
604698 return ptx;
605699}
606-
700+ # endif // GOOGLE_CUDA
607701} // namespace nvptx
608702
609703namespace {
@@ -685,6 +779,8 @@ Status ReadHsaco(std::string hsaco_path, std::vector<uint8_t>& hsaco) {
685779
686780// Emits the given module to HSA Code Object. target_machine is an initialized
687781// TargetMachine for the AMDGPU target.
782+ #ifdef TENSORFLOW_HSACO_USE_ROCM_LLVM
783+
688784StatusOr<std::vector<uint8_t >> EmitModuleToHsaco (
689785 llvm::Module* module , llvm::TargetMachine* target_machine,
690786 const std::string& optimized_ir_path, const std::string& isabin_path,
@@ -751,6 +847,118 @@ StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
751847 return hsaco;
752848}
753849
850+ #else
851+
852+ StatusOr<std::vector<uint8_t >> EmitModuleToHsaco (
853+ llvm::Module* module , llvm::TargetMachine* target_machine) {
854+ auto * env = tsl::Env::Default ();
855+ std::vector<std::string> tempdir_vector;
856+ env->GetLocalTempDirectories (&tempdir_vector);
857+ if (tempdir_vector.empty ()) {
858+ return xla::InternalError (
859+ " Unable to locate a temporary directory for compile-time artifacts." );
860+ }
861+ std::string tempdir_name = tempdir_vector.front ();
862+ VLOG (1 ) << " Compile-time artifacts located at: " << tempdir_name;
863+
864+ bool keep_tempfiles = false ;
865+ TF_CHECK_OK (tsl::ReadBoolFromEnvVar (" TF_ROCM_KEEP_XLA_TEMPFILES" ,
866+ /* default_val=*/ false , &keep_tempfiles));
867+ // Prepare filenames for all stages of compilation:
868+ // IR, binary ISA, and HSACO.
869+ std::string random_number = std::to_string (tsl::random::New64 ());
870+ std::string ir_filename =
871+ absl::StrCat (module ->getModuleIdentifier (), random_number + " .ll" );
872+ std::string ir_path = tsl::io::JoinPath (tempdir_name, ir_filename);
873+
874+ std::string ir_opt_filename =
875+ absl::StrCat (module ->getModuleIdentifier (), random_number + " _opt.ll" );
876+ std::string ir_opt_path = tsl::io::JoinPath (tempdir_name, ir_opt_filename);
877+
878+ std::string isabin_filename =
879+ absl::StrCat (module ->getModuleIdentifier (), random_number + " .o" );
880+ std::string isabin_path = tsl::io::JoinPath (tempdir_name, isabin_filename);
881+
882+ std::string hsaco_filename =
883+ absl::StrCat (module ->getModuleIdentifier (), random_number + " .hsaco" );
884+ std::string hsaco_path = tsl::io::JoinPath (tempdir_name, hsaco_filename);
885+
886+ std::error_code ec;
887+
888+ // Dump LLVM IR.
889+ std::unique_ptr<llvm::raw_fd_ostream> ir_fs (
890+ new llvm::raw_fd_ostream (ir_path, ec, llvm::sys::fs::OF_None));
891+ module ->print (*ir_fs, nullptr );
892+ ir_fs->flush ();
893+
894+ // Emit GCN ISA binary.
895+ llvm::legacy::PassManager pm;
896+ pm.add (new llvm::TargetLibraryInfoWrapperPass (
897+ llvm::Triple (module ->getTargetTriple ())));
898+ llvm::SmallVector<char , 0 > stream;
899+ llvm::raw_svector_ostream pstream (stream);
900+ std::unique_ptr<llvm::raw_fd_ostream> isabin_fs (
901+ new llvm::raw_fd_ostream (isabin_path, ec, llvm::sys::fs::OF_Text));
902+ module ->setDataLayout (target_machine->createDataLayout ());
903+ target_machine->addPassesToEmitFile (pm, *isabin_fs, nullptr ,
904+ llvm::CGFT_ObjectFile);
905+ pm.run (*module );
906+ isabin_fs->flush ();
907+
908+ if (keep_tempfiles) {
909+ std::unique_ptr<llvm::raw_fd_ostream> ir_fs (
910+ new llvm::raw_fd_ostream (ir_opt_path, ec, llvm::sys::fs::OF_None));
911+ module ->print (*ir_fs, nullptr );
912+ ir_fs->flush ();
913+ }
914+ // Locate lld.
915+ std::string lld_path;
916+ if (std::getenv (" ROCM_PATH" )) {
917+ lld_path = tsl::io::JoinPath (std::getenv (" ROCM_PATH" ), " llvm/bin" );
918+ }
919+ else {
920+ lld_path = tsl::io::JoinPath (" /opt/rocm" , " llvm/bin" );
921+ }
922+
923+ auto lld_program = llvm::sys::findProgramByName (" ld.lld" , {lld_path});
924+ if (!lld_program) {
925+ return xla::InternalError (" unable to find ld.lld in %s: %s" , lld_path,
926+ lld_program.getError ().message ());
927+ }
928+ std::vector<llvm::StringRef> lld_args{
929+ llvm_ir::AsStringRef (" ld.lld" ), llvm_ir::AsStringRef (" -flavor" ),
930+ llvm_ir::AsStringRef (" gnu" ), llvm_ir::AsStringRef (" -shared" ),
931+ llvm_ir::AsStringRef (isabin_path), llvm_ir::AsStringRef (" -o" ),
932+ llvm_ir::AsStringRef (hsaco_path),
933+ };
934+
935+ std::string error_message;
936+ int lld_result =
937+ llvm::sys::ExecuteAndWait (*lld_program, llvm_ir::AsArrayRef (lld_args),
938+ std::nullopt , {}, 0 , 0 , &error_message);
939+ if (lld_result) {
940+ return xla::InternalError (" ld.lld execute fail: %s, error code %d" ,
941+ error_message, lld_result);
942+ }
943+
944+ // Read HSACO.
945+ std::ifstream hsaco_file (hsaco_path, std::ios::binary | std::ios::ate);
946+ std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg ();
947+
948+ std::vector<uint8_t > hsaco (hsaco_file_size);
949+ hsaco_file.seekg (0 , std::ios::beg);
950+ hsaco_file.read (reinterpret_cast <char *>(&hsaco[0 ]), hsaco_file_size);
951+ hsaco_file.close ();
952+ if (!keep_tempfiles) {
953+ remove (ir_path.c_str ());
954+ remove (isabin_path.c_str ());
955+ remove (hsaco_path.c_str ());
956+ }
957+ return hsaco;
958+ }
959+
960+ #endif // TENSORFLOW_HSACO_USE_ROCM_LLVM
961+
754962// Links ROCm-Device-Libs into the given module if the module needs it.
755963Status LinkROCDLIfNecessary (llvm::Module* module , std::string gcn_arch_name,
756964 const std::string& rocdl_dir_path,
@@ -850,7 +1058,13 @@ std::pair<std::string, std::string> GetFeatureStrFromGCNArchName(
8501058std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine (
8511059 llvm::Triple target_triple, GpuVersion gpu_version,
8521060 const HloModuleConfig& hlo_module_config) {
853- return {};
1061+ auto compute_capability =
1062+ std::get_if<se::RocmComputeCapability>(&gpu_version);
1063+
1064+ std::string gcn_arch_name = compute_capability->gcn_arch_name ();
1065+ auto arch = GetFeatureStrFromGCNArchName (gcn_arch_name);
1066+ return GetTargetMachine (std::move (target_triple), arch.first ,
1067+ hlo_module_config, arch.second );
8541068}
8551069
8561070void AMDGPUBackendInit (const HloModuleConfig& hlo_module_config) {
@@ -860,22 +1074,21 @@ void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) {
8601074 // Initialize the AMDGPU target; it's the only target we link with, so call
8611075 // its specific initialization functions instead of the catch-all
8621076 // InitializeAll*.
863- # if TENSORFLOW_USE_ROCM
1077+
8641078 InitHsacoCacheDir ();
8651079 LLVMInitializeAMDGPUTarget ();
8661080 LLVMInitializeAMDGPUTargetInfo ();
8671081 LLVMInitializeAMDGPUTargetMC ();
8681082 LLVMInitializeAMDGPUAsmPrinter ();
8691083
870- #endif
871-
8721084 llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry ();
8731085 InitializePasses (registry);
8741086}
8751087
8761088} // namespace
8771089
8781090namespace amdgpu {
1091+ #ifdef TENSORFLOW_USE_ROCM
8791092StatusOr<std::vector<uint8_t >> CompileToHsaco (
8801093 llvm::Module* module , GpuVersion gpu_version,
8811094 const HloModuleConfig& hlo_module_config,
@@ -900,7 +1113,7 @@ StatusOr<std::vector<uint8_t>> CompileToHsaco(
9001113 auto pos = str.find (' \n ' );
9011114 if (pos != std::string::npos) str = str.substr (pos + 1 );
9021115 }
903-
1116+ // str += hlo_module_config.compilation_cache_key();
9041117 {
9051118 tsl::profiler::TraceMe activity (
9061119 [&] { return absl::StrCat (" Compiling IR" , module ->getName ().str ()); },
@@ -933,7 +1146,7 @@ StatusOr<std::vector<uint8_t>> CompileToHsaco(
9331146 std::unique_ptr<llvm::TargetMachine> target_machine =
9341147 AMDGPUGetTargetMachine (default_target_triple, gpu_version,
9351148 hlo_module_config);
936-
1149+ # ifdef TENSORFLOW_HSACO_USE_ROCM_LLVM
9371150 auto * env = tsl::Env::Default ();
9381151 // Prepare filenames for all stages of compilation:
9391152 // IR, binary ISA, and HSACO.
@@ -971,9 +1184,19 @@ StatusOr<std::vector<uint8_t>> CompileToHsaco(
9711184 std::vector<std::string>{ir_path, linked_ir_path, optimized_ir_path,
9721185 isabin_path});
9731186 }
1187+ #else
1188+ // Link with ROCm-Device-Libs, and optimize the LLVM module.
1189+ TF_RETURN_IF_ERROR (LinkAndOptimizeModule (
1190+ module , gpu_version, hlo_module_config, rocdl_dir_path,
1191+ AMDGPUTargetModuleLinker, default_target_triple, target_machine.get (),
1192+ kAMDGPUInlineThreshold ));
1193+
1194+ // Lower optimized LLVM module to HSA code object.
1195+ TF_ASSIGN_OR_RETURN (hsaco, EmitModuleToHsaco (module , target_machine.get ()));
1196+ #endif // TENSORFLOW_HSACO_USE_ROCM_LLVM
9741197 return hsaco;
9751198}
976-
1199+ #endif // TENSORFLOW_USE_ROCM
9771200} // namespace amdgpu
9781201
9791202} // namespace gpu
0 commit comments