Skip to content

Commit 7bd9ab8

Browse files
committed
add rocm llvm compiler as an option
1 parent c8d05cc commit 7bd9ab8

File tree

1 file changed

+235
-12
lines changed

1 file changed

+235
-12
lines changed

tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc

Lines changed: 235 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ limitations under the License.
6161
#include "tensorflow/compiler/xla/status_macros.h"
6262
#include "tensorflow/compiler/xla/types.h"
6363
#include "tensorflow/compiler/xla/util.h"
64-
#include "tensorflow/tsl/platform/cuda_libdevice_path.h"
64+
6565
#include "tensorflow/tsl/platform/env.h"
6666
#include "tensorflow/tsl/platform/logging.h"
6767
#include "tensorflow/tsl/platform/path.h"
@@ -73,6 +73,8 @@ limitations under the License.
7373
#if !defined(PLATFORM_GOOGLE) && TENSORFLOW_USE_ROCM
7474
#include "tensorflow/tsl/platform/rocm_rocdl_path.h"
7575
#include "rocm/rocm_config.h"
76+
#else
77+
#include "tensorflow/tsl/platform/cuda_libdevice_path.h"
7678
#endif
7779

7880
namespace xla {
@@ -423,7 +425,7 @@ auto DumpCallbackForModule(std::string module_identifier,
423425
DumpModule(tsl::io::JoinPath(outputs_dir, basename), module);
424426
};
425427
}
426-
428+
#ifdef TENSORFLOW_HSACO_USE_ROCM_LLVM
427429
Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
428430
const HloModuleConfig& hlo_module_config,
429431
const std::string& device_bitcode_dir_path,
@@ -437,7 +439,99 @@ Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
437439
device_bitcode_dir_path, ir_path, linked_ir_path,
438440
optimized_ir_path);
439441
}
442+
#else
443+
Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
444+
const HloModuleConfig& hlo_module_config,
445+
const std::string& device_bitcode_dir_path,
446+
TargetModuleLinker module_linker,
447+
llvm::Triple default_target_triple,
448+
llvm::TargetMachine* target_machine,
449+
int inline_threshold) {
450+
TF_RETURN_IF_ERROR(module_linker(module, gpu_version, hlo_module_config,
451+
device_bitcode_dir_path));
452+
453+
llvm::LoopAnalysisManager lam;
454+
llvm::FunctionAnalysisManager fam;
455+
llvm::CGSCCAnalysisManager cgam;
456+
llvm::ModuleAnalysisManager mam;
457+
458+
fam.registerPass([&] { return target_machine->getTargetIRAnalysis(); });
459+
460+
llvm::PipelineTuningOptions pto;
461+
pto.SLPVectorization = true;
462+
pto.InlinerThreshold = inline_threshold;
440463

464+
llvm::PassInstrumentationCallbacks pic;
465+
466+
llvm::StandardInstrumentations si(module->getContext(), false);
467+
si.registerCallbacks(pic, &mam);
468+
469+
llvm::PassBuilder pb(target_machine, pto, std::nullopt, &pic);
470+
pb.registerModuleAnalyses(mam);
471+
pb.registerCGSCCAnalyses(cgam);
472+
pb.registerFunctionAnalyses(fam);
473+
pb.registerLoopAnalyses(lam);
474+
pb.crossRegisterProxies(lam, fam, cgam, mam);
475+
476+
if (hlo_module_config.debug_options().xla_gpu_dump_llvmir()) {
477+
std::string outputs_dir;
478+
if (!tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir)) {
479+
outputs_dir = hlo_module_config.debug_options().xla_dump_to();
480+
}
481+
if (!outputs_dir.empty()) {
482+
pic.registerBeforeNonSkippedPassCallback(
483+
DumpCallbackForModule(module->getModuleIdentifier(), outputs_dir));
484+
} else {
485+
LOG(ERROR) << "--xla_gpu_dump_llvmir is set, but neither the environment "
486+
<< "variable TEST_UNDECLARED_OUTPUTS_DIR nor the flag "
487+
<< "--xla_dump_to is set, so the llvm dumps are disabled.";
488+
}
489+
}
490+
491+
int32_t opt_level =
492+
hlo_module_config.debug_options().xla_backend_optimization_level();
493+
494+
if (opt_level < 2) {
495+
LOG(ERROR) << std::string(80, '*');
496+
LOG(ERROR) << "The XLA GPU backend doesn't support unoptimized code "
497+
"generation but ";
498+
LOG(ERROR) << "--xla_backend_optimization_level is set to " << opt_level
499+
<< "!";
500+
LOG(ERROR) << "(Supported configuration is "
501+
"--xla_backend_optimization_level >= 2.)";
502+
LOG(ERROR) << std::string(80, '*');
503+
}
504+
llvm::OptimizationLevel ol;
505+
switch (opt_level) {
506+
case 0:
507+
ol = llvm::OptimizationLevel::O0;
508+
break;
509+
case 1:
510+
ol = llvm::OptimizationLevel::O1;
511+
break;
512+
case 2:
513+
ol = llvm::OptimizationLevel::O2;
514+
break;
515+
case 3:
516+
ol = llvm::OptimizationLevel::O3;
517+
break;
518+
}
519+
520+
llvm::ModulePassManager mpm;
521+
mpm.addPass(llvm::VerifierPass());
522+
if (ol == llvm::OptimizationLevel::O0) {
523+
mpm.addPass(pb.buildO0DefaultPipeline(ol));
524+
} else {
525+
mpm.addPass(pb.buildPerModuleDefaultPipeline(ol));
526+
}
527+
mpm.addPass(llvm::VerifierPass());
528+
529+
mpm.run(*module, mam);
530+
531+
return OkStatus();
532+
}
533+
534+
#endif
441535
// One-time module initializer.
442536
// Must be called only once -- DO NOT CALL DIRECTLY.
443537
void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) {
@@ -491,7 +585,7 @@ void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) {
491585
} // namespace
492586

493587
namespace nvptx {
494-
588+
#ifdef GOOGLE_CUDA
495589
std::string CantFindCudaMessage(absl::string_view msg,
496590
absl::string_view xla_gpu_cuda_data_dir) {
497591
return absl::StrCat(
@@ -588,7 +682,7 @@ StatusOr<std::string> CompileToPtx(
588682
TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
589683
module, gpu_version, hlo_module_config, libdevice_dir_path,
590684
NVPTXTargetModuleLinker, default_target_triple, target_machine.get(),
591-
kDefaultInlineThreshold, "", "", ""));
685+
kDefaultInlineThreshold));
592686

593687
uint64_t end_usecs = tsl::Env::Default()->NowMicros();
594688
RecordLlvmPassesDuration(end_usecs - start_usecs);
@@ -603,7 +697,7 @@ StatusOr<std::string> CompileToPtx(
603697
}
604698
return ptx;
605699
}
606-
700+
#endif // GOOGLE_CUDA
607701
} // namespace nvptx
608702

609703
namespace {
@@ -685,6 +779,8 @@ Status ReadHsaco(std::string hsaco_path, std::vector<uint8_t>& hsaco) {
685779

686780
// Emits the given module to HSA Code Object. target_machine is an initialized
687781
// TargetMachine for the AMDGPU target.
782+
#ifdef TENSORFLOW_HSACO_USE_ROCM_LLVM
783+
688784
StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
689785
llvm::Module* module, llvm::TargetMachine* target_machine,
690786
const std::string& optimized_ir_path, const std::string& isabin_path,
@@ -751,6 +847,118 @@ StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
751847
return hsaco;
752848
}
753849

850+
#else
851+
852+
StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
853+
llvm::Module* module, llvm::TargetMachine* target_machine) {
854+
auto* env = tsl::Env::Default();
855+
std::vector<std::string> tempdir_vector;
856+
env->GetLocalTempDirectories(&tempdir_vector);
857+
if (tempdir_vector.empty()) {
858+
return xla::InternalError(
859+
"Unable to locate a temporary directory for compile-time artifacts.");
860+
}
861+
std::string tempdir_name = tempdir_vector.front();
862+
VLOG(1) << "Compile-time artifacts located at: " << tempdir_name;
863+
864+
bool keep_tempfiles = false;
865+
TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_KEEP_XLA_TEMPFILES",
866+
/*default_val=*/false, &keep_tempfiles));
867+
// Prepare filenames for all stages of compilation:
868+
// IR, binary ISA, and HSACO.
869+
std::string random_number = std::to_string(tsl::random::New64());
870+
std::string ir_filename =
871+
absl::StrCat(module->getModuleIdentifier(), random_number + ".ll");
872+
std::string ir_path = tsl::io::JoinPath(tempdir_name, ir_filename);
873+
874+
std::string ir_opt_filename =
875+
absl::StrCat(module->getModuleIdentifier(), random_number + "_opt.ll");
876+
std::string ir_opt_path = tsl::io::JoinPath(tempdir_name, ir_opt_filename);
877+
878+
std::string isabin_filename =
879+
absl::StrCat(module->getModuleIdentifier(), random_number + ".o");
880+
std::string isabin_path = tsl::io::JoinPath(tempdir_name, isabin_filename);
881+
882+
std::string hsaco_filename =
883+
absl::StrCat(module->getModuleIdentifier(), random_number + ".hsaco");
884+
std::string hsaco_path = tsl::io::JoinPath(tempdir_name, hsaco_filename);
885+
886+
std::error_code ec;
887+
888+
// Dump LLVM IR.
889+
std::unique_ptr<llvm::raw_fd_ostream> ir_fs(
890+
new llvm::raw_fd_ostream(ir_path, ec, llvm::sys::fs::OF_None));
891+
module->print(*ir_fs, nullptr);
892+
ir_fs->flush();
893+
894+
// Emit GCN ISA binary.
895+
llvm::legacy::PassManager pm;
896+
pm.add(new llvm::TargetLibraryInfoWrapperPass(
897+
llvm::Triple(module->getTargetTriple())));
898+
llvm::SmallVector<char, 0> stream;
899+
llvm::raw_svector_ostream pstream(stream);
900+
std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
901+
new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
902+
module->setDataLayout(target_machine->createDataLayout());
903+
target_machine->addPassesToEmitFile(pm, *isabin_fs, nullptr,
904+
llvm::CGFT_ObjectFile);
905+
pm.run(*module);
906+
isabin_fs->flush();
907+
908+
if (keep_tempfiles) {
909+
std::unique_ptr<llvm::raw_fd_ostream> ir_fs(
910+
new llvm::raw_fd_ostream(ir_opt_path, ec, llvm::sys::fs::OF_None));
911+
module->print(*ir_fs, nullptr);
912+
ir_fs->flush();
913+
}
914+
// Locate lld.
915+
std::string lld_path;
916+
if (std::getenv("ROCM_PATH")) {
917+
lld_path = tsl::io::JoinPath(std::getenv("ROCM_PATH"), "llvm/bin");
918+
}
919+
else {
920+
lld_path = tsl::io::JoinPath("/opt/rocm", "llvm/bin");
921+
}
922+
923+
auto lld_program = llvm::sys::findProgramByName("ld.lld", {lld_path});
924+
if (!lld_program) {
925+
return xla::InternalError("unable to find ld.lld in %s: %s", lld_path,
926+
lld_program.getError().message());
927+
}
928+
std::vector<llvm::StringRef> lld_args{
929+
llvm_ir::AsStringRef("ld.lld"), llvm_ir::AsStringRef("-flavor"),
930+
llvm_ir::AsStringRef("gnu"), llvm_ir::AsStringRef("-shared"),
931+
llvm_ir::AsStringRef(isabin_path), llvm_ir::AsStringRef("-o"),
932+
llvm_ir::AsStringRef(hsaco_path),
933+
};
934+
935+
std::string error_message;
936+
int lld_result =
937+
llvm::sys::ExecuteAndWait(*lld_program, llvm_ir::AsArrayRef(lld_args),
938+
std::nullopt, {}, 0, 0, &error_message);
939+
if (lld_result) {
940+
return xla::InternalError("ld.lld execute fail: %s, error code %d",
941+
error_message, lld_result);
942+
}
943+
944+
// Read HSACO.
945+
std::ifstream hsaco_file(hsaco_path, std::ios::binary | std::ios::ate);
946+
std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
947+
948+
std::vector<uint8_t> hsaco(hsaco_file_size);
949+
hsaco_file.seekg(0, std::ios::beg);
950+
hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
951+
hsaco_file.close();
952+
if (!keep_tempfiles) {
953+
remove(ir_path.c_str());
954+
remove(isabin_path.c_str());
955+
remove(hsaco_path.c_str());
956+
}
957+
return hsaco;
958+
}
959+
960+
#endif // TENSORFLOW_HSACO_USE_ROCM_LLVM
961+
754962
// Links ROCm-Device-Libs into the given module if the module needs it.
755963
Status LinkROCDLIfNecessary(llvm::Module* module, std::string gcn_arch_name,
756964
const std::string& rocdl_dir_path,
@@ -850,7 +1058,13 @@ std::pair<std::string, std::string> GetFeatureStrFromGCNArchName(
8501058
std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
8511059
llvm::Triple target_triple, GpuVersion gpu_version,
8521060
const HloModuleConfig& hlo_module_config) {
853-
return {};
1061+
auto compute_capability =
1062+
std::get_if<se::RocmComputeCapability>(&gpu_version);
1063+
1064+
std::string gcn_arch_name = compute_capability->gcn_arch_name();
1065+
auto arch = GetFeatureStrFromGCNArchName(gcn_arch_name);
1066+
return GetTargetMachine(std::move(target_triple), arch.first,
1067+
hlo_module_config, arch.second);
8541068
}
8551069

8561070
void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) {
@@ -860,22 +1074,21 @@ void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) {
8601074
// Initialize the AMDGPU target; it's the only target we link with, so call
8611075
// its specific initialization functions instead of the catch-all
8621076
// InitializeAll*.
863-
#if TENSORFLOW_USE_ROCM
1077+
8641078
InitHsacoCacheDir();
8651079
LLVMInitializeAMDGPUTarget();
8661080
LLVMInitializeAMDGPUTargetInfo();
8671081
LLVMInitializeAMDGPUTargetMC();
8681082
LLVMInitializeAMDGPUAsmPrinter();
8691083

870-
#endif
871-
8721084
llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
8731085
InitializePasses(registry);
8741086
}
8751087

8761088
} // namespace
8771089

8781090
namespace amdgpu {
1091+
#ifdef TENSORFLOW_USE_ROCM
8791092
StatusOr<std::vector<uint8_t>> CompileToHsaco(
8801093
llvm::Module* module, GpuVersion gpu_version,
8811094
const HloModuleConfig& hlo_module_config,
@@ -900,7 +1113,7 @@ StatusOr<std::vector<uint8_t>> CompileToHsaco(
9001113
auto pos = str.find('\n');
9011114
if (pos != std::string::npos) str = str.substr(pos + 1);
9021115
}
903-
1116+
// str += hlo_module_config.compilation_cache_key();
9041117
{
9051118
tsl::profiler::TraceMe activity(
9061119
[&] { return absl::StrCat("Compiling IR", module->getName().str()); },
@@ -933,7 +1146,7 @@ StatusOr<std::vector<uint8_t>> CompileToHsaco(
9331146
std::unique_ptr<llvm::TargetMachine> target_machine =
9341147
AMDGPUGetTargetMachine(default_target_triple, gpu_version,
9351148
hlo_module_config);
936-
1149+
#ifdef TENSORFLOW_HSACO_USE_ROCM_LLVM
9371150
auto* env = tsl::Env::Default();
9381151
// Prepare filenames for all stages of compilation:
9391152
// IR, binary ISA, and HSACO.
@@ -971,9 +1184,19 @@ StatusOr<std::vector<uint8_t>> CompileToHsaco(
9711184
std::vector<std::string>{ir_path, linked_ir_path, optimized_ir_path,
9721185
isabin_path});
9731186
}
1187+
#else
1188+
// Link with ROCm-Device-Libs, and optimize the LLVM module.
1189+
TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
1190+
module, gpu_version, hlo_module_config, rocdl_dir_path,
1191+
AMDGPUTargetModuleLinker, default_target_triple, target_machine.get(),
1192+
kAMDGPUInlineThreshold));
1193+
1194+
// Lower optimized LLVM module to HSA code object.
1195+
TF_ASSIGN_OR_RETURN(hsaco, EmitModuleToHsaco(module, target_machine.get()));
1196+
#endif // TENSORFLOW_HSACO_USE_ROCM_LLVM
9741197
return hsaco;
9751198
}
976-
1199+
#endif // TENSORFLOW_USE_ROCM
9771200
} // namespace amdgpu
9781201

9791202
} // namespace gpu

0 commit comments

Comments
 (0)