ROCm
diff --git a/‎tensorflow/compiler/xla/debug_options_flags.cc‎
Lines changed: 11 additions & 2 deletions b/‎tensorflow/compiler/xla/debug_options_flags.cc‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorflow/compiler/xla/service/gpu/cublas_cudnn.cc‎
Lines changed: 0 additions & 6 deletions b/‎tensorflow/compiler/xla/service/gpu/cublas_cudnn.cc‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎tensorflow/compiler/xla/service/gpu/cublas_cudnn.h‎
Lines changed: 0 additions & 6 deletions b/‎tensorflow/compiler/xla/service/gpu/cublas_cudnn.h‎
Lines changed: 0 additions & 6 deletions
@@ -74,7 +74,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   // Note: CublasLt will be used for FP8 GEMMs regardless of the value of this
   // flag.
-  opts.set_xla_gpu_enable_cublaslt(false);
+  opts.set_xla_gpu_enable_cublaslt(true);
 
   // TODO(b/258036887): Enable once CUDA Graphs are fully supported.
   opts.set_xla_gpu_cuda_graph_level(0);
@@ -122,7 +122,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_partitioning_algorithm(
       DebugOptions::PARTITIONING_ALGORITHM_NOOP);
 
-  opts.set_xla_gpu_enable_triton_gemm(true);
+  opts.set_xla_gpu_enable_triton_gemm(false);
   opts.set_xla_gpu_enable_cudnn_int8x32_convolution_reordering(true);
   opts.set_xla_gpu_triton_gemm_any(false);
 
@@ -131,6 +131,15 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_enable_while_loop_reduce_scatter_code_motion(false);
 
   opts.set_xla_gpu_collective_inflation_factor(1);
+
+  // Minimum combined size of matrices in matrix multiplication to
+  // be rewritten to cuBLAS or Triton kernel call.
+  // This threshold is a conservative estimate and has been measured
+  // to be always beneficial (up to generally several times faster)
+  // on V100 and H100 GPUs. See openxla/xla #9319 for details.
+  const int64_t kDefaultMinGemmRewriteSize = 100;
+  opts.set_xla_gpu_gemm_rewrite_size_threshold(kDefaultMinGemmRewriteSize);
+
   return opts;
 }
 
 
@@ -168,6 +168,7 @@ def LHLOGPU_CublasLtMatmulOp : LHLOGPU_Op<"cublas.lt.matmul", [AttrSizedOperandS
     Arg<LHLO_Buffer, "", [MemWrite]>:$d,
     Arg<Optional<LHLO_Buffer>, "", [MemRead]>:$bias,
     Arg<Optional<LHLO_Buffer>, "", [MemRead, MemWrite]>:$aux,
+    Arg<Optional<LHLO_Buffer>, "", [MemRead, MemWrite]>:$workspace,
     MHLO_DotDimensionNumbers:$dot_dimension_numbers,
     MHLO_PrecisionConfigAttr:$precision_config,
     F64Attr:$alpha_real,
 
@@ -34,14 +34,8 @@ bool IsCublasLtMatmul(const HloInstruction& hlo) {
          hlo.custom_call_target() == kCublasLtMatmulCallTarget;
 }
 
-bool IsCublasLtMatmulF8(const HloInstruction& hlo) {
-  return hlo.opcode() == HloOpcode::kCustomCall &&
-         hlo.custom_call_target() == kCublasLtMatmulF8CallTarget;
-}
-
 const absl::string_view kGemmCallTarget = "__cublas$gemm";
 const absl::string_view kCublasLtMatmulCallTarget = "__cublas$lt$matmul";
-const absl::string_view kCublasLtMatmulF8CallTarget = "__cublas$lt$matmul$f8";
 const absl::string_view kTriangularSolveCallTarget = "__cublas$triangularSolve";
 
 const absl::string_view kCudnnConvBackwardInputCallTarget =
 
@@ -61,18 +61,12 @@ bool IsLegacyCublasMatmul(const HloInstruction& hlo);
 // Matrix multiplication that calls into cublasLt.
 bool IsCublasLtMatmul(const HloInstruction& hlo);
 
-// Scaled matrix multiplication in FP8. Calls into cublasLt.
-bool IsCublasLtMatmulF8(const HloInstruction& hlo);
-
 // A call to cuBLAS general matrix multiplication API.
 extern const absl::string_view kGemmCallTarget;
 
 // A call to cuBLAS Lt API matrix multiplication.
 extern const absl::string_view kCublasLtMatmulCallTarget;
 
-// A call to cuBLASLt for scaled matrix multiplication in FP8.
-extern const absl::string_view kCublasLtMatmulF8CallTarget;
-
 // A call to cuBLAS for a triangular solve.
 //
 // Like cudnn convolutions, this op returns a tuple (result, scratch_memory).