quic · quic-vargupt · Oct 10, 2025 · Oct 14, 2025
@@ -11,18 +11,20 @@
 
 from QEfficient.base.common import QEFFCommonLoader
 from QEfficient.utils import check_and_assign_cache_dir
+from QEfficient.utils.custom_yaml import generate_custom_io
 from QEfficient.utils.logging_utils import logger
 
 # Specifically for Docker images.
 ROOT_DIR = os.path.dirname(os.path.abspath(""))
 
 
-def get_onnx_model_path(
+def get_onnx_path_and_setup_customIO(
     model_name: str,
     cache_dir: Optional[str] = None,
     hf_token: Optional[str] = None,
     full_batch_size: Optional[int] = None,
     local_model_dir: Optional[str] = None,
+    mxint8_kv_cache: Optional[int] = False,
 ):
     """
     Exports the PyTorch model to ONNX format if a pre-exported file is not found,
@@ -63,6 +65,9 @@ def get_onnx_model_path(
     )
     onnx_model_path = qeff_model.export()
     logger.info(f"Generated onnx_path: {onnx_model_path}")
+
+    # Generating Custom IO for the compile.
+    generate_custom_io(qeff_model, mxint8_kv_cache=mxint8_kv_cache)
     return onnx_model_path
 
 
@@ -72,13 +77,14 @@ def main(
     hf_token: Optional[str] = None,
     local_model_dir: Optional[str] = None,
     full_batch_size: Optional[int] = None,
+    mxint8_kv_cache: Optional[bool] = False,
 ) -> None:
     """
     Main function for the QEfficient ONNX export CLI application.
 
     This function serves as the entry point for exporting a PyTorch model, loaded
     via QEFFCommonLoader, to the ONNX format. It prepares the necessary
-    paths and calls `get_onnx_model_path`.
+    paths and calls `get_onnx_path_and_setup_customIO`.
 
     Parameters
     ----------
@@ -106,12 +112,13 @@ def main(
 
     """
     cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
-    get_onnx_model_path(
+    get_onnx_path_and_setup_customIO(
         model_name=model_name,
         cache_dir=cache_dir,
         hf_token=hf_token,
         full_batch_size=full_batch_size,
         local_model_dir=local_model_dir,
+        mxint8_kv_cache=mxint8_kv_cache,
     )
 
 
@@ -137,5 +144,11 @@ def main(
         default=None,
         help="Set full batch size to enable continuous batching mode, default is None",
     )
+    parser.add_argument(
+        "--mxint8_kv_cache",
+        "--mxint8-kv-cache",
+        required=False,
+        help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
+    )
     args = parser.parse_args()
     main(**args.__dict__)
@@ -270,6 +270,7 @@ def compile(
         This method will be removed soon; use `QEFFAutoModelForCausalLM.compile` instead.
 
     """
+
     if full_batch_size and batch_size != 1:
         raise ValueError("Only either batch_size or full_batch_size should be greater than one")
 
@@ -284,11 +285,20 @@ def compile(
         full_batch_size=full_batch_size,
     )
 
-    # Select the customIO config based on the mx flag.
-    custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
+    dtype_suffix = "int8" if mxint8 else "fp16"
+    source_path = f"./custom_io_{dtype_suffix}.yaml"
+    destination_path = os.path.join(os.path.dirname(qpc_path), f"custom_io_{dtype_suffix}.yaml")
+
+    # Move the custom YAML file to the cache/qeff_model directory
+    try:
+        shutil.move(source_path, destination_path)
+        print(f"Successfully moved '{source_path}' to '{destination_path}'.")
+    except Exception as e:
+        print(f"Error while moving file '{source_path}': {e}")
 
+    custom_io_file_name = f"custom_io_{dtype_suffix}.yaml"
     if custom_io_file_path is None:
-        custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
+        custom_io_file_path = os.path.join(os.path.dirname(qpc_path), custom_io_file_name)
 
     if not os.path.isfile(custom_io_file_path):
         raise FileNotFoundError(