Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions QEfficient/cloud/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,20 @@

from QEfficient.base.common import QEFFCommonLoader
from QEfficient.utils import check_and_assign_cache_dir
from QEfficient.utils.custom_yaml import generate_custom_io
from QEfficient.utils.logging_utils import logger

# Specifically for Docker images.
ROOT_DIR = os.path.dirname(os.path.abspath(""))


def get_onnx_model_path(
def get_onnx_path_and_setup_customIO(
model_name: str,
cache_dir: Optional[str] = None,
hf_token: Optional[str] = None,
full_batch_size: Optional[int] = None,
local_model_dir: Optional[str] = None,
mxint8_kv_cache: Optional[int] = False,
):
"""
Exports the PyTorch model to ONNX format if a pre-exported file is not found,
Expand Down Expand Up @@ -63,6 +65,9 @@ def get_onnx_model_path(
)
onnx_model_path = qeff_model.export()
logger.info(f"Generated onnx_path: {onnx_model_path}")

# Generating Custom IO for the compile.
generate_custom_io(qeff_model, mxint8_kv_cache=mxint8_kv_cache)
return onnx_model_path


Expand All @@ -72,13 +77,14 @@ def main(
hf_token: Optional[str] = None,
local_model_dir: Optional[str] = None,
full_batch_size: Optional[int] = None,
mxint8_kv_cache: Optional[bool] = False,
) -> None:
"""
Main function for the QEfficient ONNX export CLI application.

This function serves as the entry point for exporting a PyTorch model, loaded
via QEFFCommonLoader, to the ONNX format. It prepares the necessary
paths and calls `get_onnx_model_path`.
paths and calls `get_onnx_path_and_setup_customIO`.

Parameters
----------
Expand Down Expand Up @@ -106,12 +112,13 @@ def main(

"""
cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
get_onnx_model_path(
get_onnx_path_and_setup_customIO(
model_name=model_name,
cache_dir=cache_dir,
hf_token=hf_token,
full_batch_size=full_batch_size,
local_model_dir=local_model_dir,
mxint8_kv_cache=mxint8_kv_cache,
)


Expand All @@ -137,5 +144,11 @@ def main(
default=None,
help="Set full batch size to enable continuous batching mode, default is None",
)
parser.add_argument(
"--mxint8_kv_cache",
"--mxint8-kv-cache",
required=False,
help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
)
args = parser.parse_args()
main(**args.__dict__)
16 changes: 13 additions & 3 deletions QEfficient/compile/compile_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ def compile(
This method will be removed soon; use `QEFFAutoModelForCausalLM.compile` instead.

"""

if full_batch_size and batch_size != 1:
raise ValueError("Only either batch_size or full_batch_size should be greater than one")

Expand All @@ -284,11 +285,20 @@ def compile(
full_batch_size=full_batch_size,
)

# Select the customIO config based on the mx flag.
custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
dtype_suffix = "int8" if mxint8 else "fp16"
source_path = f"./custom_io_{dtype_suffix}.yaml"
destination_path = os.path.join(os.path.dirname(qpc_path), f"custom_io_{dtype_suffix}.yaml")

# Move the custom YAML file to the cache/qeff_model directory
try:
shutil.move(source_path, destination_path)
print(f"Successfully moved '{source_path}' to '{destination_path}'.")
except Exception as e:
print(f"Error while moving file '{source_path}': {e}")

custom_io_file_name = f"custom_io_{dtype_suffix}.yaml"
if custom_io_file_path is None:
custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
custom_io_file_path = os.path.join(os.path.dirname(qpc_path), custom_io_file_name)

if not os.path.isfile(custom_io_file_path):
raise FileNotFoundError(
Expand Down
Loading