Peft Lora implementation (quic#85)

irajagop · web-flow · commit 0ef682966ca5 · 2024-09-12T16:06:35.000+05:30
AutoPeftModelForCausalLM for loading LoRA models

Better export code that can be utilized for other auto classes
Hashing model cache location to avoid exporting again
Hashing model compile location to avoid compiling again

---------

Signed-off-by: Ilango Rajagopal &lt;quic_irajagop@quicinc.com&gt;
diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
@@ -9,6 +9,7 @@
 from QEfficient.compile.compile_helper import compile
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
+from QEfficient.peft import QEffAutoPeftModelForCausalLM
 from QEfficient.transformers.transform import transform
 
 # Users can use QEfficient.export for exporting models to ONNX
@@ -22,5 +23,6 @@
     "cloud_ai_100_exec_kv",
     "QEffAutoModel",
     "QEFFAutoModelForCausalLM",
+    "QEffAutoPeftModelForCausalLM",
     "QEFFCommonLoader",
 ]
diff --git a/QEfficient/peft/__init__.py b/QEfficient/peft/__init__.py
@@ -0,0 +1,14 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from QEfficient.peft.auto import QEffAutoPeftModelForCausalLM
+from QEfficient.peft.peft_model import QEffPeftModelForCausalLM
+
+__all__ = [
+    "QEffAutoPeftModelForCausalLM",
+    "QEffPeftModelForCausalLM",
+]
diff --git a/QEfficient/peft/auto.py b/QEfficient/peft/auto.py
diff --git a/QEfficient/peft/onnx_transforms.py b/QEfficient/peft/onnx_transforms.py
@@ -0,0 +1,56 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+from typing import Tuple
+
+import onnx
+
+from QEfficient.base.onnx_transforms import OnnxTransform
+
+
+class AdapterWeightsToInputsTransform(OnnxTransform):
+    @classmethod
+    def apply(cls, model: onnx.ModelProto, *, adapter_name: str, **kwargs) -> Tuple[onnx.ModelProto, bool]:
+        transformed = False
+        removed_initializers = []
+
+        # Find nodes with lora weights as inputs
+        weight_suffix = f".{adapter_name}.weight"
+        lora_weight_nodes = {
+            inp: node for node in model.graph.node for inp in node.input if inp.endswith(weight_suffix)
+        }
+
+        for i, weight in enumerate(model.graph.initializer):
+            if weight.name.endswith(weight_suffix):
+                transformed = True
+
+                # Create input/output for lora weights
+                new_weight_name = weight.name[: -len(weight_suffix)] + ".weight"
+                type_proto = onnx.helper.make_tensor_type_proto(weight.data_type, shape=list(weight.dims))
+                inp = onnx.ValueInfoProto(name=new_weight_name, type=type_proto)
+                out = onnx.ValueInfoProto(name=new_weight_name + "_RetainedState", type=type_proto)
+                model.graph.input.append(inp)
+                model.graph.output.append(out)
+
+                # Create a node that connects input -> output
+                node = onnx.helper.make_node("Identity", [inp.name], [out.name], new_weight_name + "_identity")
+                model.graph.node.append(node)
+
+                # Rename weight input
+                lora_weight_node = lora_weight_nodes[weight.name]
+                for j, inp in enumerate(lora_weight_node.input):
+                    if inp == weight.name:
+                        lora_weight_node.input[j] = new_weight_name
+
+                # Remove weight initializers
+                removed_initializers.append(i)
+
+        if transformed:
+            for i in sorted(removed_initializers, reverse=True):
+                model.graph.initializer.pop(i)
+
+        return model, transformed
diff --git a/QEfficient/peft/peft_model.py b/QEfficient/peft/peft_model.py
@@ -0,0 +1,61 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+from peft import PeftModelForCausalLM, PeftType
+
+
+class QEffPeftModelForCausalLM(PeftModelForCausalLM):
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        task_ids=None,
+        **kwargs,
+    ):
+        peft_config = self.active_peft_config
+        if not peft_config.is_prompt_learning:
+            if self.base_model.config.model_type == "mpt":
+                if inputs_embeds is not None:
+                    raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds")
+                return self.base_model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    labels=labels,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    **kwargs,
+                )
+
+            if peft_config.peft_type == PeftType.POLY:
+                kwargs["task_ids"] = task_ids
+
+            with self._enable_peft_forward_hooks(**kwargs):
+                kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
+                return self.base_model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    labels=labels,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    **kwargs,
+                )
+
+        raise NotImplementedError("Prompt learning methods are not supported from QEfficient")
diff --git a/QEfficient/peft/pytorch_transforms.py b/QEfficient/peft/pytorch_transforms.py
@@ -0,0 +1,15 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+from peft import PeftModelForCausalLM
+
+from QEfficient.base.pytorch_transforms import ModuleMappingTransform
+from QEfficient.peft.peft_model import QEffPeftModelForCausalLM
+
+
+class PeftModelInputsTransform(ModuleMappingTransform):
+    _module_mapping = {PeftModelForCausalLM: QEffPeftModelForCausalLM}
diff --git a/QEfficient/utils/cache.py b/QEfficient/utils/cache.py
@@ -0,0 +1,41 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+import json
+import os
+from pathlib import Path
+
+QEFF_HOME: Path = None
+if "QEFF_HOME" in os.environ:
+    QEFF_HOME = Path(os.environ["QEFF_HOME"])
+elif "XDG_CACHE_HOME" in os.environ:
+    QEFF_HOME = Path(os.environ["XDG_CACHE_HOME"]) / "qeff_models"
+else:
+    QEFF_HOME = Path("~/.cache/qeff_models").expanduser()
+
+
+def json_serializable(obj):
+    if isinstance(obj, set):
+        return sorted(obj)
+    raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")
+
+
+def to_hashable(obj) -> bytes:
+    """
+    Converts obj to bytes such that same object will result in same hash
+    """
+    return json.dumps(
+        obj,
+        skipkeys=False,
+        ensure_ascii=True,
+        check_circular=True,
+        allow_nan=False,
+        indent=None,
+        separators=(",", ":"),
+        default=json_serializable,
+        sort_keys=True,
+    ).encode()
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
@@ -43,6 +43,9 @@ def get_models_dir():
 
 QEFF_MODELS_DIR = get_models_dir()
 
+ONNX_EXPORT_EXAMPLE_BATCH_SIZE = 1
+ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32
+
 
 class Constants:
     # Export Constants.
diff --git a/docs/source/hl_api.md b/docs/source/hl_api.md
@@ -9,6 +9,13 @@
    :undoc-members: 
    :exclude-members: QEffAutoModel,QEFFTransformersBase, run_ort, run_pytorch, get_tokenizer, run_cloud_ai_100, execute
 ``` 
+
+## `QEffAutoPeftModelForCausalLM`
+```{eval-rst}
+.. autoclass:: QEfficient.peft.auto.QEffAutoPeftModelForCausalLM
+   :members:
+```
+
 ## `export`
 ```{eval-rst}
 .. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
diff --git a/examples/peft_models.py b/examples/peft_models.py
@@ -0,0 +1,68 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from transformers import AutoTokenizer, TextStreamer
+
+from QEfficient import QEffAutoPeftModelForCausalLM
+
+base_model_name = "mistralai/Mistral-7B-v0.1"
+tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+streamer = TextStreamer(tokenizer)
+
+m = QEffAutoPeftModelForCausalLM.from_pretrained("predibase/magicoder", "magicoder")
+m.export()
+m.compile(prefill_seq_len=32, ctx_len=1024)
+
+# Magicoder adapter
+m.set_adapter("magicoder")
+inputs = tokenizer("def fibonacci", return_tensors="pt")
+m.generate(**inputs, streamer=streamer, max_new_tokens=1024)
+
+# TLDR, summary generator
+m.load_adapter("predibase/tldr_headline_gen", "tldr_headline_gen")
+m.set_adapter("tldr_headline_gen")
+inputs = tokenizer(
+    """Summarize this passage in one sentence or less: Jeffrey Berns, CEO of Blockchains LLC, wants the Nevada government to allow companies like \
+his to form local governments on land they own, granting them power over everything from \
+schools to law enforcement. Berns envisions a city based on digital currencies and \
+blockchain storage. His company is proposing to build a 15,000 home town 12 miles east of \
+Reno. Nevada Lawmakers have responded with intrigue and skepticism. The proposed \
+legislation has yet to be formally filed or discussed in public hearings.
+
+Summary: """,
+    return_tensors="pt",
+)
+m.generate(**inputs, streamer=streamer, max_new_tokens=1024)
+
+# Math problems
+m.load_adapter("predibase/gsm8k", "gsm8k")
+m.set_adapter("gsm8k")
+inputs = tokenizer(
+    "James decides to run 3 sprints 3 times a week. He runs 60 meters each sprint. \
+How many total meters does he run a week?",
+    return_tensors="pt",
+)
+m.generate(**inputs, streamer=streamer, max_new_tokens=1024)
+
+# News explanation
+m.load_adapter("predibase/agnews_explained", "agnews_explained")
+m.set_adapter("agnews_explained")
+inputs = tokenizer(
+    """Below is a news article. Please classify it under one of the following \
+classes (World, Business, Sports, Sci/Tech) and provide a reasonable coherent explanation for \
+why the article is classified as such. Please format your response as a JSON payload.
+
+### Article: US poverty rate climbs, along with number lacking health coverage (AFP) AFP - The \
+number of Americans living in poverty or without health insurance grew last year, a government \
+survey showed, adding potential dynamite in the battle for the White House.
+
+### JSON Response
+
+""",
+    return_tensors="pt",
+)
+m.generate(**inputs, streamer=streamer, max_new_tokens=1024)
diff --git a/tests/peft/test_peft_model.py b/tests/peft/test_peft_model.py
diff --git a/tests/peft/test_peft_onnx_transforms.py b/tests/peft/test_peft_onnx_transforms.py
diff --git a/tests/utils/test_cache.py b/tests/utils/test_cache.py