InternLM
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lmdeploy/turbomind/deploy/config.py‎
Lines changed: 6 additions & 1 deletion b/‎lmdeploy/turbomind/deploy/config.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎lmdeploy/turbomind/deploy/loader.py‎
Lines changed: 16 additions & 6 deletions b/‎lmdeploy/turbomind/deploy/loader.py‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎lmdeploy/turbomind/deploy/module.py‎
Lines changed: 16 additions & 5 deletions b/‎lmdeploy/turbomind/deploy/module.py‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎lmdeploy/turbomind/deploy/source_model/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎lmdeploy/turbomind/deploy/source_model/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lmdeploy/turbomind/deploy/source_model/deepseek2.py‎
Lines changed: 2 additions & 2 deletions b/‎lmdeploy/turbomind/deploy/source_model/deepseek2.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lmdeploy/turbomind/deploy/source_model/gpt_oss.py‎
Lines changed: 64 additions & 0 deletions b/‎lmdeploy/turbomind/deploy/source_model/gpt_oss.py‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎lmdeploy/turbomind/deploy/source_model/llama.py‎
Lines changed: 2 additions & 1 deletion b/‎lmdeploy/turbomind/deploy/source_model/llama.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lmdeploy/turbomind/deploy/source_model/mixtral.py‎
Lines changed: 2 additions & 2 deletions b/‎lmdeploy/turbomind/deploy/source_model/mixtral.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lmdeploy/turbomind/deploy/source_model/qwen.py‎
Lines changed: 2 additions & 2 deletions b/‎lmdeploy/turbomind/deploy/source_model/qwen.py‎
Lines changed: 2 additions & 2 deletions
@@ -43,6 +43,8 @@ if (BUILD_TEST)
     Catch2
     GIT_REPOSITORY https://github.com/catchorg/Catch2.git
     GIT_TAG        v3.8.0
+    GIT_SHALLOW ON
+    EXCLUDE_FROM_ALL
   )
   FetchContent_MakeAvailable(Catch2)
 endif()
 
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import inspect
 import json
-from dataclasses import asdict, fields
+from dataclasses import asdict, field, fields
 from typing import List
 
 # use pydantic.dataclasses.dataclass to check data type
@@ -61,6 +61,9 @@ class ModelConfig:
     inter_size: List[int] = None
     norm_eps: float = None
     attn_bias: int = 0
+    mlp_bias: bool = False
+    window_size: List[int] = field(default_factory=list)
+    attn_sink: bool = False
     qk_norm: bool = False
     size_per_head: int = 128
     group_size: int = 64
@@ -70,8 +73,10 @@ class ModelConfig:
     mlp_tp_size: int = 1
     model_format: str = 'hf'
     expert_num: List[int] = ()
+    expert_router_bias: bool = False
     expert_inter_size: int = 0
     experts_per_token: int = 0
+    activation_type: str = ''
     moe_shared_gate: bool = False
     norm_topk_prob: bool = False
     routed_scale: float = 1.0
 
@@ -23,10 +23,11 @@
 
 class BaseLoader(ABC):
 
-    def __init__(self, model_path: str, pattern):
+    def __init__(self, model_path: str, pattern, mappings: list):
         self.model_path = model_path
         self.pattern = pattern
         self.item_count = defaultdict(int)
+        self.mappings = mappings
 
     def get_index(self, index_name: str, file_pattern: str) -> Tuple[dict, list]:
         """Get shards and weight map (if possible) for the model."""
@@ -44,15 +45,24 @@ def get_index(self, index_name: str, file_pattern: str) -> Tuple[dict, list]:
             raise RuntimeError(f'failed to locate weight files for {self.model_path}')
         return sorted(shards), index
 
+    def map_key(self, key: str):
+        if self.mappings:
+            k = str(key)
+            for f in self.mappings:
+                k = f(k)
+            return k
+        else:
+            return key
+
     @abstractmethod
     def items(self) -> Iterator[Tuple[int, dict]]:
         pass
 
 
 class SafetensorsLoader(BaseLoader):
 
-    def __init__(self, model_path: str, pattern: str, index_name=None, file_pattern=None):
-        super().__init__(model_path, pattern)
+    def __init__(self, model_path: str, pattern: str, mappings: list, index_name=None, file_pattern=None):
+        super().__init__(model_path, pattern, mappings)
         self.shards, index = self.get_index(index_name, file_pattern)
         if not index:
             # there is no model.safetensors.index.json in the model_path,
@@ -87,7 +97,7 @@ def items(self):
                     else:
                         idx = int(match[0])
                         param = params[idx]
-                        param[k] = f.get_tensor(k)
+                        param[self.map_key(k)] = f.get_tensor(k)
                         if len(param) == self.item_count[idx]:
                             yield (idx, params.pop(idx))
                 if misc:
@@ -164,8 +174,8 @@ def items(self):
             self.que.task_done()
 
 
-def create_loader(model_path: Union[str, Queue], pattern: str) -> BaseLoader:
-    args = (model_path, pattern)
+def create_loader(model_path: Union[str, Queue], pattern: str, mappings: list) -> BaseLoader:
+    args = (model_path, pattern, mappings)
 
     if isinstance(model_path, Queue):
         # used for `update_params`
 
@@ -50,6 +50,8 @@ def pad_out_dims(x: torch.Tensor, dims: int):
 
 
 def pad_in_dims(x: torch.Tensor, dims: int):
+    if x.dim() == 1:  # 1-dim object does not have input dim (e.g. bias)
+        return x
     pad = dims - x.size(0)
     assert x.dim() == 2
     assert pad >= 0
@@ -119,6 +121,8 @@ def _export(self, inter_size: int, fmt: str, idx: int, w123, kind: str, pack_fn,
         self.model.save_split(w2, fmt.format(idx, 'w2', kind), split_dim=0, split_num=self.tp, copy=is_lora_b)
 
     def apply(self, i: int, r: BaseReader):
+        if not self.inter_size[i]:
+            return
         for e in get_params(r.ffn(i, None)):
             e(partial(self._export, self.inter_size[i], self._ffn), partial(r.ffn, i), i)
 
@@ -132,7 +136,7 @@ class MoeFfn(Ffn):
     """
 
     _moe_ffn_expert = 'layers.{0}.moe_ffn.experts.E.{1}.{2}'
-    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.weight'
+    _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.{1}'
     _moe_ffn_shared_gate = 'layers.{0}.moe_ffn.shared_gate.weight'
 
     def __init__(self, model: BaseOutputModel):
@@ -144,17 +148,20 @@ def __init__(self, model: BaseOutputModel):
     def apply(self, i: int, r: BaseReader):
         if self.expert_num[i] == 0:
             return
-        for p in get_params(r.moe_ffn_expert()):
+        for p in get_params(r.moe_ffn_expert(), 1):
             for e in range(self.expert_num[i]):
                 fmt = self._moe_ffn_expert.replace('E', str(e))
                 p(partial(self._export, self.inter_size, fmt), partial(r.moe_ffn_expert, e, i), i)
 
-        gate = transpose(r.moe_ffn_gate(i))
-        self.model.save_split(gate, self._moe_ffn_gate.format(i))
+        # router
+        gate = transpose(r.moe_ffn_gate(i, 'weight'))
+        self.model.save_split(gate, self._moe_ffn_gate.format(i, 'weight'))
+        bias = r.moe_ffn_gate(i, 'bias')
+        if bias is not None:
+            self.model.save_split(bias, self._moe_ffn_gate.format(i, 'bias'))
 
         if self.shared_gate:
             shared_gate = transpose(r.moe_ffn_shared_gate(i))
-            # print(shared_gate)
             self.model.save_split(shared_gate, self._moe_ffn_shared_gate.format(i))
 
 
@@ -172,6 +179,7 @@ def __init__(self, model: BaseOutputModel):
         self.head_dim = model.model_config.size_per_head
         self.attn_bias = model.model_config.attn_bias
         self.qk_norm = model.model_config.qk_norm
+        self.attn_sink = model.model_config.attn_sink
         self.group_size = max(1, model.model_config.group_size)
 
     def _reorder_and_merge(self, qkvo, gs: int):
@@ -250,6 +258,9 @@ def apply(self, i: int, r: BaseReader):
                 k = permute_v2(k, self.head_dim)
             self.model.save_split(q, self._attn.format(i, 'q_norm', '')[:-1])
             self.model.save_split(k, self._attn.format(i, 'k_norm', '')[:-1])
+        if self.attn_sink:
+            sinks = r.attn_sinks(i)
+            self.model.save_split(sinks, self._attn.format(i, 'sinks', '')[:-1], split_dim=0, split_num=self.tp)
 
 
 class MLA(Module):
 
@@ -3,6 +3,7 @@
 from .deepseek2 import DeepSeek2Model  # noqa: F401
 from .deepseek_vl import DeepSeekVLModel  # noqa: F401
 from .glm4 import Glm4Model  # noqa: F401
+from .gpt_oss import GptOssModel  # noqa: F401
 from .internlm2 import InternLM2Model  # noqa: F401
 from .internvl import InternVLModel  # noqa: F401
 from .llama import LlamaModel  # noqa: F401
 
@@ -8,8 +8,8 @@
 
 class DeepSeek2Reader(LlamaReader):
 
-    def moe_ffn_gate(self, i):
-        return self.params.get(f'model.layers.{i}.mlp.gate.weight')
+    def moe_ffn_gate(self, i, kind):
+        return self.params.get(f'model.layers.{i}.mlp.gate.{kind}')
 
     def moe_ffn_expert(self, e=None, i=None, kind=None):
         if not kind:
 
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import re
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+def map_experts(str):
+    s = re.sub(r'(experts.*proj)$', r'\1.weight', str)
+    s = re.sub(r'(experts.*proj)_bias$', r'\1.bias', s)
+    return s
+
+
+class GptOssReader(LlamaReader):
+
+    mappings = [map_experts]
+
+    def moe_ffn_expert(self, e=None, i=None, kind=None):
+        if not kind:
+            return self.filter(r'experts')
+        result = []
+        for key in ['gate_up', 'down']:
+            name = f'{self.attn_layer_prefix}.{i}.mlp.experts.{key}_proj.{kind}'
+            tensor = self.params.get(name)[e]
+            if tensor.ndim == 2:
+                tensor = tensor.cuda().t()  # experts in unsloth/gpt-oss-20b-BF16 are transposed
+            if key == 'gate_up':
+                gate, up = tensor[::2], tensor[1::2]
+                result.append(self.transform(gate, kind))
+                result.append(self.transform(up, kind))
+            else:
+                result.append(self.transform(tensor, kind))
+        return (result[0], result[2], result[1])
+
+    def moe_ffn_gate(self, i, kind):
+        return self.transform(self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.router.{kind}'), kind)
+
+    def attn_sinks(self, i):
+        return self.params.get(f'{self.attn_layer_prefix}.{i}.self_attn.sinks')
+
+
+@INPUT_MODELS.register_module(name='gpt-oss')
+class GptOssModel(LlamaModel):
+
+    Reader = GptOssReader
+
+    def model_info(self):
+        cfg = self.model_config
+        types = cfg['layer_types']
+        sliding_window = cfg['sliding_window']
+        info = super().model_info()
+        info.update(attn_bias=int(cfg['attention_bias']),
+                    mlp_bias=True,
+                    expert_router_bias=True,
+                    expert_num=cfg['num_local_experts'],
+                    expert_inter_size=cfg['intermediate_size'],
+                    experts_per_token=cfg['experts_per_token'],
+                    norm_topk_prob=True,
+                    inter_size=0,
+                    window_size=[sliding_window if x == 'sliding_attention' else 0 for x in types],
+                    attn_sink=True,
+                    activation_type='gpt-oss')
+        return info
@@ -108,7 +108,8 @@ def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
         self.model_config = self.model_config.to_dict()
 
     def readers(self):
-        loader = create_loader(self.model_path, self.Reader.attn_layer_patten)
+        mappings = getattr(self.Reader, 'mappings', [])
+        loader = create_loader(self.model_path, self.Reader.attn_layer_patten, mappings)
         for i, param in loader.items():
             reader = self.Reader(param, {}, False, self.model_config, policy=self.policy)
             yield i, reader
 
@@ -17,8 +17,8 @@ def moe_ffn_expert(self, e=None, i=None, kind=None):
             result.append(tensor)
         return (*result, )
 
-    def moe_ffn_gate(self, i):
-        return self.params.get(f'model.layers.{i}.block_sparse_moe.gate.weight')
+    def moe_ffn_gate(self, i, kind):
+        return self.params.get(f'model.layers.{i}.block_sparse_moe.gate.{kind}')
 
 
 @INPUT_MODELS.register_module(name='mixtral')
 
@@ -130,8 +130,8 @@ def moe_ffn_expert(self, e=None, i=None, kind=None):
             result.append(tensor)
         return (*result, )
 
-    def moe_ffn_gate(self, i):
-        return self.transform(self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.gate.weight'), 'weight')
+    def moe_ffn_gate(self, i, kind):
+        return self.transform(self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.gate.{kind}'), kind)
 
     def _ffn(self, i: int, kind: str):
         """Get ffn kind for layer i."""
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,8 @@ if (BUILD_TEST)`
`43`	`43`	`Catch2`
`44`	`44`	`GIT_REPOSITORY https://github.com/catchorg/Catch2.git`
`45`	`45`	`GIT_TAG v3.8.0`
	`46`	`+ GIT_SHALLOW ON`
	`47`	`+ EXCLUDE_FROM_ALL`
`46`	`48`	`)`
`47`	`49`	`FetchContent_MakeAvailable(Catch2)`
`48`	`50`	`endif()`