@@ -50,6 +50,8 @@ def pad_out_dims(x: torch.Tensor, dims: int):
50
50
51
51
52
52
def pad_in_dims (x : torch .Tensor , dims : int ):
53
+ if x .dim () == 1 : # 1-dim object does not have input dim (e.g. bias)
54
+ return x
53
55
pad = dims - x .size (0 )
54
56
assert x .dim () == 2
55
57
assert pad >= 0
@@ -119,6 +121,8 @@ def _export(self, inter_size: int, fmt: str, idx: int, w123, kind: str, pack_fn,
119
121
self .model .save_split (w2 , fmt .format (idx , 'w2' , kind ), split_dim = 0 , split_num = self .tp , copy = is_lora_b )
120
122
121
123
def apply (self , i : int , r : BaseReader ):
124
+ if not self .inter_size [i ]:
125
+ return
122
126
for e in get_params (r .ffn (i , None )):
123
127
e (partial (self ._export , self .inter_size [i ], self ._ffn ), partial (r .ffn , i ), i )
124
128
@@ -132,7 +136,7 @@ class MoeFfn(Ffn):
132
136
"""
133
137
134
138
_moe_ffn_expert = 'layers.{0}.moe_ffn.experts.E.{1}.{2}'
135
- _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.weight '
139
+ _moe_ffn_gate = 'layers.{0}.moe_ffn.gate.{1} '
136
140
_moe_ffn_shared_gate = 'layers.{0}.moe_ffn.shared_gate.weight'
137
141
138
142
def __init__ (self , model : BaseOutputModel ):
@@ -144,17 +148,20 @@ def __init__(self, model: BaseOutputModel):
144
148
def apply (self , i : int , r : BaseReader ):
145
149
if self .expert_num [i ] == 0 :
146
150
return
147
- for p in get_params (r .moe_ffn_expert ()):
151
+ for p in get_params (r .moe_ffn_expert (), 1 ):
148
152
for e in range (self .expert_num [i ]):
149
153
fmt = self ._moe_ffn_expert .replace ('E' , str (e ))
150
154
p (partial (self ._export , self .inter_size , fmt ), partial (r .moe_ffn_expert , e , i ), i )
151
155
152
- gate = transpose (r .moe_ffn_gate (i ))
153
- self .model .save_split (gate , self ._moe_ffn_gate .format (i ))
156
+ # router
157
+ gate = transpose (r .moe_ffn_gate (i , 'weight' ))
158
+ self .model .save_split (gate , self ._moe_ffn_gate .format (i , 'weight' ))
159
+ bias = r .moe_ffn_gate (i , 'bias' )
160
+ if bias is not None :
161
+ self .model .save_split (bias , self ._moe_ffn_gate .format (i , 'bias' ))
154
162
155
163
if self .shared_gate :
156
164
shared_gate = transpose (r .moe_ffn_shared_gate (i ))
157
- # print(shared_gate)
158
165
self .model .save_split (shared_gate , self ._moe_ffn_shared_gate .format (i ))
159
166
160
167
@@ -172,6 +179,7 @@ def __init__(self, model: BaseOutputModel):
172
179
self .head_dim = model .model_config .size_per_head
173
180
self .attn_bias = model .model_config .attn_bias
174
181
self .qk_norm = model .model_config .qk_norm
182
+ self .attn_sink = model .model_config .attn_sink
175
183
self .group_size = max (1 , model .model_config .group_size )
176
184
177
185
def _reorder_and_merge (self , qkvo , gs : int ):
@@ -250,6 +258,9 @@ def apply(self, i: int, r: BaseReader):
250
258
k = permute_v2 (k , self .head_dim )
251
259
self .model .save_split (q , self ._attn .format (i , 'q_norm' , '' )[:- 1 ])
252
260
self .model .save_split (k , self ._attn .format (i , 'k_norm' , '' )[:- 1 ])
261
+ if self .attn_sink :
262
+ sinks = r .attn_sinks (i )
263
+ self .model .save_split (sinks , self ._attn .format (i , 'sinks' , '' )[:- 1 ], split_dim = 0 , split_num = self .tp )
253
264
254
265
255
266
class MLA (Module ):
0 commit comments