Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
86 commits
Select commit Hold shift + click to select a range
d998bb6
scripts : update sync scripts
ggerganov Aug 18, 2025
74ed8d8
vulkan : fix 32-bit builds (ggml/1313)
dg0yt Jul 30, 2025
c6d170b
cmake : Fix BLAS link interface (ggml/1316)
dg0yt Jul 30, 2025
aa662f2
sycl: refactor quantization to q8_1 (llama/14815)
Alcpz Jul 28, 2025
5cc0448
CUDA: fix pointer incrementation in FA (llama/14916)
JohannesGaessler Jul 28, 2025
8ed9582
SYCL: Add set_rows support for quantized types (llama/14883)
qnixsynapse Jul 28, 2025
689060e
ggml-cpu : deduplicate scalar implementations (llama/14897)
xctan Jul 28, 2025
60dbdab
CUDA: add roll (llama/14919)
am17an Jul 29, 2025
b66cc86
cuda : add softcap fusion (llama/14907)
CISC Jul 29, 2025
2bd2aff
CANN: Add ggml_set_rows (llama/14943)
hipudding Jul 29, 2025
f7d3ac6
HIP: Ignore unsupported unroll transformation in fattn-vec (llama/14931)
IMbackK Jul 29, 2025
01e1614
HIP: add GGML_HIP_MMQ_MFMA option to allow disableing the MFMA path. …
IMbackK Jul 29, 2025
980d19e
HIP: remove the use of __HIP_PLATFORM_AMD__, explicitly support only …
IMbackK Jul 29, 2025
c716281
CUDA: skip masked KV slices for all FA kernels (llama/14924)
JohannesGaessler Jul 30, 2025
7ec71a0
HIP: enable mfma mmq on gfx908 and gfx90a for select datatypes and sh…
IMbackK Jul 30, 2025
cdfefa3
opencl: add `mul_mat_f32_f32_l4_lm` and `mul_mat_f16_f32_l4_lm` (llam…
lhez Jul 30, 2025
f9cba92
CANN: Improve loading efficiency after converting weights to NZ forma…
hipudding Jul 31, 2025
1685817
Vulkan: Fix minor debug mode issues (llama/14899)
0cc4m Jul 31, 2025
a3145e5
docker : add cann build pipline (llama/14591)
diannaojiang Aug 1, 2025
6eb52a2
ggml : Q2k interleaving implementation - x86/x64 SIMD (llama/14373)
Srihari-mcw Aug 1, 2025
a97a323
opencl: add f16 for `add`, `sub`, `mul`, `div` (llama/14984)
lhez Aug 1, 2025
10f523c
CUDA: fix MMQ nwarps for AMD with warp_size==32 (llama/15014)
JohannesGaessler Aug 1, 2025
eb1dd35
vulkan: optimizations for direct convolution (llama/14933)
jeffbolznv Aug 2, 2025
3dd7756
vulkan: Support ne[3]>1 in noncontig matrix-vector multiply (llama/15…
jeffbolznv Aug 2, 2025
d77de84
vulkan: coopmat2 mul_mat optimizations (llama/14934)
jeffbolznv Aug 2, 2025
2fd457c
cuda, sycl : fix batched gemm when ne02 == 1 && ne03 > 1 (llama/15038)
ggerganov Aug 2, 2025
f35b9e1
cuda: make im2col a little faster (llama/15025)
leejet Aug 2, 2025
1cbee87
CUDA: use mma FA kernel for gqa > 4 on RTX 4000 (llama/15035)
JohannesGaessler Aug 2, 2025
3a868d8
opencl: fix adreno compiler detection logic (llama/15029)
lhez Aug 2, 2025
883a32e
vulkan: Use coopmat2 for conv2d (llama/14982)
jeffbolznv Aug 3, 2025
1071eb7
vulkan: fix build when using glslang that does not support coopmat2 (…
jeffbolznv Aug 4, 2025
c9386c1
cmake: Add GGML_BACKEND_DIR option (llama/15074)
ckastner Aug 4, 2025
e0e122d
sycl: fix mul_mat selection (llama/15092)
Rbiessy Aug 5, 2025
1141265
llama : add gpt-oss (llama/15091)
ggerganov Aug 5, 2025
ea0666c
CANN: add support for ACL Graph (llama/15065)
noemotiovon Aug 6, 2025
e6fd650
ggml : fix fallback to CPU for ununsupported ops (llama/15118)
slaren Aug 6, 2025
c4a26c9
opencl: add `swiglu_oai` and `add_id` (llama/15121)
lhez Aug 6, 2025
bbb0da7
fix profiling crash (llama/15072)
rmatif Aug 6, 2025
75dbb4d
CUDA: GEMM for FP32/FP16/BF16 and ne11 <= 16 (llama/15131)
JohannesGaessler Aug 7, 2025
b8ac343
ggml: Skip backend library linking code when GGML_BACKEND_DL=ON (llam…
ckastner Aug 7, 2025
6642ea8
HIP: add cmake option to enable compiler output of kernel resource us…
IMbackK Aug 7, 2025
ba6238a
vulkan: Add env var to disable host visible vidmem (llama/15109)
jeffbolznv Aug 7, 2025
244c494
vulkan: support fattn sinks (llama/15126)
jeffbolznv Aug 7, 2025
6485910
opencl: support sink in `soft_max` (attn sinks) (llama/15152)
lhez Aug 8, 2025
42e295c
CUDA: attention sinks for mma FlashAttention (llama/15157)
JohannesGaessler Aug 8, 2025
2dc56f8
ggml : fix field name when new ggml_backend (llama/14944)
aisk Aug 8, 2025
430cbcc
gguf-py : add Numpy MXFP4 de/quantization support (llama/15111)
compilade Aug 8, 2025
46e8145
CUDA: add attention sinks for tile and wmma (llama/15178)
am17an Aug 9, 2025
b65235d
cuda: refactored ssm_scan and use CUB (llama/13291)
Your-Cheese Aug 9, 2025
2229462
kleidiai: fix unsigned overflow bug (llama/15150)
chaxu01 Aug 11, 2025
77c05f0
CANN: Add broadcast for softmax and FA (llama/15208)
hipudding Aug 11, 2025
74cabac
musa: fix failures in test-backend-ops for mul_mat_id op (llama/15236)
yeahdongcn Aug 12, 2025
5691b54
CANN: GGML_OP_CPY optimization (llama/15070)
noemotiovon Aug 12, 2025
ce7f91a
CUDA cmake: add `-lineinfo` for easier debug (llama/15260)
am17an Aug 12, 2025
e081517
opencl: allow mixed f16/f32 `add` (llama/15140)
rmatif Aug 12, 2025
2114917
sycl: Fix and disable more configurations of mul_mat (llama/15151)
Rbiessy Aug 12, 2025
c9a94ea
HIP: disable sync warp shuffel operators from clr amd_warp_sync_funct…
IMbackK Aug 12, 2025
77938f9
ggml-rpc: chunk send()/recv() to avoid EINVAL for very large tensors …
Tak-RS Aug 13, 2025
0b0748f
CUDA: Optimize `reduce_rows_f32` kernel, leading up to 25x perf impro…
ORippler Aug 13, 2025
16088b5
ggml : repack block_iq4_nlx8 (llama/14904)
ggerganov Aug 13, 2025
37102a5
ggml : update `ggml_rope_multi` (llama/12665)
foldl Aug 13, 2025
843095f
HIP: bump requirement to rocm 6.1 (llama/15296)
IMbackK Aug 13, 2025
164545a
finetune: SGD optimizer, more CLI args (llama/13873)
graehl Aug 14, 2025
da2d294
cuda : fix GGML_CUDA_GRAPHS=OFF (llama/15300)
CISC Aug 14, 2025
8059545
ggml: fix ggml_conv_1d_dw bug (ggml/1323)
jason-ni Aug 14, 2025
3e89862
vulkan: perf_logger improvements (llama/15246)
jeffbolznv Aug 14, 2025
9c16ccd
HIP: Cleanup hipification header (llama/15285)
IMbackK Aug 14, 2025
2b7be1b
CUDA: fix negative KV_max values in FA (llama/15321)
JohannesGaessler Aug 14, 2025
aea26be
ggml: initial IBM zDNN backend (llama/14975)
taronaeo Aug 15, 2025
651cfcc
vulkan : fix compile warnings on macos (llama/15340)
ggerganov Aug 15, 2025
9b55c3c
vulkan : fix out-of-bounds access in argmax kernel (llama/15342)
ggerganov Aug 15, 2025
b765e0e
opencl: add initial mxfp4 support via mv (llama/15270)
lhez Aug 15, 2025
4f12ef3
OpenCL: add initial FA support (llama/14987)
rmatif Aug 16, 2025
812ccab
vulkan: Add missing bounds checking to scalar/coopmat1 mul_mat_id (ll…
jeffbolznv Aug 16, 2025
b759bbc
vulkan: Support mul_mat_id with f32 accumulators (llama/15337)
jeffbolznv Aug 16, 2025
3bb4185
vulkan: fuse adds (llama/15252)
jeffbolznv Aug 16, 2025
786d86b
vulkan: Optimize argsort (llama/15354)
jeffbolznv Aug 17, 2025
185db9b
vulkan: support sqrt (llama/15370)
ddwkim Aug 17, 2025
2eefbe1
vulkan: Use larger workgroups for mul_mat_vec when M is small (llama/…
jeffbolznv Aug 17, 2025
cb061b8
vulkan: disable spirv-opt for bfloat16 shaders (llama/15352)
jeffbolznv Aug 18, 2025
50a3a0c
ggml-quants : fix make_qp_quants NANs and IQ1 assertion errors (llama…
compilade Aug 18, 2025
b309fad
common : handle mxfp4 enum
ggerganov Aug 18, 2025
1dee2ff
ggml : initial zDNN backend (llama/14975)
taronaeo Aug 18, 2025
3a8f07e
ggml: Add initial WebGPU backend (llama/14521)
reeselevine Aug 18, 2025
293c178
sync : ggml
ggerganov Aug 18, 2025
b74528f
talk-llama : sync llama.cpp
ggerganov Aug 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions examples/common-ggml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ bool ggml_common_quantize_0(
case GGML_FTYPE_MOSTLY_IQ4_XS:
case GGML_FTYPE_MOSTLY_IQ1_M:
case GGML_FTYPE_MOSTLY_BF16:
case GGML_FTYPE_MOSTLY_MXFP4:
{
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false;
Expand Down Expand Up @@ -211,6 +212,7 @@ bool ggml_common_quantize_0(
case GGML_TYPE_BF16:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_MXFP4:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
Expand Down
127 changes: 127 additions & 0 deletions examples/talk-llama/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_GLM4, "glm4" },
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
{ LLM_ARCH_BITNET, "bitnet" },
{ LLM_ARCH_T5, "t5" },
{ LLM_ARCH_T5ENCODER, "t5encoder" },
Expand All @@ -85,9 +86,13 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
{ LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
{ LLM_ARCH_SMOLLM3, "smollm3" },
{ LLM_ARCH_OPENAI_MOE, "gpt-oss" },
{ LLM_ARCH_LFM2, "lfm2" },
{ LLM_ARCH_DREAM, "dream" },
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
{ LLM_ARCH_LLADA, "llada" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};

Expand Down Expand Up @@ -124,6 +129,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
Expand Down Expand Up @@ -1388,6 +1394,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
},
},
{
LLM_ARCH_GLM4_MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
// NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number)
{ LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
{ LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
{ LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
{ LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
{ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
{ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
},
},
{
LLM_ARCH_BITNET,
{
Expand Down Expand Up @@ -1895,6 +1935,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_HUNYUAN_DENSE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },

},
},
{
LLM_ARCH_SMOLLM3,
{
Expand All @@ -1912,6 +1972,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_OPENAI_MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_SINKS, "blk.%d.attn_sinks" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_LFM2,
{
Expand All @@ -1933,6 +2012,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
}
},
{
LLM_ARCH_SMALLTHINKER,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }
},
},
{
LLM_ARCH_DREAM,
{
Expand All @@ -1950,6 +2050,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_LLADA,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_UNKNOWN,
{
Expand Down Expand Up @@ -1989,6 +2106,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
Expand Down Expand Up @@ -2120,6 +2238,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
// These tensors only exist in the last layer(s) and are treated as output tensors
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
};

LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
Expand Down Expand Up @@ -2202,6 +2328,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
bool llm_arch_is_diffusion(const llm_arch & arch) {
switch (arch) {
case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA:
return true;
default:
return false;
Expand Down
13 changes: 13 additions & 0 deletions examples/talk-llama/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ enum llm_arch {
LLM_ARCH_DEEPSEEK2,
LLM_ARCH_CHATGLM,
LLM_ARCH_GLM4,
LLM_ARCH_GLM4_MOE,
LLM_ARCH_BITNET,
LLM_ARCH_T5,
LLM_ARCH_T5ENCODER,
Expand All @@ -89,9 +90,13 @@ enum llm_arch {
LLM_ARCH_ERNIE4_5,
LLM_ARCH_ERNIE4_5_MOE,
LLM_ARCH_HUNYUAN_MOE,
LLM_ARCH_HUNYUAN_DENSE,
LLM_ARCH_SMOLLM3,
LLM_ARCH_OPENAI_MOE,
LLM_ARCH_LFM2,
LLM_ARCH_DREAM,
LLM_ARCH_SMALLTHINKER,
LLM_ARCH_LLADA,
LLM_ARCH_UNKNOWN,
};

Expand Down Expand Up @@ -128,6 +133,7 @@ enum llm_kv {
LLM_KV_EXPERT_WEIGHTS_NORM,
LLM_KV_EXPERT_GATING_FUNC,
LLM_KV_MOE_EVERY_N_LAYERS,
LLM_KV_NEXTN_PREDICT_LAYERS,
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,
LLM_KV_DECODER_START_TOKEN_ID,
Expand Down Expand Up @@ -260,6 +266,7 @@ enum llm_tensor {
LLM_TENSOR_ATTN_OUT_NORM,
LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_ATTN_SINKS,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_INP_SHEXP,
LLM_TENSOR_FFN_NORM,
Expand Down Expand Up @@ -406,6 +413,12 @@ enum llm_tensor {
LLM_TENSOR_SHORTCONV_CONV,
LLM_TENSOR_SHORTCONV_INPROJ,
LLM_TENSOR_SHORTCONV_OUTPROJ,
LLM_TENSOR_NEXTN_EH_PROJ,
LLM_TENSOR_NEXTN_EMBED_TOKENS,
LLM_TENSOR_NEXTN_ENORM,
LLM_TENSOR_NEXTN_HNORM,
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
};

enum llm_tensor_layer {
Expand Down
4 changes: 2 additions & 2 deletions examples/talk-llama/llama-batch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ bool llama_batch_allocr::init(
for (int32_t i = 0; i < batch.n_tokens; ++i) {
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
return false;
}
}
Expand Down Expand Up @@ -477,7 +477,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {

llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
if (sequential && has_cpl) {
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)\n", __func__);

return {};
}
Expand Down
38 changes: 34 additions & 4 deletions examples/talk-llama/llama-chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
};

Expand Down Expand Up @@ -191,8 +193,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_LLAMA4;
} else if (tmpl_contains("<|endofuserprompt|>")) {
return LLM_CHAT_TEMPLATE_DOTS1;
} else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
} else if (tmpl_contains("<|extra_0|>") && tmpl_contains("<|extra_4|>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
return LLM_CHAT_TEMPLATE_KIMI_K2;
}
Expand Down Expand Up @@ -619,8 +625,6 @@ int32_t llm_chat_apply_template(
} else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
// Yandex template ("\n\n" is defined as EOT token)

ss << "<s>";

for (size_t i = 0; i < chat.size(); i++) {
std::string role(chat[i]->role);
if (role == "user") {
Expand Down Expand Up @@ -698,11 +702,37 @@ int32_t llm_chat_apply_template(
if (role == "system") {
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
} else if (role == "assistant") {
ss << "<|startoftext|>" << message->content << "<|eos|>";
ss << message->content << "<|eos|>";
} else {
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_OPENAI_MOE) {
// OpenAI MoE (based on Harmony chat template)
for (auto message : chat) {
std::string role(message->role);
ss << "<|start|>" << role << "<|message|>" << message->content;
ss << (role == "assistant" ? "<|return|>" : "<|end|>");
}
if (add_ass) {
ss << "<|start|>assistant";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_DENSE) {
// tencent/Hunyuan-4B-Instruct
for (size_t i = 0; i < chat.size(); i++) {
std::string role(chat[i]->role);
if (i == 0) {
if (role == "system") {
ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
}
}

if (role == "assistant") {
ss << "<|hy_Assistant|>" << chat[i]->content << "<|hy_place▁holder▁no▁2|>";
} else if (role == "user") {
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
// moonshotai/Kimi-K2-Instruct
for (auto message : chat) {
Expand Down
2 changes: 2 additions & 0 deletions examples/talk-llama/llama-chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_SMOLVLM,
LLM_CHAT_TEMPLATE_DOTS1,
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
LLM_CHAT_TEMPLATE_OPENAI_MOE,
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
LLM_CHAT_TEMPLATE_KIMI_K2,
LLM_CHAT_TEMPLATE_UNKNOWN,
};
Expand Down
Loading
Loading