From b0c47dd7fc9052e965a3066a61a6753a01734206 Mon Sep 17 00:00:00 2001 From: Shalini Salomi Bodapati Date: Mon, 30 Jun 2025 01:48:40 -0500 Subject: [PATCH] POC:Avoid PackTranspose TinyBLAS_PPC in MMA kernel Signed-off-by: Shalini Salomi Bodapati --- convert_hf_to_gguf.py | 13 ++- ggml/src/ggml-cpu/ggml-cpu.c | 54 ++++++++-- ggml/src/ggml-cpu/llamafile/sgemm.cpp | 138 +++++++++++++++++++++----- ggml/src/ggml-cpu/llamafile/sgemm.h | 2 +- ggml/src/ggml.c | 7 +- src/llama-model-loader.cpp | 14 +++ 6 files changed, 191 insertions(+), 37 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c201883509ceb..d110bbe1657e0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1963,6 +1963,7 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None): _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + print(f"[GGUF-CONVERT] modifying tensor {name}") n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") is_vision_tensor = "vision_tower" in name \ @@ -1985,6 +1986,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + #if name.endswith(("attn.k_proj.weight", "attn.o_proj.weight", "attn.v_proj.weight","attn.q_proj.weight","up_proj.weight", "gate_proj.weight", "down_proj.weight")): + if name.endswith(( "attn.o_proj.weight", "up_proj.weight", "gate_proj.weight")): + print(f"[GGUF-CONVERT] Transposing {name}") + data_torch = data_torch.T.contiguous() + + # process the experts separately if name.find("block_sparse_moe.experts") != -1: n_experts = self.hparams["num_local_experts"] @@ -2018,8 +2025,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return tensors else: return [] - - return [(self.map_tensor_name(name), data_torch)] + mapped_name = self.map_tensor_name(name) + print(f"[GGUF-CONVERT] Mapping: {name} --> {mapped_name}") + print(f"[GGUF-CONVERT] Final shape for {mapped_name}: {data_torch.shape}") + return [(mapped_name, data_torch)] def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index c5271b7757228..383c0b3d46238 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1234,11 +1234,45 @@ void ggml_compute_forward_mul_mat( const int64_t r3 = ne13 / ne03; const bool src1_cont = ggml_is_contiguous(src1); - + bool is_transposed = false; if (src1_cont) { - for (int64_t i13 = 0; i13 < ne13; i13++) - for (int64_t i12 = 0; i12 < ne12; i12++) + const char * name = src0->name; + const char * name1 = src1->name; + + if (name && + strstr(name, "attn_output.weight") || + strstr(name, "ffn_up.weight") || + strstr(name, "ffn_gate.weight")) { + printf("[llamafile_sgemm] src0 %s was transposed during HF->GGUF conversion\n", name); + is_transposed = true; + //is_transposed = false; + } + if (name1 && + strstr(name1, "attn_output.weight") || + strstr(name1, "ffn_up.weight") || + strstr(name1, "ffn_gate.weight")) { + printf("[llamafile_sgemm] src1 %s was transposed during HF->GGUF conversion\n", name1); + } + printf("\n==> llamafile_sgemm call: %s * %s\n", src0->name, src1->name); + printf("A shape: [%lld x %lld] B shape: [%lld x %lld]\n", src0->ne[1], src0->ne[0], src1->ne[1], src1->ne[0]); + + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + if (is_transposed) { if (!llamafile_sgemm(params, + ne00/ggml_blck_size(src0->type), ne11, ne01,///ggml_blck_size(src0->type), + (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, + ne01, + (const char *)src1->data + i12*nb12 + i13*nb13, + nb11/ggml_type_size(src1->type), + (char *)dst->data + i12*nb2 + i13*nb3, + ne00/ggml_blck_size(src0->type), + src0->type, + src1->type, + dst->type, is_transposed)) + goto UseGgmlGemm1; + } else { + if (!llamafile_sgemm(params, ne01, ne11, ne00/ggml_blck_size(src0->type), (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01/ggml_type_size(src0->type), @@ -1248,8 +1282,11 @@ void ggml_compute_forward_mul_mat( nb1/ggml_type_size(dst->type), src0->type, src1->type, - dst->type)) + dst->type, false)) goto UseGgmlGemm1; + } + } + } return; } UseGgmlGemm1:; @@ -1304,8 +1341,9 @@ UseGgmlGemm1:; const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); - for (int64_t i13 = 0; i13 < ne13; i13++) - for (int64_t i12 = 0; i12 < ne12; i12++) + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + //printf("calling from 2nd site here \n"); if (!llamafile_sgemm(params, ne01, ne11, ne00/ggml_blck_size(src0->type), (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, @@ -1316,8 +1354,10 @@ UseGgmlGemm1:; nb1/ggml_type_size(dst->type), src0->type, vec_dot_type, - dst->type)) + dst->type, false)) goto UseGgmlGemm2; + } + } return; } UseGgmlGemm2:; diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index ed61869a5508a..6578b94f4aeda 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -2695,11 +2695,13 @@ class tinyBLAS_PPC { const TA *A, int64_t lda, const TB *B, int64_t ldb, TC *C, int64_t ldc, - int ith, int nth) - : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { + int ith, int nth, int64_t m_orig, bool is_transposed) + : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth), is_transposed(is_transposed){ + m_orig = 0; } void matmul(int64_t m, int64_t n) { + m_orig = m; mnpack(0, m, 0, n); } @@ -2957,7 +2959,13 @@ class tinyBLAS_PPC { acc_t acc_0; __builtin_mma_xxsetaccz(&acc_0); for (int l = 0; l < k; l+=4) { - packTranspose(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A); + if (is_transposed) { + for (int x = 0; x< 4; x++) { + vec_A[x] = (vec_t)vec_xl(0, (float*)A+ (l+x)*m_orig+ii); + } + } else { + packTranspose(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A); + } packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B); __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]); @@ -2973,7 +2981,13 @@ class tinyBLAS_PPC { __builtin_mma_xxsetaccz(&acc_0); __builtin_mma_xxsetaccz(&acc_1); for (int64_t l = 0; l < k; l+=4) { - packTranspose(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A); + if (is_transposed) { + for (int x =0; x< 4; x++) { + vec_A[x] = (vec_t) vec_xl(0, (float*)A+(l+x)*m_orig+ii); + } + }else{ + packTranspose(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A); + } packTranspose(B+(jj*ldb)+l, ldb, 8, 4, (TA*)vec_B); __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]); __builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]); @@ -2994,7 +3008,14 @@ class tinyBLAS_PPC { __builtin_mma_xxsetaccz(&acc_0); __builtin_mma_xxsetaccz(&acc_1); for (int64_t l = 0; l < k; l+=4) { + if (is_transposed) { + for (int x = 0; x <4; x++) { + vec_A[2*x] = (vec_t)vec_xl(0, (float*)A+(l+x)*m_orig+ii); + vec_A[2*x+1] = (vec_t)vec_xl(0, (float*)A+(l+x)*m_orig+ii+4); + } + } else { packTranspose(A+(ii*lda)+l, lda, 8, 4, (TA*)vec_A); + } packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B); __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]); @@ -3017,7 +3038,14 @@ class tinyBLAS_PPC { __builtin_mma_xxsetaccz(&acc_2); __builtin_mma_xxsetaccz(&acc_3); for (int l = 0; l < k; l+=8) { + if (is_transposed) { + for (int x = 0; x <8; x++) { + vec_A[2*x] = (vec_t)vec_xl(0, (float*)A+(l+x)*m_orig+ii); + vec_A[2*x+1] = (vec_t)vec_xl(0, (float*)A+(l+x)*m_orig+ii+4); + } + } else { packTranspose(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A); + } packTranspose(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B); for(int x = 0; x < 16; x+=2) { __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]); @@ -3205,24 +3233,31 @@ class tinyBLAS_PPC { * broadcasted, instead of using packing routine to prepack the * matrix elements. */ - if (RM == 1) { - TA* a = const_cast(A+(ii)*lda+l); + if (is_transposed) { + for (int x = 0; x< 4; x++) { + vec_A[x] = (vec_t)vec_xl(0, (float*)A+(l+x)*m_orig+ii); + } packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B); - vec_A[0] = (vec_t)vec_xl(0,a); - vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1)); - vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2)); - vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3)); - } else if (RN == 1) { - packTranspose(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A); - TB* b = const_cast(B+(jj)*ldb+l); - vec_B[0] = (vec_t)vec_xl(0,b); - vec_B[1] = (vec_t)vec_splats(*((TB*)&vec_B+1)); - vec_B[2] = (vec_t)vec_splats(*((TB*)&vec_B+2)); - vec_B[3] = (vec_t)vec_splats(*((TB*)&vec_B+3)); - } else { - packTranspose(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A); - packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B); - } + } else { + if (RM == 1) { + TA* a = const_cast(A+(ii)*lda+l); + packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B); + vec_A[0] = (vec_t)vec_xl(0,a); + vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1)); + vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2)); + vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3)); + } else if (RN == 1) { + packTranspose(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A); + TB* b = const_cast(B+(jj)*ldb+l); + vec_B[0] = (vec_t)vec_xl(0,b); + vec_B[1] = (vec_t)vec_splats(*((TB*)&vec_B+1)); + vec_B[2] = (vec_t)vec_splats(*((TB*)&vec_B+2)); + vec_B[3] = (vec_t)vec_splats(*((TB*)&vec_B+3)); + } else { + packTranspose(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B); + } + } __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]); @@ -3274,6 +3309,8 @@ class tinyBLAS_PPC { const int64_t ldc; const int ith; const int nth; + int64_t m_orig; + bool is_transposed; }; #endif } // namespace @@ -3310,13 +3347,16 @@ class tinyBLAS_PPC { */ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C, - int64_t ldc, int Atype, int Btype, int Ctype) { - + int64_t ldc, int Atype, int Btype, int Ctype, bool is_transposed) { + printf("m=%ld n=%ld k=%ld lda=%ld ldb=%ld ldc=%ld\n", m, n, k, lda, ldb, ldc); assert(m >= 0); assert(n >= 0); assert(k >= 0); - assert(lda >= k); - assert(ldb >= k); + /* if (is_transposed) + assert(lda >= m); + else*/ + //assert(lda >= k); + //assert(ldb >= k); assert(ldc >= m); assert(params->nth > 0); assert(params->ith < params->nth); @@ -3366,12 +3406,58 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 #elif defined(__MMA__) if (k % 8) return false; + //if (is_transposed) + //printf("A was transposed during GGUF; m = %d n = %d k = %d\n", m, n, k); + float * Ap = (float*)A; + float * Bp = (float*)B; + float * Cp = (float*)C; + printf("Matrix AT in column major\n"); + for (int r = 0; r < k; r ++) { + printf("| "); + for (int c = 0; c< m; c++) { + printf("%.2f ", Ap[c*k + r]); + } + printf(" |\n"); + } + printf("A memory layout n"); + for (int i = 0; i < (m*k); i++){ + printf("%.2f ", *(Ap++)); + } + printf("\n"); + printf("B in column major\n"); + for (int r = 0; r < k; r ++) { + printf("| "); + for (int c = 0; c< n; c++) { + printf("%.2f ", Bp[c*k + r]); + } + printf(" |\n"); + } + + printf("B memory layout n"); + for (int i = 0; i < (n*k); i++){ + printf("%.2f ", *(Bp++)); + } + printf("\n"); tinyBLAS_PPC tb{ k, (const float *)A, lda, (const float *)B, ldb, (float *)C, ldc, - params->ith, params->nth}; + params->ith, params->nth, m, is_transposed}; tb.matmul(m, n); + printf("C Matrix\n"); + for (int r = 0; r < m; r ++) { + printf("| "); + for (int c = 0; c< n; c++) { + printf("%.2f ", Cp[c*m + r]); + } + printf(" |\n"); + } + + for (int i = 0; i < (m*n); i++){ + printf("%.2f ", *(Cp++)); + } + printf("\n"); + //printf("completd llamafile_Sgemm\n"); return true; #else return false; diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h index 729e8853d516c..d79199eece106 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.h +++ b/ggml/src/ggml-cpu/llamafile/sgemm.h @@ -12,7 +12,7 @@ extern "C" { bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t, const void *, int64_t, const void *, int64_t, void *, int64_t, - int, int, int); + int, int, int, bool is_transposed); #ifdef __cplusplus } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 5ae1c527df639..58e8f17e920e1 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1904,6 +1904,7 @@ static struct ggml_tensor * ggml_add_impl( struct ggml_tensor * a, struct ggml_tensor * b, bool inplace) { + //printf("%s %s\n", a->name, b->name); GGML_ASSERT(ggml_can_repeat(b, a)); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -2972,7 +2973,7 @@ struct ggml_tensor * ggml_mul_mat( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - GGML_ASSERT(ggml_can_mul_mat(a, b)); + //GGML_ASSERT(ggml_can_mul_mat(a, b)); GGML_ASSERT(!ggml_is_transposed(a)); const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; @@ -3365,6 +3366,10 @@ struct ggml_tensor * ggml_reshape_3d( int64_t ne1, int64_t ne2) { GGML_ASSERT(ggml_is_contiguous(a)); + //printf("%s\n", a->name); + //printf("a->ne[] = [%lld %lld %lld %lld]\n", a->ne[0], a->ne[1], a->ne[2], a->ne[3]); + + //printf("ggml_nelements=%d ne0=%d ne1=%d ne2=%d\n", ggml_nelements(a), ne0, ne1, ne2); GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2); const int64_t ne[3] = { ne0, ne1, ne2 }; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index bd9e6da8832b7..569b15cbef5bf 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -774,6 +774,20 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri break; } } + // if direct match fails, try transposed match (only for 2D tensors) + if (!is_ok && ne.size() == 2) { + bool is_transposed_ok = (cur->ne[0] == ne[1] && cur->ne[1] == ne[0]); + for (size_t i = 2; i < GGML_MAX_DIMS; ++i) { + if (cur->ne[i] != 1) { + is_transposed_ok = false; + break; + } + } + if (is_transposed_ok) { + is_ok = true; + } + } + if (!is_ok) { throw std::runtime_error( format("%s: tensor '%s' has wrong shape; expected %s, got %s",