Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1963,6 +1963,7 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
_experts: list[dict[str, Tensor]] | None = None

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
print(f"[GGUF-CONVERT] modifying tensor {name}")
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
is_vision_tensor = "vision_tower" in name \
Expand All @@ -1985,6 +1986,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)

#if name.endswith(("attn.k_proj.weight", "attn.o_proj.weight", "attn.v_proj.weight","attn.q_proj.weight","up_proj.weight", "gate_proj.weight", "down_proj.weight")):
if name.endswith(( "attn.o_proj.weight", "up_proj.weight", "gate_proj.weight")):
print(f"[GGUF-CONVERT] Transposing {name}")
data_torch = data_torch.T.contiguous()


# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
n_experts = self.hparams["num_local_experts"]
Expand Down Expand Up @@ -2018,8 +2025,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
return tensors
else:
return []

return [(self.map_tensor_name(name), data_torch)]
mapped_name = self.map_tensor_name(name)
print(f"[GGUF-CONVERT] Mapping: {name} --> {mapped_name}")
print(f"[GGUF-CONVERT] Final shape for {mapped_name}: {data_torch.shape}")
return [(mapped_name, data_torch)]

def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
Expand Down
54 changes: 47 additions & 7 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1234,11 +1234,45 @@ void ggml_compute_forward_mul_mat(
const int64_t r3 = ne13 / ne03;

const bool src1_cont = ggml_is_contiguous(src1);

bool is_transposed = false;
if (src1_cont) {
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
const char * name = src0->name;
const char * name1 = src1->name;

if (name &&
strstr(name, "attn_output.weight") ||
strstr(name, "ffn_up.weight") ||
strstr(name, "ffn_gate.weight")) {
printf("[llamafile_sgemm] src0 %s was transposed during HF->GGUF conversion\n", name);
is_transposed = true;
//is_transposed = false;
}
if (name1 &&
strstr(name1, "attn_output.weight") ||
strstr(name1, "ffn_up.weight") ||
strstr(name1, "ffn_gate.weight")) {
printf("[llamafile_sgemm] src1 %s was transposed during HF->GGUF conversion\n", name1);
}
printf("\n==> llamafile_sgemm call: %s * %s\n", src0->name, src1->name);
printf("A shape: [%lld x %lld] B shape: [%lld x %lld]\n", src0->ne[1], src0->ne[0], src1->ne[1], src1->ne[0]);

for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
if (is_transposed) {
if (!llamafile_sgemm(params,
ne00/ggml_blck_size(src0->type), ne11, ne01,///ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
ne01,
(const char *)src1->data + i12*nb12 + i13*nb13,
nb11/ggml_type_size(src1->type),
(char *)dst->data + i12*nb2 + i13*nb3,
ne00/ggml_blck_size(src0->type),
src0->type,
src1->type,
dst->type, is_transposed))
goto UseGgmlGemm1;
} else {
if (!llamafile_sgemm(params,
ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(src0->type),
Expand All @@ -1248,8 +1282,11 @@ void ggml_compute_forward_mul_mat(
nb1/ggml_type_size(dst->type),
src0->type,
src1->type,
dst->type))
dst->type, false))
goto UseGgmlGemm1;
}
}
}
return;
}
UseGgmlGemm1:;
Expand Down Expand Up @@ -1304,8 +1341,9 @@ UseGgmlGemm1:;
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);

for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
//printf("calling from 2nd site here \n");
if (!llamafile_sgemm(params,
ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
Expand All @@ -1316,8 +1354,10 @@ UseGgmlGemm1:;
nb1/ggml_type_size(dst->type),
src0->type,
vec_dot_type,
dst->type))
dst->type, false))
goto UseGgmlGemm2;
}
}
return;
}
UseGgmlGemm2:;
Expand Down
138 changes: 112 additions & 26 deletions ggml/src/ggml-cpu/llamafile/sgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2695,11 +2695,13 @@ class tinyBLAS_PPC {
const TA *A, int64_t lda,
const TB *B, int64_t ldb,
TC *C, int64_t ldc,
int ith, int nth)
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
int ith, int nth, int64_t m_orig, bool is_transposed)
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth), is_transposed(is_transposed){
m_orig = 0;
}

void matmul(int64_t m, int64_t n) {
m_orig = m;
mnpack(0, m, 0, n);
}

Expand Down Expand Up @@ -2957,7 +2959,13 @@ class tinyBLAS_PPC {
acc_t acc_0;
__builtin_mma_xxsetaccz(&acc_0);
for (int l = 0; l < k; l+=4) {
packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
if (is_transposed) {
for (int x = 0; x< 4; x++) {
vec_A[x] = (vec_t)vec_xl(0, (float*)A+ (l+x)*m_orig+ii);
}
} else {
packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
}
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
Expand All @@ -2973,7 +2981,13 @@ class tinyBLAS_PPC {
__builtin_mma_xxsetaccz(&acc_0);
__builtin_mma_xxsetaccz(&acc_1);
for (int64_t l = 0; l < k; l+=4) {
packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
if (is_transposed) {
for (int x =0; x< 4; x++) {
vec_A[x] = (vec_t) vec_xl(0, (float*)A+(l+x)*m_orig+ii);
}
}else{
packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
}
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 4, (TA*)vec_B);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
Expand All @@ -2994,7 +3008,14 @@ class tinyBLAS_PPC {
__builtin_mma_xxsetaccz(&acc_0);
__builtin_mma_xxsetaccz(&acc_1);
for (int64_t l = 0; l < k; l+=4) {
if (is_transposed) {
for (int x = 0; x <4; x++) {
vec_A[2*x] = (vec_t)vec_xl(0, (float*)A+(l+x)*m_orig+ii);
vec_A[2*x+1] = (vec_t)vec_xl(0, (float*)A+(l+x)*m_orig+ii+4);
}
} else {
packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 4, (TA*)vec_A);
}
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
Expand All @@ -3017,7 +3038,14 @@ class tinyBLAS_PPC {
__builtin_mma_xxsetaccz(&acc_2);
__builtin_mma_xxsetaccz(&acc_3);
for (int l = 0; l < k; l+=8) {
if (is_transposed) {
for (int x = 0; x <8; x++) {
vec_A[2*x] = (vec_t)vec_xl(0, (float*)A+(l+x)*m_orig+ii);
vec_A[2*x+1] = (vec_t)vec_xl(0, (float*)A+(l+x)*m_orig+ii+4);
}
} else {
packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A);
}
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B);
for(int x = 0; x < 16; x+=2) {
__builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
Expand Down Expand Up @@ -3205,24 +3233,31 @@ class tinyBLAS_PPC {
* broadcasted, instead of using packing routine to prepack the
* matrix elements.
*/
if (RM == 1) {
TA* a = const_cast<TA*>(A+(ii)*lda+l);
if (is_transposed) {
for (int x = 0; x< 4; x++) {
vec_A[x] = (vec_t)vec_xl(0, (float*)A+(l+x)*m_orig+ii);
}
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
vec_A[0] = (vec_t)vec_xl(0,a);
vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
} else if (RN == 1) {
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
TB* b = const_cast<TB*>(B+(jj)*ldb+l);
vec_B[0] = (vec_t)vec_xl(0,b);
vec_B[1] = (vec_t)vec_splats(*((TB*)&vec_B+1));
vec_B[2] = (vec_t)vec_splats(*((TB*)&vec_B+2));
vec_B[3] = (vec_t)vec_splats(*((TB*)&vec_B+3));
} else {
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
}
} else {
if (RM == 1) {
TA* a = const_cast<TA*>(A+(ii)*lda+l);
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
vec_A[0] = (vec_t)vec_xl(0,a);
vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
} else if (RN == 1) {
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
TB* b = const_cast<TB*>(B+(jj)*ldb+l);
vec_B[0] = (vec_t)vec_xl(0,b);
vec_B[1] = (vec_t)vec_splats(*((TB*)&vec_B+1));
vec_B[2] = (vec_t)vec_splats(*((TB*)&vec_B+2));
vec_B[3] = (vec_t)vec_splats(*((TB*)&vec_B+3));
} else {
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
}
}
__builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
__builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
Expand Down Expand Up @@ -3274,6 +3309,8 @@ class tinyBLAS_PPC {
const int64_t ldc;
const int ith;
const int nth;
int64_t m_orig;
bool is_transposed;
};
#endif
} // namespace
Expand Down Expand Up @@ -3310,13 +3347,16 @@ class tinyBLAS_PPC {
*/
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
int64_t ldc, int Atype, int Btype, int Ctype) {

int64_t ldc, int Atype, int Btype, int Ctype, bool is_transposed) {
printf("m=%ld n=%ld k=%ld lda=%ld ldb=%ld ldc=%ld\n", m, n, k, lda, ldb, ldc);
assert(m >= 0);
assert(n >= 0);
assert(k >= 0);
assert(lda >= k);
assert(ldb >= k);
/* if (is_transposed)
assert(lda >= m);
else*/
//assert(lda >= k);
//assert(ldb >= k);
assert(ldc >= m);
assert(params->nth > 0);
assert(params->ith < params->nth);
Expand Down Expand Up @@ -3366,12 +3406,58 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
#elif defined(__MMA__)
if (k % 8)
return false;
//if (is_transposed)
//printf("A was transposed during GGUF; m = %d n = %d k = %d\n", m, n, k);
float * Ap = (float*)A;
float * Bp = (float*)B;
float * Cp = (float*)C;
printf("Matrix AT in column major\n");
for (int r = 0; r < k; r ++) {
printf("| ");
for (int c = 0; c< m; c++) {
printf("%.2f ", Ap[c*k + r]);
}
printf(" |\n");
}
printf("A memory layout n");
for (int i = 0; i < (m*k); i++){
printf("%.2f ", *(Ap++));
}
printf("\n");
printf("B in column major\n");
for (int r = 0; r < k; r ++) {
printf("| ");
for (int c = 0; c< n; c++) {
printf("%.2f ", Bp[c*k + r]);
}
printf(" |\n");
}

printf("B memory layout n");
for (int i = 0; i < (n*k); i++){
printf("%.2f ", *(Bp++));
}
printf("\n");
tinyBLAS_PPC<float, float, float> tb{
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
params->ith, params->nth};
params->ith, params->nth, m, is_transposed};
tb.matmul(m, n);
printf("C Matrix\n");
for (int r = 0; r < m; r ++) {
printf("| ");
for (int c = 0; c< n; c++) {
printf("%.2f ", Cp[c*m + r]);
}
printf(" |\n");
}

for (int i = 0; i < (m*n); i++){
printf("%.2f ", *(Cp++));
}
printf("\n");
//printf("completd llamafile_Sgemm\n");
return true;
#else
return false;
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cpu/llamafile/sgemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ extern "C" {

bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
const void *, int64_t, const void *, int64_t, void *, int64_t,
int, int, int);
int, int, int, bool is_transposed);

#ifdef __cplusplus
}
Expand Down
7 changes: 6 additions & 1 deletion ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1904,6 +1904,7 @@ static struct ggml_tensor * ggml_add_impl(
struct ggml_tensor * a,
struct ggml_tensor * b,
bool inplace) {
//printf("%s %s\n", a->name, b->name);
GGML_ASSERT(ggml_can_repeat(b, a));

struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
Expand Down Expand Up @@ -2972,7 +2973,7 @@ struct ggml_tensor * ggml_mul_mat(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b) {
GGML_ASSERT(ggml_can_mul_mat(a, b));
//GGML_ASSERT(ggml_can_mul_mat(a, b));
GGML_ASSERT(!ggml_is_transposed(a));

const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
Expand Down Expand Up @@ -3365,6 +3366,10 @@ struct ggml_tensor * ggml_reshape_3d(
int64_t ne1,
int64_t ne2) {
GGML_ASSERT(ggml_is_contiguous(a));
//printf("%s\n", a->name);
//printf("a->ne[] = [%lld %lld %lld %lld]\n", a->ne[0], a->ne[1], a->ne[2], a->ne[3]);

//printf("ggml_nelements=%d ne0=%d ne1=%d ne2=%d\n", ggml_nelements(a), ne0, ne1, ne2);
GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);

const int64_t ne[3] = { ne0, ne1, ne2 };
Expand Down
14 changes: 14 additions & 0 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,20 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
break;
}
}
// if direct match fails, try transposed match (only for 2D tensors)
if (!is_ok && ne.size() == 2) {
bool is_transposed_ok = (cur->ne[0] == ne[1] && cur->ne[1] == ne[0]);
for (size_t i = 2; i < GGML_MAX_DIMS; ++i) {
if (cur->ne[i] != 1) {
is_transposed_ok = false;
break;
}
}
if (is_transposed_ok) {
is_ok = true;
}
}

if (!is_ok) {
throw std::runtime_error(
format("%s: tensor '%s' has wrong shape; expected %s, got %s",
Expand Down
Loading