Merge branch 'master' of https://github.com/unslothai/llama.cpp

danielhanchen · danielhanchen · commit 7f4e47f7b60b · 2025-07-13T14:24:53.000-07:00
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2090,6 +2090,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
             {
                 // TODO: add support
                 // ref: https://github.com/ggml-org/llama.cpp/pull/14274
+#pragma message("TODO: implement F32, F16, BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
                 return false;
             } break;
         case GGML_OP_CPY: {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2303,6 +2303,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                 case GGML_UNARY_OP_EXP:
                     ggml_cuda_op_exp(ctx, dst);
                     break;
+                case GGML_UNARY_OP_ELU:
+                    ggml_cuda_op_elu(ctx, dst);
+                    break;
                 default:
                     return false;
             }
@@ -3116,6 +3119,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_ELU:
                     return ggml_is_contiguous(op->src[0]);
                 default:
                     return false;
@@ -3222,7 +3226,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
             } break;
         case GGML_OP_SET_ROWS:
             {
-                return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
+#pragma message("TODO: implement Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
+                return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16) &&
                        op->src[0]->type == GGML_TYPE_F32 &&
                        op->src[1]->type == GGML_TYPE_I64;
             } break;
diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu
@@ -10,6 +10,11 @@ __device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, hal
     *dst_h = __float2half(*src_f);
 }
 
+template<>
+__device__ __forceinline__ void set_rows_1<float, nv_bfloat16>(const float * src_f, nv_bfloat16 * dst_b) {
+    *dst_b = *src_f;
+}
+
 template<>
 __device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) {
     *dst_f = *src_f;
@@ -124,6 +129,16 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
             nb1, nb2, nb3,
             stream
         );
+    } else if (dst->type == GGML_TYPE_BF16) {
+        set_rows_cuda(
+            src0_d, src1_d, (nv_bfloat16*)dst->data,
+            ne00, ne01, ne02, ne03,
+            ne10, ne11, ne12, ne13,
+            nb01, nb02, nb03,
+            nb10, nb11, nb12,
+            nb1, nb2, nb3,
+            stream
+        );
     } else {
         GGML_ABORT("unsupported type");
     }
diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
@@ -83,6 +83,10 @@ static __device__ __forceinline__ float op_log(float x) {
     return logf(x);
 }
 
+static __device__ __forceinline__ float op_elu(float x) {
+    return (x > 0.f) ? x : expm1f(x);
+}
+
 template <float (*op)(float), typename T>
 static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -196,6 +200,9 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     ggml_cuda_op_unary<op_log>(ctx, dst);
 }
 
+void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_unary<op_elu>(ctx, dst);
+}
 /* gated ops */
 
 template <float (*op)(float), typename T>
diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh
@@ -59,6 +59,8 @@ void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
+void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
 void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -2280,6 +2280,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
             {
                 // TODO: add support
                 // ref: https://github.com/ggml-org/llama.cpp/pull/14274
+#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
                 if (op->src[0]->type != GGML_TYPE_F32) {
                     return false;
                 }
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4303,6 +4303,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
             {
                 // TODO: add support
                 // ref: https://github.com/ggml-org/llama.cpp/pull/14274
+#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
                 return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64));
             } break;
         case GGML_OP_CPY:
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
@@ -884,8 +884,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                         if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
                             if  (qtype != new_type) {
                                 LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
-                                new_type = qtype;
-                                break; // if two or more types are specified for the tensor, first match wins
+                                new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
                             }
                         }
                     }

Original file line number	Diff line number	Diff line change
`@@ -2090,6 +2090,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,`
`2090`	`2090`	`{`
`2091`	`2091`	`// TODO: add support`
`2092`	`2092`	`// ref: https://github.com/ggml-org/llama.cpp/pull/14274`
	`2093`	`+#pragma message("TODO: implement F32, F16, BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")`
`2093`	`2094`	`return false;`
`2094`	`2095`	`} break;`
`2095`	`2096`	`case GGML_OP_CPY: {`
Original file line number	Diff line number	Diff line change
`@@ -2280,6 +2280,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te`
`2280`	`2280`	`{`
`2281`	`2281`	`// TODO: add support`
`2282`	`2282`	`// ref: https://github.com/ggml-org/llama.cpp/pull/14274`
	`2283`	`+#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")`
`2283`	`2284`	`if (op->src[0]->type != GGML_TYPE_F32) {`
`2284`	`2285`	`return false;`
`2285`	`2286`	`}`
Original file line number	Diff line number	Diff line change
`@@ -4303,6 +4303,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g`
`4303`	`4303`	`{`
`4304`	`4304`	`// TODO: add support`
`4305`	`4305`	`// ref: https://github.com/ggml-org/llama.cpp/pull/14274`
	`4306`	`+#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")`
`4306`	`4307`	`return (op->type == GGML_TYPE_F32 \|\| (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64));`
`4307`	`4308`	`} break;`
`4308`	`4309`	`case GGML_OP_CPY:`
Original file line number	Diff line number	Diff line change
`@@ -884,8 +884,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::`
`884`	`884`	`if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {`
`885`	`885`	`if (qtype != new_type) {`
`886`	`886`	`LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));`
`887`		`- new_type = qtype;`
`888`		`- break; // if two or more types are specified for the tensor, first match wins`
	`887`	`+ new_type = qtype; // if two or more types are specified for the same tensor, the last match wins`
`889`	`888`	`}`
`890`	`889`	`}`
`891`	`890`	`}`