diff --git a/README.md b/README.md
index 17b8b42..59110ac 100644
--- a/README.md
+++ b/README.md
@@ -78,6 +78,32 @@ const App = () => {
 
 ## Language Model
 
+### Model Options
+
+Choose model quantization and NPU acceleration with Pro models.
+
+```typescript
+import { CactusLM } from 'cactus-react-native';
+
+// Use int4 for faster performance and smaller file size
+const cactusLM = new CactusLM({
+  model: 'lfm2-vl-450m',
+  options: {
+    quantization: 'int4', // 'int4' or 'int8'
+    pro: false
+  }
+});
+
+// Use pro models for NPU acceleration
+const cactusPro = new CactusLM({
+  model: 'lfm2-vl-450m',
+  options: {
+    quantization: 'int4',
+    pro: true
+  }
+});
+```
+
 ### Completion
 
 Generate text responses from the model by providing a conversation history.
@@ -559,6 +585,60 @@ const App = () => {
 };
 ```
 
+### Streaming Transcription
+
+Transcribe audio in real-time with incremental results.
+
+#### Class
+
+```typescript
+import { CactusSTT } from 'cactus-react-native';
+
+const cactusSTT = new CactusSTT({ model: 'whisper-small' });
+
+await cactusSTT.streamTranscribeInit();
+
+const audioChunk: number[] = [/* PCM samples */];
+await cactusSTT.streamTranscribeInsert({ audio: audioChunk });
+
+const result = await cactusSTT.streamTranscribeProcess({
+  options: { confirmationThreshold: 0.95 }
+});
+
+console.log('Confirmed:', result.confirmed);
+console.log('Pending:', result.pending);
+
+const final = await cactusSTT.streamTranscribeFinalize();
+await cactusSTT.streamTranscribeDestroy();
+```
+
+#### Hook
+
+```tsx
+import { useCactusSTT } from 'cactus-react-native';
+
+const App = () => {
+  const cactusSTT = useCactusSTT({ model: 'whisper-small' });
+
+  const handleStream = async () => {
+    await cactusSTT.streamTranscribeInit();
+
+    const audioChunk: number[] = [/* PCM samples */];
+    await cactusSTT.streamTranscribeInsert({ audio: audioChunk });
+
+    await cactusSTT.streamTranscribeProcess();
+  };
+
+  return (
+    <>
+      <Button onPress={handleStream} title="Stream" />
+      <Text>{cactusSTT.streamTranscribeConfirmed}</Text>
+      <Text>{cactusSTT.streamTranscribePending}</Text>
+    </>
+  );
+};
+```
+
 ### Audio Embedding
 
 Generate embeddings from audio files for audio understanding.
@@ -854,9 +934,12 @@ const App = () => {
 **`new CactusLM(params?: CactusLMParams)`**
 
 **Parameters:**
-- `model` - Model slug or absolute path to Cactus model (default: `'qwen3-0.6'`).
+- `model` - Model slug or absolute path to Cactus model (default: `'qwen3-0.6b'`).
 - `contextSize` - Context window size (default: `2048`).
 - `corpusDir` - Directory containing text files for RAG (default: `undefined`).
+- `options` - Model options for quantization and NPU acceleration:
+  - `quantization` - Quantization type: `'int4'` | `'int8'` (default: `'int4'`).
+  - `pro` - Enable NPU-accelerated models (default: `false`).
 
 #### Methods
 
@@ -932,13 +1015,13 @@ Resets the model's internal state, clearing any cached context. Automatically ca
 
 Releases all resources associated with the model. Automatically calls `stop()` first. Safe to call even if the model is not initialized.
 
-**`getModels(): Promise<CactusModel[]>`**
+**`getModels(): CactusModel[]`**
 
-Fetches available models from the database and checks their download status.
+Returns available models.
 
 ### useCactusLM Hook
 
-The `useCactusLM` hook manages a `CactusLM` instance with reactive state. When model parameters (`model`, `contextSize`, or `corpusDir`) change, the hook creates a new instance and resets all state. The hook automatically cleans up resources when the component unmounts.
+The `useCactusLM` hook manages a `CactusLM` instance with reactive state. When model parameters (`model`, `contextSize`, `corpusDir`, `options`) change, the hook creates a new instance and resets all state. The hook automatically cleans up resources when the component unmounts.
 
 #### State
 
@@ -962,7 +1045,7 @@ The `useCactusLM` hook manages a `CactusLM` instance with reactive state. When m
 - `stop(): Promise<void>` - Stops ongoing generation. Clears any errors.
 - `reset(): Promise<void>` - Resets the model's internal state, clearing cached context. Also clears the `completion` state.
 - `destroy(): Promise<void>` - Releases all resources associated with the model. Clears the `completion` state. Automatically called when the component unmounts.
-- `getModels(): Promise<CactusModel[]>` - Fetches available models from the database and checks their download status.
+- `getModels(): CactusModel[]` - Returns available models.
 
 ### CactusSTT Class
 
@@ -971,8 +1054,11 @@ The `useCactusLM` hook manages a `CactusLM` instance with reactive state. When m
 **`new CactusSTT(params?: CactusSTTParams)`**
 
 **Parameters:**
-- `model` - Model slug or absolute path to Cactus model (default: `'qwen3-0.6'`).
+- `model` - Model slug or absolute path to Cactus model (default: `'whisper-small'`).
 - `contextSize` - Context window size (default: `2048`).
+- `options` - Model options for quantization and NPU acceleration:
+  - `quantization` - Quantization type: `'int4'` | `'int8'` (default: `'int4'`).
+  - `pro` - Enable NPU-accelerated models (default: `false`).
 
 #### Methods
 
@@ -1009,6 +1095,33 @@ Generates embeddings for the given audio file. Automatically calls `init()` if n
 **Parameters:**
 - `audioPath` - Path to the audio file.
 
+**`streamTranscribeInit(): Promise<void>`**
+
+Initializes a streaming transcription session. Automatically calls `init()` if not already initialized.
+
+**`streamTranscribeInsert(params: CactusSTTStreamTranscribeInsertParams): Promise<void>`**
+
+Inserts PCM audio samples into the streaming buffer.
+
+**Parameters:**
+- `audio` - Array of PCM audio samples.
+
+**`streamTranscribeProcess(params?: CactusSTTStreamTranscribeProcessParams): Promise<CactusSTTStreamTranscribeProcessResult>`**
+
+Processes accumulated audio and returns incremental transcription results.
+
+**Parameters:**
+- `options` - Processing options:
+  - `confirmationThreshold` - Confidence threshold for confirming text.
+
+**`streamTranscribeFinalize(): Promise<CactusSTTStreamTranscribeFinalizeResult>`**
+
+Finalizes the streaming session and returns remaining transcription text.
+
+**`streamTranscribeDestroy(): Promise<void>`**
+
+Destroys the streaming session and releases resources.
+
 **`stop(): Promise<void>`**
 
 Stops ongoing transcription or embedding generation.
@@ -1021,18 +1134,21 @@ Resets the model's internal state. Automatically calls `stop()` first.
 
 Releases all resources associated with the model. Automatically calls `stop()` first. Safe to call even if the model is not initialized.
 
-**`getModels(): Promise<CactusSTTModel[]>`**
+**`getModels(): CactusModel[]`**
 
-Fetches available STT models from the database and checks their download status.
+Returns available speech-to-text models.
 
 ### useCactusSTT Hook
 
-The `useCactusSTT` hook manages a `CactusSTT` instance with reactive state. When model parameters (`model`, `contextSize`) change, the hook creates a new instance and resets all state. The hook automatically cleans up resources when the component unmounts.
+The `useCactusSTT` hook manages a `CactusSTT` instance with reactive state. When model parameters (`model`, `contextSize`, `options`) change, the hook creates a new instance and resets all state. The hook automatically cleans up resources when the component unmounts.
 
 #### State
 
 - `transcription: string` - Current transcription text. Automatically accumulated during streaming. Cleared before each new transcription and when calling `reset()` or `destroy()`.
+- `streamTranscribeConfirmed: string` - Accumulated confirmed text from streaming transcription.
+- `streamTranscribePending: string` - Current pending text from streaming transcription.
 - `isGenerating: boolean` - Whether the model is currently generating (transcription or embedding). Both operations share this flag.
+- `isStreamTranscribing: boolean` - Whether a streaming transcription session is active.
 - `isInitializing: boolean` - Whether the model is initializing.
 - `isDownloaded: boolean` - Whether the model is downloaded locally. Automatically checked when the hook mounts or model changes.
 - `isDownloading: boolean` - Whether the model is being downloaded.
@@ -1045,10 +1161,15 @@ The `useCactusSTT` hook manages a `CactusSTT` instance with reactive state. When
 - `init(): Promise<void>` - Initializes the model for inference. Sets `isInitializing` to `true` during initialization.
 - `transcribe(params: CactusSTTTranscribeParams): Promise<CactusSTTTranscribeResult>` - Transcribes audio to text. Automatically accumulates tokens in the `transcription` state during streaming. Sets `isGenerating` to `true` while generating. Clears `transcription` before starting.
 - `audioEmbed(params: CactusSTTAudioEmbedParams): Promise<CactusSTTAudioEmbedResult>` - Generates embeddings for the given audio. Sets `isGenerating` to `true` during operation.
+- `streamTranscribeInit(): Promise<void>` - Initializes a streaming transcription session. Sets `isStreamTranscribing` to `true`.
+- `streamTranscribeInsert(params: CactusSTTStreamTranscribeInsertParams): Promise<void>` - Inserts audio chunks into the streaming buffer.
+- `streamTranscribeProcess(params?: CactusSTTStreamTranscribeProcessParams): Promise<CactusSTTStreamTranscribeProcessResult>` - Processes audio and returns results. Automatically accumulates confirmed text in `streamTranscribeConfirmed` and updates `streamTranscribePending`.
+- `streamTranscribeFinalize(): Promise<CactusSTTStreamTranscribeFinalizeResult>` - Finalizes streaming and returns remaining text.
+- `streamTranscribeDestroy(): Promise<void>` - Destroys the streaming session. Sets `isStreamTranscribing` to `false`.
 - `stop(): Promise<void>` - Stops ongoing generation. Clears any errors.
 - `reset(): Promise<void>` - Resets the model's internal state. Also clears the `transcription` state.
 - `destroy(): Promise<void>` - Releases all resources associated with the model. Clears the `transcription` state. Automatically called when the component unmounts.
-- `getModels(): Promise<CactusSTTModel[]>` - Fetches available STT models from the database and checks their download status.
+- `getModels(): CactusModel[]` - Returns available speech-to-text models.
 
 ### CactusIndex Class
 
@@ -1137,6 +1258,7 @@ interface CactusLMParams {
   model?: string;
   contextSize?: number;
   corpusDir?: string;
+  options?: ModelOptions;
 }
 ```
 
@@ -1293,28 +1415,36 @@ interface CactusLMImageEmbedResult {
 
 ```typescript
 interface CactusModel {
-  name: string;
-  slug: string;
-  quantization: number;
-  sizeMb: number;
-  downloadUrl: string;
-  supportsToolCalling: boolean;
-  supportsVision: boolean;
-  supportsCompletion: boolean;
-  createdAt: Date;
-  isDownloaded: boolean;
+  completion: boolean;
+  tools: boolean;
+  vision: boolean;
+  embed: boolean;
+  speech: boolean;
+  quantization: {
+    int4: {
+      sizeMb: number;
+      url: string;
+      pro?: {
+        apple: string;
+      };
+    };
+    int8: {
+      sizeMb: number;
+      url: string;
+      pro?: {
+        apple: string;
+      };
+    };
+  };
 }
 ```
 
-### CactusSTTModel
+### ModelOptions
 
 ```typescript
-interface CactusSTTModel {
-  slug: string;
-  sizeMb: number;
-  downloadUrl: string;
-  createdAt: Date;
-  isDownloaded: boolean;
+interface ModelOptions {
+  quantization: 'int4' | 'int8';
+  pro: boolean;
 }
 ```
 
@@ -1324,6 +1454,7 @@ interface CactusSTTModel {
 interface CactusSTTParams {
   model?: string;
   contextSize?: number;
+  options?: ModelOptions;
 }
 ```
 
@@ -1391,6 +1522,49 @@ interface CactusSTTAudioEmbedResult {
 }
 ```
 
+### CactusSTTStreamTranscribeInsertParams
+
+```typescript
+interface CactusSTTStreamTranscribeInsertParams {
+  audio: number[];
+}
+```
+
+### StreamTranscribeProcessOptions
+
+```typescript
+interface StreamTranscribeProcessOptions {
+  confirmationThreshold?: number;
+}
+```
+
+### CactusSTTStreamTranscribeProcessParams
+
+```typescript
+interface CactusSTTStreamTranscribeProcessParams {
+  options?: StreamTranscribeProcessOptions;
+}
+```
+
+### CactusSTTStreamTranscribeProcessResult
+
+```typescript
+interface CactusSTTStreamTranscribeProcessResult {
+  success: boolean;
+  confirmed: string;
+  pending: string;
+}
+```
+
+### CactusSTTStreamTranscribeFinalizeResult
+
+```typescript
+interface CactusSTTStreamTranscribeFinalizeResult {
+  success: boolean;
+  confirmed: string;
+}
+```
+
 ### CactusIndexParams
 
 ```typescript
@@ -1491,6 +1665,17 @@ import { CactusConfig } from 'cactus-react-native';
 CactusConfig.cactusToken = 'your-cactus-token-here';
 ```
 
+### Cactus Pro
+
+Enable NPU-accelerated models for enhanced performance.
+
+```typescript
+import { CactusConfig } from 'cactus-react-native';
+
+// Set your Cactus Pro key
+CactusConfig.cactusProKey = 'your-cactus-pro-key-here';
+```
+
 ## Performance Tips
 
 - **Model Selection** - Choose smaller models for faster inference on mobile devices.
diff --git a/android/src/main/jniLibs/arm64-v8a/libcactus.a b/android/src/main/jniLibs/arm64-v8a/libcactus.a
index 0a72e9d..91ce826 100644
Binary files a/android/src/main/jniLibs/arm64-v8a/libcactus.a and b/android/src/main/jniLibs/arm64-v8a/libcactus.a differ
diff --git a/cpp/HybridCactus.cpp b/cpp/HybridCactus.cpp
index 7cce17a..a6282d3 100644
--- a/cpp/HybridCactus.cpp
+++ b/cpp/HybridCactus.cpp
@@ -325,9 +325,128 @@ std::shared_ptr<Promise<void>> HybridCactus::destroy() {
       throw std::runtime_error("Cactus model is not initialized");
     }
 
+    if (this->_streamTranscribe) {
+      cactus_stream_transcribe_destroy(this->_streamTranscribe);
+      this->_streamTranscribe = nullptr;
+    }
+
     cactus_destroy(this->_model);
     this->_model = nullptr;
   });
 }
 
+std::shared_ptr<Promise<void>> HybridCactus::streamTranscribeInit() {
+  return Promise<void>::async([this]() -> void {
+    std::lock_guard<std::mutex> lock(this->_modelMutex);
+
+    if (!this->_model) {
+      throw std::runtime_error("Cactus model is not initialized");
+    }
+
+    if (this->_streamTranscribe) {
+      throw std::runtime_error(
+          "Cactus stream transcribe is already initialized");
+    }
+
+    this->_streamTranscribe = cactus_stream_transcribe_init(this->_model);
+    if (!this->_streamTranscribe) {
+      throw std::runtime_error("Cactus stream transcribe init failed: " +
+                               std::string(cactus_get_last_error()));
+    }
+  });
+}
+
+std::shared_ptr<Promise<void>>
+HybridCactus::streamTranscribeInsert(const std::vector<double> &audio) {
+  return Promise<void>::async([this, audio]() -> void {
+    std::lock_guard<std::mutex> lock(this->_modelMutex);
+
+    if (!this->_streamTranscribe) {
+      throw std::runtime_error("Cactus stream transcribe is not initialized");
+    }
+
+    std::vector<uint8_t> audioBytes;
+    audioBytes.reserve(audio.size());
+    for (double d : audio) {
+      d = std::clamp(d, 0.0, 255.0);
+      audioBytes.emplace_back(static_cast<uint8_t>(d));
+    }
+
+    int result = cactus_stream_transcribe_insert(
+        this->_streamTranscribe, audioBytes.data(), audioBytes.size());
+
+    if (result < 0) {
+      throw std::runtime_error("Cactus stream transcribe insert failed: " +
+                               std::string(cactus_get_last_error()));
+    }
+  });
+}
+
+std::shared_ptr<Promise<std::string>> HybridCactus::streamTranscribeProcess(
+    const std::optional<std::string> &optionsJson) {
+  return Promise<std::string>::async([this, optionsJson]() -> std::string {
+    std::lock_guard<std::mutex> lock(this->_modelMutex);
+
+    if (!this->_streamTranscribe) {
+      throw std::runtime_error("Cactus stream transcribe is not initialized");
+    }
+
+    std::string responseBuffer;
+    responseBuffer.resize(32768);
+
+    int result = cactus_stream_transcribe_process(
+        this->_streamTranscribe, responseBuffer.data(), responseBuffer.size(),
+        optionsJson ? optionsJson->c_str() : nullptr);
+
+    if (result < 0) {
+      throw std::runtime_error("Cactus stream transcribe process failed: " +
+                               std::string(cactus_get_last_error()));
+    }
+
+    // Remove null terminator
+    responseBuffer.resize(strlen(responseBuffer.c_str()));
+
+    return responseBuffer;
+  });
+}
+
+std::shared_ptr<Promise<std::string>> HybridCactus::streamTranscribeFinalize() {
+  return Promise<std::string>::async([this]() -> std::string {
+    std::lock_guard<std::mutex> lock(this->_modelMutex);
+
+    if (!this->_streamTranscribe) {
+      throw std::runtime_error("Cactus stream transcribe is not initialized");
+    }
+
+    std::string responseBuffer;
+    responseBuffer.resize(32768);
+
+    int result = cactus_stream_transcribe_finalize(
+        this->_streamTranscribe, responseBuffer.data(), responseBuffer.size());
+
+    if (result < 0) {
+      throw std::runtime_error("Cactus stream transcribe finalize failed: " +
+                               std::string(cactus_get_last_error()));
+    }
+
+    // Remove null terminator
+    responseBuffer.resize(strlen(responseBuffer.c_str()));
+
+    return responseBuffer;
+  });
+}
+
+std::shared_ptr<Promise<void>> HybridCactus::streamTranscribeDestroy() {
+  return Promise<void>::async([this]() -> void {
+    std::lock_guard<std::mutex> lock(this->_modelMutex);
+
+    if (!this->_streamTranscribe) {
+      throw std::runtime_error("Cactus stream transcribe is not initialized");
+    }
+
+    cactus_stream_transcribe_destroy(this->_streamTranscribe);
+    this->_streamTranscribe = nullptr;
+  });
+}
+
 } // namespace margelo::nitro::cactus
diff --git a/cpp/HybridCactus.hpp b/cpp/HybridCactus.hpp
index fd49a1e..27e84e3 100644
--- a/cpp/HybridCactus.hpp
+++ b/cpp/HybridCactus.hpp
@@ -38,6 +38,18 @@ class HybridCactus : public HybridCactusSpec {
                                              double /* tokenId */)>> &callback)
       override;
 
+  std::shared_ptr<Promise<void>> streamTranscribeInit() override;
+
+  std::shared_ptr<Promise<void>>
+  streamTranscribeInsert(const std::vector<double> &audio) override;
+
+  std::shared_ptr<Promise<std::string>> streamTranscribeProcess(
+      const std::optional<std::string> &optionsJson) override;
+
+  std::shared_ptr<Promise<std::string>> streamTranscribeFinalize() override;
+
+  std::shared_ptr<Promise<void>> streamTranscribeDestroy() override;
+
   std::shared_ptr<Promise<std::vector<double>>>
   embed(const std::string &text, double embeddingBufferSize,
         bool normalize) override;
@@ -56,6 +68,7 @@ class HybridCactus : public HybridCactusSpec {
 
 private:
   cactus_model_t _model = nullptr;
+  cactus_stream_transcribe_t _streamTranscribe = nullptr;
   size_t _contextSize;
 
   std::mutex _modelMutex;
diff --git a/cpp/cactus_ffi.h b/cpp/cactus_ffi.h
index e00b391..bb57657 100644
--- a/cpp/cactus_ffi.h
+++ b/cpp/cactus_ffi.h
@@ -67,6 +67,30 @@ CACTUS_FFI_EXPORT int cactus_transcribe(
     size_t pcm_buffer_size
 );
 
+typedef void* cactus_stream_transcribe_t;
+
+CACTUS_FFI_EXPORT cactus_stream_transcribe_t cactus_stream_transcribe_init(cactus_model_t model);
+
+CACTUS_FFI_EXPORT int cactus_stream_transcribe_insert(
+    cactus_stream_transcribe_t stream,
+    const uint8_t* pcm_buffer,
+    size_t pcm_buffer_size
+);
+
+CACTUS_FFI_EXPORT int cactus_stream_transcribe_process(
+    cactus_stream_transcribe_t stream,
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json
+);
+
+CACTUS_FFI_EXPORT int cactus_stream_transcribe_finalize(
+    cactus_stream_transcribe_t stream,
+    char* response_buffer,
+    size_t buffer_size
+);
+
+CACTUS_FFI_EXPORT void cactus_stream_transcribe_destroy(cactus_stream_transcribe_t stream);
 
 CACTUS_FFI_EXPORT int cactus_embed(
     cactus_model_t model,
diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock
index f1bbe42..993c199 100644
--- a/example/ios/Podfile.lock
+++ b/example/ios/Podfile.lock
@@ -1,6 +1,6 @@
 PODS:
   - boost (1.84.0)
-  - Cactus (1.4.0):
+  - Cactus (1.5.0):
     - boost
     - DoubleConversion
     - fast_float
@@ -2643,7 +2643,7 @@ EXTERNAL SOURCES:
 
 SPEC CHECKSUMS:
   boost: 7e761d76ca2ce687f7cc98e698152abd03a18f90
-  Cactus: 83c36f3d76eb2102a79020b41201a3aae8b71956
+  Cactus: 336fab89b318d196bcc0f87cf4634acd57c83bad
   DoubleConversion: cb417026b2400c8f53ae97020b2be961b59470cb
   fast_float: b32c788ed9c6a8c584d114d0047beda9664e7cc6
   FBLazyVector: b8f1312d48447cca7b4abc21ed155db14742bd03
diff --git a/example/src/App.tsx b/example/src/App.tsx
index 0033107..33e24e1 100644
--- a/example/src/App.tsx
+++ b/example/src/App.tsx
@@ -12,8 +12,14 @@ import VisionScreen from './VisionScreen';
 import ToolCallingScreen from './ToolCallingScreen';
 import RAGScreen from './RAGScreen';
 import STTScreen from './STTScreen';
+import StreamSTTScreen from './StreamSTTScreen';
 import ChatScreen from './ChatScreen';
 import IndexScreen from './IndexScreen';
+import { CactusConfig } from 'cactus-react-native';
+
+// Set your Cactus Pro API key here
+// To enable NPU acceleration set also the pro option in model options where applicable
+CactusConfig.cactusProKey = '';
 
 type Screen =
   | 'Home'
@@ -22,6 +28,7 @@ type Screen =
   | 'ToolCalling'
   | 'RAG'
   | 'STT'
+  | 'StreamSTT'
   | 'Chat'
   | 'Index';
 
@@ -52,6 +59,10 @@ const App = () => {
     setSelectedScreen('STT');
   };
 
+  const handleGoToStreamSTT = () => {
+    setSelectedScreen('StreamSTT');
+  };
+
   const handleGoToChat = () => {
     setSelectedScreen('Chat');
   };
@@ -72,6 +83,8 @@ const App = () => {
         return <RAGScreen />;
       case 'STT':
         return <STTScreen />;
+      case 'StreamSTT':
+        return <StreamSTTScreen />;
       case 'Chat':
         return <ChatScreen />;
       case 'Index':
@@ -142,6 +155,16 @@ const App = () => {
             </Text>
           </TouchableOpacity>
 
+          <TouchableOpacity
+            style={styles.menuButton}
+            onPress={handleGoToStreamSTT}
+          >
+            <Text style={styles.menuButtonTitle}>Stream Transcription</Text>
+            <Text style={styles.menuButtonDescription}>
+              Real-time streaming audio transcription
+            </Text>
+          </TouchableOpacity>
+
           <TouchableOpacity style={styles.menuButton} onPress={handleGoToChat}>
             <Text style={styles.menuButtonTitle}>Chat</Text>
             <Text style={styles.menuButtonDescription}>
diff --git a/example/src/CompletionScreen.tsx b/example/src/CompletionScreen.tsx
index a3a4656..b66d1ce 100644
--- a/example/src/CompletionScreen.tsx
+++ b/example/src/CompletionScreen.tsx
@@ -16,7 +16,7 @@ import {
 } from 'cactus-react-native';
 
 const CompletionScreen = () => {
-  const cactusLM = useCactusLM({ model: 'qwen3-0.6' });
+  const cactusLM = useCactusLM({ model: 'qwen3-0.6b' });
   const [input, setInput] = useState('What is the capital of France?');
   const [result, setResult] = useState<CactusLMCompleteResult | null>(null);
   const [embedResult, setEmbedResult] = useState<CactusLMEmbedResult | null>(
diff --git a/example/src/StreamSTTScreen.tsx b/example/src/StreamSTTScreen.tsx
new file mode 100644
index 0000000..47b2f0e
--- /dev/null
+++ b/example/src/StreamSTTScreen.tsx
@@ -0,0 +1,284 @@
+import { useEffect, useState } from 'react';
+import {
+  View,
+  Text,
+  TouchableOpacity,
+  ScrollView,
+  StyleSheet,
+  ActivityIndicator,
+} from 'react-native';
+import { useCactusSTT } from 'cactus-react-native';
+import * as DocumentPicker from '@react-native-documents/picker';
+import * as RNFS from '@dr.pogodin/react-native-fs';
+
+// 2 seconds of 16kHz audio (2 bytes per sample)
+const CHUNK_SIZE = 16000 * 2 * 3;
+
+const StreamSTTScreen = () => {
+  const cactusSTT = useCactusSTT({ model: 'whisper-small' });
+  const [audioFile, setAudioFile] = useState<string | null>(null);
+  const [audioFileName, setAudioFileName] = useState<string>('');
+
+  useEffect(() => {
+    if (!cactusSTT.isDownloaded) {
+      cactusSTT.download();
+    }
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [cactusSTT.isDownloaded]);
+
+  const handleSelectAudio = async () => {
+    try {
+      const res = await DocumentPicker.pick({
+        type: [DocumentPicker.types.audio],
+      });
+      if (res && res.length > 0) {
+        const fileName = `audio_${Date.now()}.wav`;
+        const destPath = `${RNFS.CachesDirectoryPath}/${fileName}`;
+        await RNFS.copyFile(res[0].uri, destPath);
+        setAudioFile(destPath);
+        setAudioFileName(res[0].name || 'Unknown');
+      }
+    } catch (err) {
+      console.error(err);
+    }
+  };
+
+  const readAudioFile = async (filePath: string): Promise<Uint8Array> => {
+    const base64Audio = await RNFS.readFile(filePath, 'base64');
+    const binaryString = atob(base64Audio);
+    const bytes = new Uint8Array(binaryString.length);
+    for (let i = 0; i < binaryString.length; i++) {
+      bytes[i] = binaryString.charCodeAt(i);
+    }
+    // Skip WAV header (44 bytes)
+    return bytes.slice(44);
+  };
+
+  const handleStreamTranscribe = async () => {
+    if (!audioFile) return;
+    try {
+      // Initialize streaming
+      await cactusSTT.streamTranscribeInit();
+
+      // Read audio file
+      const pcmData = await readAudioFile(audioFile);
+
+      // Stream audio in 3-second chunks
+      for (let i = 0; i < pcmData.length; i += CHUNK_SIZE) {
+        const chunk = pcmData.slice(i, i + CHUNK_SIZE);
+        const pcmSamples = Array.from(chunk);
+
+        // Insert chunk
+        await cactusSTT.streamTranscribeInsert({ audio: pcmSamples });
+
+        // Process and get results
+        await cactusSTT.streamTranscribeProcess({
+          options: { confirmationThreshold: 0.95 },
+        });
+      }
+
+      // Finalize to get remaining text
+      await cactusSTT.streamTranscribeFinalize();
+    } catch (err) {
+      console.error('Stream error:', err);
+    }
+  };
+
+  const handleStop = async () => {
+    try {
+      await cactusSTT.streamTranscribeDestroy();
+    } catch (err) {
+      console.error('Stop error:', err);
+    }
+  };
+
+  if (cactusSTT.isDownloading) {
+    return (
+      <View style={styles.centerContainer}>
+        <ActivityIndicator size="large" />
+        <Text style={styles.progressText}>
+          Downloading: {Math.round(cactusSTT.downloadProgress * 100)}%
+        </Text>
+      </View>
+    );
+  }
+
+  return (
+    <ScrollView style={styles.container} contentContainerStyle={styles.content}>
+      <TouchableOpacity style={styles.selectButton} onPress={handleSelectAudio}>
+        <Text style={styles.selectButtonText}>
+          {audioFile ? `Selected: ${audioFileName}` : 'Select Audio File'}
+        </Text>
+      </TouchableOpacity>
+
+      <View style={styles.buttonContainer}>
+        <TouchableOpacity
+          style={[
+            styles.button,
+            (!audioFile || cactusSTT.isStreamTranscribing) &&
+              styles.buttonDisabled,
+          ]}
+          onPress={handleStreamTranscribe}
+          disabled={!audioFile || cactusSTT.isStreamTranscribing}
+        >
+          <Text style={styles.buttonText}>
+            {cactusSTT.isStreamTranscribing
+              ? 'Streaming...'
+              : 'Stream Transcribe'}
+          </Text>
+        </TouchableOpacity>
+
+        <TouchableOpacity
+          style={[
+            styles.button,
+            !cactusSTT.isStreamTranscribing && styles.buttonDisabled,
+          ]}
+          onPress={handleStop}
+          disabled={!cactusSTT.isStreamTranscribing}
+        >
+          <Text style={styles.buttonText}>Stop</Text>
+        </TouchableOpacity>
+      </View>
+
+      {cactusSTT.isStreamTranscribing && (
+        <View style={styles.statusContainer}>
+          <Text style={styles.statusText}>● Streaming...</Text>
+        </View>
+      )}
+
+      {cactusSTT.streamTranscribeConfirmed && (
+        <View style={styles.resultContainer}>
+          <Text style={styles.resultLabel}>Confirmed Text:</Text>
+          <View style={styles.resultBox}>
+            <Text style={styles.resultText}>
+              {cactusSTT.streamTranscribeConfirmed}
+            </Text>
+          </View>
+        </View>
+      )}
+
+      {cactusSTT.streamTranscribePending && (
+        <View style={styles.resultContainer}>
+          <Text style={styles.resultLabel}>Pending Text:</Text>
+          <View style={styles.pendingBox}>
+            <Text style={[styles.resultText, styles.pendingText]}>
+              {cactusSTT.streamTranscribePending}
+            </Text>
+          </View>
+        </View>
+      )}
+
+      {cactusSTT.error && (
+        <View style={styles.errorContainer}>
+          <Text style={styles.errorText}>{cactusSTT.error}</Text>
+        </View>
+      )}
+    </ScrollView>
+  );
+};
+
+export default StreamSTTScreen;
+
+const styles = StyleSheet.create({
+  container: {
+    flex: 1,
+    backgroundColor: '#fff',
+  },
+  content: {
+    padding: 20,
+  },
+  centerContainer: {
+    flex: 1,
+    justifyContent: 'center',
+    alignItems: 'center',
+    padding: 20,
+  },
+  progressText: {
+    marginTop: 16,
+    fontSize: 16,
+    color: '#000',
+  },
+  selectButton: {
+    padding: 16,
+    backgroundColor: '#f3f3f3',
+    borderRadius: 8,
+    marginBottom: 16,
+    alignItems: 'center',
+  },
+  selectButtonText: {
+    fontSize: 16,
+    color: '#000',
+  },
+  buttonContainer: {
+    flexDirection: 'row',
+    flexWrap: 'wrap',
+    gap: 8,
+    marginBottom: 16,
+  },
+  button: {
+    backgroundColor: '#000',
+    paddingVertical: 12,
+    paddingHorizontal: 16,
+    borderRadius: 8,
+    alignItems: 'center',
+  },
+  buttonDisabled: {
+    backgroundColor: '#ccc',
+  },
+  buttonText: {
+    color: '#fff',
+    fontSize: 16,
+    fontWeight: '600',
+  },
+  statusContainer: {
+    backgroundColor: '#f3f3f3',
+    padding: 12,
+    borderRadius: 8,
+    marginBottom: 16,
+  },
+  statusText: {
+    fontSize: 14,
+    color: '#2e7d32',
+    fontWeight: '600',
+  },
+  resultContainer: {
+    marginTop: 16,
+  },
+  resultLabel: {
+    fontSize: 16,
+    fontWeight: '600',
+    marginBottom: 8,
+    color: '#000',
+  },
+  resultBox: {
+    backgroundColor: '#f3f3f3',
+    padding: 12,
+    borderRadius: 8,
+    minHeight: 60,
+  },
+  resultText: {
+    fontSize: 14,
+    color: '#000',
+    lineHeight: 20,
+  },
+  pendingBox: {
+    backgroundColor: '#f3f3f3',
+    padding: 12,
+    borderRadius: 8,
+    minHeight: 60,
+    opacity: 0.7,
+  },
+  pendingText: {
+    fontStyle: 'italic',
+  },
+  errorContainer: {
+    backgroundColor: '#000',
+    padding: 12,
+    borderRadius: 8,
+    marginTop: 16,
+  },
+  errorText: {
+    color: '#fff',
+    fontSize: 14,
+  },
+});
diff --git a/example/src/ToolCallingScreen.tsx b/example/src/ToolCallingScreen.tsx
index 60d5282..e66bbd4 100644
--- a/example/src/ToolCallingScreen.tsx
+++ b/example/src/ToolCallingScreen.tsx
@@ -33,7 +33,7 @@ const tools: Tool[] = [
 ];
 
 const ToolCallingScreen = () => {
-  const cactusLM = useCactusLM({ model: 'qwen3-0.6' });
+  const cactusLM = useCactusLM({ model: 'qwen3-0.6b' });
   const [input, setInput] = useState("What's the weather in San Francisco?");
   const [result, setResult] = useState<CactusLMCompleteResult | null>(null);
 
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h
index e00b391..bb57657 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h
@@ -67,6 +67,30 @@ CACTUS_FFI_EXPORT int cactus_transcribe(
     size_t pcm_buffer_size
 );
 
+typedef void* cactus_stream_transcribe_t;
+
+CACTUS_FFI_EXPORT cactus_stream_transcribe_t cactus_stream_transcribe_init(cactus_model_t model);
+
+CACTUS_FFI_EXPORT int cactus_stream_transcribe_insert(
+    cactus_stream_transcribe_t stream,
+    const uint8_t* pcm_buffer,
+    size_t pcm_buffer_size
+);
+
+CACTUS_FFI_EXPORT int cactus_stream_transcribe_process(
+    cactus_stream_transcribe_t stream,
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json
+);
+
+CACTUS_FFI_EXPORT int cactus_stream_transcribe_finalize(
+    cactus_stream_transcribe_t stream,
+    char* response_buffer,
+    size_t buffer_size
+);
+
+CACTUS_FFI_EXPORT void cactus_stream_transcribe_destroy(cactus_stream_transcribe_t stream);
 
 CACTUS_FFI_EXPORT int cactus_embed(
     cactus_model_t model,
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h
index 06dfebe..bd03313 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h
@@ -63,6 +63,14 @@ struct ToolFunction {
     std::unordered_map<std::string, std::string> parameters;
 };
 
+} // namespace ffi
+} // namespace cactus
+
+#include "gemma_tools.h"
+
+namespace cactus {
+namespace ffi {
+
 inline void handle_error_response(const std::string& error_message, char* response_buffer, size_t buffer_size) {
     std::string sanitized_msg = error_message;
     for (auto& c : sanitized_msg) {
@@ -303,11 +311,43 @@ inline void parse_function_calls_from_response(const std::string& response_text,
     regular_response = response_text;
     function_calls.clear();
 
+    gemma::parse_function_calls(regular_response, function_calls);
+
+    // Parse Qwen-style function calls: <tool_call>{"name": "...", "arguments": {...}}</tool_call>
+    const std::string QWEN_TOOL_START = "<tool_call>";
+    const std::string QWEN_TOOL_END = "</tool_call>";
+    size_t qwen_start_pos = 0;
+
+    while ((qwen_start_pos = regular_response.find(QWEN_TOOL_START, qwen_start_pos)) != std::string::npos) {
+        size_t content_start = qwen_start_pos + QWEN_TOOL_START.length();
+        size_t qwen_end_pos = regular_response.find(QWEN_TOOL_END, content_start);
+
+        if (qwen_end_pos != std::string::npos) {
+            std::string json_content = regular_response.substr(content_start, qwen_end_pos - content_start);
+
+            size_t first = json_content.find_first_not_of(" \t\n\r");
+            size_t last = json_content.find_last_not_of(" \t\n\r");
+            if (first != std::string::npos && last != std::string::npos) {
+                json_content = json_content.substr(first, last - first + 1);
+            }
+
+            if (json_content.size() > 2 && json_content[0] == '{' &&
+                json_content.find("\"name\"") != std::string::npos) {
+                function_calls.push_back(json_content);
+            }
+
+            regular_response.erase(qwen_start_pos, qwen_end_pos + QWEN_TOOL_END.length() - qwen_start_pos);
+        } else {
+            break;
+        }
+    }
+
+    // Parse LFM2-style function calls: <|tool_call_start|>[name(args)]<|tool_call_end|>
     const std::string TOOL_CALL_START = "<|tool_call_start|>";
     const std::string TOOL_CALL_END = "<|tool_call_end|>";
     size_t tool_start_pos = 0;
 
-    while ((tool_start_pos = response_text.find(TOOL_CALL_START, tool_start_pos)) != std::string::npos) {
+    while ((tool_start_pos = regular_response.find(TOOL_CALL_START, tool_start_pos)) != std::string::npos) {
         size_t content_start = tool_start_pos + TOOL_CALL_START.length();
         size_t tool_end_pos = response_text.find(TOOL_CALL_END, content_start);
 
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h
index 601e818..a7ef002 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h
@@ -131,9 +131,12 @@ struct MergeRule {
 struct ChatMessage {
     std::string role;
     std::string content;
+    std::string name;
     std::vector<std::string> images;
 };
 
+
+
 class Tokenizer {
 public:
     virtual ~Tokenizer() = default;
@@ -329,6 +332,8 @@ struct KVCache {
     struct LayerCache {
         std::vector<uint8_t> keys;
         std::vector<uint8_t> values;
+        std::vector<float> key_scales;   
+        std::vector<float> value_scales; 
     };
 
     std::vector<LayerCache> layer_caches;
@@ -354,13 +359,11 @@ struct KVCache {
                           const std::vector<size_t>& v_nodes, size_t seq_len,
                           size_t num_layers, size_t kv_heads, size_t head_dim);
 
-    // Update KV cache from NPU prefill outputs
-    // NPU outputs are in shape [num_tokens, num_kv_heads, head_dim]
-    // This handles transposition to cache format and sliding window
     void update_from_npu(size_t layer_idx, const __fp16* k_data, const __fp16* v_data,
                          size_t num_tokens, size_t kv_heads, size_t head_dim);
 
     bool is_empty() const { return current_seq_len == 0; }
+    bool is_int8() const { return precision == Precision::INT8; }
     void* get_key_ptr(size_t layer);
     void* get_value_ptr(size_t layer);
 
@@ -374,33 +377,44 @@ struct KVCache {
 
     CircularView get_key_view(size_t layer);
     CircularView get_value_view(size_t layer);
+
+    const int8_t* get_keys_int8(size_t layer) const;
+    const int8_t* get_values_int8(size_t layer) const;
+    const float* get_key_scales(size_t layer) const;
+    const float* get_value_scales(size_t layer) const;
 };
 
 class ToolCallConstrainer {
 public:
     enum class State {
-        START,                  // -> expect {
-        EXPECT_FC_KEY,          // -> expect "function_call"
-        EXPECT_FC_COLON,        // -> expect :
-        EXPECT_FC_OPEN_BRACE,   // -> expect {
-        EXPECT_NAME_KEY,        // -> expect "name"
-        EXPECT_NAME_COLON,      // -> expect :
-        EXPECT_NAME_VALUE,      // -> expect "<function_name>"
-        EXPECT_COMMA,           // -> expect ,
-        EXPECT_ARGS_KEY,        // -> expect "arguments"
-        EXPECT_ARGS_COLON,      // -> expect :
-        IN_ARGUMENTS,           // -> free JSON, track brace depth
-        EXPECT_INNER_CLOSE,     // -> expect } to close inner object
-        EXPECT_OUTER_CLOSE,     // -> expect } to close outer object
-        DONE,                   // complete
-
-        LFM_START,              // -> expect <|tool_call_start|>
-        LFM_EXPECT_BRACKET,     // -> expect [
-        LFM_IN_FUNC_NAME,       // -> expect function name
-        LFM_EXPECT_PAREN,       // -> expect (
-        LFM_IN_ARGUMENTS,       // -> arguments until )
-        LFM_EXPECT_BRACKET_CLOSE, // -> expect ]
-        LFM_EXPECT_END          // -> expect <|tool_call_end|>
+        DONE,                   
+
+        QWEN_START,             
+        QWEN_EXPECT_OPEN_BRACE, 
+        QWEN_EXPECT_NAME_KEY, 
+        QWEN_EXPECT_NAME_COLON,
+        QWEN_EXPECT_NAME_VALUE,
+        QWEN_EXPECT_COMMA, 
+        QWEN_EXPECT_ARGS_KEY, 
+        QWEN_EXPECT_ARGS_COLON, 
+        QWEN_IN_ARGUMENTS,  
+        QWEN_EXPECT_CLOSE_BRACE,
+        QWEN_EXPECT_END, 
+
+        LFM_START,              
+        LFM_EXPECT_BRACKET, 
+        LFM_IN_FUNC_NAME,
+        LFM_EXPECT_PAREN,
+        LFM_IN_ARGUMENTS, 
+        LFM_EXPECT_BRACKET_CLOSE, 
+        LFM_EXPECT_END,   
+
+        GEMMA_START,           
+        GEMMA_EXPECT_CALL, 
+        GEMMA_IN_FUNC_NAME, 
+        GEMMA_EXPECT_BRACE, 
+        GEMMA_IN_ARGUMENTS, 
+        GEMMA_EXPECT_END 
     };
 
     void init(Config::ModelType model_type,
@@ -417,36 +431,40 @@ class ToolCallConstrainer {
 
 private:
     bool active_ = false;
-    State state_ = State::START;
+    State state_ = State::QWEN_START;
     Config::ModelType model_type_ = Config::ModelType::QWEN;
     Tokenizer* tokenizer_ = nullptr;
 
     std::vector<std::string> function_names_;
     std::string generated_text_;
-    int brace_depth_ = 0;  // Track nested braces in arguments
-
-    // Pre-tokenized token sets for each grammar element
-    std::unordered_set<uint32_t> open_brace_tokens_;      // {
-    std::unordered_set<uint32_t> close_brace_tokens_;     // }
-    std::unordered_set<uint32_t> colon_tokens_;           // :
-    std::unordered_set<uint32_t> comma_tokens_;           // ,
-    std::unordered_set<uint32_t> fc_key_tokens_;          // "function_call"
-    std::unordered_set<uint32_t> name_key_tokens_;        // "name"
-    std::unordered_set<uint32_t> args_key_tokens_;        // "arguments"
-    std::unordered_set<uint32_t> quote_tokens_;           // "
-    std::unordered_set<uint32_t> backtick_tokens_;        // ` (to block markdown code fences)
-    std::unordered_set<uint32_t> response_starter_tokens_; // Common response starters to block (I, I'm, Sorry, etc.)
-    std::unordered_set<uint32_t> all_func_name_tokens_;   // All function name tokens combined
-    std::unordered_map<std::string, std::vector<uint32_t>> func_name_sequences_;  // Full token sequence per function
-
-    // LFM2-specific tokens
+    int brace_depth_ = 0;  
+
+    std::unordered_set<uint32_t> qwen_tool_call_start_tokens_; 
+    std::unordered_set<uint32_t> qwen_tool_call_end_tokens_;   
+    std::unordered_set<uint32_t> open_brace_tokens_;         
+    std::unordered_set<uint32_t> close_brace_tokens_;       
+    std::unordered_set<uint32_t> colon_tokens_;            
+    std::unordered_set<uint32_t> comma_tokens_;          
+    std::unordered_set<uint32_t> name_key_tokens_;           
+    std::unordered_set<uint32_t> args_key_tokens_;         
+    std::unordered_set<uint32_t> quote_tokens_;            
+    std::unordered_set<uint32_t> backtick_tokens_;   
+    std::unordered_set<uint32_t> all_func_name_tokens_;
+    std::unordered_map<std::string, std::vector<uint32_t>> func_name_sequences_;  
+
     std::unordered_set<uint32_t> tool_start_tokens_;
     std::unordered_set<uint32_t> tool_end_tokens_;
-    std::unordered_set<uint32_t> bracket_open_tokens_;    // [
-    std::unordered_set<uint32_t> bracket_close_tokens_;   // ]
-    std::unordered_set<uint32_t> paren_open_tokens_;      // (
-    std::unordered_set<uint32_t> paren_close_tokens_;     // )
-    std::unordered_set<uint32_t> equals_tokens_;          // =
+    std::unordered_set<uint32_t> bracket_open_tokens_;   
+    std::unordered_set<uint32_t> bracket_close_tokens_;  
+    std::unordered_set<uint32_t> paren_open_tokens_;     
+    std::unordered_set<uint32_t> paren_close_tokens_;   
+    std::unordered_set<uint32_t> equals_tokens_;        
+
+    std::unordered_set<uint32_t> gemma_call_start_tokens_;    
+    std::unordered_set<uint32_t> gemma_call_end_tokens_;       
+    std::unordered_set<uint32_t> gemma_response_start_tokens_; 
+    std::unordered_set<uint32_t> gemma_call_prefix_tokens_;    
+    std::unordered_set<uint32_t> escape_tokens_;              
 
     std::unordered_map<uint32_t, float> current_bias_;
 
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/gemma_tools.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/gemma_tools.h
new file mode 100644
index 0000000..912de57
--- /dev/null
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/gemma_tools.h
@@ -0,0 +1,549 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <cctype>
+#include <map>
+#include <set>
+
+namespace gemma {
+
+inline std::string to_upper(const std::string& s) {
+    std::string result = s;
+    for (auto& c : result) c = std::toupper(c);
+    return result;
+}
+
+inline std::string escape(const std::string& s) {
+    return "<escape>" + s + "<escape>";
+}
+
+inline void skip_whitespace(const std::string& json, size_t& pos) {
+    while (pos < json.length() && std::isspace(json[pos])) pos++;
+}
+
+inline std::string extract_json_string(const std::string& json, size_t& pos) {
+    std::string value;
+    while (pos < json.length() && json[pos] != '"') {
+        if (json[pos] == '\\' && pos + 1 < json.length()) {
+            pos++;
+            if (json[pos] == 'n') value += '\n';
+            else if (json[pos] == 't') value += '\t';
+            else if (json[pos] == 'r') value += '\r';
+            else if (json[pos] == '"') value += '"';
+            else if (json[pos] == '\\') value += '\\';
+            else value += json[pos];
+        } else {
+            value += json[pos];
+        }
+        pos++;
+    }
+    if (pos < json.length()) pos++; 
+    return value;
+}
+
+std::string format_argument(const std::string& json, size_t& pos, bool escape_keys);
+std::string format_parameters(const std::string& properties_json, const std::string& /*required_json*/);
+
+inline std::string format_argument(const std::string& json, size_t& pos, bool escape_keys = true) {
+    skip_whitespace(json, pos);
+    if (pos >= json.length()) return "";
+
+    char c = json[pos];
+
+    if (c == '"') {
+        std::string value = extract_json_string(json, pos);
+        return escape(value);
+    } else if (c == '{') {
+        std::string result = "{";
+        pos++; 
+        bool first = true;
+
+        while (pos < json.length()) {
+            skip_whitespace(json, pos);
+            if (pos >= json.length() || json[pos] == '}') { pos++; break; }
+            if (json[pos] == ',') { pos++; continue; }
+
+            if (json[pos] != '"') break;
+            pos++;
+            std::string key = extract_json_string(json, pos);
+
+            skip_whitespace(json, pos);
+            if (pos < json.length() && json[pos] == ':') pos++;
+
+            std::string value = format_argument(json, pos, escape_keys);
+
+            if (!first) result += ",";
+            first = false;
+            if (escape_keys) {
+                result += escape(key) + ":" + value;
+            } else {
+                result += key + ":" + value;
+            }
+        }
+        result += "}";
+        return result;
+    } else if (c == '[') {
+        std::string result = "[";
+        pos++; 
+        bool first = true;
+
+        while (pos < json.length()) {
+            skip_whitespace(json, pos);
+            if (pos >= json.length() || json[pos] == ']') { pos++; break; }
+            if (json[pos] == ',') { pos++; continue; }
+
+            std::string value = format_argument(json, pos, escape_keys);
+
+            if (!first) result += ",";
+            first = false;
+            result += value;
+        }
+        result += "]";
+        return result;
+    } else if (json.compare(pos, 4, "true") == 0) {
+        pos += 4;
+        return "true";
+    } else if (json.compare(pos, 5, "false") == 0) {
+        pos += 5;
+        return "false";
+    } else if (json.compare(pos, 4, "null") == 0) {
+        pos += 4;
+        return "null";
+    } else {
+        size_t start = pos;
+        while (pos < json.length() && (std::isdigit(json[pos]) || json[pos] == '.' ||
+               json[pos] == '-' || json[pos] == '+' || json[pos] == 'e' || json[pos] == 'E')) {
+            pos++;
+        }
+        return json.substr(start, pos - start);
+    }
+}
+
+inline std::map<std::string, std::string> parse_json_object_raw(const std::string& json, size_t& pos) {
+    std::map<std::string, std::string> result;
+    skip_whitespace(json, pos);
+    if (pos >= json.length() || json[pos] != '{') return result;
+    pos++; 
+
+    while (pos < json.length()) {
+        skip_whitespace(json, pos);
+        if (pos >= json.length() || json[pos] == '}') { pos++; break; }
+        if (json[pos] == ',') { pos++; continue; }
+
+        if (json[pos] != '"') break;
+        pos++;
+        std::string key = extract_json_string(json, pos);
+
+        skip_whitespace(json, pos);
+        if (pos < json.length() && json[pos] == ':') pos++;
+        skip_whitespace(json, pos);
+
+        size_t value_start = pos;
+        if (json[pos] == '"') {
+            pos++;
+            while (pos < json.length() && json[pos] != '"') {
+                if (json[pos] == '\\') pos++;
+                pos++;
+            }
+            pos++; 
+        } else if (json[pos] == '{') {
+            int depth = 1;
+            pos++;
+            while (pos < json.length() && depth > 0) {
+                if (json[pos] == '{') depth++;
+                else if (json[pos] == '}') depth--;
+                else if (json[pos] == '"') {
+                    pos++;
+                    while (pos < json.length() && json[pos] != '"') {
+                        if (json[pos] == '\\') pos++;
+                        pos++;
+                    }
+                }
+                pos++;
+            }
+        } else if (json[pos] == '[') {
+            int depth = 1;
+            pos++;
+            while (pos < json.length() && depth > 0) {
+                if (json[pos] == '[') depth++;
+                else if (json[pos] == ']') depth--;
+                else if (json[pos] == '"') {
+                    pos++;
+                    while (pos < json.length() && json[pos] != '"') {
+                        if (json[pos] == '\\') pos++;
+                        pos++;
+                    }
+                }
+                pos++;
+            }
+        } else {
+            while (pos < json.length() && json[pos] != ',' && json[pos] != '}') pos++;
+        }
+        result[key] = json.substr(value_start, pos - value_start);
+    }
+    return result;
+}
+
+inline std::string get_json_string_value(const std::string& json, size_t pos) {
+    skip_whitespace(json, pos);
+    if (pos < json.length() && json[pos] == '"') {
+        pos++;
+        return extract_json_string(json, pos);
+    }
+    return "";
+}
+
+inline std::string format_parameters(const std::string& properties_json, const std::string& /*required_json*/) {
+    static const std::set<std::string> standard_keys = {"description", "type", "properties", "required", "nullable"};
+
+    size_t pos = 0;
+    auto properties = parse_json_object_raw(properties_json, pos);
+
+    std::string result;
+    bool first = true;
+
+    for (const auto& [key, value_json] : properties) {
+        if (standard_keys.count(key)) continue;
+
+        if (!first) result += ",";
+        first = false;
+
+        size_t prop_pos = 0;
+        auto prop_obj = parse_json_object_raw(value_json, prop_pos);
+
+        result += key + ":{";
+
+        if (prop_obj.count("description")) {
+            std::string desc = get_json_string_value(prop_obj["description"], 0);
+            result += "description:" + escape(desc);
+        }
+
+        std::string type_val;
+        if (prop_obj.count("type")) {
+            type_val = get_json_string_value(prop_obj["type"], 0);
+        }
+
+        if (to_upper(type_val) == "STRING") {
+            if (prop_obj.count("enum")) {
+                size_t enum_pos = 0;
+                std::string enum_formatted = format_argument(prop_obj["enum"], enum_pos, true);
+                result += ",enum:" + enum_formatted;
+            }
+        } else if (to_upper(type_val) == "OBJECT") {
+            if (prop_obj.count("properties")) {
+                std::string nested_required;
+                if (prop_obj.count("required")) {
+                    nested_required = prop_obj["required"];
+                }
+                result += ",properties:{" + format_parameters(prop_obj["properties"], nested_required) + "}";
+            }
+            if (prop_obj.count("required")) {
+                result += ",required:[";
+                size_t req_pos = 0;
+                skip_whitespace(prop_obj["required"], req_pos);
+                if (req_pos < prop_obj["required"].length() && prop_obj["required"][req_pos] == '[') {
+                    req_pos++;
+                    bool req_first = true;
+                    while (req_pos < prop_obj["required"].length()) {
+                        skip_whitespace(prop_obj["required"], req_pos);
+                        if (prop_obj["required"][req_pos] == ']') break;
+                        if (prop_obj["required"][req_pos] == ',') { req_pos++; continue; }
+                        if (prop_obj["required"][req_pos] == '"') {
+                            req_pos++;
+                            std::string req_item = extract_json_string(prop_obj["required"], req_pos);
+                            if (!req_first) result += ",";
+                            req_first = false;
+                            result += escape(req_item);
+                        }
+                    }
+                }
+                result += "]";
+            }
+        } else if (to_upper(type_val) == "ARRAY") {
+            if (prop_obj.count("items")) {
+                result += ",items:{";
+                size_t items_pos = 0;
+                auto items_obj = parse_json_object_raw(prop_obj["items"], items_pos);
+                bool items_first = true;
+
+                for (const auto& [item_key, item_value] : items_obj) {
+                    if (!items_first) result += ",";
+                    items_first = false;
+
+                    if (item_key == "properties") {
+                        std::string items_required;
+                        if (items_obj.count("required")) {
+                            items_required = items_obj["required"];
+                        }
+                        result += "properties:{" + format_parameters(item_value, items_required) + "}";
+                    } else if (item_key == "required") {
+                        result += "required:[";
+                        size_t req_pos = 0;
+                        skip_whitespace(item_value, req_pos);
+                        if (req_pos < item_value.length() && item_value[req_pos] == '[') {
+                            req_pos++;
+                            bool req_first = true;
+                            while (req_pos < item_value.length()) {
+                                skip_whitespace(item_value, req_pos);
+                                if (item_value[req_pos] == ']') break;
+                                if (item_value[req_pos] == ',') { req_pos++; continue; }
+                                if (item_value[req_pos] == '"') {
+                                    req_pos++;
+                                    std::string req_item = extract_json_string(item_value, req_pos);
+                                    if (!req_first) result += ",";
+                                    req_first = false;
+                                    result += escape(req_item);
+                                }
+                            }
+                        }
+                        result += "]";
+                    } else if (item_key == "type") {
+                        std::string item_type = get_json_string_value(item_value, 0);
+                        result += "type:" + escape(to_upper(item_type));
+                    } else {
+                        size_t val_pos = 0;
+                        result += item_key + ":" + format_argument(item_value, val_pos, true);
+                    }
+                }
+                result += "}";
+            }
+        }
+
+        if (!type_val.empty()) {
+            result += ",type:" + escape(to_upper(type_val));
+        }
+
+        result += "}";
+    }
+
+    return result;
+}
+
+inline std::string format_function_declaration(const std::string& name,
+                                                const std::string& description,
+                                                const std::string& params_json) {
+    std::string result = "declaration:" + name + "{";
+    result += "description:" + escape(description);
+
+    if (!params_json.empty()) {
+        result += ",parameters:{";
+
+        size_t pos = 0;
+        auto params = parse_json_object_raw(params_json, pos);
+
+        if (params.count("properties")) {
+            std::string required_json;
+            if (params.count("required")) {
+                required_json = params["required"];
+            }
+            result += "properties:{" + format_parameters(params["properties"], required_json) + "}";
+        }
+
+        if (params.count("required")) {
+            result += ",required:[";
+            size_t req_pos = 0;
+            skip_whitespace(params["required"], req_pos);
+            if (req_pos < params["required"].length() && params["required"][req_pos] == '[') {
+                req_pos++;
+                bool first = true;
+                while (req_pos < params["required"].length()) {
+                    skip_whitespace(params["required"], req_pos);
+                    if (params["required"][req_pos] == ']') break;
+                    if (params["required"][req_pos] == ',') { req_pos++; continue; }
+                    if (params["required"][req_pos] == '"') {
+                        req_pos++;
+                        std::string item = extract_json_string(params["required"], req_pos);
+                        if (!first) result += ",";
+                        first = false;
+                        result += escape(item);
+                    }
+                }
+            }
+            result += "]";
+        }
+
+        if (params.count("type")) {
+            std::string type_val = get_json_string_value(params["type"], 0);
+            result += ",type:" + escape(to_upper(type_val));
+        }
+
+        result += "}";
+    }
+
+    result += "}";
+    return result;
+}
+
+template<typename ToolFunction>
+inline std::string format_tools(const std::vector<ToolFunction>& tools) {
+    if (tools.empty()) return "";
+
+    std::string result;
+    for (const auto& tool : tools) {
+        result += "<start_function_declaration>";
+        std::string params_json;
+        auto it = tool.parameters.find("schema");
+        if (it != tool.parameters.end()) {
+            params_json = it->second;
+        }
+
+        result += format_function_declaration(tool.name, tool.description, params_json);
+        result += "<end_function_declaration>";
+    }
+    return result;
+}
+
+
+inline std::string unescape(const std::string& s) {
+    const std::string ESCAPE_TAG = "<escape>";
+    std::string result = s;
+    size_t pos = 0;
+    while ((pos = result.find(ESCAPE_TAG, pos)) != std::string::npos) {
+        result.erase(pos, ESCAPE_TAG.length());
+    }
+    return result;
+}
+
+inline std::string args_to_json(const std::string& args_content) {
+    std::string result = "{";
+    size_t pos = 0;
+    bool first = true;
+
+    if (!args_content.empty() && args_content[0] == '{') pos = 1;
+
+    while (pos < args_content.length()) {
+        while (pos < args_content.length() && std::isspace(args_content[pos])) pos++;
+        if (pos >= args_content.length() || args_content[pos] == '}') break;
+        if (args_content[pos] == ',') { pos++; continue; }
+
+        size_t key_start = pos;
+        while (pos < args_content.length() && args_content[pos] != ':') pos++;
+        std::string key = args_content.substr(key_start, pos - key_start);
+        if (pos < args_content.length()) pos++; 
+
+        std::string value;
+        while (pos < args_content.length() && std::isspace(args_content[pos])) pos++;
+
+        if (pos < args_content.length()) {
+            if (args_content.compare(pos, 8, "<escape>") == 0) {
+                pos += 8; 
+                size_t val_end = args_content.find("<escape>", pos);
+                if (val_end != std::string::npos) {
+                    value = "\"" + args_content.substr(pos, val_end - pos) + "\"";
+                    pos = val_end + 8; 
+                }
+            } else if (args_content[pos] == '{') {
+                int depth = 1;
+                size_t start = pos;
+                pos++;
+                while (pos < args_content.length() && depth > 0) {
+                    if (args_content[pos] == '{') depth++;
+                    else if (args_content[pos] == '}') depth--;
+                    pos++;
+                }
+                value = args_to_json(args_content.substr(start, pos - start));
+            } else if (args_content[pos] == '[') {
+                int depth = 1;
+                size_t start = pos;
+                pos++;
+                while (pos < args_content.length() && depth > 0) {
+                    if (args_content[pos] == '[') depth++;
+                    else if (args_content[pos] == ']') depth--;
+                    pos++;
+                }
+                std::string arr_content = args_content.substr(start + 1, pos - start - 2);
+                value = "[";
+                size_t arr_pos = 0;
+                bool first_item = true;
+                while (arr_pos < arr_content.length()) {
+                    while (arr_pos < arr_content.length() && (std::isspace(arr_content[arr_pos]) || arr_content[arr_pos] == ',')) arr_pos++;
+                    if (arr_pos >= arr_content.length()) break;
+
+                    if (!first_item) value += ",";
+                    first_item = false;
+
+                    if (arr_content.compare(arr_pos, 8, "<escape>") == 0) {
+                        arr_pos += 8;
+                        size_t end = arr_content.find("<escape>", arr_pos);
+                        if (end != std::string::npos) {
+                            value += "\"" + arr_content.substr(arr_pos, end - arr_pos) + "\"";
+                            arr_pos = end + 8;
+                        }
+                    } else {
+                        size_t end = arr_content.find_first_of(",]", arr_pos);
+                        if (end == std::string::npos) end = arr_content.length();
+                        value += arr_content.substr(arr_pos, end - arr_pos);
+                        arr_pos = end;
+                    }
+                }
+                value += "]";
+            } else {
+                size_t val_start = pos;
+                while (pos < args_content.length() && args_content[pos] != ',' && args_content[pos] != '}') {
+                    pos++;
+                }
+                value = args_content.substr(val_start, pos - val_start);
+                while (!value.empty() && std::isspace(value.back())) value.pop_back();
+            }
+        }
+
+        if (!first) result += ",";
+        first = false;
+        result += "\"" + key + "\":" + value;
+    }
+
+    result += "}";
+    return result;
+}
+
+inline void parse_function_calls(std::string& response, std::vector<std::string>& function_calls) {
+    const std::string CALL_START = "<start_function_call>";
+    const std::string CALL_END = "<end_function_call>";
+    size_t pos = 0;
+
+    while ((pos = response.find(CALL_START, pos)) != std::string::npos) {
+        size_t content_start = pos + CALL_START.length();
+        size_t call_end_pos = response.find(CALL_END, content_start);
+
+        size_t content_end = (call_end_pos != std::string::npos) ? call_end_pos : response.length();
+        std::string call_content = response.substr(content_start, content_end - content_start);
+
+        if (call_content.compare(0, 5, "call:") == 0) {
+            size_t brace_pos = call_content.find('{');
+
+            if (brace_pos == std::string::npos) {
+                size_t sep_pos = call_content.find_first_of(", ", 5);
+                if (sep_pos != std::string::npos) {
+                    std::string func_name = call_content.substr(5, sep_pos - 5);
+                    size_t args_start = sep_pos + 1;
+                    while (args_start < call_content.length() &&
+                           (call_content[args_start] == ' ' || call_content[args_start] == ',')) {
+                        args_start++;
+                    }
+                    std::string args_content = "{" + call_content.substr(args_start);
+                    if (args_content.back() != '}') args_content += "}";
+
+                    std::string args_json = args_to_json(args_content);
+                    std::string json_call = "{\"name\":\"" + func_name + "\",\"arguments\":" + args_json + "}";
+                    function_calls.push_back(json_call);
+                }
+            } else {
+                std::string func_name = call_content.substr(5, brace_pos - 5);
+                std::string args_content = call_content.substr(brace_pos);
+                if (args_content.back() != '}') args_content += "}";
+
+                std::string args_json = args_to_json(args_content);
+                std::string json_call = "{\"name\":\"" + func_name + "\",\"arguments\":" + args_json + "}";
+                function_calls.push_back(json_call);
+            }
+        }
+
+        size_t erase_end = (call_end_pos != std::string::npos) ?
+                           call_end_pos + CALL_END.length() : response.length();
+        response.erase(pos, erase_end - pos);
+    }
+}
+
+} // namespace gemma
\ No newline at end of file
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h
index 35b2808..e3038f6 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h
@@ -11,6 +11,7 @@
 #include <mutex>
 #include <sstream>
 #include <iostream>
+#include <arm_neon.h>
 
 namespace cactus {
 
@@ -96,9 +97,10 @@ namespace GraphFile {
 }
 
 enum class Precision {
-    INT8, 
+    INT8,
     FP16,
-    FP32
+    FP32,
+    INT4 
 };
 
 enum class ComputeBackend {
@@ -112,7 +114,7 @@ enum class OpType {
     MATMUL, TRANSPOSE, RESHAPE, SLICE, GATHER, EMBEDDING,
     BILINEAR_INTERPOLATION,
     SUM, MEAN, VARIANCE, MIN, MAX,
-    RMS_NORM, ROPE, SOFTMAX, ATTENTION, CONV1D_CAUSAL, CONV1D_K3,
+    RMS_NORM, ROPE, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, CONV1D_CAUSAL, CONV1D_K3,
     SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN,
     SILU, GELU, GELU_ERF,
     SAMPLE, CONCAT,
@@ -122,27 +124,38 @@ enum class OpType {
 };
 
 struct PrecisionTraits {
+    // Returns in-memory element size (INT4 unpacks to INT8, so returns 1)
     static constexpr size_t size_of(Precision prec) {
         switch (prec) {
             case Precision::INT8: return 1;
             case Precision::FP16: return 2;
             case Precision::FP32: return 4;
+            case Precision::INT4: return 1; 
         }
         return 1;
     }
-    
+
+    static constexpr size_t packed_size_of(Precision prec, size_t count) {
+        switch (prec) {
+            case Precision::INT4: return (count + 1) / 2;  
+            default: return count * size_of(prec);
+        }
+    }
+
     static constexpr bool is_integer(Precision prec) {
         switch (prec) {
             case Precision::INT8: return true;
+            case Precision::INT4: return true;
             case Precision::FP16: return false;
             case Precision::FP32: return false;
         }
         return true;
     }
-    
+
     static constexpr bool is_floating_point(Precision prec) {
         switch (prec) {
             case Precision::INT8: return false;
+            case Precision::INT4: return false;
             case Precision::FP16: return true;
             case Precision::FP32: return true;
         }
@@ -153,8 +166,6 @@ struct PrecisionTraits {
 namespace Quantization {
     void int8_to_fp32(const int8_t* src, float* dst, size_t count, float scale = 1.0f);
     void fp32_to_int8(const float* src, int8_t* dst, size_t count, float scale = 1.0f);
-    void dynamic_quantize_fp32_to_int8(const float* src, int8_t* dst, size_t count, 
-                                       float* computed_scale);
     void fp16_to_fp32(const __fp16* src, float* dst, size_t count);
     void fp32_to_fp16(const float* src, __fp16* dst, size_t count);
     void int8_to_fp16(const int8_t* src, __fp16* dst, size_t count, float scale = 1.0f);
@@ -188,10 +199,17 @@ struct BufferDesc {
     void* external_data;
     char* pooled_data;
     Precision precision;
-    float quantization_scale;
+
+    size_t group_size = 0;
+    size_t num_groups = 0;
+    void* scales_data = nullptr;
+    std::unique_ptr<char[]> owned_scales;
+
+    const void* packed_int4_data = nullptr;  
+    size_t packed_int4_size = 0; 
 
     BufferDesc();
-    BufferDesc(const std::vector<size_t>& s, Precision prec = Precision::INT8, float scale = 1.0f);
+    BufferDesc(const std::vector<size_t>& s, Precision prec = Precision::INT8);
     ~BufferDesc();
 
     BufferDesc(BufferDesc&& other) noexcept;
@@ -209,6 +227,28 @@ struct BufferDesc {
     template<typename T>
     const T* data_as() const { return static_cast<const T*>(get_data()); }
 
+    const __fp16* scales_as_fp16() const {
+        return reinterpret_cast<const __fp16*>(scales_data);
+    }
+    bool is_grouped_int8() const {
+        return precision == Precision::INT8 && group_size > 0;
+    }
+    bool is_packed_int4() const {
+        return packed_int4_data != nullptr && packed_int4_size > 0;
+    }
+    const uint8_t* packed_int4_as_uint8() const {
+        return reinterpret_cast<const uint8_t*>(packed_int4_data);
+    }
+    void set_grouped_scales(size_t gs, size_t ng, void* scales_ptr) {
+        group_size = gs;
+        num_groups = ng;
+        scales_data = scales_ptr;
+    }
+    void set_packed_int4(const void* packed_data, size_t packed_size) {
+        packed_int4_data = packed_data;
+        packed_int4_size = packed_size;
+    }
+
     void allocate();
     void allocate_from_pool(BufferPool& pool);
     void release_to_pool(BufferPool& pool);
@@ -247,6 +287,14 @@ struct OpParams {
 
     std::vector<float> bias_values;
     std::vector<uint32_t> bias_indices;
+
+    const int8_t* cached_keys_int8 = nullptr;
+    const int8_t* cached_values_int8 = nullptr;
+    const float* cached_k_scales = nullptr;
+    const float* cached_v_scales = nullptr;
+    size_t cache_seq_len = 0;
+    size_t num_kv_heads = 0;
+    size_t head_dim = 0;
 };
 
 struct GraphNode {
@@ -326,7 +374,7 @@ class CactusGraph {
     size_t precision_cast(size_t input, Precision target_precision);
     
     size_t add(size_t input1, size_t input2);
-    size_t add_clipped(size_t input1, size_t input2);  // For FP16 residual connections (Gemma)
+    size_t add_clipped(size_t input1, size_t input2);  
     size_t subtract(size_t input1, size_t input2);
     size_t multiply(size_t input1, size_t input2);
     size_t divide(size_t input1, size_t input2);
@@ -361,8 +409,12 @@ class CactusGraph {
     size_t gather(size_t embeddings, size_t indices);
     size_t mmap_embeddings(const std::string& filename);
     size_t mmap_weights(const std::string& filename);
-    size_t load_weights(const std::string& filename); 
-    void set_quantization_scale(size_t node_id, float scale);
+    size_t load_weights(const std::string& filename);
+    void set_grouped_scales(size_t node_id, size_t group_size, size_t num_groups, void* scales_ptr);
+
+    void release_weight_pages(size_t node_id);
+    void prefetch_weight_pages(size_t node_id);
+    void release_all_weight_pages();
     size_t embedding(const std::string& filename, size_t indices);
     size_t embedding(size_t embedding_tensor, size_t indices);
     size_t bilinear_interpolation(size_t pos_embeds, size_t dst_height, size_t dst_width);
@@ -376,6 +428,11 @@ class CactusGraph {
     size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, ComputeBackend backend = ComputeBackend::CPU);
     size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, size_t window_size, ComputeBackend backend = ComputeBackend::CPU);
 
+    size_t attention_int8_hybrid(size_t query, size_t key_new, size_t value_new, float scale, size_t position_offset,
+                                 const int8_t* cached_keys, const int8_t* cached_values,
+                                 const float* k_scales, const float* v_scales,
+                                 size_t cache_len, size_t num_kv_heads, size_t head_dim);
+
     size_t conv1d_causal(size_t input, size_t weight, size_t kernel_size, size_t dilation = 1);
     size_t conv1d_k3(size_t input, size_t weight, size_t stride);
     
@@ -392,6 +449,8 @@ class CactusGraph {
     void execute(const std::string& profile_file = "");
     void hard_reset();
     void soft_reset();
+    void soft_reset_keep_pool();
+    void set_prefill_mode(bool enabled) { prefill_mode_ = enabled; }
 
     void register_debug_node(uint32_t layer_idx, const std::string& name, size_t node_id);
     void capture_debug_node(uint32_t layer_idx, const std::string& name, size_t node_id);
@@ -410,8 +469,10 @@ class CactusGraph {
     size_t next_node_id_;
     std::vector<std::unique_ptr<GraphFile::MappedFile>> mapped_files_;
     std::unordered_map<std::string, size_t> weight_cache_;
+    std::unordered_map<size_t, size_t> node_to_mapped_file_;
     std::vector<DebugNodeEntry> debug_nodes_;
     BufferPool buffer_pool_;
+    bool prefill_mode_ = false;
 };
 
 
@@ -430,25 +491,36 @@ namespace GraphFile {
     public:
         MappedFile(const std::string& filename);
         ~MappedFile();
-        
+
         MappedFile(const MappedFile&) = delete;
         MappedFile& operator=(const MappedFile&) = delete;
         MappedFile(MappedFile&& other) noexcept;
         MappedFile& operator=(MappedFile&& other) noexcept;
-        
+
         const std::vector<size_t>& shape() const;
         Precision precision() const;
+        Precision effective_precision() const {
+            return is_int4_ ? Precision::INT8 : precision_;
+        }
         size_t byte_size() const;
-        float quantization_scale() const;
-        
+
+        size_t group_size() const { return group_size_; }
+        size_t num_groups() const { return num_groups_; }
+        const void* scales_data() const;
+        const void* raw_packed_data() const;  // Get raw mmap'd data without unpacking (for INT4)
+        bool is_int4() const { return is_int4_; }
+
         void* data();
         const void* data() const;
-        
+
         template<typename T>
         const T* typed_data() const;
-        
+
         LoadedNode load_into_graph(CactusGraph& graph) const;
-        
+
+        void release_pages();
+        void prefetch_pages();
+
     private:
         int fd_;
         void* mapped_data_;
@@ -456,10 +528,19 @@ namespace GraphFile {
         std::vector<size_t> shape_;
         Precision precision_;
         size_t byte_size_;
-        float quantization_scale_;
+        size_t group_size_ = 0;
+        size_t num_groups_ = 0;
+        size_t scales_offset_ = 0;
+        size_t scales_bytes_ = 0;
+        uint32_t version_ = 1;
+        uint32_t alignment_ = 32;
+        bool is_int4_ = false;
+        mutable std::unique_ptr<int8_t[]> unpacked_int4_data_;
         void parse_header();
+        void apply_madvise_hints();
+        void unpack_int4_if_needed() const;
     };
-    
+
     MappedFile mmap_load(const std::string& filename);
 }
 
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h
index 726cf37..dea1344 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h
@@ -15,12 +15,7 @@ enum class ScalarOpType {
     SIN
 };
 
-
-void cactus_add_int8(const int8_t* a, const int8_t* b, int8_t* output, size_t num_elements);
-void cactus_subtract_int8(const int8_t* a, const int8_t* b, int8_t* output, size_t num_elements);
-void cactus_multiply_int8(const int8_t* a, const int8_t* b, int8_t* output, size_t num_elements);
-void cactus_divide_int8(const int8_t* a, const int8_t* b, int8_t* output, size_t num_elements);
-
+constexpr size_t KV_QUANT_GROUP_SIZE = 128;
 
 void cactus_add_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
 void cactus_add_f16_clipped(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
@@ -28,27 +23,6 @@ void cactus_subtract_f16(const __fp16* a, const __fp16* b, __fp16* output, size_
 void cactus_multiply_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
 void cactus_divide_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
 
-
-void cactus_add_f32(const float* a, const float* b, float* output, size_t num_elements);
-void cactus_subtract_f32(const float* a, const float* b, float* output, size_t num_elements);
-void cactus_multiply_f32(const float* a, const float* b, float* output, size_t num_elements);
-void cactus_divide_f32(const float* a, const float* b, float* output, size_t num_elements);
-
-
-void cactus_add_broadcast_int8(const int8_t* a, const int8_t* b, int8_t* output,
-                               const size_t* a_strides, const size_t* b_strides,
-                               const size_t* output_shape, size_t ndim);
-void cactus_subtract_broadcast_int8(const int8_t* a, const int8_t* b, int8_t* output,
-                                    const size_t* a_strides, const size_t* b_strides,
-                                    const size_t* output_shape, size_t ndim);
-void cactus_multiply_broadcast_int8(const int8_t* a, const int8_t* b, int8_t* output,
-                                    const size_t* a_strides, const size_t* b_strides,
-                                    const size_t* output_shape, size_t ndim);
-void cactus_divide_broadcast_int8(const int8_t* a, const int8_t* b, int8_t* output,
-                                  const size_t* a_strides, const size_t* b_strides,
-                                  const size_t* output_shape, size_t ndim);
-
-
 void cactus_add_broadcast_f16(const __fp16* a, const __fp16* b, __fp16* output,
                                const size_t* a_strides, const size_t* b_strides,
                                const size_t* output_shape, size_t ndim);
@@ -62,159 +36,72 @@ void cactus_divide_broadcast_f16(const __fp16* a, const __fp16* b, __fp16* outpu
                                  const size_t* a_strides, const size_t* b_strides,
                                  const size_t* output_shape, size_t ndim);
 
-
-void cactus_add_broadcast_f32(const float* a, const float* b, float* output,
-                               const size_t* a_strides, const size_t* b_strides,
-                               const size_t* output_shape, size_t ndim);
-void cactus_subtract_broadcast_f32(const float* a, const float* b, float* output,
-                                   const size_t* a_strides, const size_t* b_strides,
-                                   const size_t* output_shape, size_t ndim);
-void cactus_multiply_broadcast_f32(const float* a, const float* b, float* output,
-                                   const size_t* a_strides, const size_t* b_strides,
-                                   const size_t* output_shape, size_t ndim);
-void cactus_divide_broadcast_f32(const float* a, const float* b, float* output,
-                                 const size_t* a_strides, const size_t* b_strides,
-                                 const size_t* output_shape, size_t ndim);
-
-
-void cactus_scalar_op_int8(const int8_t* input, int8_t* output, size_t num_elements, float scalar_value, ScalarOpType op_type);
 void cactus_scalar_op_f16(const __fp16* input, __fp16* output, size_t num_elements, float scalar_value, ScalarOpType op_type);
-void cactus_scalar_op_f32(const float* input, float* output, size_t num_elements, float scalar_value, ScalarOpType op_type);
-
 
-void cactus_matmul_int8(const int8_t* a, const int8_t* b_transposed, int8_t* c,
-                        size_t M, size_t K, size_t N,
-                        float a_scale, float b_scale, float c_scale);
+void cactus_matmul_int8(const int8_t* A, const float* A_scales,
+                        const int8_t* B, const __fp16* B_scales,
+                        __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
 
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-void cactus_matmul_int8_to_int32_i8mm(const int8_t* a, const int8_t* b_transposed, int32_t* c,
-                                       size_t M, size_t K, size_t N);
-#define cactus_matmul_int8_to_int32 cactus_matmul_int8_to_int32_i8mm
-#else
-void cactus_matmul_int8_to_int32(const int8_t* a, const int8_t* b_transposed, int32_t* c,
-                                 size_t M, size_t K, size_t N);
-#endif
+void cactus_matmul_int4(const int8_t* A, const float* A_scales,
+                        const uint8_t* B_packed, const __fp16* B_scales,
+                        __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
 
 void cactus_matmul_f16(const __fp16* a, const __fp16* b_transposed, __fp16* c,
                        size_t M, size_t K, size_t N);
 
-void cactus_matmul_f32(const float* a, const float* b_transposed, float* c,
-                       size_t M, size_t K, size_t N);
-
-
-void cactus_transpose_2d_int8(const int8_t* source, int8_t* destination,
-                               size_t num_rows, size_t num_cols, size_t start_row, size_t end_row);
 void cactus_transpose_2d_f16(const __fp16* source, __fp16* destination,
                              size_t num_rows, size_t num_cols, size_t start_row, size_t end_row);
-void cactus_transpose_2d_f32(const float* source, float* destination,
-                             size_t num_rows, size_t num_cols, size_t start_row, size_t end_row);
-
-void cactus_transpose_int8(const int8_t* source, int8_t* destination, const size_t* shape,
-                           const size_t* permutation, size_t ndim, size_t start_idx, size_t end_idx);
 void cactus_transpose_f16(const __fp16* source, __fp16* destination, const size_t* shape,
                           const size_t* permutation, size_t ndim, size_t start_idx, size_t end_idx);
-void cactus_transpose_f32(const float* source, float* destination, const size_t* shape,
-                          const size_t* permutation, size_t ndim, size_t start_idx, size_t end_idx);
 
-int64_t cactus_sum_all_int8(const int8_t* data, size_t num_elements);
-void cactus_sum_axis_int8(const int8_t* input, int8_t* output, size_t outer_size, size_t axis_size, size_t inner_size);
 double cactus_sum_all_f16(const __fp16* data, size_t num_elements);
-double cactus_sum_all_f32(const float* data, size_t num_elements);
-void cactus_sum_axis_f32(const float* input, float* output, size_t outer_size, size_t axis_size, size_t inner_size);
+void cactus_sum_axis_f16(const __fp16* input, __fp16* output, size_t outer_size, size_t axis_size, size_t inner_size);
 
-double cactus_mean_all_int8(const int8_t* data, size_t num_elements);
-void cactus_mean_axis_int8(const int8_t* input, int8_t* output, size_t outer_size, size_t axis_size, size_t inner_size);
 double cactus_mean_all_f16(const __fp16* data, size_t num_elements);
 void cactus_mean_axis_f16(const __fp16* input, __fp16* output, size_t outer_size, size_t axis_size, size_t inner_size);
-double cactus_mean_all_f32(const float* data, size_t num_elements);
-void cactus_mean_axis_f32(const float* input, float* output, size_t outer_size, size_t axis_size, size_t inner_size);
 
-double cactus_variance_all_int8(const int8_t* data, size_t num_elements);
-void cactus_variance_axis_int8(const int8_t* input, int8_t* output, size_t outer_size, size_t axis_size, size_t inner_size);
-double cactus_variance_all_f32(const float* data, size_t num_elements);
-void cactus_variance_axis_f32(const float* input, float* output, size_t outer_size, size_t axis_size, size_t inner_size);
+double cactus_variance_all_f16(const __fp16* data, size_t num_elements);
+void cactus_variance_axis_f16(const __fp16* input, __fp16* output, size_t outer_size, size_t axis_size, size_t inner_size);
 
-int64_t cactus_min_all_int8(const int8_t* data, size_t num_elements);
-void cactus_min_axis_int8(const int8_t* input, int8_t* output, size_t outer_size, size_t axis_size, size_t inner_size);
-float cactus_min_all_f32(const float* data, size_t num_elements);
-void cactus_min_axis_f32(const float* input, float* output, size_t outer_size, size_t axis_size, size_t inner_size);
+__fp16 cactus_min_all_f16(const __fp16* data, size_t num_elements);
+void cactus_min_axis_f16(const __fp16* input, __fp16* output, size_t outer_size, size_t axis_size, size_t inner_size);
 
-int64_t cactus_max_all_int8(const int8_t* data, size_t num_elements);
-void cactus_max_axis_int8(const int8_t* input, int8_t* output, size_t outer_size, size_t axis_size, size_t inner_size);
-float cactus_max_all_f32(const float* data, size_t num_elements);
-void cactus_max_axis_f32(const float* input, float* output, size_t outer_size, size_t axis_size, size_t inner_size);
+__fp16 cactus_max_all_f16(const __fp16* data, size_t num_elements);
+void cactus_max_axis_f16(const __fp16* input, __fp16* output, size_t outer_size, size_t axis_size, size_t inner_size);
 
 void cactus_rms_norm_f16(const __fp16* input, const __fp16* weight, __fp16* output,
                           size_t batch_size, size_t dims, float eps);
-                          
-void cactus_rms_norm_f32(const float* input, const float* weight, float* output,
-                          size_t batch_size, size_t dims, float eps);
-
-void cactus_rms_norm_i8_f32(const int8_t* input, const float* weight, float* output,
-                             size_t batch_size, size_t dims, float eps, float input_scale);
 
 void cactus_rope_f16(const __fp16* input, __fp16* output, size_t batch_size, size_t seq_len,
                       size_t num_heads, size_t head_dim, size_t start_pos, float theta);
 
-void cactus_rope_f32(const float* input, float* output, size_t batch_size, size_t seq_len,
-                      size_t num_heads, size_t head_dim, size_t start_pos, float theta);
-
-void cactus_rope_i8_f32_i8(const int8_t* input, int8_t* output, size_t batch_size, size_t seq_len,
-                           size_t num_heads, size_t head_dim, size_t start_pos, float theta,
-                           float input_scale, float output_scale);
-
-void cactus_softmax_f16(const __fp16* input, __fp16* output, size_t batch_size, 
+void cactus_softmax_f16(const __fp16* input, __fp16* output, size_t batch_size,
                          size_t seq_len, size_t vocab_size);
 
-void cactus_softmax_f32(const float* input, float* output, size_t batch_size, 
-                         size_t seq_len, size_t vocab_size);
-
-void cactus_silu_f32(const float* input, float* output, size_t num_elements);
 void cactus_silu_f16(const __fp16* input, __fp16* output, size_t num_elements);
-void cactus_silu_int8(const int8_t* input, int8_t* output, size_t num_elements, 
-                      float input_scale, float output_scale);
 
-void cactus_gelu_f32(const float* input, float* output, size_t num_elements);
 void cactus_gelu_f16(const __fp16* input, __fp16* output, size_t num_elements);
-void cactus_gelu_int8(const int8_t* input, int8_t* output, size_t num_elements,
-                      float input_scale, float output_scale);
 
-void cactus_gelu_f32_erf(const float* input, float* output, size_t num_elements);
 void cactus_gelu_f16_erf(const __fp16* input, __fp16* output, size_t num_elements);
-void cactus_gelu_int8_erf(
-    const int8_t* input,
-    int8_t* output,
-    size_t num_elements,
-    float scale_in,
-    float scale_out);
-
-                      
-void cactus_attention_int8(const int8_t* queries, const int8_t* keys, const int8_t* values, int8_t* output,
-                            size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads,
-                            size_t head_dim, float scale, const int8_t* mask,
-                            float q_scale, float k_scale, float v_scale, float output_scale, size_t position_offset = 0, size_t window_size = 0,
-                            bool is_causal = true);
 
 void cactus_attention_f16(const __fp16* queries, const __fp16* keys, const __fp16* values, __fp16* output,
                           size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads,
                           size_t head_dim, float scale, const __fp16* mask, size_t position_offset = 0, size_t window_size = 0,
                           bool is_causal = true);
 
-void cactus_attention_f32(const float* queries, const float* keys, const float* values, float* output,
-                          size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads,
-                          size_t head_dim, float scale, const float* mask, size_t position_offset = 0, size_t window_size = 0,
-                          bool is_causal = true);
-
-
-void cactus_conv1d_causal_depthwise_f32(
-    const float* input,
-    const float* weight,
-    float* output,
-    size_t N,
-    size_t L,
-    size_t C,
-    size_t K,
-    size_t dilation);
+void cactus_attention_hybrid_int8_fp16(
+    const __fp16* queries,        
+    const int8_t* keys_cached, 
+    const int8_t* values_cached, 
+    const float* k_scales,
+    const float* v_scales, 
+    const __fp16* keys_new, 
+    const __fp16* values_new, 
+    __fp16* output,
+    size_t batch_size, size_t seq_len, size_t cache_len, size_t new_len,
+    size_t num_q_heads, size_t num_kv_heads, size_t head_dim,
+    float scale, size_t position_offset = 0, bool is_causal = true,
+    size_t group_size = KV_QUANT_GROUP_SIZE);
 
 void cactus_conv1d_causal_depthwise_f16(
     const __fp16* input,
@@ -226,30 +113,6 @@ void cactus_conv1d_causal_depthwise_f16(
     size_t K,
     size_t dilation);
 
-void cactus_conv1d_causal_depthwise_int8(
-    const int8_t* input,
-    const int8_t* weight,
-    int8_t* output,
-    size_t N,
-    size_t L,
-    size_t C,
-    size_t K,
-    size_t dilation,
-    float input_scale,
-    float weight_scale,
-    float output_scale);
-
-void cactus_conv1d_f32_k3(
-    const float* input,
-    const float* weight,
-    float* output,
-    size_t N,
-    size_t L,
-    size_t C_in,
-    size_t C_out,
-    size_t stride
-);
-
 void cactus_conv1d_f16_k3(
     const __fp16* input,
     const __fp16* weight,
@@ -261,26 +124,8 @@ void cactus_conv1d_f16_k3(
     size_t stride
 );
 
-void cactus_conv1d_f32_k3(
-    const float* input,
-    const float* weight,
-    float* output,
-    size_t N, size_t L,
-    size_t C_in, size_t C_out,
-    size_t stride
-);
-
-void cactus_conv1d_f16_k3(
-    const __fp16* input,
-    const __fp16* weight,
-    __fp16* output,
-    size_t N, size_t L,
-    size_t C_in, size_t C_out,
-    size_t stride
-);
-
-void cactus_bilinear_interpolation_fp32(const float* input, float* output, size_t src_height, size_t src_width, size_t embed_dim,
-                                        size_t dst_height, size_t dst_width);
+void cactus_bilinear_interpolation_f16(const __fp16* input, __fp16* output, size_t src_height, size_t src_width, size_t embed_dim,
+                                       size_t dst_height, size_t dst_width);
 
 void cactus_sample_f32(const float* logits, uint32_t* output, size_t vocab_size,
                        float temperature, float top_p, size_t top_k, size_t random_seed,
@@ -291,25 +136,30 @@ void cactus_sample_f16(const __fp16* logits, uint32_t* output, size_t vocab_size
                        const float* bias_values = nullptr, const uint32_t* bias_indices = nullptr,
                        size_t bias_count = 0);
 
-
-void cactus_concat_f32(const float* input1, const float* input2, float* output,
-                       const size_t* shape1, const size_t* shape2, const size_t* output_shape,
-                       size_t ndims, int axis);
 void cactus_concat_f16(const __fp16* input1, const __fp16* input2, __fp16* output,
                        const size_t* shape1, const size_t* shape2, const size_t* output_shape,
                        size_t ndims, int axis);
-void cactus_concat_int8(const int8_t* input1, const int8_t* input2, int8_t* output,
-                        const size_t* shape1, const size_t* shape2, const size_t* output_shape,
-                        size_t ndims, int axis);
 
 void cactus_int8_to_fp32(const int8_t* src, float* dst, size_t count, float scale = 1.0f);
 void cactus_fp32_to_int8(const float* src, int8_t* dst, size_t count, float scale = 1.0f);
-void cactus_dynamic_quantize_fp32_to_int8(const float* src, int8_t* dst, size_t count, float* computed_scale);
 void cactus_fp16_to_fp32(const __fp16* src, float* dst, size_t count);
 void cactus_fp32_to_fp16(const float* src, __fp16* dst, size_t count);
 void cactus_int8_to_fp16(const int8_t* src, __fp16* dst, size_t count, float scale = 1.0f);
 void cactus_fp16_to_int8(const __fp16* src, int8_t* dst, size_t count, float scale = 1.0f);
 float cactus_fp16_max_abs(const __fp16* src, size_t count);
-void cactus_int32_to_fp16_scaled(const int32_t* src, __fp16* dst, size_t count, float scale);
 
-#endif 
\ No newline at end of file
+void cactus_quantize_kv_fp16_to_int8(
+    const __fp16* src,
+    int8_t* dst,
+    float* scales,
+    size_t seq_len, size_t kv_heads, size_t head_dim,
+    size_t group_size = KV_QUANT_GROUP_SIZE);
+
+inline size_t kv_scales_count(size_t seq_len, size_t kv_heads, size_t head_dim, size_t group_size = KV_QUANT_GROUP_SIZE) {
+    size_t num_groups = (head_dim + group_size - 1) / group_size;
+    return seq_len * kv_heads * num_groups;
+}
+
+void cactus_unpack_int4_to_int8(const uint8_t* packed, int8_t* unpacked, size_t unpacked_count);
+
+#endif
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h
index 30ca86f..92fd489 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h
@@ -2,6 +2,13 @@
 #define KERNEL_UTILS_H
 
 #include <arm_neon.h>
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+#if defined(__ANDROID__)
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#endif
 #include <algorithm>
 #include <cmath>
 #include <thread>
@@ -19,166 +26,439 @@
 #include <cstdio>
 
 constexpr size_t NEON_VECTOR_SIZE = 16;
+constexpr size_t STREAMING_STORE_THRESHOLD = 32768;
 
-inline int8_t clamp_to_int8(float value) {
-    int32_t clamped = static_cast<int32_t>(roundf(value));
-    return static_cast<int8_t>(std::max(-128, std::min(127, clamped)));
-}
-
-inline int8_t clamp_to_int8(int32_t value) {
-    return static_cast<int8_t>(std::max(-128, std::min(127, value)));
+inline void stream_store_f16x8(__fp16* dst, float16x8_t val) {
+#if defined(__aarch64__)
+    float16x4_t lo = vget_low_f16(val);
+    float16x4_t hi = vget_high_f16(val);
+    __asm__ __volatile__(
+        "stnp %d0, %d1, [%2]"
+        :
+        : "w"(lo), "w"(hi), "r"(dst)
+        : "memory"
+    );
+#else
+    vst1q_f16(dst, val);
+#endif
 }
 
 #if defined(__ARM_FEATURE_DOTPROD)
-inline int32x4_t accum_i8mm(int32x4_t acc, int8x16_t a, int8x16_t b) {
+inline int32x4_t accum_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
     return vdotq_s32(acc, a, b);
 }
 #else
-inline int32x4_t accum_i8mm(int32x4_t acc, int8x16_t a, int8x16_t b) {
+inline int32x4_t accum_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
     int16x8_t prod_low = vmull_s8(vget_low_s8(a), vget_low_s8(b));
     int32x4_t acc_high = vpaddlq_s16(vmull_s8(vget_high_s8(a), vget_high_s8(b)));
     return vaddq_s32(vaddq_s32(acc, vpaddlq_s16(prod_low)), acc_high);
 }
 #endif
 
-inline float16x8_t accum_f16_dot(float16x8_t acc, float16x8_t a_low, float16x8_t a_high, 
+// I8MM support: runtime detection on Android, compile-time on Apple
+#if defined(__ANDROID__) && defined(__aarch64__)
+
+inline bool cactus_has_i8mm() {
+    static int8_t supported = -1;
+    if (supported == -1) {
+        unsigned long hwcaps = getauxval(AT_HWCAP2);
+        supported = (hwcaps & HWCAP2_I8MM) ? 1 : 0;
+    }
+    return supported;
+}
+
+__attribute__((target("arch=armv8.2-a+i8mm")))
+inline int32x4_t accum_matmul(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    return vmmlaq_s32(acc, a, b);
+}
+
+#elif defined(__APPLE__) && defined(__aarch64__)
+
+inline bool cactus_has_i8mm() {
+    return true;
+}
+
+__attribute__((target("i8mm")))
+inline int32x4_t accum_matmul(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    return vmmlaq_s32(acc, a, b);
+}
+
+#else
+
+inline bool cactus_has_i8mm() {
+    return false;
+}
+
+#endif
+
+inline float16x8_t accum_f16_dot(float16x8_t acc, float16x8_t a_low, float16x8_t a_high,
                                  float16x8_t b_low, float16x8_t b_high) {
     acc = vfmaq_f16(acc, a_low, b_low);
     return vfmaq_f16(acc, a_high, b_high);
 }
 
-inline float32x4_t accum_f32_dot(float32x4_t acc, float32x4_t a_low, float32x4_t a_high, 
-                                  float32x4_t b_low, float32x4_t b_high) {
-    acc = vfmaq_f32(acc, a_low, b_low);
-    return vfmaq_f32(acc, a_high, b_high);
+inline float32x4_t fast_exp_f32x4(float32x4_t x) {
+    const float32x4_t log2e = vdupq_n_f32(1.4426950408889634f);
+    const float32x4_t ln2 = vdupq_n_f32(0.6931471805599453f);
+
+    const float32x4_t c0 = vdupq_n_f32(1.0f);
+    const float32x4_t c1 = vdupq_n_f32(0.6931471805599453f); 
+    const float32x4_t c2 = vdupq_n_f32(0.2402265069591007f);  
+    const float32x4_t c3 = vdupq_n_f32(0.05550410866482158f);
+    const float32x4_t c4 = vdupq_n_f32(0.009618129842071803f); 
+
+    x = vmaxq_f32(x, vdupq_n_f32(-87.0f));
+    x = vminq_f32(x, vdupq_n_f32(87.0f));
+
+    float32x4_t z = vmulq_f32(x, log2e);
+
+    int32x4_t zi = vcvtq_s32_f32(z);
+    float32x4_t zf = vsubq_f32(z, vcvtq_f32_s32(zi));
+
+    uint32x4_t neg_mask = vcltq_f32(zf, vdupq_n_f32(0.0f));
+    zi = vsubq_s32(zi, vandq_s32(vreinterpretq_s32_u32(neg_mask), vdupq_n_s32(1)));
+    zf = vaddq_f32(zf, vreinterpretq_f32_u32(vandq_u32(neg_mask, vreinterpretq_u32_f32(vdupq_n_f32(1.0f)))));
+
+    float32x4_t zf_ln2 = vmulq_f32(zf, ln2);
+    float32x4_t p = c4;
+    p = vfmaq_f32(c3, p, zf_ln2);
+    p = vfmaq_f32(c2, p, zf_ln2);
+    p = vfmaq_f32(c1, p, zf_ln2);
+    p = vfmaq_f32(c0, p, zf_ln2);
+
+    int32x4_t exp_bits = vshlq_n_s32(vaddq_s32(zi, vdupq_n_s32(127)), 23);
+    float32x4_t scale = vreinterpretq_f32_s32(exp_bits);
+
+    return vmulq_f32(p, scale);
+}
+
+inline float32x4_t fast_tanh_f32x4(float32x4_t x) {
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    const float32x4_t neg_one = vdupq_n_f32(-1.0f);
+
+    uint32x4_t pos_sat = vcgtq_f32(x, vdupq_n_f32(4.5f));
+    uint32x4_t neg_sat = vcltq_f32(x, vdupq_n_f32(-4.5f));
+
+    const float32x4_t c27 = vdupq_n_f32(27.0f);
+    const float32x4_t c9 = vdupq_n_f32(9.0f);
+
+    float32x4_t x2 = vmulq_f32(x, x);
+    float32x4_t num = vaddq_f32(c27, x2);   
+    float32x4_t den = vfmaq_f32(c27, c9, x2);  
+
+    float32x4_t result = vmulq_f32(x, vdivq_f32(num, den));
+
+    result = vbslq_f32(pos_sat, one, result);
+    result = vbslq_f32(neg_sat, neg_one, result);
+
+    return result;
+}
+
+inline int8x16_t unpack_int4_lo(uint8x16_t packed) {
+    uint8x16_t lo = vandq_u8(packed, vdupq_n_u8(0x0F));
+    uint8x16_t sign_mask = vcgtq_u8(lo, vdupq_n_u8(7));
+    uint8x16_t correction = vandq_u8(sign_mask, vdupq_n_u8(16));
+    return vreinterpretq_s8_u8(vsubq_u8(lo, correction));
+}
+
+inline int8x16_t unpack_int4_hi(uint8x16_t packed) {
+    uint8x16_t hi = vshrq_n_u8(packed, 4);
+    uint8x16_t sign_mask = vcgtq_u8(hi, vdupq_n_u8(7));
+    uint8x16_t correction = vandq_u8(sign_mask, vdupq_n_u8(16));
+    return vreinterpretq_s8_u8(vsubq_u8(hi, correction));
+}
+
+inline void unpack_int4_to_int8x32(uint8x16_t packed, int8x16_t& out_lo, int8x16_t& out_hi) {
+    int8x16_t lo_nibbles = unpack_int4_lo(packed);
+    int8x16_t hi_nibbles = unpack_int4_hi(packed);
+    int8x16x2_t interleaved = vzipq_s8(lo_nibbles, hi_nibbles);
+    out_lo = interleaved.val[0];
+    out_hi = interleaved.val[1];
+}
+
+inline int32x4_t int4_dot_asm(int32x4_t acc, uint8x16_t packed, int8x16_t a_lo, int8x16_t a_hi) {
+#if defined(__aarch64__)
+    int8x16_t b_lo, b_hi;
+
+    __asm__ __volatile__ (
+        "movi   v16.16b, #0x0F          \n"  // low nibble mask
+        "movi   v17.16b, #7             \n"  // sign threshold
+        "movi   v18.16b, #16            \n"  // sign correction
+
+        "and    %[b_lo].16b, %[packed].16b, v16.16b  \n"
+
+        "ushr   %[b_hi].16b, %[packed].16b, #4      \n"
+
+        "cmgt   v19.16b, %[b_lo].16b, v17.16b       \n"
+        "and    v19.16b, v19.16b, v18.16b           \n"
+        "sub    %[b_lo].16b, %[b_lo].16b, v19.16b   \n"
+
+        "cmgt   v20.16b, %[b_hi].16b, v17.16b       \n"
+        "and    v20.16b, v20.16b, v18.16b           \n"
+        "sub    %[b_hi].16b, %[b_hi].16b, v20.16b   \n"
+
+        "zip1   v21.16b, %[b_lo].16b, %[b_hi].16b   \n"
+        "zip2   v22.16b, %[b_lo].16b, %[b_hi].16b   \n"
+
+        ".arch armv8.2-a+dotprod                    \n"
+        "sdot   %[acc].4s, %[a_lo].16b, v21.16b     \n"
+        "sdot   %[acc].4s, %[a_hi].16b, v22.16b     \n"
+
+        : [acc] "+w"(acc), [b_lo] "=w"(b_lo), [b_hi] "=w"(b_hi)
+        : [packed] "w"(packed), [a_lo] "w"(a_lo), [a_hi] "w"(a_hi)
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22"
+    );
+
+    return acc;
+#else
+    int8x16_t b_lo, b_hi;
+    unpack_int4_to_int8x32(packed, b_lo, b_hi);
+    acc = accum_dot(acc, a_lo, b_lo);
+    acc = accum_dot(acc, a_hi, b_hi);
+    return acc;
+#endif
+}
+
+inline int32_t int4_dot_m1_asm(const int8_t* a_ptr, const uint8_t* b_packed, size_t group_size) {
+#if defined(__aarch64__)
+    int32x4_t acc = vdupq_n_s32(0);
+
+    for (size_t k = 0; k < group_size; k += 64) {
+        uint8x16_t p0 = vld1q_u8(b_packed + k/2);
+        uint8x16_t p1 = vld1q_u8(b_packed + k/2 + 16);
+
+        int8x16_t a0 = vld1q_s8(a_ptr + k);
+        int8x16_t a1 = vld1q_s8(a_ptr + k + 16);
+        int8x16_t a2 = vld1q_s8(a_ptr + k + 32);
+        int8x16_t a3 = vld1q_s8(a_ptr + k + 48);
+
+        acc = int4_dot_asm(acc, p0, a0, a1);
+        acc = int4_dot_asm(acc, p1, a2, a3);
+    }
+
+    return vaddvq_s32(acc);
+#else
+    int32x4_t acc = vdupq_n_s32(0);
+    for (size_t k = 0; k < group_size; k += 32) {
+        uint8x16_t packed = vld1q_u8(b_packed + k/2);
+        int8x16_t b_lo, b_hi;
+        unpack_int4_to_int8x32(packed, b_lo, b_hi);
+        acc = accum_dot(acc, vld1q_s8(a_ptr + k), b_lo);
+        acc = accum_dot(acc, vld1q_s8(a_ptr + k + 16), b_hi);
+    }
+    return vaddvq_s32(acc);
+#endif
 }
 
 namespace CactusThreading {
-    
+
     class ThreadPool {
     private:
+        static constexpr size_t MAX_WORKERS = 16;
+
         std::vector<std::thread> workers;
-        std::queue<std::function<void()>> tasks;
-        std::mutex queue_mutex;
-        std::condition_variable condition;
-        std::atomic<bool> stop{false};
-        std::atomic<size_t> active_workers{0};
-        std::condition_variable finish_condition;
-        
+        std::deque<std::function<void()>> tasks;
+
+        std::mutex mutex;
+        std::condition_variable work_available;
+        std::condition_variable work_done;
+
+        bool stop{false};
+        std::atomic<size_t> pending_tasks{0};
+        size_t num_workers_;
+
         void worker_thread() {
             while (true) {
                 std::function<void()> task;
                 {
-                    std::unique_lock<std::mutex> lock(queue_mutex);
-                    condition.wait(lock, [this] { return stop || !tasks.empty(); });
-                    
-                    if (stop && tasks.empty()) return;
-                    
+                    std::unique_lock<std::mutex> lock(mutex);
+                    work_available.wait(lock, [this] {
+                        return stop || !tasks.empty();
+                    });
+
+                    if (stop && tasks.empty()) {
+                        return;
+                    }
+
                     task = std::move(tasks.front());
-                    tasks.pop();
-                    active_workers++;
+                    tasks.pop_front();
                 }
-                
+
                 task();
-                
-                active_workers--;
-                finish_condition.notify_all();
+
+                if (pending_tasks.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+                    std::lock_guard<std::mutex> lock(mutex);
+                    work_done.notify_one();
+                }
             }
         }
-        
+
     public:
-        explicit ThreadPool(size_t num_threads = std::thread::hardware_concurrency()) {
-            workers.reserve(num_threads);
-            for (size_t i = 0; i < num_threads; ++i) {
+        explicit ThreadPool(size_t num_threads = std::thread::hardware_concurrency())
+            : stop(false), pending_tasks(0) {
+            num_workers_ = std::min(num_threads, MAX_WORKERS);
+            if (num_workers_ == 0) num_workers_ = 1;
+            workers.reserve(num_workers_);
+            for (size_t i = 0; i < num_workers_; ++i) {
                 workers.emplace_back(&ThreadPool::worker_thread, this);
             }
         }
-        
+
         ~ThreadPool() {
             {
-                std::unique_lock<std::mutex> lock(queue_mutex);
+                std::lock_guard<std::mutex> lock(mutex);
                 stop = true;
             }
-            condition.notify_all();
+            work_available.notify_all();
             for (auto& worker : workers) {
-                worker.join();
+                if (worker.joinable()) {
+                    worker.join();
+                }
             }
         }
-        
+
         template<typename F>
         auto enqueue(F&& f) -> std::future<decltype(f())> {
             using return_type = decltype(f());
-            
+
             auto task = std::make_shared<std::packaged_task<return_type()>>(
                 std::forward<F>(f)
             );
-            
+
             std::future<return_type> res = task->get_future();
+
             {
-                std::unique_lock<std::mutex> lock(queue_mutex);
-                if (stop) throw std::runtime_error("enqueue on stopped ThreadPool");
-                
-                tasks.emplace([task](){ (*task)(); });
+                std::lock_guard<std::mutex> lock(mutex);
+                pending_tasks.fetch_add(1, std::memory_order_relaxed);
+                tasks.emplace_back([task](){ (*task)(); });
             }
-            condition.notify_one();
+            work_available.notify_one();
+
             return res;
         }
-        
+
+        template<typename F>
+        void enqueue_batch(size_t total_work, F task_func) {
+            if (total_work == 0) return;
+
+            const size_t num_tasks = std::min(num_workers_, total_work);
+            const size_t per_worker = total_work / num_tasks;
+            const size_t remainder = total_work % num_tasks;
+
+            {
+                std::lock_guard<std::mutex> lock(mutex);
+                pending_tasks.fetch_add(num_tasks, std::memory_order_relaxed);
+
+                for (size_t w = 0; w < num_tasks; ++w) {
+                    size_t start = w * per_worker + std::min(w, remainder);
+                    size_t end = start + per_worker + (w < remainder ? 1 : 0);
+                    tasks.emplace_back([=]() { task_func(start, end); });
+                }
+            }
+            work_available.notify_all();
+        }
+
         void wait_all() {
-            std::unique_lock<std::mutex> lock(queue_mutex);
-            finish_condition.wait(lock, [this] { 
-                return tasks.empty() && active_workers == 0; 
+            std::unique_lock<std::mutex> lock(mutex);
+            work_done.wait(lock, [this] {
+                return pending_tasks.load(std::memory_order_acquire) == 0;
             });
         }
-        
-        size_t num_workers() const { return workers.size(); }
+
+        template<typename F>
+        void enqueue_n_threads(size_t total_work, size_t num_threads, F task_func) {
+            if (total_work == 0 || num_threads == 0) return;
+
+            num_threads = std::min(num_threads, std::min(num_workers_, total_work));
+            const size_t per_thread = total_work / num_threads;
+            const size_t remainder = total_work % num_threads;
+
+            {
+                std::lock_guard<std::mutex> lock(mutex);
+                pending_tasks.fetch_add(num_threads, std::memory_order_relaxed);
+
+                for (size_t t = 0; t < num_threads; ++t) {
+                    size_t start = t * per_thread + std::min(t, remainder);
+                    size_t end = start + per_thread + (t < remainder ? 1 : 0);
+                    tasks.emplace_back([=]() { task_func(start, end); });
+                }
+            }
+            work_available.notify_all();
+        }
+
+        size_t num_workers() const { return num_workers_; }
     };
-    
+
     inline ThreadPool& get_thread_pool() {
         static ThreadPool pool;
         return pool;
     }
     
-    inline size_t get_optimal_thread_count(size_t total_work, size_t min_work_per_thread) {
-        if (total_work < min_work_per_thread) return 1;
+    struct ParallelConfig {
+        size_t min_work_gate;  
+        size_t work_per_thread; 
+
+        constexpr ParallelConfig(size_t gate, size_t per_thread)
+            : min_work_gate(gate), work_per_thread(per_thread) {}
+    };
+
+    inline size_t get_optimal_thread_count(size_t total_work, ParallelConfig config) {
+        if (total_work < config.min_work_gate) return 1;
+
         size_t pool_size = get_thread_pool().num_workers();
-        return std::min(pool_size, 
-                       std::max(static_cast<size_t>(1), total_work / min_work_per_thread));
+        size_t num_threads = (total_work + config.work_per_thread - 1) / config.work_per_thread;
+        return std::min(pool_size, std::max(static_cast<size_t>(1), num_threads));
     }
-    
+
     struct Thresholds {
+        #if defined(__ANDROID__)
+        static constexpr ParallelConfig ATTENTION{64, 32};
+        static constexpr ParallelConfig ELEMENT_WISE{5000, 2500};
+        static constexpr ParallelConfig AXIS_REDUCE{1000, 500};
+        static constexpr ParallelConfig ALL_REDUCE{10000, 5000};
+        static constexpr ParallelConfig SCALAR_BASIC{30000, 15000};
+        static constexpr ParallelConfig SCALAR_EXPENSIVE{10000, 5000};
+        #else // Apple
+        static constexpr ParallelConfig ATTENTION{32, 16};
+        static constexpr ParallelConfig ELEMENT_WISE{5000, 2500};
+        static constexpr ParallelConfig AXIS_REDUCE{1000, 500};
+        static constexpr ParallelConfig ALL_REDUCE{10000, 5000};
+        static constexpr ParallelConfig SCALAR_BASIC{5000, 2500};
+        static constexpr ParallelConfig SCALAR_EXPENSIVE{2500, 1250};
+        #endif
+    };
 
+    struct GemmThreading {
         #if defined(__ANDROID__)
-        static constexpr size_t ELEMENT_WISE = 5000;
-        static constexpr size_t AXIS_REDUCE = 1000;
-        static constexpr size_t ALL_REDUCE = 10000;
-        static constexpr size_t SCALAR_BASIC = 30000;
-        static constexpr size_t SCALAR_EXPENSIVE = 10000;
-        static constexpr size_t ATTENTION = 512;
-        static constexpr size_t GEMM_TILED = 20000; 
-        static constexpr size_t GEMM_SMALL = 64 * 64 * 64;
-        static constexpr size_t GEMM_MEDIUM = 256 * 256 * 256;
-        static constexpr size_t GEMM_TILE_M = 64;
-        static constexpr size_t GEMM_TILE_N = 64;
-        static constexpr size_t GEMM_TILE_M_SMALL = 32;
-        static constexpr size_t GEMM_TILE_N_SMALL = 32;
-        #else // iOS
-        static constexpr size_t ELEMENT_WISE = 5000;
-        static constexpr size_t AXIS_REDUCE = 1000;
-        static constexpr size_t ALL_REDUCE = 10000;
-        static constexpr size_t SCALAR_BASIC = 5000;
-        static constexpr size_t SCALAR_EXPENSIVE = 2500;
-        static constexpr size_t ATTENTION = 4;
-        static constexpr size_t GEMM_TILED = 4;  
-        static constexpr size_t GEMM_SMALL = 64 * 64 * 64;
-        static constexpr size_t GEMM_MEDIUM = 256 * 256 * 256;
-        static constexpr size_t GEMM_TILE_M = 64;
-        static constexpr size_t GEMM_TILE_N = 64;
-        static constexpr size_t GEMM_TILE_M_SMALL = 32;
-        static constexpr size_t GEMM_TILE_N_SMALL = 32;
+        static size_t get_num_threads(size_t M, size_t pool_size) {
+            if (M <= 1) return 1; 
+            return pool_size; 
+        }
+        #elif defined(__APPLE__) && TARGET_OS_IPHONE
+        static size_t get_num_threads(size_t M, size_t pool_size) {
+            if (M <= 1) return std::min(pool_size, static_cast<size_t>(2)); 
+            return pool_size; 
+        }
+        #else // Mac
+        static size_t get_num_threads(size_t M, size_t pool_size) {
+            if (M <= 1) return std::min(pool_size, static_cast<size_t>(4));
+            return pool_size; 
+        }
         #endif
-        static constexpr size_t L2_CACHE_SIZE = 256 * 1024;
     };
+
+    inline size_t& get_gemm_thread_override() {
+        static size_t override_threads = 0; 
+        return override_threads;
+    }
+
+    inline void set_gemm_threads(size_t num_threads) {
+        get_gemm_thread_override() = num_threads;
+    }
+
+    inline void reset_gemm_threads() {
+        get_gemm_thread_override() = 0;
+    }
     
     class TaskHandle {
     private:
@@ -225,10 +505,10 @@ namespace CactusThreading {
     };
     
     template<typename WorkFunc>
-    TaskHandle parallel_for(size_t total_work, size_t threshold, WorkFunc work_func, bool wait = true) {
-        const size_t num_threads = get_optimal_thread_count(total_work, threshold);
-        TaskHandle handle(!wait);  
-        
+    TaskHandle parallel_for(size_t total_work, ParallelConfig config, WorkFunc work_func, bool wait = true) {
+        const size_t num_threads = get_optimal_thread_count(total_work, config);
+        TaskHandle handle(!wait);
+
         if (num_threads == 1) {
             if (wait) {
                 work_func(0, total_work);
@@ -240,10 +520,10 @@ namespace CactusThreading {
             }));
             return handle;
         }
-        
+
         auto& pool = get_thread_pool();
         const size_t work_per_thread = total_work / num_threads;
-        
+
         for (size_t t = 0; t < num_threads; ++t) {
             handle.add_future(pool.enqueue([work_func, t, num_threads, work_per_thread, total_work]() {
                 const size_t start_idx = t * work_per_thread;
@@ -251,17 +531,17 @@ namespace CactusThreading {
                 work_func(start_idx, end_idx);
             }));
         }
-        
+
         if (wait) {
             handle.wait();
         }
         return handle;
     }
-    
+
     template<typename WorkFunc>
-    void parallel_for_2d(size_t outer_size, size_t inner_size, size_t threshold, WorkFunc work_func) {
+    void parallel_for_2d(size_t outer_size, size_t inner_size, ParallelConfig config, WorkFunc work_func) {
         const size_t total_work = outer_size * inner_size;
-        parallel_for(total_work, threshold, [&](size_t start_idx, size_t end_idx) {
+        parallel_for(total_work, config, [&](size_t start_idx, size_t end_idx) {
             for (size_t work_idx = start_idx; work_idx < end_idx; ++work_idx) {
                 const size_t outer = work_idx / inner_size;
                 const size_t inner = work_idx % inner_size;
@@ -269,11 +549,11 @@ namespace CactusThreading {
             }
         });
     }
-    
+
     template<typename WorkFunc, typename ResultType, typename CombineFunc>
-    ResultType parallel_reduce(size_t total_work, size_t threshold, 
+    ResultType parallel_reduce(size_t total_work, ParallelConfig config,
                               WorkFunc work_func, ResultType init_value, CombineFunc combine_func) {
-        const size_t num_threads = get_optimal_thread_count(total_work, threshold);
+        const size_t num_threads = get_optimal_thread_count(total_work, config);
         
         if (num_threads == 1) {
             return work_func(0, total_work);
@@ -298,46 +578,25 @@ namespace CactusThreading {
         }
         return result;
     }
-    
-    inline size_t compute_gemm_parallelism(size_t M, size_t K, size_t N, size_t element_size) {
-        size_t total_ops = M * K * N;
-        
-        if (total_ops < Thresholds::GEMM_SMALL) return 1;
-        
-        if (total_ops < Thresholds::GEMM_MEDIUM) {
-            return std::min(static_cast<size_t>(2), get_thread_pool().num_workers());
-        }
-        
-        size_t bytes_accessed = (M * K + K * N + M * N) * element_size;
-        size_t cache_tiles = (bytes_accessed + Thresholds::L2_CACHE_SIZE - 1) / Thresholds::L2_CACHE_SIZE;
-        
-        size_t compute_threads = std::sqrt(static_cast<double>(total_ops) / Thresholds::GEMM_SMALL);
-        size_t memory_threads = cache_tiles;
-        
-        size_t optimal = std::min(compute_threads, memory_threads);
-        return std::min(optimal, get_thread_pool().num_workers());
-    }
-    
+
     template<typename WorkFunc>
-    void parallel_for_2d_tiled(size_t rows, size_t cols, size_t tile_rows, size_t tile_cols, WorkFunc work_func) {
-        size_t num_row_tiles = (rows + tile_rows - 1) / tile_rows;
-        size_t num_col_tiles = (cols + tile_cols - 1) / tile_cols;
-        size_t total_tiles = num_row_tiles * num_col_tiles;
-
-        parallel_for(total_tiles, Thresholds::GEMM_TILED, [=](size_t start_tile, size_t end_tile) {
-            for (size_t tile_idx = start_tile; tile_idx < end_tile; ++tile_idx) {
-                size_t tile_row = tile_idx / num_col_tiles;
-                size_t tile_col = tile_idx % num_col_tiles;
-                
-                size_t row_start = tile_row * tile_rows;
-                size_t row_end = std::min(row_start + tile_rows, rows);
-                size_t col_start = tile_col * tile_cols;
-                size_t col_end = std::min(col_start + tile_cols, cols);
-                
-                work_func(row_start, row_end, col_start, col_end);
-            }
-        });
+    void parallel_gemm_tiles(size_t M, size_t total_tiles, WorkFunc work_func) {
+        auto& pool = get_thread_pool();
+
+        size_t override = get_gemm_thread_override();
+        size_t num_threads = (override > 0) ? override : GemmThreading::get_num_threads(M, pool.num_workers());
+        num_threads = std::min(num_threads, total_tiles);
+
+        if (num_threads <= 1) {
+            work_func(0, total_tiles);
+            return;
+        }
+
+        pool.enqueue_n_threads(total_tiles, num_threads, work_func);
+        pool.wait_all();
     }
+
 }
 
+
 #endif // KERNEL_UTILS_H 
\ No newline at end of file
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus
index 330a0ed..467d508 100755
Binary files a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus and b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus differ
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h
index e00b391..bb57657 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h
@@ -67,6 +67,30 @@ CACTUS_FFI_EXPORT int cactus_transcribe(
     size_t pcm_buffer_size
 );
 
+typedef void* cactus_stream_transcribe_t;
+
+CACTUS_FFI_EXPORT cactus_stream_transcribe_t cactus_stream_transcribe_init(cactus_model_t model);
+
+CACTUS_FFI_EXPORT int cactus_stream_transcribe_insert(
+    cactus_stream_transcribe_t stream,
+    const uint8_t* pcm_buffer,
+    size_t pcm_buffer_size
+);
+
+CACTUS_FFI_EXPORT int cactus_stream_transcribe_process(
+    cactus_stream_transcribe_t stream,
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json
+);
+
+CACTUS_FFI_EXPORT int cactus_stream_transcribe_finalize(
+    cactus_stream_transcribe_t stream,
+    char* response_buffer,
+    size_t buffer_size
+);
+
+CACTUS_FFI_EXPORT void cactus_stream_transcribe_destroy(cactus_stream_transcribe_t stream);
 
 CACTUS_FFI_EXPORT int cactus_embed(
     cactus_model_t model,
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h
index 06dfebe..bd03313 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h
@@ -63,6 +63,14 @@ struct ToolFunction {
     std::unordered_map<std::string, std::string> parameters;
 };
 
+} // namespace ffi
+} // namespace cactus
+
+#include "gemma_tools.h"
+
+namespace cactus {
+namespace ffi {
+
 inline void handle_error_response(const std::string& error_message, char* response_buffer, size_t buffer_size) {
     std::string sanitized_msg = error_message;
     for (auto& c : sanitized_msg) {
@@ -303,11 +311,43 @@ inline void parse_function_calls_from_response(const std::string& response_text,
     regular_response = response_text;
     function_calls.clear();
 
+    gemma::parse_function_calls(regular_response, function_calls);
+
+    // Parse Qwen-style function calls: <tool_call>{"name": "...", "arguments": {...}}</tool_call>
+    const std::string QWEN_TOOL_START = "<tool_call>";
+    const std::string QWEN_TOOL_END = "</tool_call>";
+    size_t qwen_start_pos = 0;
+
+    while ((qwen_start_pos = regular_response.find(QWEN_TOOL_START, qwen_start_pos)) != std::string::npos) {
+        size_t content_start = qwen_start_pos + QWEN_TOOL_START.length();
+        size_t qwen_end_pos = regular_response.find(QWEN_TOOL_END, content_start);
+
+        if (qwen_end_pos != std::string::npos) {
+            std::string json_content = regular_response.substr(content_start, qwen_end_pos - content_start);
+
+            size_t first = json_content.find_first_not_of(" \t\n\r");
+            size_t last = json_content.find_last_not_of(" \t\n\r");
+            if (first != std::string::npos && last != std::string::npos) {
+                json_content = json_content.substr(first, last - first + 1);
+            }
+
+            if (json_content.size() > 2 && json_content[0] == '{' &&
+                json_content.find("\"name\"") != std::string::npos) {
+                function_calls.push_back(json_content);
+            }
+
+            regular_response.erase(qwen_start_pos, qwen_end_pos + QWEN_TOOL_END.length() - qwen_start_pos);
+        } else {
+            break;
+        }
+    }
+
+    // Parse LFM2-style function calls: <|tool_call_start|>[name(args)]<|tool_call_end|>
     const std::string TOOL_CALL_START = "<|tool_call_start|>";
     const std::string TOOL_CALL_END = "<|tool_call_end|>";
     size_t tool_start_pos = 0;
 
-    while ((tool_start_pos = response_text.find(TOOL_CALL_START, tool_start_pos)) != std::string::npos) {
+    while ((tool_start_pos = regular_response.find(TOOL_CALL_START, tool_start_pos)) != std::string::npos) {
         size_t content_start = tool_start_pos + TOOL_CALL_START.length();
         size_t tool_end_pos = response_text.find(TOOL_CALL_END, content_start);
 
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h
index 601e818..a7ef002 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h
@@ -131,9 +131,12 @@ struct MergeRule {
 struct ChatMessage {
     std::string role;
     std::string content;
+    std::string name;
     std::vector<std::string> images;
 };
 
+
+
 class Tokenizer {
 public:
     virtual ~Tokenizer() = default;
@@ -329,6 +332,8 @@ struct KVCache {
     struct LayerCache {
         std::vector<uint8_t> keys;
         std::vector<uint8_t> values;
+        std::vector<float> key_scales;   
+        std::vector<float> value_scales; 
     };
 
     std::vector<LayerCache> layer_caches;
@@ -354,13 +359,11 @@ struct KVCache {
                           const std::vector<size_t>& v_nodes, size_t seq_len,
                           size_t num_layers, size_t kv_heads, size_t head_dim);
 
-    // Update KV cache from NPU prefill outputs
-    // NPU outputs are in shape [num_tokens, num_kv_heads, head_dim]
-    // This handles transposition to cache format and sliding window
     void update_from_npu(size_t layer_idx, const __fp16* k_data, const __fp16* v_data,
                          size_t num_tokens, size_t kv_heads, size_t head_dim);
 
     bool is_empty() const { return current_seq_len == 0; }
+    bool is_int8() const { return precision == Precision::INT8; }
     void* get_key_ptr(size_t layer);
     void* get_value_ptr(size_t layer);
 
@@ -374,33 +377,44 @@ struct KVCache {
 
     CircularView get_key_view(size_t layer);
     CircularView get_value_view(size_t layer);
+
+    const int8_t* get_keys_int8(size_t layer) const;
+    const int8_t* get_values_int8(size_t layer) const;
+    const float* get_key_scales(size_t layer) const;
+    const float* get_value_scales(size_t layer) const;
 };
 
 class ToolCallConstrainer {
 public:
     enum class State {
-        START,                  // -> expect {
-        EXPECT_FC_KEY,          // -> expect "function_call"
-        EXPECT_FC_COLON,        // -> expect :
-        EXPECT_FC_OPEN_BRACE,   // -> expect {
-        EXPECT_NAME_KEY,        // -> expect "name"
-        EXPECT_NAME_COLON,      // -> expect :
-        EXPECT_NAME_VALUE,      // -> expect "<function_name>"
-        EXPECT_COMMA,           // -> expect ,
-        EXPECT_ARGS_KEY,        // -> expect "arguments"
-        EXPECT_ARGS_COLON,      // -> expect :
-        IN_ARGUMENTS,           // -> free JSON, track brace depth
-        EXPECT_INNER_CLOSE,     // -> expect } to close inner object
-        EXPECT_OUTER_CLOSE,     // -> expect } to close outer object
-        DONE,                   // complete
-
-        LFM_START,              // -> expect <|tool_call_start|>
-        LFM_EXPECT_BRACKET,     // -> expect [
-        LFM_IN_FUNC_NAME,       // -> expect function name
-        LFM_EXPECT_PAREN,       // -> expect (
-        LFM_IN_ARGUMENTS,       // -> arguments until )
-        LFM_EXPECT_BRACKET_CLOSE, // -> expect ]
-        LFM_EXPECT_END          // -> expect <|tool_call_end|>
+        DONE,                   
+
+        QWEN_START,             
+        QWEN_EXPECT_OPEN_BRACE, 
+        QWEN_EXPECT_NAME_KEY, 
+        QWEN_EXPECT_NAME_COLON,
+        QWEN_EXPECT_NAME_VALUE,
+        QWEN_EXPECT_COMMA, 
+        QWEN_EXPECT_ARGS_KEY, 
+        QWEN_EXPECT_ARGS_COLON, 
+        QWEN_IN_ARGUMENTS,  
+        QWEN_EXPECT_CLOSE_BRACE,
+        QWEN_EXPECT_END, 
+
+        LFM_START,              
+        LFM_EXPECT_BRACKET, 
+        LFM_IN_FUNC_NAME,
+        LFM_EXPECT_PAREN,
+        LFM_IN_ARGUMENTS, 
+        LFM_EXPECT_BRACKET_CLOSE, 
+        LFM_EXPECT_END,   
+
+        GEMMA_START,           
+        GEMMA_EXPECT_CALL, 
+        GEMMA_IN_FUNC_NAME, 
+        GEMMA_EXPECT_BRACE, 
+        GEMMA_IN_ARGUMENTS, 
+        GEMMA_EXPECT_END 
     };
 
     void init(Config::ModelType model_type,
@@ -417,36 +431,40 @@ class ToolCallConstrainer {
 
 private:
     bool active_ = false;
-    State state_ = State::START;
+    State state_ = State::QWEN_START;
     Config::ModelType model_type_ = Config::ModelType::QWEN;
     Tokenizer* tokenizer_ = nullptr;
 
     std::vector<std::string> function_names_;
     std::string generated_text_;
-    int brace_depth_ = 0;  // Track nested braces in arguments
-
-    // Pre-tokenized token sets for each grammar element
-    std::unordered_set<uint32_t> open_brace_tokens_;      // {
-    std::unordered_set<uint32_t> close_brace_tokens_;     // }
-    std::unordered_set<uint32_t> colon_tokens_;           // :
-    std::unordered_set<uint32_t> comma_tokens_;           // ,
-    std::unordered_set<uint32_t> fc_key_tokens_;          // "function_call"
-    std::unordered_set<uint32_t> name_key_tokens_;        // "name"
-    std::unordered_set<uint32_t> args_key_tokens_;        // "arguments"
-    std::unordered_set<uint32_t> quote_tokens_;           // "
-    std::unordered_set<uint32_t> backtick_tokens_;        // ` (to block markdown code fences)
-    std::unordered_set<uint32_t> response_starter_tokens_; // Common response starters to block (I, I'm, Sorry, etc.)
-    std::unordered_set<uint32_t> all_func_name_tokens_;   // All function name tokens combined
-    std::unordered_map<std::string, std::vector<uint32_t>> func_name_sequences_;  // Full token sequence per function
-
-    // LFM2-specific tokens
+    int brace_depth_ = 0;  
+
+    std::unordered_set<uint32_t> qwen_tool_call_start_tokens_; 
+    std::unordered_set<uint32_t> qwen_tool_call_end_tokens_;   
+    std::unordered_set<uint32_t> open_brace_tokens_;         
+    std::unordered_set<uint32_t> close_brace_tokens_;       
+    std::unordered_set<uint32_t> colon_tokens_;            
+    std::unordered_set<uint32_t> comma_tokens_;          
+    std::unordered_set<uint32_t> name_key_tokens_;           
+    std::unordered_set<uint32_t> args_key_tokens_;         
+    std::unordered_set<uint32_t> quote_tokens_;            
+    std::unordered_set<uint32_t> backtick_tokens_;   
+    std::unordered_set<uint32_t> all_func_name_tokens_;
+    std::unordered_map<std::string, std::vector<uint32_t>> func_name_sequences_;  
+
     std::unordered_set<uint32_t> tool_start_tokens_;
     std::unordered_set<uint32_t> tool_end_tokens_;
-    std::unordered_set<uint32_t> bracket_open_tokens_;    // [
-    std::unordered_set<uint32_t> bracket_close_tokens_;   // ]
-    std::unordered_set<uint32_t> paren_open_tokens_;      // (
-    std::unordered_set<uint32_t> paren_close_tokens_;     // )
-    std::unordered_set<uint32_t> equals_tokens_;          // =
+    std::unordered_set<uint32_t> bracket_open_tokens_;   
+    std::unordered_set<uint32_t> bracket_close_tokens_;  
+    std::unordered_set<uint32_t> paren_open_tokens_;     
+    std::unordered_set<uint32_t> paren_close_tokens_;   
+    std::unordered_set<uint32_t> equals_tokens_;        
+
+    std::unordered_set<uint32_t> gemma_call_start_tokens_;    
+    std::unordered_set<uint32_t> gemma_call_end_tokens_;       
+    std::unordered_set<uint32_t> gemma_response_start_tokens_; 
+    std::unordered_set<uint32_t> gemma_call_prefix_tokens_;    
+    std::unordered_set<uint32_t> escape_tokens_;              
 
     std::unordered_map<uint32_t, float> current_bias_;
 
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/gemma_tools.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/gemma_tools.h
new file mode 100644
index 0000000..912de57
--- /dev/null
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/gemma_tools.h
@@ -0,0 +1,549 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <cctype>
+#include <map>
+#include <set>
+
+namespace gemma {
+
+inline std::string to_upper(const std::string& s) {
+    std::string result = s;
+    for (auto& c : result) c = std::toupper(c);
+    return result;
+}
+
+inline std::string escape(const std::string& s) {
+    return "<escape>" + s + "<escape>";
+}
+
+inline void skip_whitespace(const std::string& json, size_t& pos) {
+    while (pos < json.length() && std::isspace(json[pos])) pos++;
+}
+
+inline std::string extract_json_string(const std::string& json, size_t& pos) {
+    std::string value;
+    while (pos < json.length() && json[pos] != '"') {
+        if (json[pos] == '\\' && pos + 1 < json.length()) {
+            pos++;
+            if (json[pos] == 'n') value += '\n';
+            else if (json[pos] == 't') value += '\t';
+            else if (json[pos] == 'r') value += '\r';
+            else if (json[pos] == '"') value += '"';
+            else if (json[pos] == '\\') value += '\\';
+            else value += json[pos];
+        } else {
+            value += json[pos];
+        }
+        pos++;
+    }
+    if (pos < json.length()) pos++; 
+    return value;
+}
+
+std::string format_argument(const std::string& json, size_t& pos, bool escape_keys);
+std::string format_parameters(const std::string& properties_json, const std::string& /*required_json*/);
+
+inline std::string format_argument(const std::string& json, size_t& pos, bool escape_keys = true) {
+    skip_whitespace(json, pos);
+    if (pos >= json.length()) return "";
+
+    char c = json[pos];
+
+    if (c == '"') {
+        std::string value = extract_json_string(json, pos);
+        return escape(value);
+    } else if (c == '{') {
+        std::string result = "{";
+        pos++; 
+        bool first = true;
+
+        while (pos < json.length()) {
+            skip_whitespace(json, pos);
+            if (pos >= json.length() || json[pos] == '}') { pos++; break; }
+            if (json[pos] == ',') { pos++; continue; }
+
+            if (json[pos] != '"') break;
+            pos++;
+            std::string key = extract_json_string(json, pos);
+
+            skip_whitespace(json, pos);
+            if (pos < json.length() && json[pos] == ':') pos++;
+
+            std::string value = format_argument(json, pos, escape_keys);
+
+            if (!first) result += ",";
+            first = false;
+            if (escape_keys) {
+                result += escape(key) + ":" + value;
+            } else {
+                result += key + ":" + value;
+            }
+        }
+        result += "}";
+        return result;
+    } else if (c == '[') {
+        std::string result = "[";
+        pos++; 
+        bool first = true;
+
+        while (pos < json.length()) {
+            skip_whitespace(json, pos);
+            if (pos >= json.length() || json[pos] == ']') { pos++; break; }
+            if (json[pos] == ',') { pos++; continue; }
+
+            std::string value = format_argument(json, pos, escape_keys);
+
+            if (!first) result += ",";
+            first = false;
+            result += value;
+        }
+        result += "]";
+        return result;
+    } else if (json.compare(pos, 4, "true") == 0) {
+        pos += 4;
+        return "true";
+    } else if (json.compare(pos, 5, "false") == 0) {
+        pos += 5;
+        return "false";
+    } else if (json.compare(pos, 4, "null") == 0) {
+        pos += 4;
+        return "null";
+    } else {
+        size_t start = pos;
+        while (pos < json.length() && (std::isdigit(json[pos]) || json[pos] == '.' ||
+               json[pos] == '-' || json[pos] == '+' || json[pos] == 'e' || json[pos] == 'E')) {
+            pos++;
+        }
+        return json.substr(start, pos - start);
+    }
+}
+
+inline std::map<std::string, std::string> parse_json_object_raw(const std::string& json, size_t& pos) {
+    std::map<std::string, std::string> result;
+    skip_whitespace(json, pos);
+    if (pos >= json.length() || json[pos] != '{') return result;
+    pos++; 
+
+    while (pos < json.length()) {
+        skip_whitespace(json, pos);
+        if (pos >= json.length() || json[pos] == '}') { pos++; break; }
+        if (json[pos] == ',') { pos++; continue; }
+
+        if (json[pos] != '"') break;
+        pos++;
+        std::string key = extract_json_string(json, pos);
+
+        skip_whitespace(json, pos);
+        if (pos < json.length() && json[pos] == ':') pos++;
+        skip_whitespace(json, pos);
+
+        size_t value_start = pos;
+        if (json[pos] == '"') {
+            pos++;
+            while (pos < json.length() && json[pos] != '"') {
+                if (json[pos] == '\\') pos++;
+                pos++;
+            }
+            pos++; 
+        } else if (json[pos] == '{') {
+            int depth = 1;
+            pos++;
+            while (pos < json.length() && depth > 0) {
+                if (json[pos] == '{') depth++;
+                else if (json[pos] == '}') depth--;
+                else if (json[pos] == '"') {
+                    pos++;
+                    while (pos < json.length() && json[pos] != '"') {
+                        if (json[pos] == '\\') pos++;
+                        pos++;
+                    }
+                }
+                pos++;
+            }
+        } else if (json[pos] == '[') {
+            int depth = 1;
+            pos++;
+            while (pos < json.length() && depth > 0) {
+                if (json[pos] == '[') depth++;
+                else if (json[pos] == ']') depth--;
+                else if (json[pos] == '"') {
+                    pos++;
+                    while (pos < json.length() && json[pos] != '"') {
+                        if (json[pos] == '\\') pos++;
+                        pos++;
+                    }
+                }
+                pos++;
+            }
+        } else {
+            while (pos < json.length() && json[pos] != ',' && json[pos] != '}') pos++;
+        }
+        result[key] = json.substr(value_start, pos - value_start);
+    }
+    return result;
+}
+
+inline std::string get_json_string_value(const std::string& json, size_t pos) {
+    skip_whitespace(json, pos);
+    if (pos < json.length() && json[pos] == '"') {
+        pos++;
+        return extract_json_string(json, pos);
+    }
+    return "";
+}
+
+inline std::string format_parameters(const std::string& properties_json, const std::string& /*required_json*/) {
+    static const std::set<std::string> standard_keys = {"description", "type", "properties", "required", "nullable"};
+
+    size_t pos = 0;
+    auto properties = parse_json_object_raw(properties_json, pos);
+
+    std::string result;
+    bool first = true;
+
+    for (const auto& [key, value_json] : properties) {
+        if (standard_keys.count(key)) continue;
+
+        if (!first) result += ",";
+        first = false;
+
+        size_t prop_pos = 0;
+        auto prop_obj = parse_json_object_raw(value_json, prop_pos);
+
+        result += key + ":{";
+
+        if (prop_obj.count("description")) {
+            std::string desc = get_json_string_value(prop_obj["description"], 0);
+            result += "description:" + escape(desc);
+        }
+
+        std::string type_val;
+        if (prop_obj.count("type")) {
+            type_val = get_json_string_value(prop_obj["type"], 0);
+        }
+
+        if (to_upper(type_val) == "STRING") {
+            if (prop_obj.count("enum")) {
+                size_t enum_pos = 0;
+                std::string enum_formatted = format_argument(prop_obj["enum"], enum_pos, true);
+                result += ",enum:" + enum_formatted;
+            }
+        } else if (to_upper(type_val) == "OBJECT") {
+            if (prop_obj.count("properties")) {
+                std::string nested_required;
+                if (prop_obj.count("required")) {
+                    nested_required = prop_obj["required"];
+                }
+                result += ",properties:{" + format_parameters(prop_obj["properties"], nested_required) + "}";
+            }
+            if (prop_obj.count("required")) {
+                result += ",required:[";
+                size_t req_pos = 0;
+                skip_whitespace(prop_obj["required"], req_pos);
+                if (req_pos < prop_obj["required"].length() && prop_obj["required"][req_pos] == '[') {
+                    req_pos++;
+                    bool req_first = true;
+                    while (req_pos < prop_obj["required"].length()) {
+                        skip_whitespace(prop_obj["required"], req_pos);
+                        if (prop_obj["required"][req_pos] == ']') break;
+                        if (prop_obj["required"][req_pos] == ',') { req_pos++; continue; }
+                        if (prop_obj["required"][req_pos] == '"') {
+                            req_pos++;
+                            std::string req_item = extract_json_string(prop_obj["required"], req_pos);
+                            if (!req_first) result += ",";
+                            req_first = false;
+                            result += escape(req_item);
+                        }
+                    }
+                }
+                result += "]";
+            }
+        } else if (to_upper(type_val) == "ARRAY") {
+            if (prop_obj.count("items")) {
+                result += ",items:{";
+                size_t items_pos = 0;
+                auto items_obj = parse_json_object_raw(prop_obj["items"], items_pos);
+                bool items_first = true;
+
+                for (const auto& [item_key, item_value] : items_obj) {
+                    if (!items_first) result += ",";
+                    items_first = false;
+
+                    if (item_key == "properties") {
+                        std::string items_required;
+                        if (items_obj.count("required")) {
+                            items_required = items_obj["required"];
+                        }
+                        result += "properties:{" + format_parameters(item_value, items_required) + "}";
+                    } else if (item_key == "required") {
+                        result += "required:[";
+                        size_t req_pos = 0;
+                        skip_whitespace(item_value, req_pos);
+                        if (req_pos < item_value.length() && item_value[req_pos] == '[') {
+                            req_pos++;
+                            bool req_first = true;
+                            while (req_pos < item_value.length()) {
+                                skip_whitespace(item_value, req_pos);
+                                if (item_value[req_pos] == ']') break;
+                                if (item_value[req_pos] == ',') { req_pos++; continue; }
+                                if (item_value[req_pos] == '"') {
+                                    req_pos++;
+                                    std::string req_item = extract_json_string(item_value, req_pos);
+                                    if (!req_first) result += ",";
+                                    req_first = false;
+                                    result += escape(req_item);
+                                }
+                            }
+                        }
+                        result += "]";
+                    } else if (item_key == "type") {
+                        std::string item_type = get_json_string_value(item_value, 0);
+                        result += "type:" + escape(to_upper(item_type));
+                    } else {
+                        size_t val_pos = 0;
+                        result += item_key + ":" + format_argument(item_value, val_pos, true);
+                    }
+                }
+                result += "}";
+            }
+        }
+
+        if (!type_val.empty()) {
+            result += ",type:" + escape(to_upper(type_val));
+        }
+
+        result += "}";
+    }
+
+    return result;
+}
+
+inline std::string format_function_declaration(const std::string& name,
+                                                const std::string& description,
+                                                const std::string& params_json) {
+    std::string result = "declaration:" + name + "{";
+    result += "description:" + escape(description);
+
+    if (!params_json.empty()) {
+        result += ",parameters:{";
+
+        size_t pos = 0;
+        auto params = parse_json_object_raw(params_json, pos);
+
+        if (params.count("properties")) {
+            std::string required_json;
+            if (params.count("required")) {
+                required_json = params["required"];
+            }
+            result += "properties:{" + format_parameters(params["properties"], required_json) + "}";
+        }
+
+        if (params.count("required")) {
+            result += ",required:[";
+            size_t req_pos = 0;
+            skip_whitespace(params["required"], req_pos);
+            if (req_pos < params["required"].length() && params["required"][req_pos] == '[') {
+                req_pos++;
+                bool first = true;
+                while (req_pos < params["required"].length()) {
+                    skip_whitespace(params["required"], req_pos);
+                    if (params["required"][req_pos] == ']') break;
+                    if (params["required"][req_pos] == ',') { req_pos++; continue; }
+                    if (params["required"][req_pos] == '"') {
+                        req_pos++;
+                        std::string item = extract_json_string(params["required"], req_pos);
+                        if (!first) result += ",";
+                        first = false;
+                        result += escape(item);
+                    }
+                }
+            }
+            result += "]";
+        }
+
+        if (params.count("type")) {
+            std::string type_val = get_json_string_value(params["type"], 0);
+            result += ",type:" + escape(to_upper(type_val));
+        }
+
+        result += "}";
+    }
+
+    result += "}";
+    return result;
+}
+
+template<typename ToolFunction>
+inline std::string format_tools(const std::vector<ToolFunction>& tools) {
+    if (tools.empty()) return "";
+
+    std::string result;
+    for (const auto& tool : tools) {
+        result += "<start_function_declaration>";
+        std::string params_json;
+        auto it = tool.parameters.find("schema");
+        if (it != tool.parameters.end()) {
+            params_json = it->second;
+        }
+
+        result += format_function_declaration(tool.name, tool.description, params_json);
+        result += "<end_function_declaration>";
+    }
+    return result;
+}
+
+
+inline std::string unescape(const std::string& s) {
+    const std::string ESCAPE_TAG = "<escape>";
+    std::string result = s;
+    size_t pos = 0;
+    while ((pos = result.find(ESCAPE_TAG, pos)) != std::string::npos) {
+        result.erase(pos, ESCAPE_TAG.length());
+    }
+    return result;
+}
+
+inline std::string args_to_json(const std::string& args_content) {
+    std::string result = "{";
+    size_t pos = 0;
+    bool first = true;
+
+    if (!args_content.empty() && args_content[0] == '{') pos = 1;
+
+    while (pos < args_content.length()) {
+        while (pos < args_content.length() && std::isspace(args_content[pos])) pos++;
+        if (pos >= args_content.length() || args_content[pos] == '}') break;
+        if (args_content[pos] == ',') { pos++; continue; }
+
+        size_t key_start = pos;
+        while (pos < args_content.length() && args_content[pos] != ':') pos++;
+        std::string key = args_content.substr(key_start, pos - key_start);
+        if (pos < args_content.length()) pos++; 
+
+        std::string value;
+        while (pos < args_content.length() && std::isspace(args_content[pos])) pos++;
+
+        if (pos < args_content.length()) {
+            if (args_content.compare(pos, 8, "<escape>") == 0) {
+                pos += 8; 
+                size_t val_end = args_content.find("<escape>", pos);
+                if (val_end != std::string::npos) {
+                    value = "\"" + args_content.substr(pos, val_end - pos) + "\"";
+                    pos = val_end + 8; 
+                }
+            } else if (args_content[pos] == '{') {
+                int depth = 1;
+                size_t start = pos;
+                pos++;
+                while (pos < args_content.length() && depth > 0) {
+                    if (args_content[pos] == '{') depth++;
+                    else if (args_content[pos] == '}') depth--;
+                    pos++;
+                }
+                value = args_to_json(args_content.substr(start, pos - start));
+            } else if (args_content[pos] == '[') {
+                int depth = 1;
+                size_t start = pos;
+                pos++;
+                while (pos < args_content.length() && depth > 0) {
+                    if (args_content[pos] == '[') depth++;
+                    else if (args_content[pos] == ']') depth--;
+                    pos++;
+                }
+                std::string arr_content = args_content.substr(start + 1, pos - start - 2);
+                value = "[";
+                size_t arr_pos = 0;
+                bool first_item = true;
+                while (arr_pos < arr_content.length()) {
+                    while (arr_pos < arr_content.length() && (std::isspace(arr_content[arr_pos]) || arr_content[arr_pos] == ',')) arr_pos++;
+                    if (arr_pos >= arr_content.length()) break;
+
+                    if (!first_item) value += ",";
+                    first_item = false;
+
+                    if (arr_content.compare(arr_pos, 8, "<escape>") == 0) {
+                        arr_pos += 8;
+                        size_t end = arr_content.find("<escape>", arr_pos);
+                        if (end != std::string::npos) {
+                            value += "\"" + arr_content.substr(arr_pos, end - arr_pos) + "\"";
+                            arr_pos = end + 8;
+                        }
+                    } else {
+                        size_t end = arr_content.find_first_of(",]", arr_pos);
+                        if (end == std::string::npos) end = arr_content.length();
+                        value += arr_content.substr(arr_pos, end - arr_pos);
+                        arr_pos = end;
+                    }
+                }
+                value += "]";
+            } else {
+                size_t val_start = pos;
+                while (pos < args_content.length() && args_content[pos] != ',' && args_content[pos] != '}') {
+                    pos++;
+                }
+                value = args_content.substr(val_start, pos - val_start);
+                while (!value.empty() && std::isspace(value.back())) value.pop_back();
+            }
+        }
+
+        if (!first) result += ",";
+        first = false;
+        result += "\"" + key + "\":" + value;
+    }
+
+    result += "}";
+    return result;
+}
+
+inline void parse_function_calls(std::string& response, std::vector<std::string>& function_calls) {
+    const std::string CALL_START = "<start_function_call>";
+    const std::string CALL_END = "<end_function_call>";
+    size_t pos = 0;
+
+    while ((pos = response.find(CALL_START, pos)) != std::string::npos) {
+        size_t content_start = pos + CALL_START.length();
+        size_t call_end_pos = response.find(CALL_END, content_start);
+
+        size_t content_end = (call_end_pos != std::string::npos) ? call_end_pos : response.length();
+        std::string call_content = response.substr(content_start, content_end - content_start);
+
+        if (call_content.compare(0, 5, "call:") == 0) {
+            size_t brace_pos = call_content.find('{');
+
+            if (brace_pos == std::string::npos) {
+                size_t sep_pos = call_content.find_first_of(", ", 5);
+                if (sep_pos != std::string::npos) {
+                    std::string func_name = call_content.substr(5, sep_pos - 5);
+                    size_t args_start = sep_pos + 1;
+                    while (args_start < call_content.length() &&
+                           (call_content[args_start] == ' ' || call_content[args_start] == ',')) {
+                        args_start++;
+                    }
+                    std::string args_content = "{" + call_content.substr(args_start);
+                    if (args_content.back() != '}') args_content += "}";
+
+                    std::string args_json = args_to_json(args_content);
+                    std::string json_call = "{\"name\":\"" + func_name + "\",\"arguments\":" + args_json + "}";
+                    function_calls.push_back(json_call);
+                }
+            } else {
+                std::string func_name = call_content.substr(5, brace_pos - 5);
+                std::string args_content = call_content.substr(brace_pos);
+                if (args_content.back() != '}') args_content += "}";
+
+                std::string args_json = args_to_json(args_content);
+                std::string json_call = "{\"name\":\"" + func_name + "\",\"arguments\":" + args_json + "}";
+                function_calls.push_back(json_call);
+            }
+        }
+
+        size_t erase_end = (call_end_pos != std::string::npos) ?
+                           call_end_pos + CALL_END.length() : response.length();
+        response.erase(pos, erase_end - pos);
+    }
+}
+
+} // namespace gemma
\ No newline at end of file
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h
index 35b2808..e3038f6 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h
@@ -11,6 +11,7 @@
 #include <mutex>
 #include <sstream>
 #include <iostream>
+#include <arm_neon.h>
 
 namespace cactus {
 
@@ -96,9 +97,10 @@ namespace GraphFile {
 }
 
 enum class Precision {
-    INT8, 
+    INT8,
     FP16,
-    FP32
+    FP32,
+    INT4 
 };
 
 enum class ComputeBackend {
@@ -112,7 +114,7 @@ enum class OpType {
     MATMUL, TRANSPOSE, RESHAPE, SLICE, GATHER, EMBEDDING,
     BILINEAR_INTERPOLATION,
     SUM, MEAN, VARIANCE, MIN, MAX,
-    RMS_NORM, ROPE, SOFTMAX, ATTENTION, CONV1D_CAUSAL, CONV1D_K3,
+    RMS_NORM, ROPE, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, CONV1D_CAUSAL, CONV1D_K3,
     SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN,
     SILU, GELU, GELU_ERF,
     SAMPLE, CONCAT,
@@ -122,27 +124,38 @@ enum class OpType {
 };
 
 struct PrecisionTraits {
+    // Returns in-memory element size (INT4 unpacks to INT8, so returns 1)
     static constexpr size_t size_of(Precision prec) {
         switch (prec) {
             case Precision::INT8: return 1;
             case Precision::FP16: return 2;
             case Precision::FP32: return 4;
+            case Precision::INT4: return 1; 
         }
         return 1;
     }
-    
+
+    static constexpr size_t packed_size_of(Precision prec, size_t count) {
+        switch (prec) {
+            case Precision::INT4: return (count + 1) / 2;  
+            default: return count * size_of(prec);
+        }
+    }
+
     static constexpr bool is_integer(Precision prec) {
         switch (prec) {
             case Precision::INT8: return true;
+            case Precision::INT4: return true;
             case Precision::FP16: return false;
             case Precision::FP32: return false;
         }
         return true;
     }
-    
+
     static constexpr bool is_floating_point(Precision prec) {
         switch (prec) {
             case Precision::INT8: return false;
+            case Precision::INT4: return false;
             case Precision::FP16: return true;
             case Precision::FP32: return true;
         }
@@ -153,8 +166,6 @@ struct PrecisionTraits {
 namespace Quantization {
     void int8_to_fp32(const int8_t* src, float* dst, size_t count, float scale = 1.0f);
     void fp32_to_int8(const float* src, int8_t* dst, size_t count, float scale = 1.0f);
-    void dynamic_quantize_fp32_to_int8(const float* src, int8_t* dst, size_t count, 
-                                       float* computed_scale);
     void fp16_to_fp32(const __fp16* src, float* dst, size_t count);
     void fp32_to_fp16(const float* src, __fp16* dst, size_t count);
     void int8_to_fp16(const int8_t* src, __fp16* dst, size_t count, float scale = 1.0f);
@@ -188,10 +199,17 @@ struct BufferDesc {
     void* external_data;
     char* pooled_data;
     Precision precision;
-    float quantization_scale;
+
+    size_t group_size = 0;
+    size_t num_groups = 0;
+    void* scales_data = nullptr;
+    std::unique_ptr<char[]> owned_scales;
+
+    const void* packed_int4_data = nullptr;  
+    size_t packed_int4_size = 0; 
 
     BufferDesc();
-    BufferDesc(const std::vector<size_t>& s, Precision prec = Precision::INT8, float scale = 1.0f);
+    BufferDesc(const std::vector<size_t>& s, Precision prec = Precision::INT8);
     ~BufferDesc();
 
     BufferDesc(BufferDesc&& other) noexcept;
@@ -209,6 +227,28 @@ struct BufferDesc {
     template<typename T>
     const T* data_as() const { return static_cast<const T*>(get_data()); }
 
+    const __fp16* scales_as_fp16() const {
+        return reinterpret_cast<const __fp16*>(scales_data);
+    }
+    bool is_grouped_int8() const {
+        return precision == Precision::INT8 && group_size > 0;
+    }
+    bool is_packed_int4() const {
+        return packed_int4_data != nullptr && packed_int4_size > 0;
+    }
+    const uint8_t* packed_int4_as_uint8() const {
+        return reinterpret_cast<const uint8_t*>(packed_int4_data);
+    }
+    void set_grouped_scales(size_t gs, size_t ng, void* scales_ptr) {
+        group_size = gs;
+        num_groups = ng;
+        scales_data = scales_ptr;
+    }
+    void set_packed_int4(const void* packed_data, size_t packed_size) {
+        packed_int4_data = packed_data;
+        packed_int4_size = packed_size;
+    }
+
     void allocate();
     void allocate_from_pool(BufferPool& pool);
     void release_to_pool(BufferPool& pool);
@@ -247,6 +287,14 @@ struct OpParams {
 
     std::vector<float> bias_values;
     std::vector<uint32_t> bias_indices;
+
+    const int8_t* cached_keys_int8 = nullptr;
+    const int8_t* cached_values_int8 = nullptr;
+    const float* cached_k_scales = nullptr;
+    const float* cached_v_scales = nullptr;
+    size_t cache_seq_len = 0;
+    size_t num_kv_heads = 0;
+    size_t head_dim = 0;
 };
 
 struct GraphNode {
@@ -326,7 +374,7 @@ class CactusGraph {
     size_t precision_cast(size_t input, Precision target_precision);
     
     size_t add(size_t input1, size_t input2);
-    size_t add_clipped(size_t input1, size_t input2);  // For FP16 residual connections (Gemma)
+    size_t add_clipped(size_t input1, size_t input2);  
     size_t subtract(size_t input1, size_t input2);
     size_t multiply(size_t input1, size_t input2);
     size_t divide(size_t input1, size_t input2);
@@ -361,8 +409,12 @@ class CactusGraph {
     size_t gather(size_t embeddings, size_t indices);
     size_t mmap_embeddings(const std::string& filename);
     size_t mmap_weights(const std::string& filename);
-    size_t load_weights(const std::string& filename); 
-    void set_quantization_scale(size_t node_id, float scale);
+    size_t load_weights(const std::string& filename);
+    void set_grouped_scales(size_t node_id, size_t group_size, size_t num_groups, void* scales_ptr);
+
+    void release_weight_pages(size_t node_id);
+    void prefetch_weight_pages(size_t node_id);
+    void release_all_weight_pages();
     size_t embedding(const std::string& filename, size_t indices);
     size_t embedding(size_t embedding_tensor, size_t indices);
     size_t bilinear_interpolation(size_t pos_embeds, size_t dst_height, size_t dst_width);
@@ -376,6 +428,11 @@ class CactusGraph {
     size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, ComputeBackend backend = ComputeBackend::CPU);
     size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, size_t window_size, ComputeBackend backend = ComputeBackend::CPU);
 
+    size_t attention_int8_hybrid(size_t query, size_t key_new, size_t value_new, float scale, size_t position_offset,
+                                 const int8_t* cached_keys, const int8_t* cached_values,
+                                 const float* k_scales, const float* v_scales,
+                                 size_t cache_len, size_t num_kv_heads, size_t head_dim);
+
     size_t conv1d_causal(size_t input, size_t weight, size_t kernel_size, size_t dilation = 1);
     size_t conv1d_k3(size_t input, size_t weight, size_t stride);
     
@@ -392,6 +449,8 @@ class CactusGraph {
     void execute(const std::string& profile_file = "");
     void hard_reset();
     void soft_reset();
+    void soft_reset_keep_pool();
+    void set_prefill_mode(bool enabled) { prefill_mode_ = enabled; }
 
     void register_debug_node(uint32_t layer_idx, const std::string& name, size_t node_id);
     void capture_debug_node(uint32_t layer_idx, const std::string& name, size_t node_id);
@@ -410,8 +469,10 @@ class CactusGraph {
     size_t next_node_id_;
     std::vector<std::unique_ptr<GraphFile::MappedFile>> mapped_files_;
     std::unordered_map<std::string, size_t> weight_cache_;
+    std::unordered_map<size_t, size_t> node_to_mapped_file_;
     std::vector<DebugNodeEntry> debug_nodes_;
     BufferPool buffer_pool_;
+    bool prefill_mode_ = false;
 };
 
 
@@ -430,25 +491,36 @@ namespace GraphFile {
     public:
         MappedFile(const std::string& filename);
         ~MappedFile();
-        
+
         MappedFile(const MappedFile&) = delete;
         MappedFile& operator=(const MappedFile&) = delete;
         MappedFile(MappedFile&& other) noexcept;
         MappedFile& operator=(MappedFile&& other) noexcept;
-        
+
         const std::vector<size_t>& shape() const;
         Precision precision() const;
+        Precision effective_precision() const {
+            return is_int4_ ? Precision::INT8 : precision_;
+        }
         size_t byte_size() const;
-        float quantization_scale() const;
-        
+
+        size_t group_size() const { return group_size_; }
+        size_t num_groups() const { return num_groups_; }
+        const void* scales_data() const;
+        const void* raw_packed_data() const;  // Get raw mmap'd data without unpacking (for INT4)
+        bool is_int4() const { return is_int4_; }
+
         void* data();
         const void* data() const;
-        
+
         template<typename T>
         const T* typed_data() const;
-        
+
         LoadedNode load_into_graph(CactusGraph& graph) const;
-        
+
+        void release_pages();
+        void prefetch_pages();
+
     private:
         int fd_;
         void* mapped_data_;
@@ -456,10 +528,19 @@ namespace GraphFile {
         std::vector<size_t> shape_;
         Precision precision_;
         size_t byte_size_;
-        float quantization_scale_;
+        size_t group_size_ = 0;
+        size_t num_groups_ = 0;
+        size_t scales_offset_ = 0;
+        size_t scales_bytes_ = 0;
+        uint32_t version_ = 1;
+        uint32_t alignment_ = 32;
+        bool is_int4_ = false;
+        mutable std::unique_ptr<int8_t[]> unpacked_int4_data_;
         void parse_header();
+        void apply_madvise_hints();
+        void unpack_int4_if_needed() const;
     };
-    
+
     MappedFile mmap_load(const std::string& filename);
 }
 
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h
index 726cf37..dea1344 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h
@@ -15,12 +15,7 @@ enum class ScalarOpType {
     SIN
 };
 
-
-void cactus_add_int8(const int8_t* a, const int8_t* b, int8_t* output, size_t num_elements);
-void cactus_subtract_int8(const int8_t* a, const int8_t* b, int8_t* output, size_t num_elements);
-void cactus_multiply_int8(const int8_t* a, const int8_t* b, int8_t* output, size_t num_elements);
-void cactus_divide_int8(const int8_t* a, const int8_t* b, int8_t* output, size_t num_elements);
-
+constexpr size_t KV_QUANT_GROUP_SIZE = 128;
 
 void cactus_add_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
 void cactus_add_f16_clipped(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
@@ -28,27 +23,6 @@ void cactus_subtract_f16(const __fp16* a, const __fp16* b, __fp16* output, size_
 void cactus_multiply_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
 void cactus_divide_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
 
-
-void cactus_add_f32(const float* a, const float* b, float* output, size_t num_elements);
-void cactus_subtract_f32(const float* a, const float* b, float* output, size_t num_elements);
-void cactus_multiply_f32(const float* a, const float* b, float* output, size_t num_elements);
-void cactus_divide_f32(const float* a, const float* b, float* output, size_t num_elements);
-
-
-void cactus_add_broadcast_int8(const int8_t* a, const int8_t* b, int8_t* output,
-                               const size_t* a_strides, const size_t* b_strides,
-                               const size_t* output_shape, size_t ndim);
-void cactus_subtract_broadcast_int8(const int8_t* a, const int8_t* b, int8_t* output,
-                                    const size_t* a_strides, const size_t* b_strides,
-                                    const size_t* output_shape, size_t ndim);
-void cactus_multiply_broadcast_int8(const int8_t* a, const int8_t* b, int8_t* output,
-                                    const size_t* a_strides, const size_t* b_strides,
-                                    const size_t* output_shape, size_t ndim);
-void cactus_divide_broadcast_int8(const int8_t* a, const int8_t* b, int8_t* output,
-                                  const size_t* a_strides, const size_t* b_strides,
-                                  const size_t* output_shape, size_t ndim);
-
-
 void cactus_add_broadcast_f16(const __fp16* a, const __fp16* b, __fp16* output,
                                const size_t* a_strides, const size_t* b_strides,
                                const size_t* output_shape, size_t ndim);
@@ -62,159 +36,72 @@ void cactus_divide_broadcast_f16(const __fp16* a, const __fp16* b, __fp16* outpu
                                  const size_t* a_strides, const size_t* b_strides,
                                  const size_t* output_shape, size_t ndim);
 
-
-void cactus_add_broadcast_f32(const float* a, const float* b, float* output,
-                               const size_t* a_strides, const size_t* b_strides,
-                               const size_t* output_shape, size_t ndim);
-void cactus_subtract_broadcast_f32(const float* a, const float* b, float* output,
-                                   const size_t* a_strides, const size_t* b_strides,
-                                   const size_t* output_shape, size_t ndim);
-void cactus_multiply_broadcast_f32(const float* a, const float* b, float* output,
-                                   const size_t* a_strides, const size_t* b_strides,
-                                   const size_t* output_shape, size_t ndim);
-void cactus_divide_broadcast_f32(const float* a, const float* b, float* output,
-                                 const size_t* a_strides, const size_t* b_strides,
-                                 const size_t* output_shape, size_t ndim);
-
-
-void cactus_scalar_op_int8(const int8_t* input, int8_t* output, size_t num_elements, float scalar_value, ScalarOpType op_type);
 void cactus_scalar_op_f16(const __fp16* input, __fp16* output, size_t num_elements, float scalar_value, ScalarOpType op_type);
-void cactus_scalar_op_f32(const float* input, float* output, size_t num_elements, float scalar_value, ScalarOpType op_type);
-
 
-void cactus_matmul_int8(const int8_t* a, const int8_t* b_transposed, int8_t* c,
-                        size_t M, size_t K, size_t N,
-                        float a_scale, float b_scale, float c_scale);
+void cactus_matmul_int8(const int8_t* A, const float* A_scales,
+                        const int8_t* B, const __fp16* B_scales,
+                        __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
 
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-void cactus_matmul_int8_to_int32_i8mm(const int8_t* a, const int8_t* b_transposed, int32_t* c,
-                                       size_t M, size_t K, size_t N);
-#define cactus_matmul_int8_to_int32 cactus_matmul_int8_to_int32_i8mm
-#else
-void cactus_matmul_int8_to_int32(const int8_t* a, const int8_t* b_transposed, int32_t* c,
-                                 size_t M, size_t K, size_t N);
-#endif
+void cactus_matmul_int4(const int8_t* A, const float* A_scales,
+                        const uint8_t* B_packed, const __fp16* B_scales,
+                        __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
 
 void cactus_matmul_f16(const __fp16* a, const __fp16* b_transposed, __fp16* c,
                        size_t M, size_t K, size_t N);
 
-void cactus_matmul_f32(const float* a, const float* b_transposed, float* c,
-                       size_t M, size_t K, size_t N);
-
-
-void cactus_transpose_2d_int8(const int8_t* source, int8_t* destination,
-                               size_t num_rows, size_t num_cols, size_t start_row, size_t end_row);
 void cactus_transpose_2d_f16(const __fp16* source, __fp16* destination,
                              size_t num_rows, size_t num_cols, size_t start_row, size_t end_row);
-void cactus_transpose_2d_f32(const float* source, float* destination,
-                             size_t num_rows, size_t num_cols, size_t start_row, size_t end_row);
-
-void cactus_transpose_int8(const int8_t* source, int8_t* destination, const size_t* shape,
-                           const size_t* permutation, size_t ndim, size_t start_idx, size_t end_idx);
 void cactus_transpose_f16(const __fp16* source, __fp16* destination, const size_t* shape,
                           const size_t* permutation, size_t ndim, size_t start_idx, size_t end_idx);
-void cactus_transpose_f32(const float* source, float* destination, const size_t* shape,
-                          const size_t* permutation, size_t ndim, size_t start_idx, size_t end_idx);
 
-int64_t cactus_sum_all_int8(const int8_t* data, size_t num_elements);
-void cactus_sum_axis_int8(const int8_t* input, int8_t* output, size_t outer_size, size_t axis_size, size_t inner_size);
 double cactus_sum_all_f16(const __fp16* data, size_t num_elements);
-double cactus_sum_all_f32(const float* data, size_t num_elements);
-void cactus_sum_axis_f32(const float* input, float* output, size_t outer_size, size_t axis_size, size_t inner_size);
+void cactus_sum_axis_f16(const __fp16* input, __fp16* output, size_t outer_size, size_t axis_size, size_t inner_size);
 
-double cactus_mean_all_int8(const int8_t* data, size_t num_elements);
-void cactus_mean_axis_int8(const int8_t* input, int8_t* output, size_t outer_size, size_t axis_size, size_t inner_size);
 double cactus_mean_all_f16(const __fp16* data, size_t num_elements);
 void cactus_mean_axis_f16(const __fp16* input, __fp16* output, size_t outer_size, size_t axis_size, size_t inner_size);
-double cactus_mean_all_f32(const float* data, size_t num_elements);
-void cactus_mean_axis_f32(const float* input, float* output, size_t outer_size, size_t axis_size, size_t inner_size);
 
-double cactus_variance_all_int8(const int8_t* data, size_t num_elements);
-void cactus_variance_axis_int8(const int8_t* input, int8_t* output, size_t outer_size, size_t axis_size, size_t inner_size);
-double cactus_variance_all_f32(const float* data, size_t num_elements);
-void cactus_variance_axis_f32(const float* input, float* output, size_t outer_size, size_t axis_size, size_t inner_size);
+double cactus_variance_all_f16(const __fp16* data, size_t num_elements);
+void cactus_variance_axis_f16(const __fp16* input, __fp16* output, size_t outer_size, size_t axis_size, size_t inner_size);
 
-int64_t cactus_min_all_int8(const int8_t* data, size_t num_elements);
-void cactus_min_axis_int8(const int8_t* input, int8_t* output, size_t outer_size, size_t axis_size, size_t inner_size);
-float cactus_min_all_f32(const float* data, size_t num_elements);
-void cactus_min_axis_f32(const float* input, float* output, size_t outer_size, size_t axis_size, size_t inner_size);
+__fp16 cactus_min_all_f16(const __fp16* data, size_t num_elements);
+void cactus_min_axis_f16(const __fp16* input, __fp16* output, size_t outer_size, size_t axis_size, size_t inner_size);
 
-int64_t cactus_max_all_int8(const int8_t* data, size_t num_elements);
-void cactus_max_axis_int8(const int8_t* input, int8_t* output, size_t outer_size, size_t axis_size, size_t inner_size);
-float cactus_max_all_f32(const float* data, size_t num_elements);
-void cactus_max_axis_f32(const float* input, float* output, size_t outer_size, size_t axis_size, size_t inner_size);
+__fp16 cactus_max_all_f16(const __fp16* data, size_t num_elements);
+void cactus_max_axis_f16(const __fp16* input, __fp16* output, size_t outer_size, size_t axis_size, size_t inner_size);
 
 void cactus_rms_norm_f16(const __fp16* input, const __fp16* weight, __fp16* output,
                           size_t batch_size, size_t dims, float eps);
-                          
-void cactus_rms_norm_f32(const float* input, const float* weight, float* output,
-                          size_t batch_size, size_t dims, float eps);
-
-void cactus_rms_norm_i8_f32(const int8_t* input, const float* weight, float* output,
-                             size_t batch_size, size_t dims, float eps, float input_scale);
 
 void cactus_rope_f16(const __fp16* input, __fp16* output, size_t batch_size, size_t seq_len,
                       size_t num_heads, size_t head_dim, size_t start_pos, float theta);
 
-void cactus_rope_f32(const float* input, float* output, size_t batch_size, size_t seq_len,
-                      size_t num_heads, size_t head_dim, size_t start_pos, float theta);
-
-void cactus_rope_i8_f32_i8(const int8_t* input, int8_t* output, size_t batch_size, size_t seq_len,
-                           size_t num_heads, size_t head_dim, size_t start_pos, float theta,
-                           float input_scale, float output_scale);
-
-void cactus_softmax_f16(const __fp16* input, __fp16* output, size_t batch_size, 
+void cactus_softmax_f16(const __fp16* input, __fp16* output, size_t batch_size,
                          size_t seq_len, size_t vocab_size);
 
-void cactus_softmax_f32(const float* input, float* output, size_t batch_size, 
-                         size_t seq_len, size_t vocab_size);
-
-void cactus_silu_f32(const float* input, float* output, size_t num_elements);
 void cactus_silu_f16(const __fp16* input, __fp16* output, size_t num_elements);
-void cactus_silu_int8(const int8_t* input, int8_t* output, size_t num_elements, 
-                      float input_scale, float output_scale);
 
-void cactus_gelu_f32(const float* input, float* output, size_t num_elements);
 void cactus_gelu_f16(const __fp16* input, __fp16* output, size_t num_elements);
-void cactus_gelu_int8(const int8_t* input, int8_t* output, size_t num_elements,
-                      float input_scale, float output_scale);
 
-void cactus_gelu_f32_erf(const float* input, float* output, size_t num_elements);
 void cactus_gelu_f16_erf(const __fp16* input, __fp16* output, size_t num_elements);
-void cactus_gelu_int8_erf(
-    const int8_t* input,
-    int8_t* output,
-    size_t num_elements,
-    float scale_in,
-    float scale_out);
-
-                      
-void cactus_attention_int8(const int8_t* queries, const int8_t* keys, const int8_t* values, int8_t* output,
-                            size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads,
-                            size_t head_dim, float scale, const int8_t* mask,
-                            float q_scale, float k_scale, float v_scale, float output_scale, size_t position_offset = 0, size_t window_size = 0,
-                            bool is_causal = true);
 
 void cactus_attention_f16(const __fp16* queries, const __fp16* keys, const __fp16* values, __fp16* output,
                           size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads,
                           size_t head_dim, float scale, const __fp16* mask, size_t position_offset = 0, size_t window_size = 0,
                           bool is_causal = true);
 
-void cactus_attention_f32(const float* queries, const float* keys, const float* values, float* output,
-                          size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads,
-                          size_t head_dim, float scale, const float* mask, size_t position_offset = 0, size_t window_size = 0,
-                          bool is_causal = true);
-
-
-void cactus_conv1d_causal_depthwise_f32(
-    const float* input,
-    const float* weight,
-    float* output,
-    size_t N,
-    size_t L,
-    size_t C,
-    size_t K,
-    size_t dilation);
+void cactus_attention_hybrid_int8_fp16(
+    const __fp16* queries,        
+    const int8_t* keys_cached, 
+    const int8_t* values_cached, 
+    const float* k_scales,
+    const float* v_scales, 
+    const __fp16* keys_new, 
+    const __fp16* values_new, 
+    __fp16* output,
+    size_t batch_size, size_t seq_len, size_t cache_len, size_t new_len,
+    size_t num_q_heads, size_t num_kv_heads, size_t head_dim,
+    float scale, size_t position_offset = 0, bool is_causal = true,
+    size_t group_size = KV_QUANT_GROUP_SIZE);
 
 void cactus_conv1d_causal_depthwise_f16(
     const __fp16* input,
@@ -226,30 +113,6 @@ void cactus_conv1d_causal_depthwise_f16(
     size_t K,
     size_t dilation);
 
-void cactus_conv1d_causal_depthwise_int8(
-    const int8_t* input,
-    const int8_t* weight,
-    int8_t* output,
-    size_t N,
-    size_t L,
-    size_t C,
-    size_t K,
-    size_t dilation,
-    float input_scale,
-    float weight_scale,
-    float output_scale);
-
-void cactus_conv1d_f32_k3(
-    const float* input,
-    const float* weight,
-    float* output,
-    size_t N,
-    size_t L,
-    size_t C_in,
-    size_t C_out,
-    size_t stride
-);
-
 void cactus_conv1d_f16_k3(
     const __fp16* input,
     const __fp16* weight,
@@ -261,26 +124,8 @@ void cactus_conv1d_f16_k3(
     size_t stride
 );
 
-void cactus_conv1d_f32_k3(
-    const float* input,
-    const float* weight,
-    float* output,
-    size_t N, size_t L,
-    size_t C_in, size_t C_out,
-    size_t stride
-);
-
-void cactus_conv1d_f16_k3(
-    const __fp16* input,
-    const __fp16* weight,
-    __fp16* output,
-    size_t N, size_t L,
-    size_t C_in, size_t C_out,
-    size_t stride
-);
-
-void cactus_bilinear_interpolation_fp32(const float* input, float* output, size_t src_height, size_t src_width, size_t embed_dim,
-                                        size_t dst_height, size_t dst_width);
+void cactus_bilinear_interpolation_f16(const __fp16* input, __fp16* output, size_t src_height, size_t src_width, size_t embed_dim,
+                                       size_t dst_height, size_t dst_width);
 
 void cactus_sample_f32(const float* logits, uint32_t* output, size_t vocab_size,
                        float temperature, float top_p, size_t top_k, size_t random_seed,
@@ -291,25 +136,30 @@ void cactus_sample_f16(const __fp16* logits, uint32_t* output, size_t vocab_size
                        const float* bias_values = nullptr, const uint32_t* bias_indices = nullptr,
                        size_t bias_count = 0);
 
-
-void cactus_concat_f32(const float* input1, const float* input2, float* output,
-                       const size_t* shape1, const size_t* shape2, const size_t* output_shape,
-                       size_t ndims, int axis);
 void cactus_concat_f16(const __fp16* input1, const __fp16* input2, __fp16* output,
                        const size_t* shape1, const size_t* shape2, const size_t* output_shape,
                        size_t ndims, int axis);
-void cactus_concat_int8(const int8_t* input1, const int8_t* input2, int8_t* output,
-                        const size_t* shape1, const size_t* shape2, const size_t* output_shape,
-                        size_t ndims, int axis);
 
 void cactus_int8_to_fp32(const int8_t* src, float* dst, size_t count, float scale = 1.0f);
 void cactus_fp32_to_int8(const float* src, int8_t* dst, size_t count, float scale = 1.0f);
-void cactus_dynamic_quantize_fp32_to_int8(const float* src, int8_t* dst, size_t count, float* computed_scale);
 void cactus_fp16_to_fp32(const __fp16* src, float* dst, size_t count);
 void cactus_fp32_to_fp16(const float* src, __fp16* dst, size_t count);
 void cactus_int8_to_fp16(const int8_t* src, __fp16* dst, size_t count, float scale = 1.0f);
 void cactus_fp16_to_int8(const __fp16* src, int8_t* dst, size_t count, float scale = 1.0f);
 float cactus_fp16_max_abs(const __fp16* src, size_t count);
-void cactus_int32_to_fp16_scaled(const int32_t* src, __fp16* dst, size_t count, float scale);
 
-#endif 
\ No newline at end of file
+void cactus_quantize_kv_fp16_to_int8(
+    const __fp16* src,
+    int8_t* dst,
+    float* scales,
+    size_t seq_len, size_t kv_heads, size_t head_dim,
+    size_t group_size = KV_QUANT_GROUP_SIZE);
+
+inline size_t kv_scales_count(size_t seq_len, size_t kv_heads, size_t head_dim, size_t group_size = KV_QUANT_GROUP_SIZE) {
+    size_t num_groups = (head_dim + group_size - 1) / group_size;
+    return seq_len * kv_heads * num_groups;
+}
+
+void cactus_unpack_int4_to_int8(const uint8_t* packed, int8_t* unpacked, size_t unpacked_count);
+
+#endif
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h
index 30ca86f..92fd489 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h
@@ -2,6 +2,13 @@
 #define KERNEL_UTILS_H
 
 #include <arm_neon.h>
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+#if defined(__ANDROID__)
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#endif
 #include <algorithm>
 #include <cmath>
 #include <thread>
@@ -19,166 +26,439 @@
 #include <cstdio>
 
 constexpr size_t NEON_VECTOR_SIZE = 16;
+constexpr size_t STREAMING_STORE_THRESHOLD = 32768;
 
-inline int8_t clamp_to_int8(float value) {
-    int32_t clamped = static_cast<int32_t>(roundf(value));
-    return static_cast<int8_t>(std::max(-128, std::min(127, clamped)));
-}
-
-inline int8_t clamp_to_int8(int32_t value) {
-    return static_cast<int8_t>(std::max(-128, std::min(127, value)));
+inline void stream_store_f16x8(__fp16* dst, float16x8_t val) {
+#if defined(__aarch64__)
+    float16x4_t lo = vget_low_f16(val);
+    float16x4_t hi = vget_high_f16(val);
+    __asm__ __volatile__(
+        "stnp %d0, %d1, [%2]"
+        :
+        : "w"(lo), "w"(hi), "r"(dst)
+        : "memory"
+    );
+#else
+    vst1q_f16(dst, val);
+#endif
 }
 
 #if defined(__ARM_FEATURE_DOTPROD)
-inline int32x4_t accum_i8mm(int32x4_t acc, int8x16_t a, int8x16_t b) {
+inline int32x4_t accum_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
     return vdotq_s32(acc, a, b);
 }
 #else
-inline int32x4_t accum_i8mm(int32x4_t acc, int8x16_t a, int8x16_t b) {
+inline int32x4_t accum_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
     int16x8_t prod_low = vmull_s8(vget_low_s8(a), vget_low_s8(b));
     int32x4_t acc_high = vpaddlq_s16(vmull_s8(vget_high_s8(a), vget_high_s8(b)));
     return vaddq_s32(vaddq_s32(acc, vpaddlq_s16(prod_low)), acc_high);
 }
 #endif
 
-inline float16x8_t accum_f16_dot(float16x8_t acc, float16x8_t a_low, float16x8_t a_high, 
+// I8MM support: runtime detection on Android, compile-time on Apple
+#if defined(__ANDROID__) && defined(__aarch64__)
+
+inline bool cactus_has_i8mm() {
+    static int8_t supported = -1;
+    if (supported == -1) {
+        unsigned long hwcaps = getauxval(AT_HWCAP2);
+        supported = (hwcaps & HWCAP2_I8MM) ? 1 : 0;
+    }
+    return supported;
+}
+
+__attribute__((target("arch=armv8.2-a+i8mm")))
+inline int32x4_t accum_matmul(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    return vmmlaq_s32(acc, a, b);
+}
+
+#elif defined(__APPLE__) && defined(__aarch64__)
+
+inline bool cactus_has_i8mm() {
+    return true;
+}
+
+__attribute__((target("i8mm")))
+inline int32x4_t accum_matmul(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    return vmmlaq_s32(acc, a, b);
+}
+
+#else
+
+inline bool cactus_has_i8mm() {
+    return false;
+}
+
+#endif
+
+inline float16x8_t accum_f16_dot(float16x8_t acc, float16x8_t a_low, float16x8_t a_high,
                                  float16x8_t b_low, float16x8_t b_high) {
     acc = vfmaq_f16(acc, a_low, b_low);
     return vfmaq_f16(acc, a_high, b_high);
 }
 
-inline float32x4_t accum_f32_dot(float32x4_t acc, float32x4_t a_low, float32x4_t a_high, 
-                                  float32x4_t b_low, float32x4_t b_high) {
-    acc = vfmaq_f32(acc, a_low, b_low);
-    return vfmaq_f32(acc, a_high, b_high);
+inline float32x4_t fast_exp_f32x4(float32x4_t x) {
+    const float32x4_t log2e = vdupq_n_f32(1.4426950408889634f);
+    const float32x4_t ln2 = vdupq_n_f32(0.6931471805599453f);
+
+    const float32x4_t c0 = vdupq_n_f32(1.0f);
+    const float32x4_t c1 = vdupq_n_f32(0.6931471805599453f); 
+    const float32x4_t c2 = vdupq_n_f32(0.2402265069591007f);  
+    const float32x4_t c3 = vdupq_n_f32(0.05550410866482158f);
+    const float32x4_t c4 = vdupq_n_f32(0.009618129842071803f); 
+
+    x = vmaxq_f32(x, vdupq_n_f32(-87.0f));
+    x = vminq_f32(x, vdupq_n_f32(87.0f));
+
+    float32x4_t z = vmulq_f32(x, log2e);
+
+    int32x4_t zi = vcvtq_s32_f32(z);
+    float32x4_t zf = vsubq_f32(z, vcvtq_f32_s32(zi));
+
+    uint32x4_t neg_mask = vcltq_f32(zf, vdupq_n_f32(0.0f));
+    zi = vsubq_s32(zi, vandq_s32(vreinterpretq_s32_u32(neg_mask), vdupq_n_s32(1)));
+    zf = vaddq_f32(zf, vreinterpretq_f32_u32(vandq_u32(neg_mask, vreinterpretq_u32_f32(vdupq_n_f32(1.0f)))));
+
+    float32x4_t zf_ln2 = vmulq_f32(zf, ln2);
+    float32x4_t p = c4;
+    p = vfmaq_f32(c3, p, zf_ln2);
+    p = vfmaq_f32(c2, p, zf_ln2);
+    p = vfmaq_f32(c1, p, zf_ln2);
+    p = vfmaq_f32(c0, p, zf_ln2);
+
+    int32x4_t exp_bits = vshlq_n_s32(vaddq_s32(zi, vdupq_n_s32(127)), 23);
+    float32x4_t scale = vreinterpretq_f32_s32(exp_bits);
+
+    return vmulq_f32(p, scale);
+}
+
+inline float32x4_t fast_tanh_f32x4(float32x4_t x) {
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    const float32x4_t neg_one = vdupq_n_f32(-1.0f);
+
+    uint32x4_t pos_sat = vcgtq_f32(x, vdupq_n_f32(4.5f));
+    uint32x4_t neg_sat = vcltq_f32(x, vdupq_n_f32(-4.5f));
+
+    const float32x4_t c27 = vdupq_n_f32(27.0f);
+    const float32x4_t c9 = vdupq_n_f32(9.0f);
+
+    float32x4_t x2 = vmulq_f32(x, x);
+    float32x4_t num = vaddq_f32(c27, x2);   
+    float32x4_t den = vfmaq_f32(c27, c9, x2);  
+
+    float32x4_t result = vmulq_f32(x, vdivq_f32(num, den));
+
+    result = vbslq_f32(pos_sat, one, result);
+    result = vbslq_f32(neg_sat, neg_one, result);
+
+    return result;
+}
+
+inline int8x16_t unpack_int4_lo(uint8x16_t packed) {
+    uint8x16_t lo = vandq_u8(packed, vdupq_n_u8(0x0F));
+    uint8x16_t sign_mask = vcgtq_u8(lo, vdupq_n_u8(7));
+    uint8x16_t correction = vandq_u8(sign_mask, vdupq_n_u8(16));
+    return vreinterpretq_s8_u8(vsubq_u8(lo, correction));
+}
+
+inline int8x16_t unpack_int4_hi(uint8x16_t packed) {
+    uint8x16_t hi = vshrq_n_u8(packed, 4);
+    uint8x16_t sign_mask = vcgtq_u8(hi, vdupq_n_u8(7));
+    uint8x16_t correction = vandq_u8(sign_mask, vdupq_n_u8(16));
+    return vreinterpretq_s8_u8(vsubq_u8(hi, correction));
+}
+
+inline void unpack_int4_to_int8x32(uint8x16_t packed, int8x16_t& out_lo, int8x16_t& out_hi) {
+    int8x16_t lo_nibbles = unpack_int4_lo(packed);
+    int8x16_t hi_nibbles = unpack_int4_hi(packed);
+    int8x16x2_t interleaved = vzipq_s8(lo_nibbles, hi_nibbles);
+    out_lo = interleaved.val[0];
+    out_hi = interleaved.val[1];
+}
+
+inline int32x4_t int4_dot_asm(int32x4_t acc, uint8x16_t packed, int8x16_t a_lo, int8x16_t a_hi) {
+#if defined(__aarch64__)
+    int8x16_t b_lo, b_hi;
+
+    __asm__ __volatile__ (
+        "movi   v16.16b, #0x0F          \n"  // low nibble mask
+        "movi   v17.16b, #7             \n"  // sign threshold
+        "movi   v18.16b, #16            \n"  // sign correction
+
+        "and    %[b_lo].16b, %[packed].16b, v16.16b  \n"
+
+        "ushr   %[b_hi].16b, %[packed].16b, #4      \n"
+
+        "cmgt   v19.16b, %[b_lo].16b, v17.16b       \n"
+        "and    v19.16b, v19.16b, v18.16b           \n"
+        "sub    %[b_lo].16b, %[b_lo].16b, v19.16b   \n"
+
+        "cmgt   v20.16b, %[b_hi].16b, v17.16b       \n"
+        "and    v20.16b, v20.16b, v18.16b           \n"
+        "sub    %[b_hi].16b, %[b_hi].16b, v20.16b   \n"
+
+        "zip1   v21.16b, %[b_lo].16b, %[b_hi].16b   \n"
+        "zip2   v22.16b, %[b_lo].16b, %[b_hi].16b   \n"
+
+        ".arch armv8.2-a+dotprod                    \n"
+        "sdot   %[acc].4s, %[a_lo].16b, v21.16b     \n"
+        "sdot   %[acc].4s, %[a_hi].16b, v22.16b     \n"
+
+        : [acc] "+w"(acc), [b_lo] "=w"(b_lo), [b_hi] "=w"(b_hi)
+        : [packed] "w"(packed), [a_lo] "w"(a_lo), [a_hi] "w"(a_hi)
+        : "v16", "v17", "v18", "v19", "v20", "v21", "v22"
+    );
+
+    return acc;
+#else
+    int8x16_t b_lo, b_hi;
+    unpack_int4_to_int8x32(packed, b_lo, b_hi);
+    acc = accum_dot(acc, a_lo, b_lo);
+    acc = accum_dot(acc, a_hi, b_hi);
+    return acc;
+#endif
+}
+
+inline int32_t int4_dot_m1_asm(const int8_t* a_ptr, const uint8_t* b_packed, size_t group_size) {
+#if defined(__aarch64__)
+    int32x4_t acc = vdupq_n_s32(0);
+
+    for (size_t k = 0; k < group_size; k += 64) {
+        uint8x16_t p0 = vld1q_u8(b_packed + k/2);
+        uint8x16_t p1 = vld1q_u8(b_packed + k/2 + 16);
+
+        int8x16_t a0 = vld1q_s8(a_ptr + k);
+        int8x16_t a1 = vld1q_s8(a_ptr + k + 16);
+        int8x16_t a2 = vld1q_s8(a_ptr + k + 32);
+        int8x16_t a3 = vld1q_s8(a_ptr + k + 48);
+
+        acc = int4_dot_asm(acc, p0, a0, a1);
+        acc = int4_dot_asm(acc, p1, a2, a3);
+    }
+
+    return vaddvq_s32(acc);
+#else
+    int32x4_t acc = vdupq_n_s32(0);
+    for (size_t k = 0; k < group_size; k += 32) {
+        uint8x16_t packed = vld1q_u8(b_packed + k/2);
+        int8x16_t b_lo, b_hi;
+        unpack_int4_to_int8x32(packed, b_lo, b_hi);
+        acc = accum_dot(acc, vld1q_s8(a_ptr + k), b_lo);
+        acc = accum_dot(acc, vld1q_s8(a_ptr + k + 16), b_hi);
+    }
+    return vaddvq_s32(acc);
+#endif
 }
 
 namespace CactusThreading {
-    
+
     class ThreadPool {
     private:
+        static constexpr size_t MAX_WORKERS = 16;
+
         std::vector<std::thread> workers;
-        std::queue<std::function<void()>> tasks;
-        std::mutex queue_mutex;
-        std::condition_variable condition;
-        std::atomic<bool> stop{false};
-        std::atomic<size_t> active_workers{0};
-        std::condition_variable finish_condition;
-        
+        std::deque<std::function<void()>> tasks;
+
+        std::mutex mutex;
+        std::condition_variable work_available;
+        std::condition_variable work_done;
+
+        bool stop{false};
+        std::atomic<size_t> pending_tasks{0};
+        size_t num_workers_;
+
         void worker_thread() {
             while (true) {
                 std::function<void()> task;
                 {
-                    std::unique_lock<std::mutex> lock(queue_mutex);
-                    condition.wait(lock, [this] { return stop || !tasks.empty(); });
-                    
-                    if (stop && tasks.empty()) return;
-                    
+                    std::unique_lock<std::mutex> lock(mutex);
+                    work_available.wait(lock, [this] {
+                        return stop || !tasks.empty();
+                    });
+
+                    if (stop && tasks.empty()) {
+                        return;
+                    }
+
                     task = std::move(tasks.front());
-                    tasks.pop();
-                    active_workers++;
+                    tasks.pop_front();
                 }
-                
+
                 task();
-                
-                active_workers--;
-                finish_condition.notify_all();
+
+                if (pending_tasks.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+                    std::lock_guard<std::mutex> lock(mutex);
+                    work_done.notify_one();
+                }
             }
         }
-        
+
     public:
-        explicit ThreadPool(size_t num_threads = std::thread::hardware_concurrency()) {
-            workers.reserve(num_threads);
-            for (size_t i = 0; i < num_threads; ++i) {
+        explicit ThreadPool(size_t num_threads = std::thread::hardware_concurrency())
+            : stop(false), pending_tasks(0) {
+            num_workers_ = std::min(num_threads, MAX_WORKERS);
+            if (num_workers_ == 0) num_workers_ = 1;
+            workers.reserve(num_workers_);
+            for (size_t i = 0; i < num_workers_; ++i) {
                 workers.emplace_back(&ThreadPool::worker_thread, this);
             }
         }
-        
+
         ~ThreadPool() {
             {
-                std::unique_lock<std::mutex> lock(queue_mutex);
+                std::lock_guard<std::mutex> lock(mutex);
                 stop = true;
             }
-            condition.notify_all();
+            work_available.notify_all();
             for (auto& worker : workers) {
-                worker.join();
+                if (worker.joinable()) {
+                    worker.join();
+                }
             }
         }
-        
+
         template<typename F>
         auto enqueue(F&& f) -> std::future<decltype(f())> {
             using return_type = decltype(f());
-            
+
             auto task = std::make_shared<std::packaged_task<return_type()>>(
                 std::forward<F>(f)
             );
-            
+
             std::future<return_type> res = task->get_future();
+
             {
-                std::unique_lock<std::mutex> lock(queue_mutex);
-                if (stop) throw std::runtime_error("enqueue on stopped ThreadPool");
-                
-                tasks.emplace([task](){ (*task)(); });
+                std::lock_guard<std::mutex> lock(mutex);
+                pending_tasks.fetch_add(1, std::memory_order_relaxed);
+                tasks.emplace_back([task](){ (*task)(); });
             }
-            condition.notify_one();
+            work_available.notify_one();
+
             return res;
         }
-        
+
+        template<typename F>
+        void enqueue_batch(size_t total_work, F task_func) {
+            if (total_work == 0) return;
+
+            const size_t num_tasks = std::min(num_workers_, total_work);
+            const size_t per_worker = total_work / num_tasks;
+            const size_t remainder = total_work % num_tasks;
+
+            {
+                std::lock_guard<std::mutex> lock(mutex);
+                pending_tasks.fetch_add(num_tasks, std::memory_order_relaxed);
+
+                for (size_t w = 0; w < num_tasks; ++w) {
+                    size_t start = w * per_worker + std::min(w, remainder);
+                    size_t end = start + per_worker + (w < remainder ? 1 : 0);
+                    tasks.emplace_back([=]() { task_func(start, end); });
+                }
+            }
+            work_available.notify_all();
+        }
+
         void wait_all() {
-            std::unique_lock<std::mutex> lock(queue_mutex);
-            finish_condition.wait(lock, [this] { 
-                return tasks.empty() && active_workers == 0; 
+            std::unique_lock<std::mutex> lock(mutex);
+            work_done.wait(lock, [this] {
+                return pending_tasks.load(std::memory_order_acquire) == 0;
             });
         }
-        
-        size_t num_workers() const { return workers.size(); }
+
+        template<typename F>
+        void enqueue_n_threads(size_t total_work, size_t num_threads, F task_func) {
+            if (total_work == 0 || num_threads == 0) return;
+
+            num_threads = std::min(num_threads, std::min(num_workers_, total_work));
+            const size_t per_thread = total_work / num_threads;
+            const size_t remainder = total_work % num_threads;
+
+            {
+                std::lock_guard<std::mutex> lock(mutex);
+                pending_tasks.fetch_add(num_threads, std::memory_order_relaxed);
+
+                for (size_t t = 0; t < num_threads; ++t) {
+                    size_t start = t * per_thread + std::min(t, remainder);
+                    size_t end = start + per_thread + (t < remainder ? 1 : 0);
+                    tasks.emplace_back([=]() { task_func(start, end); });
+                }
+            }
+            work_available.notify_all();
+        }
+
+        size_t num_workers() const { return num_workers_; }
     };
-    
+
     inline ThreadPool& get_thread_pool() {
         static ThreadPool pool;
         return pool;
     }
     
-    inline size_t get_optimal_thread_count(size_t total_work, size_t min_work_per_thread) {
-        if (total_work < min_work_per_thread) return 1;
+    struct ParallelConfig {
+        size_t min_work_gate;  
+        size_t work_per_thread; 
+
+        constexpr ParallelConfig(size_t gate, size_t per_thread)
+            : min_work_gate(gate), work_per_thread(per_thread) {}
+    };
+
+    inline size_t get_optimal_thread_count(size_t total_work, ParallelConfig config) {
+        if (total_work < config.min_work_gate) return 1;
+
         size_t pool_size = get_thread_pool().num_workers();
-        return std::min(pool_size, 
-                       std::max(static_cast<size_t>(1), total_work / min_work_per_thread));
+        size_t num_threads = (total_work + config.work_per_thread - 1) / config.work_per_thread;
+        return std::min(pool_size, std::max(static_cast<size_t>(1), num_threads));
     }
-    
+
     struct Thresholds {
+        #if defined(__ANDROID__)
+        static constexpr ParallelConfig ATTENTION{64, 32};
+        static constexpr ParallelConfig ELEMENT_WISE{5000, 2500};
+        static constexpr ParallelConfig AXIS_REDUCE{1000, 500};
+        static constexpr ParallelConfig ALL_REDUCE{10000, 5000};
+        static constexpr ParallelConfig SCALAR_BASIC{30000, 15000};
+        static constexpr ParallelConfig SCALAR_EXPENSIVE{10000, 5000};
+        #else // Apple
+        static constexpr ParallelConfig ATTENTION{32, 16};
+        static constexpr ParallelConfig ELEMENT_WISE{5000, 2500};
+        static constexpr ParallelConfig AXIS_REDUCE{1000, 500};
+        static constexpr ParallelConfig ALL_REDUCE{10000, 5000};
+        static constexpr ParallelConfig SCALAR_BASIC{5000, 2500};
+        static constexpr ParallelConfig SCALAR_EXPENSIVE{2500, 1250};
+        #endif
+    };
 
+    struct GemmThreading {
         #if defined(__ANDROID__)
-        static constexpr size_t ELEMENT_WISE = 5000;
-        static constexpr size_t AXIS_REDUCE = 1000;
-        static constexpr size_t ALL_REDUCE = 10000;
-        static constexpr size_t SCALAR_BASIC = 30000;
-        static constexpr size_t SCALAR_EXPENSIVE = 10000;
-        static constexpr size_t ATTENTION = 512;
-        static constexpr size_t GEMM_TILED = 20000; 
-        static constexpr size_t GEMM_SMALL = 64 * 64 * 64;
-        static constexpr size_t GEMM_MEDIUM = 256 * 256 * 256;
-        static constexpr size_t GEMM_TILE_M = 64;
-        static constexpr size_t GEMM_TILE_N = 64;
-        static constexpr size_t GEMM_TILE_M_SMALL = 32;
-        static constexpr size_t GEMM_TILE_N_SMALL = 32;
-        #else // iOS
-        static constexpr size_t ELEMENT_WISE = 5000;
-        static constexpr size_t AXIS_REDUCE = 1000;
-        static constexpr size_t ALL_REDUCE = 10000;
-        static constexpr size_t SCALAR_BASIC = 5000;
-        static constexpr size_t SCALAR_EXPENSIVE = 2500;
-        static constexpr size_t ATTENTION = 4;
-        static constexpr size_t GEMM_TILED = 4;  
-        static constexpr size_t GEMM_SMALL = 64 * 64 * 64;
-        static constexpr size_t GEMM_MEDIUM = 256 * 256 * 256;
-        static constexpr size_t GEMM_TILE_M = 64;
-        static constexpr size_t GEMM_TILE_N = 64;
-        static constexpr size_t GEMM_TILE_M_SMALL = 32;
-        static constexpr size_t GEMM_TILE_N_SMALL = 32;
+        static size_t get_num_threads(size_t M, size_t pool_size) {
+            if (M <= 1) return 1; 
+            return pool_size; 
+        }
+        #elif defined(__APPLE__) && TARGET_OS_IPHONE
+        static size_t get_num_threads(size_t M, size_t pool_size) {
+            if (M <= 1) return std::min(pool_size, static_cast<size_t>(2)); 
+            return pool_size; 
+        }
+        #else // Mac
+        static size_t get_num_threads(size_t M, size_t pool_size) {
+            if (M <= 1) return std::min(pool_size, static_cast<size_t>(4));
+            return pool_size; 
+        }
         #endif
-        static constexpr size_t L2_CACHE_SIZE = 256 * 1024;
     };
+
+    inline size_t& get_gemm_thread_override() {
+        static size_t override_threads = 0; 
+        return override_threads;
+    }
+
+    inline void set_gemm_threads(size_t num_threads) {
+        get_gemm_thread_override() = num_threads;
+    }
+
+    inline void reset_gemm_threads() {
+        get_gemm_thread_override() = 0;
+    }
     
     class TaskHandle {
     private:
@@ -225,10 +505,10 @@ namespace CactusThreading {
     };
     
     template<typename WorkFunc>
-    TaskHandle parallel_for(size_t total_work, size_t threshold, WorkFunc work_func, bool wait = true) {
-        const size_t num_threads = get_optimal_thread_count(total_work, threshold);
-        TaskHandle handle(!wait);  
-        
+    TaskHandle parallel_for(size_t total_work, ParallelConfig config, WorkFunc work_func, bool wait = true) {
+        const size_t num_threads = get_optimal_thread_count(total_work, config);
+        TaskHandle handle(!wait);
+
         if (num_threads == 1) {
             if (wait) {
                 work_func(0, total_work);
@@ -240,10 +520,10 @@ namespace CactusThreading {
             }));
             return handle;
         }
-        
+
         auto& pool = get_thread_pool();
         const size_t work_per_thread = total_work / num_threads;
-        
+
         for (size_t t = 0; t < num_threads; ++t) {
             handle.add_future(pool.enqueue([work_func, t, num_threads, work_per_thread, total_work]() {
                 const size_t start_idx = t * work_per_thread;
@@ -251,17 +531,17 @@ namespace CactusThreading {
                 work_func(start_idx, end_idx);
             }));
         }
-        
+
         if (wait) {
             handle.wait();
         }
         return handle;
     }
-    
+
     template<typename WorkFunc>
-    void parallel_for_2d(size_t outer_size, size_t inner_size, size_t threshold, WorkFunc work_func) {
+    void parallel_for_2d(size_t outer_size, size_t inner_size, ParallelConfig config, WorkFunc work_func) {
         const size_t total_work = outer_size * inner_size;
-        parallel_for(total_work, threshold, [&](size_t start_idx, size_t end_idx) {
+        parallel_for(total_work, config, [&](size_t start_idx, size_t end_idx) {
             for (size_t work_idx = start_idx; work_idx < end_idx; ++work_idx) {
                 const size_t outer = work_idx / inner_size;
                 const size_t inner = work_idx % inner_size;
@@ -269,11 +549,11 @@ namespace CactusThreading {
             }
         });
     }
-    
+
     template<typename WorkFunc, typename ResultType, typename CombineFunc>
-    ResultType parallel_reduce(size_t total_work, size_t threshold, 
+    ResultType parallel_reduce(size_t total_work, ParallelConfig config,
                               WorkFunc work_func, ResultType init_value, CombineFunc combine_func) {
-        const size_t num_threads = get_optimal_thread_count(total_work, threshold);
+        const size_t num_threads = get_optimal_thread_count(total_work, config);
         
         if (num_threads == 1) {
             return work_func(0, total_work);
@@ -298,46 +578,25 @@ namespace CactusThreading {
         }
         return result;
     }
-    
-    inline size_t compute_gemm_parallelism(size_t M, size_t K, size_t N, size_t element_size) {
-        size_t total_ops = M * K * N;
-        
-        if (total_ops < Thresholds::GEMM_SMALL) return 1;
-        
-        if (total_ops < Thresholds::GEMM_MEDIUM) {
-            return std::min(static_cast<size_t>(2), get_thread_pool().num_workers());
-        }
-        
-        size_t bytes_accessed = (M * K + K * N + M * N) * element_size;
-        size_t cache_tiles = (bytes_accessed + Thresholds::L2_CACHE_SIZE - 1) / Thresholds::L2_CACHE_SIZE;
-        
-        size_t compute_threads = std::sqrt(static_cast<double>(total_ops) / Thresholds::GEMM_SMALL);
-        size_t memory_threads = cache_tiles;
-        
-        size_t optimal = std::min(compute_threads, memory_threads);
-        return std::min(optimal, get_thread_pool().num_workers());
-    }
-    
+
     template<typename WorkFunc>
-    void parallel_for_2d_tiled(size_t rows, size_t cols, size_t tile_rows, size_t tile_cols, WorkFunc work_func) {
-        size_t num_row_tiles = (rows + tile_rows - 1) / tile_rows;
-        size_t num_col_tiles = (cols + tile_cols - 1) / tile_cols;
-        size_t total_tiles = num_row_tiles * num_col_tiles;
-
-        parallel_for(total_tiles, Thresholds::GEMM_TILED, [=](size_t start_tile, size_t end_tile) {
-            for (size_t tile_idx = start_tile; tile_idx < end_tile; ++tile_idx) {
-                size_t tile_row = tile_idx / num_col_tiles;
-                size_t tile_col = tile_idx % num_col_tiles;
-                
-                size_t row_start = tile_row * tile_rows;
-                size_t row_end = std::min(row_start + tile_rows, rows);
-                size_t col_start = tile_col * tile_cols;
-                size_t col_end = std::min(col_start + tile_cols, cols);
-                
-                work_func(row_start, row_end, col_start, col_end);
-            }
-        });
+    void parallel_gemm_tiles(size_t M, size_t total_tiles, WorkFunc work_func) {
+        auto& pool = get_thread_pool();
+
+        size_t override = get_gemm_thread_override();
+        size_t num_threads = (override > 0) ? override : GemmThreading::get_num_threads(M, pool.num_workers());
+        num_threads = std::min(num_threads, total_tiles);
+
+        if (num_threads <= 1) {
+            work_func(0, total_tiles);
+            return;
+        }
+
+        pool.enqueue_n_threads(total_tiles, num_threads, work_func);
+        pool.wait_all();
     }
+
 }
 
+
 #endif // KERNEL_UTILS_H 
\ No newline at end of file
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus b/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus
index 2de21e3..803b860 100755
Binary files a/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus and b/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus differ
diff --git a/nitrogen/generated/shared/c++/HybridCactusSpec.cpp b/nitrogen/generated/shared/c++/HybridCactusSpec.cpp
index 19f73fc..aca7c56 100644
--- a/nitrogen/generated/shared/c++/HybridCactusSpec.cpp
+++ b/nitrogen/generated/shared/c++/HybridCactusSpec.cpp
@@ -19,6 +19,11 @@ namespace margelo::nitro::cactus {
       prototype.registerHybridMethod("tokenize", &HybridCactusSpec::tokenize);
       prototype.registerHybridMethod("scoreWindow", &HybridCactusSpec::scoreWindow);
       prototype.registerHybridMethod("transcribe", &HybridCactusSpec::transcribe);
+      prototype.registerHybridMethod("streamTranscribeInit", &HybridCactusSpec::streamTranscribeInit);
+      prototype.registerHybridMethod("streamTranscribeInsert", &HybridCactusSpec::streamTranscribeInsert);
+      prototype.registerHybridMethod("streamTranscribeProcess", &HybridCactusSpec::streamTranscribeProcess);
+      prototype.registerHybridMethod("streamTranscribeFinalize", &HybridCactusSpec::streamTranscribeFinalize);
+      prototype.registerHybridMethod("streamTranscribeDestroy", &HybridCactusSpec::streamTranscribeDestroy);
       prototype.registerHybridMethod("embed", &HybridCactusSpec::embed);
       prototype.registerHybridMethod("imageEmbed", &HybridCactusSpec::imageEmbed);
       prototype.registerHybridMethod("audioEmbed", &HybridCactusSpec::audioEmbed);
diff --git a/nitrogen/generated/shared/c++/HybridCactusSpec.hpp b/nitrogen/generated/shared/c++/HybridCactusSpec.hpp
index c2bfa1b..0cf0070 100644
--- a/nitrogen/generated/shared/c++/HybridCactusSpec.hpp
+++ b/nitrogen/generated/shared/c++/HybridCactusSpec.hpp
@@ -58,6 +58,11 @@ namespace margelo::nitro::cactus {
       virtual std::shared_ptr<Promise<std::vector<double>>> tokenize(const std::string& text) = 0;
       virtual std::shared_ptr<Promise<std::string>> scoreWindow(const std::vector<double>& tokens, double start, double end, double context) = 0;
       virtual std::shared_ptr<Promise<std::string>> transcribe(const std::variant<std::vector<double>, std::string>& audio, const std::string& prompt, double responseBufferSize, const std::optional<std::string>& optionsJson, const std::optional<std::function<void(const std::string& /* token */, double /* tokenId */)>>& callback) = 0;
+      virtual std::shared_ptr<Promise<void>> streamTranscribeInit() = 0;
+      virtual std::shared_ptr<Promise<void>> streamTranscribeInsert(const std::vector<double>& audio) = 0;
+      virtual std::shared_ptr<Promise<std::string>> streamTranscribeProcess(const std::optional<std::string>& optionsJson) = 0;
+      virtual std::shared_ptr<Promise<std::string>> streamTranscribeFinalize() = 0;
+      virtual std::shared_ptr<Promise<void>> streamTranscribeDestroy() = 0;
       virtual std::shared_ptr<Promise<std::vector<double>>> embed(const std::string& text, double embeddingBufferSize, bool normalize) = 0;
       virtual std::shared_ptr<Promise<std::vector<double>>> imageEmbed(const std::string& imagePath, double embeddingBufferSize) = 0;
       virtual std::shared_ptr<Promise<std::vector<double>>> audioEmbed(const std::string& audioPath, double embeddingBufferSize) = 0;
diff --git a/package.json b/package.json
index bc6f2bf..a333bd7 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "cactus-react-native",
-  "version": "1.4.0",
+  "version": "1.5.0",
   "description": "Run AI models locally on mobile devices",
   "main": "./lib/module/index.js",
   "types": "./lib/typescript/src/index.d.ts",
diff --git a/src/api/Database.ts b/src/api/Database.ts
index 4406421..7ee7739 100644
--- a/src/api/Database.ts
+++ b/src/api/Database.ts
@@ -1,30 +1,8 @@
 import { CactusUtil } from '../native';
 import type { DeviceInfo } from '../specs/CactusDeviceInfo.nitro';
 import type { LogRecord } from '../telemetry/Telemetry';
-import { packageVersion } from '../constants/packageVersion';
-import type { CactusModel } from '../types/CactusModel';
-import type { CactusSTTModel } from '../types/CactusSTTModel';
 import { CactusConfig } from '../config/CactusConfig';
 
-interface CactusModelResponse {
-  name: string;
-  slug: string;
-  quantization: number;
-  size_mb: number;
-  download_url: string;
-  supports_tool_calling: boolean;
-  supports_vision: boolean;
-  supports_completion: boolean;
-  created_at: Date;
-}
-
-interface CactusSTTModelResponse {
-  slug: string;
-  download_url: string;
-  size_mb: number;
-  created_at: Date;
-}
-
 export class Database {
   private static readonly url = 'https://vlqqczxwyaodtcdmdmlw.supabase.co';
   private static readonly key =
@@ -74,115 +52,4 @@ export class Database {
 
     return await CactusUtil.registerApp(await response.text());
   }
-
-  public static async getModel(slug: string): Promise<CactusModel> {
-    const response = await fetch(
-      `${this.url}/functions/v1/get-models?slug=${slug}&sdk_name=react&sdk_version=${packageVersion}`,
-      {
-        headers: { apikey: this.key, Authorization: `Bearer ${this.key}` },
-      }
-    );
-
-    if (!response.ok) {
-      throw new Error('Getting model failed');
-    }
-
-    const model = (await response.json()) as CactusModelResponse;
-
-    return {
-      name: model.name,
-      slug: model.slug,
-      quantization: model.quantization,
-      sizeMb: model.size_mb,
-      downloadUrl: model.download_url,
-      supportsToolCalling: model.supports_tool_calling,
-      supportsVision: model.supports_vision,
-      supportsCompletion: model.supports_completion,
-      createdAt: model.created_at,
-      isDownloaded: false,
-    };
-  }
-
-  public static async getSTTModel(slug: string): Promise<CactusSTTModel> {
-    const response = await fetch(
-      `${this.url}/rest/v1/whisper?slug=eq.${slug}&select=*`,
-      {
-        headers: {
-          'apikey': this.key,
-          'Authorization': `Bearer ${this.key}`,
-          'Accept-Profile': 'cactus',
-        },
-      }
-    );
-
-    if (!response.ok) {
-      throw new Error('Getting STT model failed');
-    }
-
-    const [model] = (await response.json()) as CactusSTTModelResponse[];
-
-    if (!model) {
-      throw new Error(`STT model with slug "${slug}" not found`);
-    }
-
-    return {
-      slug: model.slug,
-      downloadUrl: model.download_url,
-      sizeMb: model.size_mb,
-      createdAt: model.created_at,
-      isDownloaded: false,
-    };
-  }
-
-  public static async getModels(): Promise<CactusModel[]> {
-    const response = await fetch(
-      `${this.url}/functions/v1/get-models?sdk_name=react&sdk_version=${packageVersion}`,
-      {
-        headers: { apikey: this.key, Authorization: `Bearer ${this.key}` },
-      }
-    );
-
-    if (!response.ok) {
-      throw new Error('Getting models failed');
-    }
-
-    const models = (await response.json()) as CactusModelResponse[];
-
-    return models.map((model) => ({
-      name: model.name,
-      slug: model.slug,
-      quantization: model.quantization,
-      sizeMb: model.size_mb,
-      downloadUrl: model.download_url,
-      supportsToolCalling: model.supports_tool_calling,
-      supportsVision: model.supports_vision,
-      supportsCompletion: model.supports_completion,
-      createdAt: model.created_at,
-      isDownloaded: false,
-    }));
-  }
-
-  public static async getSTTModels(): Promise<CactusSTTModel[]> {
-    const response = await fetch(`${this.url}/rest/v1/whisper?select=*`, {
-      headers: {
-        'apikey': this.key,
-        'Authorization': `Bearer ${this.key}`,
-        'Accept-Profile': 'cactus',
-      },
-    });
-
-    if (!response.ok) {
-      throw new Error('Getting STT models failed');
-    }
-
-    const models = (await response.json()) as CactusSTTModelResponse[];
-
-    return models.map((model) => ({
-      slug: model.slug,
-      downloadUrl: model.download_url,
-      sizeMb: model.size_mb,
-      createdAt: model.created_at,
-      isDownloaded: false,
-    }));
-  }
 }
diff --git a/src/classes/CactusLM.ts b/src/classes/CactusLM.ts
index 121acdc..cd2db83 100644
--- a/src/classes/CactusLM.ts
+++ b/src/classes/CactusLM.ts
@@ -13,12 +13,12 @@ import type {
   CactusLMImageEmbedResult,
   CactusLMParams,
 } from '../types/CactusLM';
-import type { CactusModel } from '../types/CactusModel';
 import { Telemetry } from '../telemetry/Telemetry';
 import { CactusConfig } from '../config/CactusConfig';
-import { Database } from '../api/Database';
 import { getErrorMessage } from '../utils/error';
 import { RemoteLM } from '../api/RemoteLM';
+import models from '../models';
+import type { CactusModel } from '../types/common';
 
 export class CactusLM {
   private readonly cactus = new Cactus();
@@ -26,25 +26,46 @@ export class CactusLM {
   private readonly model: string;
   private readonly contextSize: number;
   private readonly corpusDir?: string;
+  private readonly options: {
+    quantization: 'int4' | 'int8';
+    pro: boolean;
+  };
 
   private isDownloading = false;
   private isInitialized = false;
   private isGenerating = false;
 
-  private static readonly defaultModel = 'qwen3-0.6';
+  private static readonly defaultModel = 'qwen3-0.6b';
   private static readonly defaultContextSize = 2048;
+  private static readonly defaultOptions = {
+    quantization: 'int4' as const,
+    pro: false,
+  };
+  private static readonly quantizationExceptions: {
+    [model: string]: 'int4' | 'int8';
+  } = {
+    'gemma-3-270m-it': 'int8' as const,
+    'functiongemma-270m-it': 'int8' as const,
+  };
   private static readonly defaultCompleteOptions = {
     maxTokens: 512,
   };
   private static readonly defaultCompleteMode = 'local';
   private static readonly defaultEmbedBufferSize = 2048;
 
-  constructor({ model, contextSize, corpusDir }: CactusLMParams = {}) {
+  constructor({ model, contextSize, corpusDir, options }: CactusLMParams = {}) {
     Telemetry.init(CactusConfig.telemetryToken);
 
     this.model = model ?? CactusLM.defaultModel;
     this.contextSize = contextSize ?? CactusLM.defaultContextSize;
     this.corpusDir = corpusDir;
+    this.options = {
+      quantization:
+        options?.quantization ??
+        CactusLM.quantizationExceptions[this.model] ??
+        CactusLM.defaultOptions.quantization,
+      pro: options?.pro ?? CactusLM.defaultOptions.pro,
+    };
   }
 
   public async download({
@@ -59,17 +80,25 @@ export class CactusLM {
       throw new Error('CactusLM is already downloading');
     }
 
-    if (await CactusFileSystem.modelExists(this.model)) {
+    if (await CactusFileSystem.modelExists(this.getModelName())) {
+      console.log('Model already exists', this.getModelName());
       onProgress?.(1.0);
       return;
     }
 
     this.isDownloading = true;
     try {
-      const model = await Database.getModel(this.model);
+      const modelConfig =
+        models[this.model]?.quantization[this.options.quantization];
+      const url = this.options.pro ? modelConfig?.pro?.apple : modelConfig?.url;
+
+      if (!url) {
+        throw new Error(`Model ${this.model} with specified options not found`);
+      }
+
       await CactusFileSystem.downloadModel(
-        this.model,
-        model.downloadUrl,
+        this.getModelName(),
+        url,
         onProgress
       );
     } finally {
@@ -86,10 +115,13 @@ export class CactusLM {
     if (this.isModelPath(this.model)) {
       modelPath = this.model.replace('file://', '');
     } else {
-      if (!(await CactusFileSystem.modelExists(this.model))) {
-        throw new Error(`Model "${this.model}" is not downloaded`);
+      if (!(await CactusFileSystem.modelExists(this.getModelName()))) {
+        console.log('Model not found:', this.getModelName());
+        throw new Error(
+          `Model "${this.model}" with options ${JSON.stringify(this.options)} is not downloaded`
+        );
       }
-      modelPath = await CactusFileSystem.getModelPath(this.model);
+      modelPath = await CactusFileSystem.getModelPath(this.getModelName());
     }
 
     try {
@@ -255,15 +287,15 @@ export class CactusLM {
     this.isInitialized = false;
   }
 
-  public async getModels(): Promise<CactusModel[]> {
-    const models = await Database.getModels();
-    for (const model of models) {
-      model.isDownloaded = await CactusFileSystem.modelExists(model.slug);
-    }
-    return models;
+  public getModels(): CactusModel[] {
+    return Object.values(models).filter((model) => model.completion);
   }
 
   private isModelPath(model: string): boolean {
     return model.startsWith('file://') || model.startsWith('/');
   }
+
+  private getModelName(): string {
+    return `${this.model}-${this.options.quantization}${this.options.pro ? '-pro' : ''}`;
+  }
 }
diff --git a/src/classes/CactusSTT.ts b/src/classes/CactusSTT.ts
index 8dc6328..b7e9b81 100644
--- a/src/classes/CactusSTT.ts
+++ b/src/classes/CactusSTT.ts
@@ -6,37 +6,56 @@ import type {
   CactusSTTParams,
   CactusSTTAudioEmbedParams,
   CactusSTTAudioEmbedResult,
+  CactusSTTStreamTranscribeInsertParams,
+  CactusSTTStreamTranscribeProcessParams,
+  CactusSTTStreamTranscribeProcessResult,
+  CactusSTTStreamTranscribeFinalizeResult,
 } from '../types/CactusSTT';
 import { Telemetry } from '../telemetry/Telemetry';
 import { CactusConfig } from '../config/CactusConfig';
-import { Database } from '../api/Database';
 import { getErrorMessage } from '../utils/error';
-import type { CactusSTTModel } from '../types/CactusSTTModel';
+import models from '../models';
+import type { CactusModel } from '../types/common';
 
 export class CactusSTT {
   private readonly cactus = new Cactus();
 
   private readonly model: string;
   private readonly contextSize: number;
+  private readonly options: {
+    quantization: 'int4' | 'int8';
+    pro: boolean;
+  };
 
   private isDownloading = false;
   private isInitialized = false;
   private isGenerating = false;
 
+  private isStreamTranscribeInitialized = false;
+
   private static readonly defaultModel = 'whisper-small';
   private static readonly defaultContextSize = 2048;
+  private static readonly defaultOptions = {
+    quantization: 'int4' as const,
+    pro: false,
+  };
   private static readonly defaultPrompt =
     '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>';
   private static readonly defaultTranscribeOptions = {
-    maxTokens: 512,
+    maxTokens: 384,
   };
   private static readonly defaultEmbedBufferSize = 4096;
 
-  constructor({ model, contextSize }: CactusSTTParams = {}) {
+  constructor({ model, contextSize, options }: CactusSTTParams = {}) {
     Telemetry.init(CactusConfig.telemetryToken);
 
     this.model = model ?? CactusSTT.defaultModel;
     this.contextSize = contextSize ?? CactusSTT.defaultContextSize;
+    this.options = {
+      quantization:
+        options?.quantization ?? CactusSTT.defaultOptions.quantization,
+      pro: options?.pro ?? CactusSTT.defaultOptions.pro,
+    };
   }
 
   public async download({
@@ -51,17 +70,25 @@ export class CactusSTT {
       throw new Error('CactusSTT is already downloading');
     }
 
-    if (await CactusFileSystem.modelExists(this.model)) {
+    if (await CactusFileSystem.modelExists(this.getModelName())) {
+      console.log('Model already exists', this.getModelName());
       onProgress?.(1.0);
       return;
     }
 
     this.isDownloading = true;
     try {
-      const model = await Database.getSTTModel(this.model);
+      const modelConfig =
+        models[this.model]?.quantization[this.options.quantization];
+      const url = this.options.pro ? modelConfig?.pro?.apple : modelConfig?.url;
+
+      if (!url) {
+        throw new Error(`Model ${this.model} with specified options not found`);
+      }
+
       await CactusFileSystem.downloadModel(
-        this.model,
-        model.downloadUrl,
+        this.getModelName(),
+        url,
         onProgress
       );
     } finally {
@@ -78,10 +105,13 @@ export class CactusSTT {
     if (this.isModelPath(this.model)) {
       modelPath = this.model.replace('file://', '');
     } else {
-      if (!(await CactusFileSystem.modelExists(this.model))) {
-        throw new Error(`Model "${this.model}" is not downloaded`);
+      if (!(await CactusFileSystem.modelExists(this.getModelName()))) {
+        console.log('Model does not exist', this.getModelName());
+        throw new Error(
+          `Model "${this.model}" with options ${JSON.stringify(this.options)} is not downloaded`
+        );
       }
-      modelPath = await CactusFileSystem.getModelPath(this.model);
+      modelPath = await CactusFileSystem.getModelPath(this.getModelName());
     }
 
     try {
@@ -137,6 +167,76 @@ export class CactusSTT {
     }
   }
 
+  public async streamTranscribeInit(): Promise<void> {
+    if (this.isStreamTranscribeInitialized) {
+      return;
+    }
+
+    await this.init();
+
+    try {
+      await this.cactus.streamTranscribeInit();
+      this.isStreamTranscribeInitialized = true;
+    } catch (error) {
+      throw error;
+    }
+  }
+
+  public async streamTranscribeInsert({
+    audio,
+  }: CactusSTTStreamTranscribeInsertParams): Promise<void> {
+    if (!this.isStreamTranscribeInitialized) {
+      throw new Error('CactusSTT stream transcribe is not initialized');
+    }
+
+    try {
+      await this.cactus.streamTranscribeInsert(audio);
+    } catch (error) {
+      throw error;
+    }
+  }
+
+  public async streamTranscribeProcess({
+    options,
+  }: CactusSTTStreamTranscribeProcessParams = {}): Promise<CactusSTTStreamTranscribeProcessResult> {
+    if (!this.isStreamTranscribeInitialized) {
+      throw new Error('CactusSTT stream transcribe is not initialized');
+    }
+
+    try {
+      const result = await this.cactus.streamTranscribeProcess(options);
+      return result;
+    } catch (error) {
+      throw error;
+    }
+  }
+
+  public async streamTranscribeFinalize(): Promise<CactusSTTStreamTranscribeFinalizeResult> {
+    if (!this.isStreamTranscribeInitialized) {
+      throw new Error('CactusSTT stream transcribe is not initialized');
+    }
+
+    try {
+      const result = await this.cactus.streamTranscribeFinalize();
+      return result;
+    } catch (error) {
+      throw error;
+    }
+  }
+
+  public async streamTranscribeDestroy(): Promise<void> {
+    if (!this.isStreamTranscribeInitialized) {
+      return;
+    }
+
+    try {
+      await this.cactus.streamTranscribeDestroy();
+      this.isStreamTranscribeInitialized = false;
+    } catch (error) {
+      throw error;
+    }
+  }
+
   public async audioEmbed({
     audioPath,
   }: CactusSTTAudioEmbedParams): Promise<CactusSTTAudioEmbedResult> {
@@ -177,20 +277,21 @@ export class CactusSTT {
     }
 
     await this.stop();
+    await this.streamTranscribeDestroy();
     await this.cactus.destroy();
 
     this.isInitialized = false;
   }
 
-  public async getModels(): Promise<CactusSTTModel[]> {
-    const models = await Database.getSTTModels();
-    for (const model of models) {
-      model.isDownloaded = await CactusFileSystem.modelExists(model.slug);
-    }
-    return models;
+  public getModels(): CactusModel[] {
+    return Object.values(models).filter((model) => model.speech);
   }
 
   private isModelPath(model: string): boolean {
     return model.startsWith('file://') || model.startsWith('/');
   }
+
+  private getModelName(): string {
+    return `${this.model}-${this.options.quantization}${this.options.pro ? '-pro' : ''}`;
+  }
 }
diff --git a/src/hooks/useCactusLM.ts b/src/hooks/useCactusLM.ts
index 5cc486a..a0f3371 100644
--- a/src/hooks/useCactusLM.ts
+++ b/src/hooks/useCactusLM.ts
@@ -16,15 +16,19 @@ import type {
   CactusLMImageEmbedResult,
   CactusLMDownloadParams,
 } from '../types/CactusLM';
-import type { CactusModel } from '../types/CactusModel';
+import type { CactusModel } from '../types/common';
 
 export const useCactusLM = ({
-  model = 'qwen3-0.6',
+  model = 'qwen3-0.6b',
   contextSize = 2048,
   corpusDir = undefined,
+  options: modelOptions = {
+    quantization: undefined,
+    pro: false,
+  },
 }: CactusLMParams = {}) => {
   const [cactusLM, setCactusLM] = useState(
-    () => new CactusLM({ model, contextSize, corpusDir })
+    () => new CactusLM({ model, contextSize, corpusDir, options: modelOptions })
   );
 
   // State
@@ -44,7 +48,17 @@ export const useCactusLM = ({
   }, [model]);
 
   useEffect(() => {
-    setCactusLM(new CactusLM({ model, contextSize, corpusDir }));
+    setCactusLM(
+      new CactusLM({
+        model,
+        contextSize,
+        corpusDir,
+        options: {
+          quantization: modelOptions.quantization,
+          pro: modelOptions.pro,
+        },
+      })
+    );
 
     setCompletion('');
     setIsGenerating(false);
@@ -73,7 +87,13 @@ export const useCactusLM = ({
     return () => {
       mounted = false;
     };
-  }, [model, contextSize, corpusDir]);
+  }, [
+    model,
+    contextSize,
+    corpusDir,
+    modelOptions.quantization,
+    modelOptions.pro,
+  ]);
 
   useEffect(() => {
     return () => {
diff --git a/src/hooks/useCactusSTT.ts b/src/hooks/useCactusSTT.ts
index acb1c4f..c7a4b1c 100644
--- a/src/hooks/useCactusSTT.ts
+++ b/src/hooks/useCactusSTT.ts
@@ -9,20 +9,32 @@ import type {
   CactusSTTDownloadParams,
   CactusSTTAudioEmbedParams,
   CactusSTTAudioEmbedResult,
+  CactusSTTStreamTranscribeInsertParams,
+  CactusSTTStreamTranscribeProcessParams,
+  CactusSTTStreamTranscribeProcessResult,
+  CactusSTTStreamTranscribeFinalizeResult,
 } from '../types/CactusSTT';
-import type { CactusSTTModel } from '../types/CactusSTTModel';
+import type { CactusModel } from '../types/common';
 
 export const useCactusSTT = ({
   model = 'whisper-small',
   contextSize = 2048,
+  options: modelOptions = {
+    quantization: undefined,
+    pro: false,
+  },
 }: CactusSTTParams = {}) => {
   const [cactusSTT, setCactusSTT] = useState(
-    () => new CactusSTT({ model, contextSize })
+    () => new CactusSTT({ model, contextSize, options: modelOptions })
   );
 
   // State
   const [transcription, setTranscription] = useState('');
+  const [streamTranscribeConfirmed, setStreamTranscribeConfirmed] =
+    useState('');
+  const [streamTranscribePending, setStreamTranscribePending] = useState('');
   const [isGenerating, setIsGenerating] = useState(false);
+  const [isStreamTranscribing, setIsStreamTranscribing] = useState(false);
   const [isInitializing, setIsInitializing] = useState(false);
   const [isDownloaded, setIsDownloaded] = useState(false);
   const [isDownloading, setIsDownloading] = useState(false);
@@ -37,10 +49,22 @@ export const useCactusSTT = ({
   }, [model]);
 
   useEffect(() => {
-    setCactusSTT(new CactusSTT({ model, contextSize }));
+    setCactusSTT(
+      new CactusSTT({
+        model,
+        contextSize,
+        options: {
+          quantization: modelOptions.quantization,
+          pro: modelOptions.pro,
+        },
+      })
+    );
 
     setTranscription('');
+    setStreamTranscribeConfirmed('');
+    setStreamTranscribePending('');
     setIsGenerating(false);
+    setIsStreamTranscribing(false);
     setIsInitializing(false);
     setIsDownloaded(false);
     setIsDownloading(false);
@@ -66,7 +90,7 @@ export const useCactusSTT = ({
     return () => {
       mounted = false;
     };
-  }, [model, contextSize]);
+  }, [model, contextSize, modelOptions.quantization, modelOptions.pro]);
 
   useEffect(() => {
     return () => {
@@ -220,6 +244,83 @@ export const useCactusSTT = ({
     [cactusSTT, isGenerating]
   );
 
+  const streamTranscribeInit = useCallback(async () => {
+    if (isStreamTranscribing) {
+      return;
+    }
+
+    setError(null);
+    setStreamTranscribeConfirmed('');
+    setStreamTranscribePending('');
+    setIsStreamTranscribing(true);
+    try {
+      await cactusSTT.streamTranscribeInit();
+    } catch (e) {
+      setError(getErrorMessage(e));
+      setIsStreamTranscribing(false);
+      throw e;
+    }
+  }, [cactusSTT, isStreamTranscribing]);
+
+  const streamTranscribeInsert = useCallback(
+    async ({ audio }: CactusSTTStreamTranscribeInsertParams): Promise<void> => {
+      setError(null);
+      try {
+        await cactusSTT.streamTranscribeInsert({ audio });
+      } catch (e) {
+        setError(getErrorMessage(e));
+        throw e;
+      }
+    },
+    [cactusSTT]
+  );
+
+  const streamTranscribeProcess = useCallback(
+    async ({
+      options,
+    }: CactusSTTStreamTranscribeProcessParams = {}): Promise<CactusSTTStreamTranscribeProcessResult> => {
+      setError(null);
+      try {
+        const result = await cactusSTT.streamTranscribeProcess({ options });
+        setStreamTranscribeConfirmed((prev) => prev + result.confirmed);
+        setStreamTranscribePending(result.pending);
+        return result;
+      } catch (e) {
+        setError(getErrorMessage(e));
+        throw e;
+      }
+    },
+    [cactusSTT]
+  );
+
+  const streamTranscribeFinalize =
+    useCallback(async (): Promise<CactusSTTStreamTranscribeFinalizeResult> => {
+      setError(null);
+      try {
+        const result = await cactusSTT.streamTranscribeFinalize();
+        setStreamTranscribeConfirmed((prev) => prev + result.confirmed);
+        setStreamTranscribePending('');
+        setIsStreamTranscribing(false);
+        return result;
+      } catch (e) {
+        setError(getErrorMessage(e));
+        throw e;
+      }
+    }, [cactusSTT]);
+
+  const streamTranscribeDestroy = useCallback(async (): Promise<void> => {
+    setError(null);
+    try {
+      await cactusSTT.streamTranscribeDestroy();
+    } catch (e) {
+      setError(getErrorMessage(e));
+      throw e;
+    } finally {
+      setIsStreamTranscribing(false);
+      setStreamTranscribePending('');
+    }
+  }, [cactusSTT]);
+
   const stop = useCallback(async () => {
     setError(null);
     try {
@@ -251,10 +352,13 @@ export const useCactusSTT = ({
       throw e;
     } finally {
       setTranscription('');
+      setStreamTranscribeConfirmed('');
+      setStreamTranscribePending('');
+      setIsStreamTranscribing(false);
     }
   }, [cactusSTT]);
 
-  const getModels = useCallback(async (): Promise<CactusSTTModel[]> => {
+  const getModels = useCallback(async (): Promise<CactusModel[]> => {
     setError(null);
     try {
       return await cactusSTT.getModels();
@@ -266,7 +370,10 @@ export const useCactusSTT = ({
 
   return {
     transcription,
+    streamTranscribeConfirmed,
+    streamTranscribePending,
     isGenerating,
+    isStreamTranscribing,
     isInitializing,
     isDownloaded,
     isDownloading,
@@ -277,6 +384,11 @@ export const useCactusSTT = ({
     init,
     transcribe,
     audioEmbed,
+    streamTranscribeInit,
+    streamTranscribeInsert,
+    streamTranscribeProcess,
+    streamTranscribeFinalize,
+    streamTranscribeDestroy,
     reset,
     stop,
     destroy,
diff --git a/src/index.tsx b/src/index.tsx
index 5ed424e..cdeb9dd 100644
--- a/src/index.tsx
+++ b/src/index.tsx
@@ -9,8 +9,7 @@ export { useCactusSTT } from './hooks/useCactusSTT';
 export { useCactusIndex } from './hooks/useCactusIndex';
 
 // Types
-export type { CactusModel } from './types/CactusModel';
-export type { CactusSTTModel } from './types/CactusSTTModel';
+export type { CactusModel, ModelOptions } from './types/common';
 export type {
   CactusLMParams,
   CactusLMDownloadParams,
@@ -36,6 +35,11 @@ export type {
   CactusSTTTranscribeResult,
   CactusSTTAudioEmbedParams,
   CactusSTTAudioEmbedResult,
+  CactusSTTStreamTranscribeInsertParams,
+  StreamTranscribeProcessOptions,
+  CactusSTTStreamTranscribeProcessParams,
+  CactusSTTStreamTranscribeProcessResult,
+  CactusSTTStreamTranscribeFinalizeResult,
 } from './types/CactusSTT';
 export type {
   CactusIndexParams,
diff --git a/src/models.ts b/src/models.ts
new file mode 100644
index 0000000..f422613
--- /dev/null
+++ b/src/models.ts
@@ -0,0 +1,344 @@
+import type { CactusModel } from './types/common';
+
+const models: { [key: string]: CactusModel } = {
+  'gemma-3-270m-it': {
+    completion: true,
+    tools: false,
+    vision: false,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 115,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/gemma-3-270m-it.zip',
+      },
+      int8: {
+        sizeMb: 172,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/gemma-3-270m-it.zip',
+      },
+    },
+  },
+  'functiongemma-270m-it': {
+    completion: true,
+    tools: true,
+    vision: false,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 115,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/functiongemma-270m-it.zip',
+      },
+      int8: {
+        sizeMb: 172,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/functiongemma-270m-it.zip',
+      },
+    },
+  },
+  'whisper-small': {
+    completion: false,
+    tools: false,
+    vision: false,
+    embed: true,
+    speech: true,
+    quantization: {
+      int4: {
+        sizeMb: 104,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/whisper-small.zip',
+        pro: {
+          apple:
+            'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/pro/apple/whisper-small.zip',
+        },
+      },
+      int8: {
+        sizeMb: 282,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/whisper-small.zip',
+        pro: {
+          apple:
+            'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/pro/apple/whisper-small.zip',
+        },
+      },
+    },
+  },
+  'lfm2-350m': {
+    completion: true,
+    tools: true,
+    vision: false,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 153,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/lfm2-350m.zip',
+      },
+      int8: {
+        sizeMb: 233,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/lfm2-350m.zip',
+      },
+    },
+  },
+  'smollm2-360m-instruct': {
+    completion: true,
+    tools: false,
+    vision: false,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 140,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/smollm2-360m-instruct.zip',
+      },
+      int8: {
+        sizeMb: 227,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/smollm2-360m-instruct.zip',
+      },
+    },
+  },
+  'lfm2-vl-450m': {
+    completion: true,
+    tools: false,
+    vision: true,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 318,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/lfm2-vl-450m.zip',
+        pro: {
+          apple:
+            'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/pro/apple/lfm2-vl-450m.zip',
+        },
+      },
+      int8: {
+        sizeMb: 480,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/lfm2-vl-450m.zip',
+        pro: {
+          apple:
+            'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/pro/apple/lfm2-vl-450m.zip',
+        },
+      },
+    },
+  },
+  'nomic-embed-text-v2-moe': {
+    completion: false,
+    tools: false,
+    vision: false,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 211,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/nomic-embed-text-v2-moe.zip',
+      },
+      int8: {
+        sizeMb: 456,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/nomic-embed-text-v2-moe.zip',
+      },
+    },
+  },
+  'qwen3-0.6b': {
+    completion: true,
+    tools: true,
+    vision: false,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 234,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/qwen3-0.6b.zip',
+      },
+      int8: {
+        sizeMb: 394,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/qwen3-0.6b.zip',
+      },
+    },
+  },
+  'qwen3-embedding-0.6b': {
+    completion: false,
+    tools: false,
+    vision: false,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 234,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/qwen3-embedding-0.6b.zip',
+      },
+      int8: {
+        sizeMb: 394,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/qwen3-embedding-0.6b.zip',
+      },
+    },
+  },
+  'lfm2-700m': {
+    completion: true,
+    tools: true,
+    vision: false,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 300,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/lfm2-700m.zip',
+      },
+      int8: {
+        sizeMb: 467,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/lfm2-700m.zip',
+      },
+    },
+  },
+  'gemma-3-1b-it': {
+    completion: true,
+    tools: false,
+    vision: false,
+    embed: false,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 320,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/gemma-3-1b-it.zip',
+      },
+      int8: {
+        sizeMb: 642,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/gemma-3-1b-it.zip',
+      },
+    },
+  },
+  'whisper-medium': {
+    completion: false,
+    tools: false,
+    vision: false,
+    embed: true,
+    speech: true,
+    quantization: {
+      int4: {
+        sizeMb: 320,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/whisper-medium.zip',
+        pro: {
+          apple:
+            'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/pro/apple/whisper-medium.zip',
+        },
+      },
+      int8: {
+        sizeMb: 646,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/whisper-medium.zip',
+        pro: {
+          apple:
+            'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/pro/apple/whisper-medium.zip',
+        },
+      },
+    },
+  },
+  'lfm2.5-1.2b-instruct': {
+    completion: true,
+    tools: true,
+    vision: false,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 474,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/lfm2.5-1.2b-instruct.zip',
+      },
+      int8: {
+        sizeMb: 722,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/lfm2.5-1.2b-instruct.zip',
+      },
+    },
+  },
+  'lfm2-1.2b-rag': {
+    completion: true,
+    tools: true,
+    vision: false,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 474,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/lfm2-1.2b-rag.zip',
+      },
+      int8: {
+        sizeMb: 722,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/lfm2-1.2b-rag.zip',
+      },
+    },
+  },
+  'lfm2-1.2b-tool': {
+    completion: true,
+    tools: true,
+    vision: false,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 474,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/lfm2-1.2b-tool.zip',
+      },
+      int8: {
+        sizeMb: 722,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/lfm2-1.2b-tool.zip',
+      },
+    },
+  },
+  'lfm2-vl-1.6b': {
+    completion: true,
+    tools: false,
+    vision: true,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 954,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/lfm2-vl-1.6b.zip',
+        pro: {
+          apple:
+            'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/pro/apple/lfm2-vl-1.6b.zip',
+        },
+      },
+      int8: {
+        sizeMb: 1440,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/lfm2-vl-1.6b.zip',
+        pro: {
+          apple:
+            'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/pro/apple/lfm2-vl-1.6b.zip',
+        },
+      },
+    },
+  },
+  'qwen3-1.7b': {
+    completion: true,
+    tools: true,
+    vision: false,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 749,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/qwen3-1.7b.zip',
+      },
+      int8: {
+        sizeMb: 1161,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/qwen3-1.7b.zip',
+      },
+    },
+  },
+  'smollm2-1.7b-instruct': {
+    completion: true,
+    tools: false,
+    vision: false,
+    embed: true,
+    speech: false,
+    quantization: {
+      int4: {
+        sizeMb: 801,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int4/smollm2-1.7b-instruct.zip',
+      },
+      int8: {
+        sizeMb: 1161,
+        url: 'https://vlqqczxwyaodtcdmdmlw.supabase.co/storage/v1/object/public/cactus-models/v1.5/int8/smollm2-1.7b-instruct.zip',
+      },
+    },
+  },
+};
+
+export default models;
diff --git a/src/native/Cactus.ts b/src/native/Cactus.ts
index f75ed35..4a33de1 100644
--- a/src/native/Cactus.ts
+++ b/src/native/Cactus.ts
@@ -10,6 +10,9 @@ import type {
 import type {
   CactusSTTTranscribeResult,
   TranscribeOptions,
+  CactusSTTStreamTranscribeProcessResult,
+  StreamTranscribeProcessOptions,
+  CactusSTTStreamTranscribeFinalizeResult,
 } from '../types/CactusSTT';
 
 export class Cactus {
@@ -163,6 +166,58 @@ export class Cactus {
     }
   }
 
+  public streamTranscribeInit(): Promise<void> {
+    return this.hybridCactus.streamTranscribeInit();
+  }
+
+  public streamTranscribeInsert(audio: number[]): Promise<void> {
+    return this.hybridCactus.streamTranscribeInsert(audio);
+  }
+
+  public async streamTranscribeProcess(
+    options?: StreamTranscribeProcessOptions
+  ): Promise<CactusSTTStreamTranscribeProcessResult> {
+    const optionsJson = options
+      ? JSON.stringify({
+          confirmation_threshold: options.confirmationThreshold,
+        })
+      : undefined;
+
+    const response =
+      await this.hybridCactus.streamTranscribeProcess(optionsJson);
+
+    try {
+      const parsed = JSON.parse(response);
+
+      return {
+        success: parsed.success,
+        confirmed: parsed.confirmed,
+        pending: parsed.pending,
+      };
+    } catch {
+      throw new Error('Unable to parse stream transcribe process response');
+    }
+  }
+
+  public async streamTranscribeFinalize(): Promise<CactusSTTStreamTranscribeFinalizeResult> {
+    const response = await this.hybridCactus.streamTranscribeFinalize();
+
+    try {
+      const parsed = JSON.parse(response);
+
+      return {
+        success: parsed.success,
+        confirmed: parsed.confirmed,
+      };
+    } catch {
+      throw new Error('Unable to parse stream transcribe finalize response');
+    }
+  }
+
+  public streamTranscribeDestroy(): Promise<void> {
+    return this.hybridCactus.streamTranscribeDestroy();
+  }
+
   public embed(
     text: string,
     embeddingBufferSize: number,
diff --git a/src/specs/Cactus.nitro.ts b/src/specs/Cactus.nitro.ts
index a373113..e0846df 100644
--- a/src/specs/Cactus.nitro.ts
+++ b/src/specs/Cactus.nitro.ts
@@ -27,6 +27,11 @@ export interface Cactus extends HybridObject<{ ios: 'c++'; android: 'c++' }> {
     optionsJson?: string,
     callback?: (token: string, tokenId: number) => void
   ): Promise<string>;
+  streamTranscribeInit(): Promise<void>;
+  streamTranscribeInsert(audio: number[]): Promise<void>;
+  streamTranscribeProcess(optionsJson?: string): Promise<string>;
+  streamTranscribeFinalize(): Promise<string>;
+  streamTranscribeDestroy(): Promise<void>;
   embed(
     text: string,
     embeddingBufferSize: number,
diff --git a/src/types/CactusLM.ts b/src/types/CactusLM.ts
index 6d59fc8..3a5b9ba 100644
--- a/src/types/CactusLM.ts
+++ b/src/types/CactusLM.ts
@@ -1,7 +1,10 @@
+import { type ModelOptions } from './common';
+
 export interface CactusLMParams {
   model?: string;
   contextSize?: number;
   corpusDir?: string;
+  options?: ModelOptions;
 }
 
 export interface CactusLMDownloadParams {
diff --git a/src/types/CactusModel.ts b/src/types/CactusModel.ts
deleted file mode 100644
index 38ba8dd..0000000
--- a/src/types/CactusModel.ts
+++ /dev/null
@@ -1,15 +0,0 @@
-export interface CactusModel {
-  // API
-  name: string;
-  slug: string;
-  quantization: number;
-  sizeMb: number;
-  downloadUrl: string;
-  supportsToolCalling: boolean;
-  supportsVision: boolean;
-  supportsCompletion: boolean;
-  createdAt: Date;
-
-  // Local
-  isDownloaded: boolean;
-}
diff --git a/src/types/CactusSTT.ts b/src/types/CactusSTT.ts
index 9153bfa..0d30ea7 100644
--- a/src/types/CactusSTT.ts
+++ b/src/types/CactusSTT.ts
@@ -1,6 +1,9 @@
+import { type ModelOptions } from './common';
+
 export interface CactusSTTParams {
   model?: string;
   contextSize?: number;
+  options?: ModelOptions;
 }
 
 export interface CactusSTTDownloadParams {
@@ -40,3 +43,26 @@ export interface CactusSTTAudioEmbedParams {
 export interface CactusSTTAudioEmbedResult {
   embedding: number[];
 }
+
+export interface CactusSTTStreamTranscribeInsertParams {
+  audio: number[];
+}
+
+export interface StreamTranscribeProcessOptions {
+  confirmationThreshold?: number;
+}
+
+export interface CactusSTTStreamTranscribeProcessParams {
+  options?: StreamTranscribeProcessOptions;
+}
+
+export interface CactusSTTStreamTranscribeProcessResult {
+  success: boolean;
+  confirmed: string;
+  pending: string;
+}
+
+export interface CactusSTTStreamTranscribeFinalizeResult {
+  success: boolean;
+  confirmed: string;
+}
diff --git a/src/types/CactusSTTModel.ts b/src/types/CactusSTTModel.ts
deleted file mode 100644
index 6ac8ba3..0000000
--- a/src/types/CactusSTTModel.ts
+++ /dev/null
@@ -1,10 +0,0 @@
-export interface CactusSTTModel {
-  // API
-  slug: string;
-  sizeMb: number;
-  downloadUrl: string;
-  createdAt: Date;
-
-  // Local
-  isDownloaded: boolean;
-}
diff --git a/src/types/common.ts b/src/types/common.ts
new file mode 100644
index 0000000..10c4a66
--- /dev/null
+++ b/src/types/common.ts
@@ -0,0 +1,28 @@
+export interface CactusModel {
+  completion: boolean;
+  tools: boolean;
+  vision: boolean;
+  embed: boolean;
+  speech: boolean;
+  quantization: {
+    int4: {
+      sizeMb: number;
+      url: string;
+      pro?: {
+        apple: string;
+      };
+    };
+    int8: {
+      sizeMb: number;
+      url: string;
+      pro?: {
+        apple: string;
+      };
+    };
+  };
+}
+
+export interface ModelOptions {
+  quantization?: 'int4' | 'int8';
+  pro?: boolean;
+}