gergap · owner888 · Dec 1, 2025 · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/autoload/ollama.vim b/autoload/ollama.vim
@@ -221,11 +221,24 @@ function! ollama#GetSuggestion(timer)
           \ "Connecting to Ollama on " .. g:ollama_host
           \ .. " using model " .. g:ollama_model)
     call ollama#logger#Debug("model_options=" .. l:model_options)
+
+    if exists('g:ollama_model_sampling_denylist')
+            \ && len(g:ollama_model_sampling_denylist) > 0
+            \ && index(g:ollama_model_sampling_denylist, g:ollama_model) >= 0
+        let l:sampling_enabled = 0
+    else
+        let l:sampling_enabled = 1
+    endif
+    call ollama#logger#Debug("sampling_enabled=" .. l:sampling_enabled)
+
     " Convert plugin debug level to python logger levels
     let l:log_level = ollama#logger#PythonLogLevel(g:ollama_debug)
     let l:base_url = g:ollama_host
     if g:ollama_model_provider =~ '^openai'
         let l:base_url = g:ollama_openai_baseurl
+    elseif g:ollama_model_provider == 'claude'
+        " Claude uses default Anthropic API, don't set base_url
+        let l:base_url = ''
     endif
     " Adjust the command to use the prompt as stdin input
     let l:command = [ g:ollama_python_interpreter,
@@ -234,6 +247,7 @@ function! ollama#GetSuggestion(timer)
         \ "-m", g:ollama_model,
         \ "-u", l:base_url,
         \ "-o", l:model_options,
+        \ "-se", l:sampling_enabled,
         \ "-l", l:log_level
         \ ]
     " Add optional credentialname for looking up the API key
@@ -247,6 +261,11 @@ function! ollama#GetSuggestion(timer)
             " add credentialname option for Mistral
             let l:command += [ '-k', g:ollama_mistral_credentialname ]
         endif
+    elseif g:ollama_model_provider == 'claude'
+        if exists('g:ollama_claude_credentialname') && g:ollama_claude_credentialname != ''
+            " add credentialname option for Claude
+            let l:command += [ '-k', g:ollama_claude_credentialname ]
+        endif
     endif
     call ollama#logger#Debug("command=" .. join(l:command, " "))
     let l:job_options = {

diff --git a/autoload/ollama/review.vim b/autoload/ollama/review.vim
@@ -56,6 +56,9 @@ function! s:FindBufferWindow(bufnr)
 endfunction
 
 function! s:StartChat(lines) abort
+    " Counter for reducing redraw frequency
+    let s:token_count = 0
+
     " Function handling a line of text that has been typed.
     func! TextEntered(text)
         call ollama#logger#Debug("TextEntered: " .. a:text)
@@ -65,12 +68,13 @@ function! s:StartChat(lines) abort
         endif
         " Send the text to a shell with Enter appended.
         call ch_sendraw(s:job, a:text .. "\n")
+        " Reset token count for new request
+        let s:token_count = 0
     endfunc
 
-    " Function handling output from the shell: Add it above the prompt.
-    func! GotOutput(channel, msg)
+    " OLD VERSION: Append each token as a new line (non-streaming)
+    func! GotOutputOld(channel, msg)
         call ollama#logger#Debug("GotOutput: " .. a:msg)
-
         " append lines
         let l:lines = split(a:msg, "\n")
         for l:line in l:lines
@@ -96,6 +100,91 @@ function! s:StartChat(lines) abort
         endfor
     endfunc
 
+    " NEW VERSION: Stream tokens on the same line with real-time cursor tracking
+    func! GotOutputNew(channel, msg)
+        " call ollama#logger#Debug("GotOutput: [" .. a:msg .. "]")
+
+        " Check for <EOT> marker
+        let l:idx = stridx(a:msg, "<EOT>")
+        let l:is_eot = l:idx != -1
+        let l:content = l:is_eot ? strpart(a:msg, 0, l:idx) : a:msg
+
+        " Append content to the last line for streaming effect
+        let l:updated_line_num = 0
+        let l:updated_line_content = ""
+        let l:line_count = 0
+
+        if !empty(l:content)
+            " Get buffer line count efficiently
+            let l:buf_info = getbufinfo(s:buf)[0]
+            let l:line_count = l:buf_info.linecount
+            " call ollama#logger#Debug("line_count=" .. l:line_count)
+
+            if l:line_count == 0
+                " Buffer is empty, append as new line
+                " call ollama#logger#Debug("Buffer empty, appending first line")
+                call appendbufline(s:buf, 0, l:content)
+                let l:updated_line_num = 1
+                let l:updated_line_content = l:content
+            else
+                " Get only the last line (much faster than getting all lines)
+                let l:last_line = getbufline(s:buf, l:line_count, l:line_count)[0]
+                let l:updated_line_content = l:last_line .. l:content
+                " call ollama#logger#Debug("Appending to line " .. l:line_count .. ": '" .. l:last_line .. "' + '" .. l:content .. "'")
+                call setbufline(s:buf, l:line_count, l:updated_line_content)
+                let l:updated_line_num = l:line_count
+            endif
+        endif
+
+        " When streaming is done, add a new line for the next input
+        if l:is_eot
+            " call ollama#logger#Debug("EOT received, adding newline")
+            call appendbufline(s:buf, "$", "")
+            " Reuse line_count if we already got it, otherwise fetch
+            if l:line_count > 0
+                let l:updated_line_num = l:line_count + 1
+            else
+                let l:buf_info = getbufinfo(s:buf)[0]
+                let l:updated_line_num = l:buf_info.linecount
+            endif
+            let l:updated_line_content = ""
+        endif
+
+        " Update cursor position if this is the active chat window
+        if bufname() == s:ollama_bufname " Check if current active window is Ollama Chat
+            let l:winid = bufwinid(s:buf)
+            if l:winid != -1 && l:updated_line_num > 0
+                " Set cursor position directly (much faster than feedkeys)
+                let l:col = len(l:updated_line_content) + 1
+                call win_execute(l:winid, 'call cursor(' . l:updated_line_num . ', ' . l:col . ')')
+
+                " Increment token counter and only redraw every N tokens (or always for EOT)
+                let s:token_count += 1
+                if l:is_eot || s:token_count % 5 == 0
+                    redraw
+                endif
+
+                if l:is_eot
+                    " Streaming done, enter insert mode
+                    if mode() == 'i'
+                        call feedkeys("\<Esc>")
+                    endif
+                    call feedkeys("a")
+                endif
+            endif
+        endif
+    endfunc
+
+    " Wrapper function that delegates to new version by default
+    " To use old version, set g:ollama_use_old_output = 1
+    func! GotOutput(channel, msg)
+        if exists('g:ollama_use_old_output') && g:ollama_use_old_output
+            call GotOutputOld(a:channel, a:msg)
+        else
+            call GotOutputNew(a:channel, a:msg)
+        endif
+    endfunc
+
     " Function handling output from the shell: Add it above the prompt.
     func! GotErrors(channel, msg)
         call ollama#logger#Debug("GotErrors: " .. a:msg)
@@ -136,9 +225,18 @@ function! s:StartChat(lines) abort
     endfunc
 
     let l:model_options = json_encode(g:ollama_chat_options)
-    call ollama#logger#Debug("Connecting to Ollama on " .. g:ollama_host .. " using model " .. g:ollama_model)
+    call ollama#logger#Debug("Chat Connecting to Ollama on " .. g:ollama_host .. " using model " .. g:ollama_model)
     call ollama#logger#Debug("model_options=" .. l:model_options)
 
+    if exists('g:ollama_model_sampling_denylist')
+            \ && len(g:ollama_model_sampling_denylist) > 0
+            \ && index(g:ollama_model_sampling_denylist, g:ollama_chat_model) >= 0
+        let l:sampling_enabled = 0
+    else
+        let l:sampling_enabled = 1
+    endif
+    call ollama#logger#Debug("sampling_enabled=" .. l:sampling_enabled)
+
     " Convert plugin debug level to python logger levels
     let l:log_level = ollama#logger#PythonLogLevel(g:ollama_debug)
     let l:base_url = g:ollama_host
@@ -154,6 +252,7 @@ function! s:StartChat(lines) abort
                 \ '-m', g:ollama_chat_model,
                 \ '-u', l:base_url,
                 \ '-o', l:model_options,
+                \ "-se", l:sampling_enabled,
                 \ '-t', g:ollama_chat_timeout,
                 \ '-l', l:log_level ]
     " Check if a system prompt was configured
@@ -205,7 +304,7 @@ function! s:StartChat(lines) abort
         silent execute 'new' l:bufname
     endif
     " Set the filetype to ollama-chat
-"    setlocal filetype=ollama-chat
+    " setlocal filetype=ollama-chat
     setlocal filetype=markdown
     setlocal buftype=prompt
     " enable BufDelete event when closing buffer usig :q!

diff --git a/plugin/ollama.vim b/plugin/ollama.vim
@@ -67,6 +67,10 @@ if !exists('g:ollama_openai_credentialname')
     " UNIX Pass credential name to lookup API key for OpenAI service
     let g:ollama_openai_credentialname = ''
 endif
+if !exists('g:ollama_claude_credentialname')
+    " UNIX Pass credential name to lookup API key for Anthropic Claude service
+    let g:ollama_claude_credentialname = ''
+endif
 " Tab completion specific settings
 if !exists('g:ollama_debounce_time')
     let g:ollama_debounce_time = 500
@@ -98,6 +102,10 @@ if !exists('g:ollama_model_options')
                 \ 'max_tokens': 500
                 \ }
 endif
+if !exists('g:ollama_model_sampling_denylist')
+    " default model sampling denylist
+  let g:ollama_model_sampling_denylist = []
+endif
 " Chat specific settings
 if !exists('g:ollama_chat_provider')
     " Provider for chat models: 'ollama' or 'openai'

diff --git a/python/OllamaCredentials.py b/python/OllamaCredentials.py
@@ -18,6 +18,7 @@ def GetApiKey(self, provider: str, credentialname: str | None) -> str:
           - 'openai'         → use OPENAI_API_KEY env var or pass entry
           - 'openai_legacy'  → same as 'openai', kept for compatibility
           - 'mistral'        → use MISTRAL_API_KEY env var or pass entry
+          - 'anthropic'      → use ANTHROPIC_API_KEY env var or pass entry
 
         Priority:
           1. Environment variable override
@@ -36,6 +37,8 @@ def GetApiKey(self, provider: str, credentialname: str | None) -> str:
             env_var = "OPENAI_API_KEY"
         elif provider == "mistral":
             env_var = "MISTRAL_API_KEY"
+        elif provider == "anthropic":
+            env_var = "ANTHROPIC_API_KEY"
         else:
             raise ValueError(f"Unknown provider: {provider}")
 

diff --git a/python/chat.py b/python/chat.py
@@ -63,7 +63,9 @@ async def stream_chat_message_ollama(messages, endpoint, model, options, timeout
                             if "message" in message and "content" in message["message"]:
                                 content = message["message"]["content"]
                                 assistant_message += content
-                                print(content, end="", flush=True)
+                                # Print each token followed by newline so Vim's out_cb receives it immediately
+                                # VimScript will need to handle concatenating tokens on the same line
+                                print(content, flush=True)
 
                                 # If <EOT> is detected, stop processing
                                 if "<EOT>" in content:
@@ -90,7 +92,7 @@ async def stream_chat_message_ollama(messages, endpoint, model, options, timeout
         messages.append({"role": "assistant", "content": assistant_message.strip()})
 
 
-async def stream_chat_message_openai(messages, endpoint, model, options, credentialname):
+async def stream_chat_message_openai(messages, endpoint, model, options, sampling_enabled, credentialname):
     """Stream chat responses from OpenAI API."""
     if AsyncOpenAI is None:
         raise ImportError("OpenAI package not found. Please install via 'pip install openai'.")
@@ -114,20 +116,30 @@ async def stream_chat_message_openai(messages, endpoint, model, options, credent
     top_p = options.get('top_p', 1.0)
 
     try:
-        stream = await client.chat.completions.create(
-            model=model,
-            messages=messages,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=top_p,
-            stream=True,
-        )
+        # Build request parameters
+        request_params = {
+            'model': model,
+            'messages': messages,
+            'stream': True,
+        }
+
+        # Check if model supports sampling parameters
+        if sampling_enabled:
+            request_params['temperature'] = temperature
+            request_params['top_p'] = top_p
+            request_params['max_tokens'] = max_tokens
+        else:
+            request_params['max_completion_tokens'] = max_tokens
+
+        stream = await client.chat.completions.create(**request_params)
 
         async for chunk in stream:
             if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content:
                 token = chunk.choices[0].delta.content
                 assistant_message += token
-                print(token, end="", flush=True)
+                # Print each token followed by newline so Vim's out_cb receives it immediately
+                # VimScript will need to handle concatenating tokens on the same line
+                print(token, flush=True)
 
         print("<EOT>", flush=True)
 
@@ -139,7 +151,7 @@ async def stream_chat_message_openai(messages, endpoint, model, options, credent
         messages.append({"role": "assistant", "content": assistant_message.strip()})
 
 
-async def main(provider, endpoint, model, options, systemprompt, timeout, credentialname):
+async def main(provider, endpoint, model, options, sampling_enabled, systemprompt, timeout, credentialname):
     conversation_history = []
     log.debug("endpoint: " + str(endpoint))
 
@@ -169,7 +181,7 @@ async def main(provider, endpoint, model, options, systemprompt, timeout, creden
                         )
                     else:
                         task = asyncio.create_task(
-                            stream_chat_message_openai(conversation_history, endpoint, model, options, credentialname)
+                            stream_chat_message_openai(conversation_history, endpoint, model, options, sampling_enabled, credentialname)
                         )
                     await task
                 else:
@@ -189,7 +201,7 @@ async def main(provider, endpoint, model, options, systemprompt, timeout, creden
                         )
                     else:
                         task = asyncio.create_task(
-                            stream_chat_message_openai(conversation_history, endpoint, model, options, credentialname)
+                            stream_chat_message_openai(conversation_history, endpoint, model, options, sampling_enabled, credentialname)
                         )
                     await task
 
@@ -213,6 +225,7 @@ async def main(provider, endpoint, model, options, systemprompt, timeout, creden
                         help="Base endpoint URL.")
     parser.add_argument("-o", "--options", type=str, default=DEFAULT_OPTIONS,
                         help="Ollama REST API options.")
+    parser.add_argument("-se", "--sampling-enabled", type=int, default=1, help="Enable or disable sampling.")
     parser.add_argument("-s", "--system-prompt", type=str, default="", help="Specify system prompt.")
     parser.add_argument("-t", "--timeout", type=int, default=DEFAULT_TIMEOUT, help="Timeout in seconds.")
     parser.add_argument("-l", "--log-level", type=int, default=OllamaLogger.ERROR, help="Log level.")
@@ -243,7 +256,7 @@ async def main(provider, endpoint, model, options, systemprompt, timeout, creden
     try:
         while True:
             try:
-                asyncio.run(main(args.provider, endpoint, model, options, args.system_prompt, args.timeout, args.keyname))
+                asyncio.run(main(args.provider, endpoint, model, options, args.sampling_enabled, args.system_prompt, args.timeout, args.keyname))
             except KeyboardInterrupt:
                 print("Canceled.")
                 break