Skip to content

Commit f83bbe6

Browse files
mishig25Vaibhavs10gary149
authored
[Local App Snippet] support non conversational LLMs (#954)
## Description Most GGUF files on the hub are insutrct/conversational. However, not all of them. Previously, local app snippets assumed that all GGUFs are insutrct/conversational. ### vLLM https://huggingface.co/meta-llama/Llama-3.2-3B?local-app=vllm ```json mishig@machine:~$ curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "meta-llama/Llama-3.2-3B", "prompt": "Once upon a time", "max_tokens": 150, "temperature": 0.5 }' {"id":"cmpl-157aad50ba6d45a5a7e2641a3c8157dd","object":"text_completion","created":1728293162,"model":"meta-llama/Llama-3.2-3B","choices":[{"index":0,"text":" there was a man who was very generous and kind to everyone. He was a good man and a good person. One day he was walking down the street and he saw a man who was very poor and starving. The man was so hungry that he was crying and shaking. The man was so hungry that he was crying and shaking. The man was so hungry that he was crying and shaking. The man was so hungry that he was crying and shaking. The man was so hungry that he was crying and shaking. The man was so hungry that he was crying and shaking. The man was so hungry that he was crying and shaking. The man was so hungry that he was crying and shaking. The man was so hungry that he was crying and shaking","logprobs":null,"finish_reason":"length","stop_reason":null,"prompt_logprobs":null}],"usage":{"prompt_tokens":5,"total_tokens":155,"completion_tokens":150}} ``` ### llama.cpp https://huggingface.co/mlabonne/gemma-2b-GGUF?local-app=llama.cpp ``` llama-cli \ --hf-repo "mlabonne/gemma-2b-GGUF" \ --hf-file gemma-2b.Q2_K.gguf \ -p "Once upon a time " ``` ### llama-cpp-python ```python from llama_cpp import Llama llm = Llama.from_pretrained( repo_id="mlabonne/gemma-2b-GGUF", filename="gemma-2b.Q2_K.gguf", ) output = llm( "Once upon a time ", max_tokens=512, echo=True ) print(output) ``` --------- Co-authored-by: vb <[email protected]> Co-authored-by: Victor Muštar <[email protected]>
1 parent 04fa3bb commit f83bbe6

File tree

4 files changed

+238
-29
lines changed

4 files changed

+238
-29
lines changed
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import { describe, expect, it } from "vitest";
2+
import { LOCAL_APPS } from "./local-apps.js";
3+
import type { ModelData } from "./model-data.js";
4+
5+
describe("local-apps", () => {
6+
it("llama.cpp conversational", async () => {
7+
const { snippet: snippetFunc } = LOCAL_APPS["llama.cpp"];
8+
const model: ModelData = {
9+
id: "bartowski/Llama-3.2-3B-Instruct-GGUF",
10+
tags: ["conversational"],
11+
inference: "",
12+
};
13+
const snippet = snippetFunc(model);
14+
15+
expect(snippet[0].content).toEqual(`# Load and run the model:
16+
llama-cli \\
17+
--hf-repo "bartowski/Llama-3.2-3B-Instruct-GGUF" \\
18+
--hf-file {{GGUF_FILE}} \\
19+
-p "You are a helpful assistant" \\
20+
--conversation`);
21+
});
22+
23+
it("llama.cpp non-conversational", async () => {
24+
const { snippet: snippetFunc } = LOCAL_APPS["llama.cpp"];
25+
const model: ModelData = {
26+
id: "mlabonne/gemma-2b-GGUF",
27+
tags: [],
28+
inference: "",
29+
};
30+
const snippet = snippetFunc(model);
31+
32+
expect(snippet[0].content).toEqual(`# Load and run the model:
33+
llama-cli \\
34+
--hf-repo "mlabonne/gemma-2b-GGUF" \\
35+
--hf-file {{GGUF_FILE}} \\
36+
-p "Once upon a time,"`);
37+
});
38+
39+
it("vLLM conversational llm", async () => {
40+
const { snippet: snippetFunc } = LOCAL_APPS["vllm"];
41+
const model: ModelData = {
42+
id: "meta-llama/Llama-3.2-3B-Instruct",
43+
pipeline_tag: "text-generation",
44+
tags: ["conversational"],
45+
inference: "",
46+
};
47+
const snippet = snippetFunc(model);
48+
49+
expect((snippet[0].content as string[]).join("\n")).toEqual(`# Load and run the model:
50+
vllm serve "meta-llama/Llama-3.2-3B-Instruct"
51+
# Call the server using curl:
52+
curl -X POST "http://localhost:8000/v1/chat/completions" \\
53+
-H "Content-Type: application/json" \\
54+
--data '{
55+
"model": "meta-llama/Llama-3.2-3B-Instruct",
56+
"messages": [
57+
{
58+
"role": "user",
59+
"content": "What is the capital of France?"
60+
}
61+
]
62+
}'`);
63+
});
64+
65+
it("vLLM non-conversational llm", async () => {
66+
const { snippet: snippetFunc } = LOCAL_APPS["vllm"];
67+
const model: ModelData = {
68+
id: "meta-llama/Llama-3.2-3B",
69+
tags: [""],
70+
inference: "",
71+
};
72+
const snippet = snippetFunc(model);
73+
74+
expect((snippet[0].content as string[]).join("\n")).toEqual(`# Load and run the model:
75+
vllm serve "meta-llama/Llama-3.2-3B"
76+
# Call the server using curl:
77+
curl -X POST "http://localhost:8000/v1/completions" \\
78+
-H "Content-Type: application/json" \\
79+
--data '{
80+
"model": "meta-llama/Llama-3.2-3B",
81+
"prompt": "Once upon a time,",
82+
"max_tokens": 512,
83+
"temperature": 0.5
84+
}'`);
85+
});
86+
87+
it("vLLM conversational vlm", async () => {
88+
const { snippet: snippetFunc } = LOCAL_APPS["vllm"];
89+
const model: ModelData = {
90+
id: "meta-llama/Llama-3.2-11B-Vision-Instruct",
91+
pipeline_tag: "image-text-to-text",
92+
tags: ["conversational"],
93+
inference: "",
94+
};
95+
const snippet = snippetFunc(model);
96+
97+
expect((snippet[0].content as string[]).join("\n")).toEqual(`# Load and run the model:
98+
vllm serve "meta-llama/Llama-3.2-11B-Vision-Instruct"
99+
# Call the server using curl:
100+
curl -X POST "http://localhost:8000/v1/chat/completions" \\
101+
-H "Content-Type: application/json" \\
102+
--data '{
103+
"model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
104+
"messages": [
105+
{
106+
"role": "user",
107+
"content": [
108+
{
109+
"type": "text",
110+
"text": "Describe this image in one sentence."
111+
},
112+
{
113+
"type": "image_url",
114+
"image_url": {
115+
"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
116+
}
117+
}
118+
]
119+
}
120+
]
121+
}'`);
122+
});
123+
});

packages/tasks/src/local-apps.ts

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import { parseGGUFQuantLabel } from "./gguf.js";
22
import type { ModelData } from "./model-data.js";
33
import type { PipelineType } from "./pipelines.js";
4+
import { stringifyMessages } from "./snippets/common.js";
5+
import { getModelInputSnippet } from "./snippets/inputs.js";
6+
import type { ChatCompletionInputMessage } from "./tasks/index.js";
47

58
export interface LocalAppSnippet {
69
/**
@@ -92,15 +95,20 @@ function isMlxModel(model: ModelData) {
9295
}
9396

9497
const snippetLlamacpp = (model: ModelData, filepath?: string): LocalAppSnippet[] => {
95-
const command = (binary: string) =>
96-
[
98+
const command = (binary: string) => {
99+
const snippet = [
97100
"# Load and run the model:",
98101
`${binary} \\`,
99102
` --hf-repo "${model.id}" \\`,
100103
` --hf-file ${filepath ?? "{{GGUF_FILE}}"} \\`,
101-
' -p "You are a helpful assistant" \\',
102-
" --conversation",
103-
].join("\n");
104+
` -p "${model.tags.includes("conversational") ? "You are a helpful assistant" : "Once upon a time,"}"`,
105+
];
106+
if (model.tags.includes("conversational")) {
107+
snippet[snippet.length - 1] += " \\";
108+
snippet.push(" --conversation");
109+
}
110+
return snippet.join("\n");
111+
};
104112
return [
105113
{
106114
title: "Install from brew",
@@ -178,22 +186,33 @@ const snippetLocalAI = (model: ModelData, filepath?: string): LocalAppSnippet[]
178186
};
179187

180188
const snippetVllm = (model: ModelData): LocalAppSnippet[] => {
181-
const runCommand = [
182-
"# Call the server using curl:",
183-
`curl -X POST "http://localhost:8000/v1/chat/completions" \\`,
184-
` -H "Content-Type: application/json" \\`,
185-
` --data '{`,
186-
` "model": "${model.id}",`,
187-
` "messages": [`,
188-
` {"role": "user", "content": "Hello!"}`,
189-
` ]`,
190-
` }'`,
191-
];
189+
const messages = getModelInputSnippet(model) as ChatCompletionInputMessage[];
190+
const runCommandInstruct = `# Call the server using curl:
191+
curl -X POST "http://localhost:8000/v1/chat/completions" \\
192+
-H "Content-Type: application/json" \\
193+
--data '{
194+
"model": "${model.id}",
195+
"messages": ${stringifyMessages(messages, {
196+
indent: "\t\t",
197+
attributeKeyQuotes: true,
198+
customContentEscaper: (str) => str.replace(/'/g, "'\\''"),
199+
})}
200+
}'`;
201+
const runCommandNonInstruct = `# Call the server using curl:
202+
curl -X POST "http://localhost:8000/v1/completions" \\
203+
-H "Content-Type: application/json" \\
204+
--data '{
205+
"model": "${model.id}",
206+
"prompt": "Once upon a time,",
207+
"max_tokens": 512,
208+
"temperature": 0.5
209+
}'`;
210+
const runCommand = model.tags.includes("conversational") ? runCommandInstruct : runCommandNonInstruct;
192211
return [
193212
{
194213
title: "Install from pip",
195214
setup: ["# Install vLLM from pip:", "pip install vllm"].join("\n"),
196-
content: [`# Load and run the model:\nvllm serve "${model.id}"`, runCommand.join("\n")],
215+
content: [`# Load and run the model:\nvllm serve "${model.id}"`, runCommand],
197216
},
198217
{
199218
title: "Use Docker images",
@@ -210,7 +229,7 @@ const snippetVllm = (model: ModelData): LocalAppSnippet[] => {
210229
].join("\n"),
211230
content: [
212231
`# Load and run the model:\ndocker exec -it my_vllm_container bash -c "vllm serve ${model.id}"`,
213-
runCommand.join("\n"),
232+
runCommand,
214233
],
215234
},
216235
];
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import { describe, expect, it } from "vitest";
2+
import type { ModelData } from "./model-data.js";
3+
import { llama_cpp_python } from "./model-libraries-snippets.js";
4+
5+
describe("model-libraries-snippets", () => {
6+
it("llama_cpp_python conversational", async () => {
7+
const model: ModelData = {
8+
id: "bartowski/Llama-3.2-3B-Instruct-GGUF",
9+
pipeline_tag: "text-generation",
10+
tags: ["conversational"],
11+
inference: "",
12+
};
13+
const snippet = llama_cpp_python(model);
14+
15+
expect(snippet.join("\n")).toEqual(`from llama_cpp import Llama
16+
17+
llm = Llama.from_pretrained(
18+
repo_id="bartowski/Llama-3.2-3B-Instruct-GGUF",
19+
filename="{{GGUF_FILE}}",
20+
)
21+
22+
llm.create_chat_completion(
23+
messages = [
24+
{
25+
"role": "user",
26+
"content": "What is the capital of France?"
27+
}
28+
]
29+
)`);
30+
});
31+
32+
it("llama_cpp_python non-conversational", async () => {
33+
const model: ModelData = {
34+
id: "mlabonne/gemma-2b-GGUF",
35+
tags: [""],
36+
inference: "",
37+
};
38+
const snippet = llama_cpp_python(model);
39+
40+
expect(snippet.join("\n")).toEqual(`from llama_cpp import Llama
41+
42+
llm = Llama.from_pretrained(
43+
repo_id="mlabonne/gemma-2b-GGUF",
44+
filename="{{GGUF_FILE}}",
45+
)
46+
47+
output = llm(
48+
"Once upon a time,",
49+
max_tokens=512,
50+
echo=True
51+
)
52+
print(output)`);
53+
});
54+
});

packages/tasks/src/model-libraries-snippets.ts

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import type { ModelData } from "./model-data.js";
22
import type { WidgetExampleTextInput, WidgetExampleSentenceSimilarityInput } from "./widget-example.js";
33
import { LIBRARY_TASK_MAPPING } from "./library-to-tasks.js";
4+
import { getModelInputSnippet } from "./snippets/inputs.js";
5+
import type { ChatCompletionInputMessage } from "./tasks/index.js";
6+
import { stringifyMessages } from "./snippets/common.js";
47

58
const TAG_CUSTOM_CODE = "custom_code";
69

@@ -418,23 +421,33 @@ model = keras_hub.models.CausalLM.from_preset("hf://${model.id}", dtype="bfloat1
418421
`,
419422
];
420423

421-
export const llama_cpp_python = (model: ModelData): string[] => [
422-
`from llama_cpp import Llama
424+
export const llama_cpp_python = (model: ModelData): string[] => {
425+
const snippets = [
426+
`from llama_cpp import Llama
423427
424428
llm = Llama.from_pretrained(
425429
repo_id="${model.id}",
426430
filename="{{GGUF_FILE}}",
427431
)
432+
`,
433+
];
428434

429-
llm.create_chat_completion(
430-
messages = [
431-
{
432-
"role": "user",
433-
"content": "What is the capital of France?"
434-
}
435-
]
436-
)`,
437-
];
435+
if (model.tags.includes("conversational")) {
436+
const messages = getModelInputSnippet(model) as ChatCompletionInputMessage[];
437+
snippets.push(`llm.create_chat_completion(
438+
messages = ${stringifyMessages(messages, { attributeKeyQuotes: true, indent: "\t" })}
439+
)`);
440+
} else {
441+
snippets.push(`output = llm(
442+
"Once upon a time,",
443+
max_tokens=512,
444+
echo=True
445+
)
446+
print(output)`);
447+
}
448+
449+
return snippets;
450+
};
438451

439452
export const tf_keras = (model: ModelData): string[] => [
440453
`# Note: 'keras<3.x' or 'tf_keras' must be installed (legacy)

0 commit comments

Comments
 (0)