Skip to content

Commit 652e065

Browse files
committed
compare quantization on cpu and gpu
1 parent f90900f commit 652e065

File tree

1 file changed

+134
-4
lines changed

1 file changed

+134
-4
lines changed

docs/72_quantization/quantization.ipynb

Lines changed: 134 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
"source": [
2525
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
2626
"from utilities import calculate_model_memory_in_gb\n",
27-
"import torch"
27+
"import torch\n",
28+
"import numpy as np"
2829
]
2930
},
3031
{
@@ -65,7 +66,7 @@
6566
{
6667
"data": {
6768
"application/vnd.jupyter.widget-view+json": {
68-
"model_id": "79e5f5a054c9469f9f88b23e7c7fb962",
69+
"model_id": "937bfa80eb814bc6b7848d3646777ef7",
6970
"version_major": 2,
7071
"version_minor": 0
7172
},
@@ -144,7 +145,7 @@
144145
{
145146
"data": {
146147
"application/vnd.jupyter.widget-view+json": {
147-
"model_id": "67b0f9a20d2149b99c7ea59304af5675",
148+
"model_id": "aba28d966b9f405488417df37091bb67",
148149
"version_major": 2,
149150
"version_minor": 0
150151
},
@@ -185,13 +186,142 @@
185186
"calculate_model_memory_in_gb(quantized_model)"
186187
]
187188
},
189+
{
190+
"cell_type": "markdown",
191+
"id": "a091cca6-69e6-47a0-9e97-453e989705d3",
192+
"metadata": {},
193+
"source": [
194+
"Apparently, quantization is implemented differently for CPU and GPU devices. If we load the model into GPU-memory, its size is different."
195+
]
196+
},
197+
{
198+
"cell_type": "code",
199+
"execution_count": 8,
200+
"id": "be0c1a05-ebd8-4e5b-b294-b158f8b630ee",
201+
"metadata": {},
202+
"outputs": [
203+
{
204+
"data": {
205+
"application/vnd.jupyter.widget-view+json": {
206+
"model_id": "57d988878f9b40309daade21a83020b9",
207+
"version_major": 2,
208+
"version_minor": 0
209+
},
210+
"text/plain": [
211+
"Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
212+
]
213+
},
214+
"metadata": {},
215+
"output_type": "display_data"
216+
}
217+
],
218+
"source": [
219+
"quantized_gpu_model = AutoModelForCausalLM.from_pretrained(\n",
220+
" model_name,\n",
221+
" quantization_config=bnb_config,\n",
222+
" device_map=\"cuda:0\"\n",
223+
")"
224+
]
225+
},
226+
{
227+
"cell_type": "code",
228+
"execution_count": 9,
229+
"id": "d4eb965a-6bcf-4f6e-9257-13db1c43a32c",
230+
"metadata": {},
231+
"outputs": [
232+
{
233+
"data": {
234+
"text/plain": [
235+
"2.822406768798828"
236+
]
237+
},
238+
"execution_count": 9,
239+
"metadata": {},
240+
"output_type": "execute_result"
241+
}
242+
],
243+
"source": [
244+
"calculate_model_memory_in_gb(quantized_gpu_model)"
245+
]
246+
},
247+
{
248+
"cell_type": "markdown",
249+
"id": "0510f950-3803-41e8-bb86-71c46146a619",
250+
"metadata": {},
251+
"source": [
252+
"We can elaborate a bit more on this by inspecting the existing [element sizes given in bytes](https://pytorch.org/docs/stable/generated/torch.Tensor.element_size.html) of parameters in the models."
253+
]
254+
},
255+
{
256+
"cell_type": "code",
257+
"execution_count": 10,
258+
"id": "6b36498b-b719-4896-a287-d34cec95022c",
259+
"metadata": {},
260+
"outputs": [
261+
{
262+
"data": {
263+
"text/plain": [
264+
"array([4])"
265+
]
266+
},
267+
"execution_count": 10,
268+
"metadata": {},
269+
"output_type": "execute_result"
270+
}
271+
],
272+
"source": [
273+
"np.unique([p.element_size() for p in model.parameters()])"
274+
]
275+
},
276+
{
277+
"cell_type": "code",
278+
"execution_count": 11,
279+
"id": "13199fe3-c955-4787-9f56-53248e6793b2",
280+
"metadata": {},
281+
"outputs": [
282+
{
283+
"data": {
284+
"text/plain": [
285+
"array([2])"
286+
]
287+
},
288+
"execution_count": 11,
289+
"metadata": {},
290+
"output_type": "execute_result"
291+
}
292+
],
293+
"source": [
294+
"np.unique([p.element_size() for p in quantized_model.parameters()])"
295+
]
296+
},
297+
{
298+
"cell_type": "code",
299+
"execution_count": 12,
300+
"id": "41993337-3dd9-46ce-98b5-920ee75d0a51",
301+
"metadata": {},
302+
"outputs": [
303+
{
304+
"data": {
305+
"text/plain": [
306+
"array([1, 2])"
307+
]
308+
},
309+
"execution_count": 12,
310+
"metadata": {},
311+
"output_type": "execute_result"
312+
}
313+
],
314+
"source": [
315+
"np.unique([p.element_size() for p in quantized_gpu_model.parameters()])"
316+
]
317+
},
188318
{
189319
"cell_type": "markdown",
190320
"id": "196a61ae-2008-474f-8ccc-1b4b04b0da54",
191321
"metadata": {},
192322
"source": [
193323
"## Exercise\n",
194-
"Explore alternative [Quantization configurations](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#transformers.BitsAndBytesConfig) and try to make the model as small as possible."
324+
"Explore alternative [Quantization configurations](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#transformers.BitsAndBytesConfig) and try to make the model as small as possible. Hint: Compare different approaches using `device_map=\"cpu\"` and `device_map=\"cuda:0\"` using a GPU."
195325
]
196326
},
197327
{

0 commit comments

Comments
 (0)