|
24 | 24 | "source": [ |
25 | 25 | "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n", |
26 | 26 | "from utilities import calculate_model_memory_in_gb\n", |
27 | | - "import torch" |
| 27 | + "import torch\n", |
| 28 | + "import numpy as np" |
28 | 29 | ] |
29 | 30 | }, |
30 | 31 | { |
|
65 | 66 | { |
66 | 67 | "data": { |
67 | 68 | "application/vnd.jupyter.widget-view+json": { |
68 | | - "model_id": "79e5f5a054c9469f9f88b23e7c7fb962", |
| 69 | + "model_id": "937bfa80eb814bc6b7848d3646777ef7", |
69 | 70 | "version_major": 2, |
70 | 71 | "version_minor": 0 |
71 | 72 | }, |
|
144 | 145 | { |
145 | 146 | "data": { |
146 | 147 | "application/vnd.jupyter.widget-view+json": { |
147 | | - "model_id": "67b0f9a20d2149b99c7ea59304af5675", |
| 148 | + "model_id": "aba28d966b9f405488417df37091bb67", |
148 | 149 | "version_major": 2, |
149 | 150 | "version_minor": 0 |
150 | 151 | }, |
|
185 | 186 | "calculate_model_memory_in_gb(quantized_model)" |
186 | 187 | ] |
187 | 188 | }, |
| 189 | + { |
| 190 | + "cell_type": "markdown", |
| 191 | + "id": "a091cca6-69e6-47a0-9e97-453e989705d3", |
| 192 | + "metadata": {}, |
| 193 | + "source": [ |
| 194 | + "Apparently, quantization is implemented differently for CPU and GPU devices. If we load the model into GPU-memory, its size is different." |
| 195 | + ] |
| 196 | + }, |
| 197 | + { |
| 198 | + "cell_type": "code", |
| 199 | + "execution_count": 8, |
| 200 | + "id": "be0c1a05-ebd8-4e5b-b294-b158f8b630ee", |
| 201 | + "metadata": {}, |
| 202 | + "outputs": [ |
| 203 | + { |
| 204 | + "data": { |
| 205 | + "application/vnd.jupyter.widget-view+json": { |
| 206 | + "model_id": "57d988878f9b40309daade21a83020b9", |
| 207 | + "version_major": 2, |
| 208 | + "version_minor": 0 |
| 209 | + }, |
| 210 | + "text/plain": [ |
| 211 | + "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]" |
| 212 | + ] |
| 213 | + }, |
| 214 | + "metadata": {}, |
| 215 | + "output_type": "display_data" |
| 216 | + } |
| 217 | + ], |
| 218 | + "source": [ |
| 219 | + "quantized_gpu_model = AutoModelForCausalLM.from_pretrained(\n", |
| 220 | + " model_name,\n", |
| 221 | + " quantization_config=bnb_config,\n", |
| 222 | + " device_map=\"cuda:0\"\n", |
| 223 | + ")" |
| 224 | + ] |
| 225 | + }, |
| 226 | + { |
| 227 | + "cell_type": "code", |
| 228 | + "execution_count": 9, |
| 229 | + "id": "d4eb965a-6bcf-4f6e-9257-13db1c43a32c", |
| 230 | + "metadata": {}, |
| 231 | + "outputs": [ |
| 232 | + { |
| 233 | + "data": { |
| 234 | + "text/plain": [ |
| 235 | + "2.822406768798828" |
| 236 | + ] |
| 237 | + }, |
| 238 | + "execution_count": 9, |
| 239 | + "metadata": {}, |
| 240 | + "output_type": "execute_result" |
| 241 | + } |
| 242 | + ], |
| 243 | + "source": [ |
| 244 | + "calculate_model_memory_in_gb(quantized_gpu_model)" |
| 245 | + ] |
| 246 | + }, |
| 247 | + { |
| 248 | + "cell_type": "markdown", |
| 249 | + "id": "0510f950-3803-41e8-bb86-71c46146a619", |
| 250 | + "metadata": {}, |
| 251 | + "source": [ |
| 252 | + "We can elaborate a bit more on this by inspecting the existing [element sizes given in bytes](https://pytorch.org/docs/stable/generated/torch.Tensor.element_size.html) of parameters in the models." |
| 253 | + ] |
| 254 | + }, |
| 255 | + { |
| 256 | + "cell_type": "code", |
| 257 | + "execution_count": 10, |
| 258 | + "id": "6b36498b-b719-4896-a287-d34cec95022c", |
| 259 | + "metadata": {}, |
| 260 | + "outputs": [ |
| 261 | + { |
| 262 | + "data": { |
| 263 | + "text/plain": [ |
| 264 | + "array([4])" |
| 265 | + ] |
| 266 | + }, |
| 267 | + "execution_count": 10, |
| 268 | + "metadata": {}, |
| 269 | + "output_type": "execute_result" |
| 270 | + } |
| 271 | + ], |
| 272 | + "source": [ |
| 273 | + "np.unique([p.element_size() for p in model.parameters()])" |
| 274 | + ] |
| 275 | + }, |
| 276 | + { |
| 277 | + "cell_type": "code", |
| 278 | + "execution_count": 11, |
| 279 | + "id": "13199fe3-c955-4787-9f56-53248e6793b2", |
| 280 | + "metadata": {}, |
| 281 | + "outputs": [ |
| 282 | + { |
| 283 | + "data": { |
| 284 | + "text/plain": [ |
| 285 | + "array([2])" |
| 286 | + ] |
| 287 | + }, |
| 288 | + "execution_count": 11, |
| 289 | + "metadata": {}, |
| 290 | + "output_type": "execute_result" |
| 291 | + } |
| 292 | + ], |
| 293 | + "source": [ |
| 294 | + "np.unique([p.element_size() for p in quantized_model.parameters()])" |
| 295 | + ] |
| 296 | + }, |
| 297 | + { |
| 298 | + "cell_type": "code", |
| 299 | + "execution_count": 12, |
| 300 | + "id": "41993337-3dd9-46ce-98b5-920ee75d0a51", |
| 301 | + "metadata": {}, |
| 302 | + "outputs": [ |
| 303 | + { |
| 304 | + "data": { |
| 305 | + "text/plain": [ |
| 306 | + "array([1, 2])" |
| 307 | + ] |
| 308 | + }, |
| 309 | + "execution_count": 12, |
| 310 | + "metadata": {}, |
| 311 | + "output_type": "execute_result" |
| 312 | + } |
| 313 | + ], |
| 314 | + "source": [ |
| 315 | + "np.unique([p.element_size() for p in quantized_gpu_model.parameters()])" |
| 316 | + ] |
| 317 | + }, |
188 | 318 | { |
189 | 319 | "cell_type": "markdown", |
190 | 320 | "id": "196a61ae-2008-474f-8ccc-1b4b04b0da54", |
191 | 321 | "metadata": {}, |
192 | 322 | "source": [ |
193 | 323 | "## Exercise\n", |
194 | | - "Explore alternative [Quantization configurations](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#transformers.BitsAndBytesConfig) and try to make the model as small as possible." |
| 324 | + "Explore alternative [Quantization configurations](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#transformers.BitsAndBytesConfig) and try to make the model as small as possible. Hint: Compare different approaches using `device_map=\"cpu\"` and `device_map=\"cuda:0\"` using a GPU." |
195 | 325 | ] |
196 | 326 | }, |
197 | 327 | { |
|
0 commit comments