|
148 | 148 | "import os\n", |
149 | 149 | "\n", |
150 | 150 | "\n", |
151 | | - "def split_image(pdf_path):\n", |
152 | | - " if hasattr(pdf_path, 'unpack'):\n", |
153 | | - " pdf_path = pdf_path.unpack()\n", |
154 | | - " \n", |
155 | | - " logging.info(f\"Splitting images from {pdf_path}\")\n", |
| 151 | + "class SplitImage:\n", |
| 152 | + " def __hash__(self):\n", |
| 153 | + " return 1234567890\n", |
| 154 | + "\n", |
| 155 | + " def __call__(self, pdf_path):\n", |
| 156 | + " if hasattr(pdf_path, 'unpack'):\n", |
| 157 | + " pdf_path = pdf_path.unpack()\n", |
| 158 | + " \n", |
| 159 | + " logging.info(f\"Splitting images from {pdf_path}\")\n", |
156 | 160 | "\n", |
157 | | - " image_folders = \"data/pdf-images\"\n", |
158 | | - " pdf_name = os.path.basename(pdf_path)\n", |
159 | | - " images = convert_from_path(pdf_path)\n", |
160 | | - " logging.info(f\"Number of images: {len(images)}\")\n", |
| 161 | + " image_folders = \"data/pdf-images\"\n", |
| 162 | + " pdf_name = os.path.basename(pdf_path)\n", |
| 163 | + " images = convert_from_path(pdf_path)\n", |
| 164 | + " logging.info(f\"Number of images: {len(images)}\")\n", |
161 | 165 | "\n", |
162 | | - " image_folder = os.path.join(image_folders, pdf_name)\n", |
163 | | - " if not os.path.exists(image_folder):\n", |
164 | | - " os.makedirs(image_folder)\n", |
| 166 | + " image_folder = os.path.join(image_folders, pdf_name)\n", |
| 167 | + " if not os.path.exists(image_folder):\n", |
| 168 | + " os.makedirs(image_folder)\n", |
165 | 169 | "\n", |
166 | | - " data = []\n", |
167 | | - " for i, image in enumerate(images):\n", |
168 | | - " path = os.path.join(image_folder, f\"{i}.jpg\")\n", |
169 | | - " image.save(os.path.join(path))\n", |
170 | | - " data.append(path)\n", |
171 | | - " return data\n", |
| 170 | + " data = []\n", |
| 171 | + " for i, image in enumerate(images):\n", |
| 172 | + " path = os.path.join(image_folder, f\"{i}.jpg\")\n", |
| 173 | + " image.save(os.path.join(path))\n", |
| 174 | + " data.append(path)\n", |
| 175 | + " return data\n", |
172 | 176 | "\n", |
173 | 177 | "\n", |
174 | 178 | "model_split_image = ObjectModel(\n", |
175 | 179 | " identifier=\"split_image\",\n", |
176 | | - " object=split_image,\n", |
| 180 | + " object=SplitImage(),\n", |
177 | 181 | " datatype='file',\n", |
178 | 182 | ")\n", |
179 | 183 | "\n", |
|
287 | 291 | " return datas\n", |
288 | 292 | "\n", |
289 | 293 | "\n", |
290 | | - "def get_chunks(pdf):\n", |
291 | | - " from collections import defaultdict\n", |
292 | | - " from unstructured.documents.coordinates import RelativeCoordinateSystem\n", |
293 | | - " from unstructured.partition.pdf import partition_pdf\n", |
| 294 | + "class GetChunks:\n", |
| 295 | + " def __hash__(self):\n", |
| 296 | + " return 24681012\n", |
| 297 | + "\n", |
| 298 | + " def __call__(self, pdf):\n", |
| 299 | + " from collections import defaultdict\n", |
| 300 | + " from unstructured.documents.coordinates import RelativeCoordinateSystem\n", |
| 301 | + " from unstructured.partition.pdf import partition_pdf\n", |
| 302 | + "\n", |
| 303 | + " if hasattr(pdf, 'unpack'):\n", |
| 304 | + " pdf = pdf.unpack()\n", |
294 | 305 | "\n", |
295 | | - " if hasattr(pdf, 'unpack'):\n", |
296 | | - " pdf = pdf.unpack()\n", |
| 306 | + " elements = partition_pdf(pdf)\n", |
| 307 | + " elements = remove_annotation(elements)\n", |
297 | 308 | "\n", |
298 | | - " elements = partition_pdf(pdf)\n", |
299 | | - " elements = remove_annotation(elements)\n", |
| 309 | + " pages_elements = defaultdict(list)\n", |
| 310 | + " for element in elements:\n", |
| 311 | + " element.convert_coordinates_to_new_system(\n", |
| 312 | + " RelativeCoordinateSystem(), in_place=True\n", |
| 313 | + " )\n", |
| 314 | + " pages_elements[element.metadata.page_number].append(element)\n", |
300 | 315 | "\n", |
301 | | - " pages_elements = defaultdict(list)\n", |
302 | | - " for element in elements:\n", |
303 | | - " element.convert_coordinates_to_new_system(\n", |
304 | | - " RelativeCoordinateSystem(), in_place=True\n", |
| 316 | + " all_chunks_and_links = sum(\n", |
| 317 | + " [\n", |
| 318 | + " create_chunk_and_metadatas(page_elements)\n", |
| 319 | + " for _, page_elements in pages_elements.items()\n", |
| 320 | + " ],\n", |
| 321 | + " [],\n", |
305 | 322 | " )\n", |
306 | | - " pages_elements[element.metadata.page_number].append(element)\n", |
307 | | - "\n", |
308 | | - " all_chunks_and_links = sum(\n", |
309 | | - " [\n", |
310 | | - " create_chunk_and_metadatas(page_elements)\n", |
311 | | - " for _, page_elements in pages_elements.items()\n", |
312 | | - " ],\n", |
313 | | - " [],\n", |
314 | | - " )\n", |
315 | | - " return all_chunks_and_links" |
| 323 | + " return all_chunks_and_links" |
316 | 324 | ] |
317 | 325 | }, |
318 | 326 | { |
|
324 | 332 | "source": [ |
325 | 333 | "model_chunk = ObjectModel(\n", |
326 | 334 | " identifier=\"chunk\",\n", |
327 | | - " object=get_chunks,\n", |
| 335 | + " object=GetChunks(),\n", |
328 | 336 | " datatype='json',\n", |
329 | 337 | ")\n", |
330 | 338 | "\n", |
|
367 | 375 | "metadata": {}, |
368 | 376 | "outputs": [], |
369 | 377 | "source": [ |
370 | | - "from superduper_openai.model import OpenAIEmbedding\n", |
371 | 378 | "from superduper import VectorIndex\n", |
372 | 379 | "\n", |
373 | 380 | "listener_embedding = Listener(\n", |
|
413 | 420 | " identifier=\"processor\",\n", |
414 | 421 | " chunk_key=listener_chunk.outputs,\n", |
415 | 422 | " split_image_key=listener_split_image.outputs,\n", |
416 | | - " upstream=[Plugin(path=\"./utils.py\")],\n", |
417 | 423 | ")" |
418 | 424 | ] |
419 | 425 | }, |
|
446 | 452 | "metadata": {}, |
447 | 453 | "outputs": [], |
448 | 454 | "source": [ |
449 | | - "from superduper_openai.model import OpenAIChatCompletion\n", |
450 | 455 | "from utils import Rag\n", |
451 | 456 | "\n", |
452 | 457 | "prompt_template = (\n", |
|
468 | 473 | ")" |
469 | 474 | ] |
470 | 475 | }, |
471 | | - { |
472 | | - "cell_type": "code", |
473 | | - "execution_count": null, |
474 | | - "id": "09e04c4c-c932-4358-ae2f-61cc482f0ff4", |
475 | | - "metadata": {}, |
476 | | - "outputs": [], |
477 | | - "source": [ |
478 | | - "from utils import Rag\n", |
479 | | - "\n", |
480 | | - "Rag.__module__" |
481 | | - ] |
482 | | - }, |
483 | 476 | { |
484 | 477 | "cell_type": "markdown", |
485 | 478 | "id": "fde11162-e994-4621-af36-b5fa9bc3f258", |
|
505 | 498 | " listener_chunk,\n", |
506 | 499 | " vector_index,\n", |
507 | 500 | " rag\n", |
508 | | - " ]\n", |
509 | | - " \n", |
| 501 | + " ],\n", |
| 502 | + " upstream=[Plugin(path=\"./utils.py\")],\n", |
510 | 503 | ")" |
511 | 504 | ] |
512 | 505 | }, |
|
0 commit comments