Bump Version 0.6.0

blythed · blythed · commit 0ec956843712 · 2025-03-27T00:04:17.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,13 +1,13 @@
 # superduper.io Changelog
 
-All notable changes to this project will be documented in this file. 
+All notable changes to this project will be documented in this file.
 
 The format is inspired by (but not strictly follows) [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 **Before you create a Pull Request, remember to update the Changelog with your changes.**
 
-## Changes Since Last Release 
+## [0.6.0](https://github.com/superduper-io/superduper/compare/0.6.0...0.5.0])    (2025-Mar-26)
 
 #### Changed defaults / behaviours
 
@@ -17,7 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - No need to define `_fields`
 - Use databackend to perform metadata duties
 - Add `db.create` and `db.insert` instead of `auto_schema`
-- Merkel-tree implementation replacing random `.uuid` with deterministic implementation
+- Merkle-tree implementation replacing random `.uuid` with deterministic implementation
 - Simplify the `Template` class
 - Simplify `Component` lifecycle by removing `Component.pre_create`
 - Renamed `Component.init` to `Component.setup`
diff --git a/plugins/mongodb/superduper_mongodb/data_backend.py b/plugins/mongodb/superduper_mongodb/data_backend.py
@@ -1,5 +1,4 @@
 import json
-import os
 import typing as t
 
 import click
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 name = "superduper-framework"
 description = "Build compositional and declarative AI applications and agents"
 readme = "README.md"
-version = '0.5.0'
+version = '0.6.0'
 license = {file = "LICENSE"}
 maintainers = [{name = "superduper.io, Inc.", email = "opensource@superduper.com"}]
 keywords = [
diff --git a/superduper/base/config_settings.py b/superduper/base/config_settings.py
@@ -66,11 +66,14 @@ def config(self) -> t.Any:
             try:
                 with open(USER_CONFIG) as f:
                     kwargs = yaml.safe_load(f)
-            except FileNotFoundError as e:
+            except FileNotFoundError:
                 if USER_CONFIG != f'{HOME}/.superduper/config.yaml':
-                    raise ConfigError(
-                        f'Could not find config file: {USER_CONFIG}'
-                    ) from e
+                    from warnings import warn
+
+                    warn(
+                        f'Could not find config file: {USER_CONFIG}, '
+                        'falling back to defaults...'
+                    )
             if self.base:
                 kwargs = kwargs.get(self.base, {})
 
diff --git a/superduper/base/query.py b/superduper/base/query.py
@@ -94,6 +94,9 @@ def predict_ids(self):
 
     def to_query(self):
         """Convert decomposition back to a ``Query``."""
+        if self.db is None:
+            self.db = db
+
         q = self.db[self.table]
 
         if self.pre_like:
@@ -360,7 +363,7 @@ def outputs(self, *predict_ids):
 
     :param predict_ids: The predict_ids to add. # noqa
     """
-    d = self.decomposition
+    d: Decomposition = self.decomposition
 
     assert not d.outputs
 
diff --git a/templates/multimodal_video_search/build.ipynb b/templates/multimodal_video_search/build.ipynb
@@ -185,56 +185,60 @@
     "import cv2\n",
     "import tqdm\n",
     "from PIL import Image\n",
-    "from superduper import Schema, ObjectModel\n",
+    "from superduper import ObjectModel\n",
     "from superduper.base.datatype import FileItem\n",
     "from superduper.misc.importing import isreallyinstance\n",
     "\n",
     "\n",
-    "def chunker(video_file):\n",
-    "    # Set the sampling frequency for frames\n",
+    "class Chunker:\n",
+    "    def __hash__(self):\n",
+    "        return 1234567890\n",
     "\n",
-    "    if isreallyinstance(video_file, FileItem):\n",
-    "        video_file = video_file.unpack()\n",
-    "    sample_freq = 100\n",
-    "    \n",
-    "    # Open the video file using OpenCV\n",
-    "    cap = cv2.VideoCapture(video_file)\n",
-    "    \n",
-    "    # Initialize variables\n",
-    "    frame_count = 0\n",
-    "    fps = cap.get(cv2.CAP_PROP_FPS)\n",
-    "    extracted_frames = []\n",
-    "    progress = tqdm.tqdm()\n",
+    "    def __call__(self, video_file):\n",
+    "        # Set the sampling frequency for frames\n",
     "\n",
-    "    # Iterate through video frames\n",
-    "    while True:\n",
-    "        ret, frame = cap.read()\n",
-    "        if not ret:\n",
-    "            break\n",
+    "        if isreallyinstance(video_file, FileItem):\n",
+    "            video_file = video_file.unpack()\n",
+    "        sample_freq = 100\n",
     "        \n",
-    "        # Get the current timestamp based on frame count and FPS\n",
-    "        current_timestamp = frame_count // fps\n",
+    "        # Open the video file using OpenCV\n",
+    "        cap = cv2.VideoCapture(video_file)\n",
     "        \n",
-    "        # Sample frames based on the specified frequency\n",
-    "        if frame_count % sample_freq == 0:\n",
-    "            extracted_frames.append({\n",
-    "                'image': Image.fromarray(frame[:,:,::-1]),  # Convert BGR to RGB\n",
-    "                'current_timestamp': current_timestamp,\n",
-    "            })\n",
-    "        frame_count += 1\n",
-    "        progress.update(1)\n",
-    "    \n",
-    "    # Release resources \n",
-    "    cap.release()\n",
-    "    cv2.destroyAllWindows()\n",
-    "    \n",
-    "    # Return the list of extracted frames\n",
-    "    return extracted_frames\n",
+    "        # Initialize variables\n",
+    "        frame_count = 0\n",
+    "        fps = cap.get(cv2.CAP_PROP_FPS)\n",
+    "        extracted_frames = []\n",
+    "        progress = tqdm.tqdm()\n",
+    "\n",
+    "        # Iterate through video frames\n",
+    "        while True:\n",
+    "            ret, frame = cap.read()\n",
+    "            if not ret:\n",
+    "                break\n",
+    "            \n",
+    "            # Get the current timestamp based on frame count and FPS\n",
+    "            current_timestamp = frame_count // fps\n",
+    "            \n",
+    "            # Sample frames based on the specified frequency\n",
+    "            if frame_count % sample_freq == 0:\n",
+    "                extracted_frames.append({\n",
+    "                    'image': Image.fromarray(frame[:,:,::-1]),  # Convert BGR to RGB\n",
+    "                    'current_timestamp': current_timestamp,\n",
+    "                })\n",
+    "            frame_count += 1\n",
+    "            progress.update(1)\n",
+    "        \n",
+    "        # Release resources \n",
+    "        cap.release()\n",
+    "        cv2.destroyAllWindows()\n",
+    "        \n",
+    "        # Return the list of extracted frames\n",
+    "        return extracted_frames\n",
     "\n",
     "\n",
     "chunker = ObjectModel(\n",
     "    'chunker', \n",
-    "    object=chunker,\n",
+    "    object=Chunker(),\n",
     "    datatype='image=superduper_pillow.pil_image|current_timestamp=int',\n",
     ")"
    ]
diff --git a/templates/pdf_rag/build.ipynb b/templates/pdf_rag/build.ipynb
@@ -148,32 +148,36 @@
     "import os\n",
     "\n",
     "\n",
-    "def split_image(pdf_path):\n",
-    "    if hasattr(pdf_path, 'unpack'):\n",
-    "        pdf_path = pdf_path.unpack()\n",
-    "    \n",
-    "    logging.info(f\"Splitting images from {pdf_path}\")\n",
+    "class SplitImage:\n",
+    "    def __hash__(self):\n",
+    "        return 1234567890\n",
+    "\n",
+    "    def __call__(self, pdf_path):\n",
+    "        if hasattr(pdf_path, 'unpack'):\n",
+    "            pdf_path = pdf_path.unpack()\n",
+    "        \n",
+    "        logging.info(f\"Splitting images from {pdf_path}\")\n",
     "\n",
-    "    image_folders = \"data/pdf-images\"\n",
-    "    pdf_name = os.path.basename(pdf_path)\n",
-    "    images = convert_from_path(pdf_path)\n",
-    "    logging.info(f\"Number of images: {len(images)}\")\n",
+    "        image_folders = \"data/pdf-images\"\n",
+    "        pdf_name = os.path.basename(pdf_path)\n",
+    "        images = convert_from_path(pdf_path)\n",
+    "        logging.info(f\"Number of images: {len(images)}\")\n",
     "\n",
-    "    image_folder = os.path.join(image_folders, pdf_name)\n",
-    "    if not os.path.exists(image_folder):\n",
-    "        os.makedirs(image_folder)\n",
+    "        image_folder = os.path.join(image_folders, pdf_name)\n",
+    "        if not os.path.exists(image_folder):\n",
+    "            os.makedirs(image_folder)\n",
     "\n",
-    "    data = []\n",
-    "    for i, image in enumerate(images):\n",
-    "        path = os.path.join(image_folder, f\"{i}.jpg\")\n",
-    "        image.save(os.path.join(path))\n",
-    "        data.append(path)\n",
-    "    return data\n",
+    "        data = []\n",
+    "        for i, image in enumerate(images):\n",
+    "            path = os.path.join(image_folder, f\"{i}.jpg\")\n",
+    "            image.save(os.path.join(path))\n",
+    "            data.append(path)\n",
+    "        return data\n",
     "\n",
     "\n",
     "model_split_image = ObjectModel(\n",
     "    identifier=\"split_image\",\n",
-    "    object=split_image,\n",
+    "    object=SplitImage(),\n",
     "    datatype='file',\n",
     ")\n",
     "\n",
@@ -287,32 +291,36 @@
     "    return datas\n",
     "\n",
     "\n",
-    "def get_chunks(pdf):\n",
-    "    from collections import defaultdict\n",
-    "    from unstructured.documents.coordinates import RelativeCoordinateSystem\n",
-    "    from unstructured.partition.pdf import partition_pdf\n",
+    "class GetChunks:\n",
+    "    def __hash__(self):\n",
+    "        return 24681012\n",
+    "\n",
+    "    def __call__(self, pdf):\n",
+    "        from collections import defaultdict\n",
+    "        from unstructured.documents.coordinates import RelativeCoordinateSystem\n",
+    "        from unstructured.partition.pdf import partition_pdf\n",
+    "\n",
+    "        if hasattr(pdf, 'unpack'):\n",
+    "            pdf = pdf.unpack()\n",
     "\n",
-    "    if hasattr(pdf, 'unpack'):\n",
-    "        pdf = pdf.unpack()\n",
+    "        elements = partition_pdf(pdf)\n",
+    "        elements = remove_annotation(elements)\n",
     "\n",
-    "    elements = partition_pdf(pdf)\n",
-    "    elements = remove_annotation(elements)\n",
+    "        pages_elements = defaultdict(list)\n",
+    "        for element in elements:\n",
+    "            element.convert_coordinates_to_new_system(\n",
+    "                RelativeCoordinateSystem(), in_place=True\n",
+    "            )\n",
+    "            pages_elements[element.metadata.page_number].append(element)\n",
     "\n",
-    "    pages_elements = defaultdict(list)\n",
-    "    for element in elements:\n",
-    "        element.convert_coordinates_to_new_system(\n",
-    "            RelativeCoordinateSystem(), in_place=True\n",
+    "        all_chunks_and_links = sum(\n",
+    "            [\n",
+    "                create_chunk_and_metadatas(page_elements)\n",
+    "                for _, page_elements in pages_elements.items()\n",
+    "            ],\n",
+    "            [],\n",
     "        )\n",
-    "        pages_elements[element.metadata.page_number].append(element)\n",
-    "\n",
-    "    all_chunks_and_links = sum(\n",
-    "        [\n",
-    "            create_chunk_and_metadatas(page_elements)\n",
-    "            for _, page_elements in pages_elements.items()\n",
-    "        ],\n",
-    "        [],\n",
-    "    )\n",
-    "    return all_chunks_and_links"
+    "        return all_chunks_and_links"
    ]
   },
   {
@@ -324,7 +332,7 @@
    "source": [
     "model_chunk = ObjectModel(\n",
     "    identifier=\"chunk\",\n",
-    "    object=get_chunks,\n",
+    "    object=GetChunks(),\n",
     "    datatype='json',\n",
     ")\n",
     "\n",
@@ -367,7 +375,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from superduper_openai.model import OpenAIEmbedding\n",
     "from superduper import VectorIndex\n",
     "\n",
     "listener_embedding = Listener(\n",
@@ -413,7 +420,6 @@
     "    identifier=\"processor\",\n",
     "    chunk_key=listener_chunk.outputs,\n",
     "    split_image_key=listener_split_image.outputs,\n",
-    "    upstream=[Plugin(path=\"./utils.py\")],\n",
     ")"
    ]
   },
@@ -446,7 +452,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from superduper_openai.model import OpenAIChatCompletion\n",
     "from utils import Rag\n",
     "\n",
     "prompt_template = (\n",
@@ -468,18 +473,6 @@
     ")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "09e04c4c-c932-4358-ae2f-61cc482f0ff4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from utils import Rag\n",
-    "\n",
-    "Rag.__module__"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "fde11162-e994-4621-af36-b5fa9bc3f258",
@@ -505,8 +498,8 @@
     "        listener_chunk,\n",
     "        vector_index,\n",
     "        rag\n",
-    "    ]\n",
-    "    \n",
+    "    ],\n",
+    "    upstream=[Plugin(path=\"./utils.py\")],\n",
     ")"
    ]
   },

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`import json`
`2`		`-import os`
`3`	`2`	`import typing as t`
`4`	`3`
`5`	`4`	`import click`