Merge pull request #64 from TogetherCrew/fix/63-ingestion-pipline-remove-caching

amindadgar · web-flow · commit a77c23560412 · 2025-09-03T13:42:53.000+03:30
Fix/63 ingestion pipline remove caching
diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml
@@ -9,6 +9,6 @@ on:
 
 jobs:
   ci:
-    uses: TogetherCrew/operations/.github/workflows/ci.yml@main
+    uses: TogetherCrew/operations/.github/workflows/ci2.yml@main
     secrets:
       CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}
diff --git a/.github/workflows/start.staging.yml b/.github/workflows/start.staging.yml
@@ -6,6 +6,6 @@ on: pull_request
 
 jobs:
   ci:
-    uses: TogetherCrew/operations/.github/workflows/ci.yml@main
+    uses: TogetherCrew/operations/.github/workflows/ci2.yml@main
     secrets:
       CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}
diff --git a/hivemind_etl/mediawiki/etl.py b/hivemind_etl/mediawiki/etl.py
@@ -99,7 +99,7 @@ def transform(self) -> list[Document]:
     def load(self, documents: list[Document]) -> None:
         logging.info(f"Loading {len(documents)} documents into Qdrant!")
         ingestion_pipeline = CustomIngestionPipeline(
-            self.community_id, collection_name=self.platform_id
+            self.community_id, collection_name=self.platform_id, use_cache=False,
         )
         
         # Process batches in parallel using ThreadPoolExecutor
diff --git a/hivemind_etl/simple_ingestion/pipeline.py b/hivemind_etl/simple_ingestion/pipeline.py
@@ -147,6 +147,7 @@ async def process_document(
     pipeline = CustomIngestionPipeline(
         community_id=ingestion_request.communityId,
         collection_name=collection_name,
+        use_cache=False,
     )
 
     document = Document(
@@ -188,6 +189,7 @@ async def process_documents_batch(
     pipeline = CustomIngestionPipeline(
         community_id=batch_chunk.communityId,
         collection_name=collection_name,
+        use_cache=False,
     )
 
     # Convert all documents in this chunk to Document objects
diff --git a/hivemind_etl/website/website_etl.py b/hivemind_etl/website/website_etl.py
@@ -30,7 +30,7 @@ def __init__(
 
         # preparing the ingestion pipeline
         self.ingestion_pipeline = CustomIngestionPipeline(
-            self.community_id, collection_name=self.platform_id
+            self.community_id, collection_name=self.platform_id, use_cache=False,
         )
 
     async def extract(
diff --git a/hivemind_summarizer/activities.py b/hivemind_summarizer/activities.py
@@ -97,6 +97,7 @@ async def fetch_platform_summaries_by_date(
             pipeline = CustomIngestionPipeline(
                 community_id=community_id,
                 collection_name=f"{input.platform_id}_summary",
+                use_cache=False,
             )
             # get the latest date from the collection
             latest_date = pipeline.get_latest_document_date(
@@ -211,6 +212,7 @@ async def fetch_platform_summaries_by_date_range(
                 extract_text_only=extract_text_only,
                 platform_id=input.platform_id,
                 community_id=community_id,
+                use_cache=False,
             )
             summaries = await fetch_platform_summaries_by_date(date_input)
             result[date] = summaries
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 python-dotenv>=1.0.0, <2.0.0
-tc-hivemind-backend==1.4.3
+tc-hivemind-backend==1.4.6
 llama-index-storage-docstore-redis==0.1.2
 llama-index-storage-docstore-mongodb==0.1.3
 crawlee[playwright]==0.3.8
diff --git a/tests/unit/test_mediawiki_etl.py b/tests/unit/test_mediawiki_etl.py
@@ -164,7 +164,7 @@ def test_load_with_dump_deletion(self, mock_ingestion_pipeline_class):
 
         # Verify that methods were called correctly
         mock_ingestion_pipeline_class.assert_called_once_with(
-            self.community_id, collection_name=self.platform_id
+            self.community_id, collection_name=self.platform_id, use_cache=False
         )
         mock_pipeline.run_pipeline.assert_called_once_with(documents)
         self.assertFalse(os.path.exists(etl.dump_dir))
@@ -192,7 +192,7 @@ def test_load_without_dump_deletion(self, mock_ingestion_pipeline_class):
 
         # Verify that methods were called correctly
         mock_ingestion_pipeline_class.assert_called_once_with(
-            self.community_id, collection_name=self.platform_id
+            self.community_id, collection_name=self.platform_id, use_cache=False
         )
         mock_pipeline.run_pipeline.assert_called_once_with(documents)
         self.assertTrue(os.path.exists(etl.dump_dir))

Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,7 @@ def transform(self) -> list[Document]:`
`99`	`99`	`def load(self, documents: list[Document]) -> None:`
`100`	`100`	`logging.info(f"Loading {len(documents)} documents into Qdrant!")`
`101`	`101`	`ingestion_pipeline = CustomIngestionPipeline(`
`102`		`- self.community_id, collection_name=self.platform_id`
	`102`	`+ self.community_id, collection_name=self.platform_id, use_cache=False,`
`103`	`103`	`)`
`104`	`104`
`105`	`105`	`# Process batches in parallel using ThreadPoolExecutor`
Original file line number	Diff line number	Diff line change
`@@ -147,6 +147,7 @@ async def process_document(`
`147`	`147`	`pipeline = CustomIngestionPipeline(`
`148`	`148`	`community_id=ingestion_request.communityId,`
`149`	`149`	`collection_name=collection_name,`
	`150`	`+ use_cache=False,`
`150`	`151`	`)`
`151`	`152`
`152`	`153`	`document = Document(`
`@@ -188,6 +189,7 @@ async def process_documents_batch(`
`188`	`189`	`pipeline = CustomIngestionPipeline(`
`189`	`190`	`community_id=batch_chunk.communityId,`
`190`	`191`	`collection_name=collection_name,`
	`192`	`+ use_cache=False,`
`191`	`193`	`)`
`192`	`194`
`193`	`195`	`# Convert all documents in this chunk to Document objects`
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ def __init__(`
`30`	`30`
`31`	`31`	`# preparing the ingestion pipeline`
`32`	`32`	`self.ingestion_pipeline = CustomIngestionPipeline(`
`33`		`- self.community_id, collection_name=self.platform_id`
	`33`	`+ self.community_id, collection_name=self.platform_id, use_cache=False,`
`34`	`34`	`)`
`35`	`35`
`36`	`36`	`async def extract(`