Skip to content

Commit a77c235

Browse files
authored
Merge pull request #64 from TogetherCrew/fix/63-ingestion-pipline-remove-caching
Fix/63 ingestion pipline remove caching
2 parents d61e8d4 + 754d262 commit a77c235

File tree

8 files changed

+11
-7
lines changed

8 files changed

+11
-7
lines changed

.github/workflows/production.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@ on:
99

1010
jobs:
1111
ci:
12-
uses: TogetherCrew/operations/.github/workflows/ci.yml@main
12+
uses: TogetherCrew/operations/.github/workflows/ci2.yml@main
1313
secrets:
1414
CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}

.github/workflows/start.staging.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,6 @@ on: pull_request
66

77
jobs:
88
ci:
9-
uses: TogetherCrew/operations/.github/workflows/ci.yml@main
9+
uses: TogetherCrew/operations/.github/workflows/ci2.yml@main
1010
secrets:
1111
CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}

hivemind_etl/mediawiki/etl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def transform(self) -> list[Document]:
9999
def load(self, documents: list[Document]) -> None:
100100
logging.info(f"Loading {len(documents)} documents into Qdrant!")
101101
ingestion_pipeline = CustomIngestionPipeline(
102-
self.community_id, collection_name=self.platform_id
102+
self.community_id, collection_name=self.platform_id, use_cache=False,
103103
)
104104

105105
# Process batches in parallel using ThreadPoolExecutor

hivemind_etl/simple_ingestion/pipeline.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ async def process_document(
147147
pipeline = CustomIngestionPipeline(
148148
community_id=ingestion_request.communityId,
149149
collection_name=collection_name,
150+
use_cache=False,
150151
)
151152

152153
document = Document(
@@ -188,6 +189,7 @@ async def process_documents_batch(
188189
pipeline = CustomIngestionPipeline(
189190
community_id=batch_chunk.communityId,
190191
collection_name=collection_name,
192+
use_cache=False,
191193
)
192194

193195
# Convert all documents in this chunk to Document objects

hivemind_etl/website/website_etl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(
3030

3131
# preparing the ingestion pipeline
3232
self.ingestion_pipeline = CustomIngestionPipeline(
33-
self.community_id, collection_name=self.platform_id
33+
self.community_id, collection_name=self.platform_id, use_cache=False,
3434
)
3535

3636
async def extract(

hivemind_summarizer/activities.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ async def fetch_platform_summaries_by_date(
9797
pipeline = CustomIngestionPipeline(
9898
community_id=community_id,
9999
collection_name=f"{input.platform_id}_summary",
100+
use_cache=False,
100101
)
101102
# get the latest date from the collection
102103
latest_date = pipeline.get_latest_document_date(
@@ -211,6 +212,7 @@ async def fetch_platform_summaries_by_date_range(
211212
extract_text_only=extract_text_only,
212213
platform_id=input.platform_id,
213214
community_id=community_id,
215+
use_cache=False,
214216
)
215217
summaries = await fetch_platform_summaries_by_date(date_input)
216218
result[date] = summaries

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
python-dotenv>=1.0.0, <2.0.0
2-
tc-hivemind-backend==1.4.3
2+
tc-hivemind-backend==1.4.6
33
llama-index-storage-docstore-redis==0.1.2
44
llama-index-storage-docstore-mongodb==0.1.3
55
crawlee[playwright]==0.3.8

tests/unit/test_mediawiki_etl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ def test_load_with_dump_deletion(self, mock_ingestion_pipeline_class):
164164

165165
# Verify that methods were called correctly
166166
mock_ingestion_pipeline_class.assert_called_once_with(
167-
self.community_id, collection_name=self.platform_id
167+
self.community_id, collection_name=self.platform_id, use_cache=False
168168
)
169169
mock_pipeline.run_pipeline.assert_called_once_with(documents)
170170
self.assertFalse(os.path.exists(etl.dump_dir))
@@ -192,7 +192,7 @@ def test_load_without_dump_deletion(self, mock_ingestion_pipeline_class):
192192

193193
# Verify that methods were called correctly
194194
mock_ingestion_pipeline_class.assert_called_once_with(
195-
self.community_id, collection_name=self.platform_id
195+
self.community_id, collection_name=self.platform_id, use_cache=False
196196
)
197197
mock_pipeline.run_pipeline.assert_called_once_with(documents)
198198
self.assertTrue(os.path.exists(etl.dump_dir))

0 commit comments

Comments
 (0)