Skip to content

Commit 4d07bb1

Browse files
authored
Merge pull request #31 from TogetherCrew/feat/29-fetch-summaries
feat: updated to get latest summary docs!
2 parents 73808cc + ab0b4c0 commit 4d07bb1

File tree

4 files changed

+73
-41
lines changed

4 files changed

+73
-41
lines changed

hivemind_summarizer/activities.py

Lines changed: 58 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
from bson import ObjectId
77
from tc_hivemind_backend.db.qdrant import QdrantSingleton
88
from tc_hivemind_backend.db.mongo import MongoSingleton
9+
from tc_hivemind_backend.ingest_qdrant import CustomIngestionPipeline
910

1011
from temporalio import activity, workflow
1112
from qdrant_client.models import Filter, FieldCondition, MatchValue
13+
from qdrant_client.http import models
1214

1315
with workflow.unsafe.imports_passed_through():
1416
from hivemind_summarizer.schema import (
@@ -40,7 +42,7 @@ def extract_summary_text(node_content: dict[str, Any]) -> str:
4042

4143

4244
@activity.defn
43-
async def get_collection_name(input: TelegramGetCollectionNameInput) -> str:
45+
async def get_platform_name(input: TelegramGetCollectionNameInput) -> str:
4446
"""
4547
Activity that extracts collection name from MongoDB based on platform_id and community_id.
4648
@@ -52,7 +54,7 @@ async def get_collection_name(input: TelegramGetCollectionNameInput) -> str:
5254
Returns
5355
-------
5456
str
55-
The collection name in format [communityId]_[platformName]_summary
57+
The platform name
5658
5759
Raises
5860
------
@@ -83,11 +85,7 @@ async def get_collection_name(input: TelegramGetCollectionNameInput) -> str:
8385
if not platform_name:
8486
raise Exception(f"Platform name not found for platform_id {platform_id}")
8587

86-
# Construct collection name
87-
collection_name = f"{community_id}_{platform_name}_summary"
88-
89-
logging.info(f"Generated collection name: {collection_name}")
90-
return collection_name
88+
return platform_name
9189

9290
except Exception as e:
9391
logging.error(f"Error getting collection name: {str(e)}")
@@ -113,11 +111,13 @@ async def fetch_telegram_summaries_by_date(
113111
"""
114112
date = input.date
115113
extract_text_only = input.extract_text_only
116-
collection_name = input.collection_name
114+
collection_name = f"{input.community_id}_{input.platform_name}_summary"
115+
community_id = input.community_id
117116

118117
logging.info("Started fetch_telegram_summaries_by_date!")
119-
if not collection_name:
120-
raise ValueError("Collection name is required but was not provided")
118+
119+
if not input.platform_name:
120+
raise ValueError("Platform name is required but was not provided")
121121

122122
logging.info(
123123
f"Fetching summaries for date: {date} from collection: {collection_name}"
@@ -128,19 +128,46 @@ async def fetch_telegram_summaries_by_date(
128128
qdrant_client = QdrantSingleton.get_instance().get_client()
129129

130130
# Create filter for the specified date
131-
filter_conditions = [FieldCondition(key="date", match=MatchValue(value=date))]
132-
133-
date_filter = Filter(must=filter_conditions)
134-
135-
# Query Qdrant for all summaries matching the date using the provided collection name
136-
search_results = qdrant_client.search(
137-
collection_name=collection_name,
138-
query_vector=[0] * 1024,
139-
query_filter=date_filter,
140-
limit=100,
141-
with_payload=True,
142-
with_vectors=False,
143-
)
131+
if date is not None:
132+
filter_conditions = [
133+
FieldCondition(key="date", match=MatchValue(value=date))
134+
]
135+
date_filter = Filter(must=filter_conditions)
136+
137+
# Query Qdrant for all summaries matching the date using the provided collection name
138+
search_results = qdrant_client.search(
139+
collection_name=collection_name,
140+
query_vector=[0] * 1024,
141+
query_filter=date_filter,
142+
limit=100,
143+
with_payload=True,
144+
with_vectors=False,
145+
)
146+
else:
147+
# pipeline requires a different format for the collection name
148+
pipeline = CustomIngestionPipeline(
149+
community_id=community_id,
150+
collection_name=f"{input.platform_name}_summary",
151+
)
152+
# get the latest date from the collection
153+
latest_date = pipeline.get_latest_document_date(
154+
field_name="date", field_schema=models.PayloadSchemaType.DATETIME
155+
)
156+
157+
filter_conditions = [
158+
FieldCondition(
159+
key="date", match=MatchValue(value=latest_date.strftime("%Y-%m-%d"))
160+
)
161+
]
162+
date_filter = Filter(must=filter_conditions)
163+
search_results = qdrant_client.search(
164+
collection_name=collection_name,
165+
query_vector=[0] * 1024,
166+
query_filter=date_filter,
167+
limit=100,
168+
with_payload=True,
169+
with_vectors=False,
170+
)
144171

145172
summaries = []
146173
for point in search_results:
@@ -189,7 +216,7 @@ async def fetch_telegram_summaries_by_date_range(
189216
Parameters
190217
----------
191218
input : TelegramSummariesRangeActivityInput
192-
Input object containing start_date, end_date, collection_name and extract_text_only
219+
Input object containing start_date, end_date, platform_name and community_id
193220
194221
Returns
195222
-------
@@ -199,15 +226,15 @@ async def fetch_telegram_summaries_by_date_range(
199226
Raises
200227
------
201228
ValueError
202-
If end_date is before start_date or collection_name is not provided
229+
If end_date is before start_date or platform_name is not provided
203230
"""
204231
start_date = input.start_date
205232
end_date = input.end_date
206233
extract_text_only = input.extract_text_only
207-
collection_name = input.collection_name
208-
209-
if not collection_name:
210-
raise ValueError("Collection name is required but was not provided")
234+
platform_name = input.platform_name
235+
community_id = input.community_id
236+
if not platform_name:
237+
raise ValueError("Platform name is required but was not provided")
211238

212239
logging.info(
213240
f"Fetching summaries for date range: {start_date} to {end_date} from collection: {collection_name}"
@@ -235,7 +262,8 @@ async def fetch_telegram_summaries_by_date_range(
235262
date_input = TelegramSummariesActivityInput(
236263
date=date,
237264
extract_text_only=extract_text_only,
238-
collection_name=collection_name,
265+
platform_name=input.platform_name,
266+
community_id=community_id,
239267
)
240268
summaries = await fetch_telegram_summaries_by_date(date_input)
241269
result[date] = summaries

hivemind_summarizer/schema.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,18 @@
22

33

44
class TelegramSummariesActivityInput(BaseModel):
5-
date: str
5+
date: str | None = None
66
extract_text_only: bool = True
7-
collection_name: str | None = None
7+
platform_name: str | None = None
8+
community_id: str | None = None
89

910

1011
class TelegramSummariesRangeActivityInput(BaseModel):
1112
start_date: str
1213
end_date: str
1314
extract_text_only: bool = True
14-
collection_name: str | None = None
15+
platform_name: str | None = None
16+
community_id: str | None = None
1517

1618

1719
class TelegramGetCollectionNameInput(BaseModel):
@@ -22,6 +24,6 @@ class TelegramGetCollectionNameInput(BaseModel):
2224
class TelegramFetchSummariesWorkflowInput(BaseModel):
2325
platform_id: str
2426
community_id: str
25-
start_date: str
27+
start_date: str | None = None
2628
end_date: str | None = None
2729
extract_text_only: bool = True

hivemind_summarizer/workflows.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from .activities import (
1010
fetch_telegram_summaries_by_date,
1111
fetch_telegram_summaries_by_date_range,
12-
get_collection_name,
12+
get_platform_name,
1313
)
1414
from .schema import (
1515
TelegramSummariesActivityInput,
@@ -54,8 +54,8 @@ async def run(
5454

5555
logging.info("Getting collection name!")
5656
# First, get the collection name
57-
collection_name = await workflow.execute_activity(
58-
get_collection_name,
57+
platform_name = await workflow.execute_activity(
58+
get_platform_name,
5959
TelegramGetCollectionNameInput(
6060
platform_id=input.platform_id, community_id=input.community_id
6161
),
@@ -70,7 +70,8 @@ async def run(
7070
fetch_telegram_summaries_by_date,
7171
TelegramSummariesActivityInput(
7272
date=input.start_date,
73-
collection_name=collection_name,
73+
platform_name=platform_name,
74+
community_id=input.community_id,
7475
extract_text_only=input.extract_text_only,
7576
),
7677
schedule_to_close_timeout=timedelta(minutes=2),
@@ -84,7 +85,8 @@ async def run(
8485
TelegramSummariesRangeActivityInput(
8586
start_date=input.start_date,
8687
end_date=input.end_date,
87-
collection_name=collection_name,
88+
platform_name=platform_name,
89+
community_id=input.community_id,
8890
extract_text_only=input.extract_text_only,
8991
),
9092
schedule_to_close_timeout=timedelta(minutes=2),

registry.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from hivemind_summarizer.activities import (
1313
fetch_telegram_summaries_by_date,
1414
fetch_telegram_summaries_by_date_range,
15-
get_collection_name,
15+
get_platform_name,
1616
)
1717
from workflows import (
1818
CommunityWebsiteWorkflow,
@@ -42,5 +42,5 @@
4242
say_hello,
4343
fetch_telegram_summaries_by_date,
4444
fetch_telegram_summaries_by_date_range,
45-
get_collection_name,
45+
get_platform_name,
4646
]

0 commit comments

Comments
 (0)