1
1
import gc
2
2
import json
3
3
import dataclasses
4
+ from datetime import timedelta
4
5
from typing import Any , TypedDict
5
6
6
7
from django .db import connection
7
8
from django .db .models import QuerySet
9
+ from django .utils import timezone
8
10
9
11
from posthog .models .activity_logging .activity_log import ActivityLog , Change
10
12
from posthog .models .utils import UUIDT
@@ -96,8 +98,49 @@ def _analyze_detail_fields_memory(self) -> DetailFieldsResult:
96
98
def _get_org_record_count (self ) -> int :
97
99
return ActivityLog .objects .filter (organization_id = self .organization_id ).count ()
98
100
99
- def process_batch_for_large_org (self , offset : int , limit : int ) -> None :
100
- batch_fields = self ._process_batch_memory (offset , limit , use_sampling = True )
101
+ def get_activity_logs_queryset (self , hours_back : int | None = None ) -> QuerySet :
102
+ """Get the base queryset for activity logs, optionally filtered by time."""
103
+ queryset = ActivityLog .objects .filter (organization_id = self .organization_id , detail__isnull = False )
104
+
105
+ if hours_back is not None :
106
+ cutoff_time = timezone .now () - timedelta (hours = hours_back )
107
+ queryset = queryset .filter (created_at__gte = cutoff_time )
108
+
109
+ return queryset
110
+
111
+ def get_sampled_records (self , limit : int , offset : int = 0 ) -> list [dict ]:
112
+ """Get sampled records using SQL TABLESAMPLE for large datasets."""
113
+ query = f"""
114
+ SELECT scope, detail
115
+ FROM posthog_activitylog TABLESAMPLE SYSTEM ({ SAMPLING_PERCENTAGE } )
116
+ WHERE organization_id = %s
117
+ AND detail IS NOT NULL
118
+ ORDER BY created_at DESC
119
+ LIMIT %s OFFSET %s
120
+ """
121
+
122
+ with connection .cursor () as cursor :
123
+ cursor .execute (query , [str (self .organization_id ), limit , offset ])
124
+ records = []
125
+ for row in cursor .fetchall ():
126
+ scope , detail = row
127
+ if isinstance (detail , str ):
128
+ try :
129
+ detail = json .loads (detail )
130
+ except (json .JSONDecodeError , TypeError ):
131
+ detail = None
132
+ records .append ({"scope" : scope , "detail" : detail })
133
+ return records
134
+
135
+ def process_batch_for_large_org (self , records : list [dict ], hours_back : int | None = None ) -> None :
136
+ """Process a batch of records for large organizations.
137
+
138
+ Args:
139
+ records: List of activity log records to process
140
+ hours_back: If provided, used to get appropriate static filters for the time range
141
+ """
142
+ # Process the provided records
143
+ batch_fields = self ._extract_fields_from_records (records )
101
144
batch_converted = self ._convert_to_discovery_format (batch_fields )
102
145
103
146
existing_cache = get_cached_fields (str (self .organization_id ))
@@ -108,11 +151,21 @@ def process_batch_for_large_org(self, offset: int, limit: int) -> None:
108
151
current_detail_fields = {}
109
152
self ._merge_fields_into_result (current_detail_fields , batch_converted )
110
153
111
- static_filters = (
112
- existing_cache .get ("static_filters" )
113
- if existing_cache
114
- else self ._get_static_filters (self ._get_base_queryset ())
115
- )
154
+ # Get static filters for the appropriate time range
155
+ if hours_back is not None :
156
+ recent_queryset = self .get_activity_logs_queryset (hours_back = hours_back )
157
+ new_static_filters = self ._get_static_filters (recent_queryset )
158
+
159
+ # Merge with existing static filters
160
+ if existing_cache and "static_filters" in existing_cache :
161
+ static_filters = self ._merge_static_filters (existing_cache ["static_filters" ], new_static_filters )
162
+ else :
163
+ static_filters = new_static_filters
164
+ else :
165
+ if existing_cache and existing_cache .get ("static_filters" ):
166
+ static_filters = existing_cache ["static_filters" ]
167
+ else :
168
+ static_filters = self ._get_static_filters (self ._get_base_queryset ())
116
169
117
170
cache_data = {
118
171
"static_filters" : static_filters ,
@@ -181,38 +234,8 @@ def _discover_fields_memory(
181
234
182
235
return all_fields
183
236
184
- def _process_batch_memory (
185
- self , offset : int , limit : int , use_sampling : bool = True
186
- ) -> dict [str , set [tuple [str , str ]]]:
187
- if use_sampling :
188
- query = f"""
189
- SELECT scope, detail
190
- FROM posthog_activitylog TABLESAMPLE SYSTEM ({ SAMPLING_PERCENTAGE } )
191
- WHERE organization_id = %s
192
- AND detail IS NOT NULL
193
- ORDER BY created_at DESC
194
- LIMIT %s OFFSET %s
195
- """
196
-
197
- with connection .cursor () as cursor :
198
- cursor .execute (query , [str (self .organization_id ), limit , offset ])
199
- records = []
200
- for row in cursor .fetchall ():
201
- scope , detail = row
202
- if isinstance (detail , str ):
203
- try :
204
- detail = json .loads (detail )
205
- except (json .JSONDecodeError , TypeError ):
206
- detail = None
207
- records .append ({"scope" : scope , "detail" : detail })
208
- else :
209
- records = [
210
- {"scope" : record ["scope" ], "detail" : record ["detail" ]}
211
- for record in ActivityLog .objects .filter (
212
- organization_id = self .organization_id , detail__isnull = False
213
- ).values ("scope" , "detail" )[offset : offset + limit ]
214
- ]
215
-
237
+ def _extract_fields_from_records (self , records : list [dict ]) -> dict [str , set [tuple [str , str ]]]:
238
+ """Extract field information from a list of activity log records."""
216
239
batch_fields : dict [str , set [tuple [str , str ]]] = {}
217
240
218
241
for record in records :
@@ -231,6 +254,20 @@ def _process_batch_memory(
231
254
232
255
return batch_fields
233
256
257
+ def _process_batch_memory (
258
+ self , offset : int , limit : int , use_sampling : bool = True
259
+ ) -> dict [str , set [tuple [str , str ]]]:
260
+ """Legacy method for backward compatibility."""
261
+ if use_sampling :
262
+ records = self .get_sampled_records (limit , offset )
263
+ else :
264
+ records = [
265
+ {"scope" : record ["scope" ], "detail" : record ["detail" ]}
266
+ for record in self .get_activity_logs_queryset ().values ("scope" , "detail" )[offset : offset + limit ]
267
+ ]
268
+
269
+ return self ._extract_fields_from_records (records )
270
+
234
271
def _extract_json_paths (self , obj : Any , prefix : str = "" ) -> set [tuple [str , str ]]:
235
272
paths = set ()
236
273
@@ -304,3 +341,31 @@ def _convert_to_discovery_format(self, fields: dict[str, set[tuple[str, str]]])
304
341
result .append ((scope , field_path , sorted (types )))
305
342
306
343
return result
344
+
345
+ def _merge_static_filters (self , existing : dict , new : dict ) -> dict :
346
+ """Merge static filters additively"""
347
+ merged = {
348
+ "users" : existing .get ("users" , []),
349
+ "scopes" : existing .get ("scopes" , []),
350
+ "activities" : existing .get ("activities" , []),
351
+ }
352
+
353
+ # Merge users (by uuid)
354
+ existing_user_ids = {u ["value" ] for u in merged ["users" ]}
355
+ for user in new .get ("users" , []):
356
+ if user ["value" ] not in existing_user_ids :
357
+ merged ["users" ].append (user )
358
+
359
+ # Merge scopes
360
+ existing_scopes = {s ["value" ] for s in merged ["scopes" ]}
361
+ for scope in new .get ("scopes" , []):
362
+ if scope ["value" ] not in existing_scopes :
363
+ merged ["scopes" ].append (scope )
364
+
365
+ # Merge activities
366
+ existing_activities = {a ["value" ] for a in merged ["activities" ]}
367
+ for activity in new .get ("activities" , []):
368
+ if activity ["value" ] not in existing_activities :
369
+ merged ["activities" ].append (activity )
370
+
371
+ return merged
0 commit comments