Skip to content

Commit ee1105f

Browse files
committed
WIP ✨(backend) improve search filtering & ordering
Filter deleted documents from visited ones. Add ordering to the Find API search call. BaseDocumentIndexer.search now returns a list of document ids instead of models. Do not call the indexer in signals when SEARCH_INDEXER_CLASS is not defined or properly configured. Signed-off-by: Fabre Florian <[email protected]>
1 parent 3aff599 commit ee1105f

File tree

8 files changed

+212
-56
lines changed

8 files changed

+212
-56
lines changed

src/backend/core/api/serializers.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -827,3 +827,7 @@ class FindDocumentSerializer(serializers.Serializer):
827827
"""Serializer for Find search requests"""
828828

829829
q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True)
830+
page_size = serializers.IntegerField(
831+
required=False, min_value=1, max_value=50, default=20
832+
)
833+
page = serializers.IntegerField(required=False, min_value=1, default=1)

src/backend/core/api/viewsets.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,10 @@
4949
from core.services.converter_services import (
5050
YdocConverter,
5151
)
52-
from core.services.search_indexers import get_document_indexer_class
52+
from core.services.search_indexers import (
53+
get_document_indexer_class,
54+
get_visited_document_ids_of,
55+
)
5356
from core.tasks.mail import send_ask_for_access_mail
5457
from core.utils import extract_attachments, filter_descendants
5558

@@ -1042,23 +1045,37 @@ def search(self, request, *args, **kwargs):
10421045
The filtering allows full text search through the opensearch indexation app "find".
10431046
"""
10441047
access_token = request.session.get("oidc_access_token")
1048+
user = request.user
10451049

10461050
serializer = serializers.FindDocumentSerializer(data=request.query_params)
10471051
serializer.is_valid(raise_exception=True)
10481052

1053+
queryset = self.get_queryset()
1054+
10491055
try:
10501056
indexer = get_document_indexer_class()()
1051-
queryset = indexer.search(
1057+
ids = indexer.search(
10521058
text=serializer.validated_data.get("q", ""),
1053-
user=request.user,
1059+
user=user,
10541060
token=access_token,
1061+
visited=get_visited_document_ids_of(queryset, user),
1062+
page_number=serializer.validated_data.get("page"),
1063+
page_size=serializer.validated_data.get("page_size"),
10551064
)
10561065
except ImproperlyConfigured:
10571066
return drf.response.Response(
10581067
{"detail": "The service is not properly configured."},
1059-
status=status.HTTP_401_UNAUTHORIZED,
1068+
status=status.HTTP_500_INTERNAL_SERVER_ERROR,
10601069
)
10611070

1071+
queryset = queryset.filter(pk__in=ids)
1072+
queryset = queryset.annotate_user_roles(user)
1073+
1074+
# Apply ordering only now that everything is filtered and annotated
1075+
queryset = filters.OrderingFilter().filter_queryset(
1076+
self.request, queryset, self
1077+
)
1078+
10621079
return self.get_response_for_queryset(
10631080
queryset,
10641081
context={

src/backend/core/services/search_indexers.py

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from django.conf import settings
99
from django.contrib.auth.models import AnonymousUser
1010
from django.core.exceptions import ImproperlyConfigured
11+
from django.db.models import Subquery
1112
from django.utils.module_loading import import_string
1213

1314
import requests
@@ -17,6 +18,17 @@
1718
logger = logging.getLogger(__name__)
1819

1920

21+
@cache
22+
def is_document_indexer_configured() -> bool:
23+
"""Returns true if the indexer service is enabled and properly configured."""
24+
try:
25+
get_document_indexer_class()
26+
except ImproperlyConfigured:
27+
return False
28+
29+
return True
30+
31+
2032
@cache
2133
def get_document_indexer_class() -> "BaseDocumentIndexer":
2234
"""Return the indexer backend class based on the settings."""
@@ -65,7 +77,7 @@ def get_batch_accesses_by_users_and_teams(paths):
6577
return dict(access_by_document_path)
6678

6779

68-
def get_visited_document_ids_of(user):
80+
def get_visited_document_ids_of(queryset, user):
6981
"""
7082
Returns the ids of the documents that have a linktrace to the user and NOT owned.
7183
It will be use to limit the opensearch responses to the public documents already
@@ -74,11 +86,18 @@ def get_visited_document_ids_of(user):
7486
if isinstance(user, AnonymousUser):
7587
return []
7688

77-
qs = models.LinkTrace.objects.filter(user=user).exclude(
78-
document__accesses__user=user,
89+
qs = models.LinkTrace.objects.filter(user=user)
90+
91+
docs = (
92+
queryset.exclude(accesses__user=user)
93+
.filter(
94+
deleted_at__isnull=True,
95+
ancestors_deleted_at__isnull=True,
96+
)
97+
.filter(pk__in=Subquery(qs.values("document_id")))
7998
)
8099

81-
return list({str(id) for id in qs.values_list("document_id", flat=True)})
100+
return list({str(id) for id in docs.values_list("pk", flat=True)})
82101

83102

84103
class BaseDocumentIndexer(ABC):
@@ -159,22 +178,23 @@ def push(self, data):
159178
Must be implemented by subclasses.
160179
"""
161180

162-
def search(self, text, user, token):
181+
# pylint: disable-next=too-many-arguments,too-many-positional-arguments
182+
def search(self, text, token, visited=(), page=1, page_size=50):
163183
"""
164184
Search for documents in Find app.
165185
"""
166-
visited_ids = get_visited_document_ids_of(user)
167-
168186
response = self.search_query(
169187
data={
170188
"q": text,
171-
"visited": visited_ids,
189+
"visited": visited,
172190
"services": ["docs"],
191+
"page_number": page,
192+
"page_size": page_size,
173193
},
174194
token=token,
175195
)
176196

177-
return self.format_response(response)
197+
return [d["_id"] for d in response]
178198

179199
@abstractmethod
180200
def search_query(self, data, token) -> dict:
@@ -184,14 +204,6 @@ def search_query(self, data, token) -> dict:
184204
Must be implemented by subclasses.
185205
"""
186206

187-
@abstractmethod
188-
def format_response(self, data: dict):
189-
"""
190-
Convert the JSON response from Find app as document queryset.
191-
192-
Must be implemented by subclasses.
193-
"""
194-
195207

196208
class FindDocumentIndexer(BaseDocumentIndexer):
197209
"""
@@ -253,12 +265,6 @@ def search_query(self, data, token) -> requests.Response:
253265
logger.error("HTTPError: %s", e)
254266
raise
255267

256-
def format_response(self, data: dict):
257-
"""
258-
Retrieve documents ids from Find app response and return a queryset.
259-
"""
260-
return models.Document.objects.filter(pk__in=[d["_id"] for d in data])
261-
262268
def push(self, data):
263269
"""
264270
Push a batch of documents to the Find backend.

src/backend/core/signals.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from django.dispatch import receiver
77

88
from . import models
9+
from .services.search_indexers import is_document_indexer_configured
910
from .tasks.find import trigger_document_indexer
1011

1112

@@ -16,13 +17,14 @@ def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-ar
1617
Note : Within the transaction we can have an empty content and a serialization
1718
error.
1819
"""
19-
trigger_document_indexer(instance, on_commit=True)
20+
if is_document_indexer_configured():
21+
trigger_document_indexer(instance, on_commit=True)
2022

2123

2224
@receiver(signals.post_save, sender=models.DocumentAccess)
2325
def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument
2426
"""
2527
Asynchronous call to the document indexer at the end of the transaction.
2628
"""
27-
if not created:
29+
if not created and is_document_indexer_configured():
2830
trigger_document_indexer(instance.document, on_commit=True)

src/backend/core/tasks/find.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,15 +72,14 @@ def trigger_document_indexer(document, on_commit=False):
7272
on_commit (bool): Wait for the end of the transaction before starting the task
7373
(some fields may be in wrong state within the transaction)
7474
"""
75-
76-
if document.deleted_at or document.ancestors_deleted_at:
77-
pass
78-
7975
if on_commit:
8076
transaction.on_commit(
8177
partial(trigger_document_indexer, document, on_commit=False)
8278
)
8379
else:
80+
if document.deleted_at or document.ancestors_deleted_at:
81+
return
82+
8483
key = document_indexer_debounce_key(document.pk)
8584
countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1)
8685

src/backend/core/tests/documents/test_api_documents_search.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,6 @@ def test_api_documents_search_format(settings):
140140
"updated_at": document.updated_at.isoformat().replace("+00:00", "Z"),
141141
"user_role": access.role,
142142
}
143+
144+
145+
# TODO : add pagination tests

src/backend/core/tests/test_models_documents.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1460,6 +1460,59 @@ def test_models_documents_post_save_indexer(mock_push, settings):
14601460
assert cache.get(document_indexer_debounce_key(doc3.pk)) == 0
14611461

14621462

1463+
@mock.patch.object(FindDocumentIndexer, "push")
1464+
@pytest.mark.django_db(transaction=True)
1465+
def test_models_documents_post_save_indexer_deleted(mock_push, settings):
1466+
"""Skip indexation task on deleted or ancestor_deleted documents"""
1467+
settings.SEARCH_INDEXER_COUNTDOWN = 0
1468+
1469+
user = factories.UserFactory()
1470+
1471+
with transaction.atomic():
1472+
doc = factories.DocumentFactory()
1473+
doc_deleted = factories.DocumentFactory()
1474+
doc_ancestor_deleted = factories.DocumentFactory(parent=doc_deleted)
1475+
doc_deleted.soft_delete()
1476+
doc_ancestor_deleted.ancestors_deleted_at = doc_deleted.deleted_at
1477+
1478+
factories.UserDocumentAccessFactory(document=doc, user=user)
1479+
factories.UserDocumentAccessFactory(document=doc_deleted, user=user)
1480+
factories.UserDocumentAccessFactory(document=doc_ancestor_deleted, user=user)
1481+
1482+
doc_deleted.refresh_from_db()
1483+
doc_ancestor_deleted.refresh_from_db()
1484+
1485+
assert doc_deleted.deleted_at is not None
1486+
assert doc_deleted.ancestors_deleted_at is not None
1487+
1488+
assert doc_ancestor_deleted.deleted_at is None
1489+
assert doc_ancestor_deleted.ancestors_deleted_at is not None
1490+
1491+
time.sleep(0.2) # waits for the end of the tasks
1492+
1493+
accesses = {
1494+
str(doc.path): {"users": [user.sub]},
1495+
str(doc_deleted.path): {"users": [user.sub]},
1496+
str(doc_ancestor_deleted.path): {"users": [user.sub]},
1497+
}
1498+
1499+
data = [call.args[0] for call in mock_push.call_args_list]
1500+
1501+
indexer = FindDocumentIndexer()
1502+
1503+
# Only the not deleted document is indexed
1504+
assert data == [
1505+
indexer.serialize_document(doc, accesses),
1506+
]
1507+
1508+
# The debounce counters should be reset
1509+
assert cache.get(document_indexer_debounce_key(doc.pk)) == 0
1510+
1511+
# These caches are not filled
1512+
assert cache.get(document_indexer_debounce_key(doc_deleted.pk)) is None
1513+
assert cache.get(document_indexer_debounce_key(doc_ancestor_deleted.pk)) is None
1514+
1515+
14631516
@pytest.mark.django_db(transaction=True)
14641517
def test_models_documents_post_save_indexer_debounce(settings):
14651518
"""Test indexation task skipping on document update"""

0 commit comments

Comments
 (0)