Skip to content

Commit dd0a056

Browse files
committed
New function util.paraphrase_mining_embeddings to find most similar embeddings in a matrix
1 parent b1ef004 commit dd0a056

File tree

1 file changed

+25
-7
lines changed

1 file changed

+25
-7
lines changed

sentence_transformers/util.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,7 @@ def paraphrase_mining(model,
7373
sentences: List[str],
7474
show_progress_bar: bool = False,
7575
batch_size:int = 32,
76-
query_chunk_size: int = 5000,
77-
corpus_chunk_size: int = 100000,
78-
max_pairs: int = 500000,
79-
top_k: int = 100,
80-
score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim):
76+
**kwargs):
8177
"""
8278
Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
8379
other sentences and returns a list with the pairs that have the highest cosine similarity score.
@@ -94,11 +90,33 @@ def paraphrase_mining(model,
9490
:return: Returns a list of triplets with the format [score, id1, id2]
9591
"""
9692

97-
top_k += 1 #A sentence has the highest similarity to itself. Increase +1 as we are interest in distinct pairs
98-
9993
# Compute embedding for the sentences
10094
embeddings = model.encode(sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_tensor=True)
10195

96+
return paraphrase_mining_embeddings(embeddings, **kwargs)
97+
98+
99+
def paraphrase_mining_embeddings(embeddings: Tensor,
100+
query_chunk_size: int = 5000,
101+
corpus_chunk_size: int = 100000,
102+
max_pairs: int = 500000,
103+
top_k: int = 100,
104+
score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim):
105+
"""
106+
Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
107+
other sentences and returns a list with the pairs that have the highest cosine similarity score.
108+
109+
:param embeddings: A tensor with the embeddings
110+
:param query_chunk_size: Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
111+
:param corpus_chunk_size: Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
112+
:param max_pairs: Maximal number of text pairs returned.
113+
:param top_k: For each sentence, we retrieve up to top_k other sentences
114+
:param score_function: Funtion for computing scores. By default, cosine similarity.
115+
:return: Returns a list of triplets with the format [score, id1, id2]
116+
"""
117+
118+
top_k += 1 # A sentence has the highest similarity to itself. Increase +1 as we are interest in distinct pairs
119+
102120
# Mine for duplicates
103121
pairs = queue.PriorityQueue()
104122
min_score = -1

0 commit comments

Comments
 (0)