Skip to content

Commit f4cd097

Browse files
committed
extracts snippets logic into service object
1 parent b8a1ba1 commit f4cd097

File tree

2 files changed

+120
-95
lines changed

2 files changed

+120
-95
lines changed

app/models/document.rb

Lines changed: 1 addition & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# frozen_string_literal: true
2-
require 'fuzzystringmatch'
32

43
# app/models/document.rb
54
class Document < ApplicationRecord
@@ -93,99 +92,6 @@ def formatted_last_crawl_date
9392
end
9493

9594
def retrieve_snippets(text_given)
96-
text_to_scan = text_given || ''
97-
jarow = FuzzyStringMatch::JaroWinkler.create(:native)
98-
match_threshold = 0.85
99-
100-
quotes = []
101-
snippets = []
102-
points_with_quote_text_to_restore_in_doc = []
103-
points_no_longer_in_text = []
104-
105-
normalized_text = text_to_scan.gsub(/\s+/, ' ')
106-
107-
points.each do |p|
108-
next if p.status == 'declined'
109-
next if p.quote_text.nil? || (p.quote_start.nil? && p.quote_end.nil?)
110-
111-
normalized_quote = p.quote_text.gsub(/\s+/, ' ')
112-
quote_start = normalized_text.index(normalized_quote)
113-
114-
if quote_start.nil?
115-
# Fuzzy match fallback
116-
best_match_index = nil
117-
best_match_score = 0.0
118-
best_candidate = nil
119-
window_size = normalized_quote.length
120-
121-
(0..(normalized_text.length - window_size)).each do |i|
122-
candidate = normalized_text[i, window_size]
123-
score = jarow.getDistance(normalized_quote, candidate)
124-
125-
if score > best_match_score
126-
best_match_score = score
127-
best_match_index = i
128-
best_candidate = candidate
129-
end
130-
end
131-
132-
if best_match_score >= match_threshold
133-
puts "🔍 Fuzzy match for Point ##{p.id} with score #{best_match_score.round(3)}"
134-
puts "→ Original: #{p.quote_text[0..80].inspect}"
135-
puts "→ Match: #{best_candidate[0..80].inspect}"
136-
quote_start = best_match_index
137-
else
138-
puts "❌ No match for Point ##{p.id}"
139-
if p.status != 'approved-not-found' && p.status != 'pending-not-found'
140-
old_status = p.status
141-
p.status = old_status == 'approved' ? 'approved-not-found' : 'pending-not-found'
142-
p.save!
143-
puts "⚠️ Point ##{p.id} status changed from #{old_status} to #{p.status}"
144-
end
145-
points_no_longer_in_text << p
146-
next
147-
end
148-
end
149-
150-
quote_end = quote_start + normalized_quote.length
151-
quote_start_changed = p.quote_start != quote_start
152-
quote_end_changed = p.quote_end != quote_end
153-
154-
if p.status == 'approved-not-found'
155-
puts "✅ Point ##{p.id} (#{p.title}) restored. Status updated to 'approved'."
156-
p.status = 'approved'
157-
p.quote_start = quote_start
158-
p.quote_end = quote_end
159-
p.save!
160-
quotes << p
161-
elsif !quote_start_changed && !quote_end_changed
162-
quotes << p
163-
else
164-
puts "🛠️ Point ##{p.id} position mismatch. Old: #{p.quote_start}-#{p.quote_end}, New: #{quote_start}-#{quote_end}"
165-
points_with_quote_text_to_restore_in_doc << p
166-
end
167-
end
168-
169-
cursor = 0
170-
quotes.sort_by!(&:quote_start)
171-
172-
quotes.each do |q|
173-
next unless q.quote_start > cursor
174-
175-
snippets << { text: text_to_scan[cursor, q.quote_start - cursor] }
176-
snippets << {
177-
pointId: q.id,
178-
text: text_to_scan[q.quote_start, q.quote_end - q.quote_start],
179-
title: q.title
180-
}
181-
cursor = q.quote_end
182-
end
183-
184-
snippets << { text: text_to_scan[cursor, text_to_scan.length - cursor] }
185-
186-
{
187-
snippets: snippets,
188-
points_needing_restoration: points_with_quote_text_to_restore_in_doc
189-
}
95+
SnippetRetriever.new(text: text_given, points: points).call
19096
end
19197
end

app/services/snippet_retriever.rb

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# frozen_string_literal: true
2+
3+
require 'fuzzystringmatch'
4+
5+
class SnippetRetriever
6+
def initialize(text:, points:)
7+
@text = text || ''
8+
@points = points
9+
@normalized_text = @text.gsub(/\s+/, ' ')
10+
@jarow = FuzzyStringMatch::JaroWinkler.create(:native)
11+
@match_threshold = 0.85
12+
end
13+
14+
def call
15+
quotes = []
16+
snippets = []
17+
points_needing_restoration = []
18+
19+
@points.each do |point|
20+
next if point.status == 'declined'
21+
next if point.quote_text.nil? || (point.quote_start.nil? && point.quote_end.nil?)
22+
23+
normalized_quote = point.quote_text.gsub(/\s+/, ' ')
24+
quote_start = @normalized_text.index(normalized_quote)
25+
26+
if quote_start.nil?
27+
quote_start = fuzzy_match(normalized_quote)
28+
if quote_start.nil?
29+
handle_not_found(point)
30+
next
31+
end
32+
end
33+
34+
quote_end = quote_start + normalized_quote.length
35+
quote_start_changed = point.quote_start != quote_start
36+
quote_end_changed = point.quote_end != quote_end
37+
38+
if point.status == 'approved-not-found'
39+
puts "✅ Point ##{point.id} restored. Status updated to 'approved'."
40+
point.status = 'approved'
41+
point.quote_start = quote_start
42+
point.quote_end = quote_end
43+
point.save!
44+
quotes << point
45+
elsif !quote_start_changed && !quote_end_changed
46+
quotes << point
47+
else
48+
puts "🛠️ Point ##{point.id} position mismatch. Old: #{point.quote_start}-#{point.quote_end}, New: #{quote_start}-#{quote_end}"
49+
points_needing_restoration << point
50+
end
51+
end
52+
53+
snippets = extract_snippets(quotes)
54+
55+
{
56+
snippets: snippets,
57+
points_needing_restoration: points_needing_restoration
58+
}
59+
end
60+
61+
private
62+
63+
def fuzzy_match(quote)
64+
best_match_index = nil
65+
best_score = 0.0
66+
window_size = quote.length
67+
68+
(0..(@normalized_text.length - window_size)).each do |i|
69+
candidate = @normalized_text[i, window_size]
70+
score = @jarow.getDistance(quote, candidate)
71+
72+
if score > best_score
73+
best_score = score
74+
best_match_index = i
75+
end
76+
end
77+
78+
if best_score >= @match_threshold
79+
puts "🔍 Fuzzy match accepted with score #{best_score.round(3)}"
80+
best_match_index
81+
else
82+
nil
83+
end
84+
end
85+
86+
def handle_not_found(point)
87+
puts "❌ No match for Point ##{point.id} (#{point.title})"
88+
if point.status == 'approved'
89+
point.status = 'approved-not-found'
90+
point.save!
91+
elsif point.status != 'approved-not-found'
92+
point.status = 'pending-not-found'
93+
point.save!
94+
end
95+
end
96+
97+
def extract_snippets(quotes)
98+
snippets = []
99+
cursor = 0
100+
101+
sorted = quotes.sort_by(&:quote_start)
102+
103+
sorted.each do |q|
104+
next unless q.quote_start > cursor
105+
106+
snippets << { text: @text[cursor, q.quote_start - cursor] }
107+
snippets << {
108+
pointId: q.id,
109+
text: @text[q.quote_start, q.quote_end - q.quote_start],
110+
title: q.title
111+
}
112+
113+
cursor = q.quote_end
114+
end
115+
116+
snippets << { text: @text[cursor, @text.length - cursor] }
117+
snippets
118+
end
119+
end

0 commit comments

Comments
 (0)