|
1 | 1 | # frozen_string_literal: true |
2 | | -require 'fuzzystringmatch' |
3 | 2 |
|
4 | 3 | # app/models/document.rb |
5 | 4 | class Document < ApplicationRecord |
@@ -93,99 +92,6 @@ def formatted_last_crawl_date |
93 | 92 | end |
94 | 93 |
|
95 | 94 | def retrieve_snippets(text_given) |
96 | | - text_to_scan = text_given || '' |
97 | | - jarow = FuzzyStringMatch::JaroWinkler.create(:native) |
98 | | - match_threshold = 0.85 |
99 | | - |
100 | | - quotes = [] |
101 | | - snippets = [] |
102 | | - points_with_quote_text_to_restore_in_doc = [] |
103 | | - points_no_longer_in_text = [] |
104 | | - |
105 | | - normalized_text = text_to_scan.gsub(/\s+/, ' ') |
106 | | - |
107 | | - points.each do |p| |
108 | | - next if p.status == 'declined' |
109 | | - next if p.quote_text.nil? || (p.quote_start.nil? && p.quote_end.nil?) |
110 | | - |
111 | | - normalized_quote = p.quote_text.gsub(/\s+/, ' ') |
112 | | - quote_start = normalized_text.index(normalized_quote) |
113 | | - |
114 | | - if quote_start.nil? |
115 | | - # Fuzzy match fallback |
116 | | - best_match_index = nil |
117 | | - best_match_score = 0.0 |
118 | | - best_candidate = nil |
119 | | - window_size = normalized_quote.length |
120 | | - |
121 | | - (0..(normalized_text.length - window_size)).each do |i| |
122 | | - candidate = normalized_text[i, window_size] |
123 | | - score = jarow.getDistance(normalized_quote, candidate) |
124 | | - |
125 | | - if score > best_match_score |
126 | | - best_match_score = score |
127 | | - best_match_index = i |
128 | | - best_candidate = candidate |
129 | | - end |
130 | | - end |
131 | | - |
132 | | - if best_match_score >= match_threshold |
133 | | - puts "🔍 Fuzzy match for Point ##{p.id} with score #{best_match_score.round(3)}" |
134 | | - puts "→ Original: #{p.quote_text[0..80].inspect}" |
135 | | - puts "→ Match: #{best_candidate[0..80].inspect}" |
136 | | - quote_start = best_match_index |
137 | | - else |
138 | | - puts "❌ No match for Point ##{p.id}" |
139 | | - if p.status != 'approved-not-found' && p.status != 'pending-not-found' |
140 | | - old_status = p.status |
141 | | - p.status = old_status == 'approved' ? 'approved-not-found' : 'pending-not-found' |
142 | | - p.save! |
143 | | - puts "⚠️ Point ##{p.id} status changed from #{old_status} to #{p.status}" |
144 | | - end |
145 | | - points_no_longer_in_text << p |
146 | | - next |
147 | | - end |
148 | | - end |
149 | | - |
150 | | - quote_end = quote_start + normalized_quote.length |
151 | | - quote_start_changed = p.quote_start != quote_start |
152 | | - quote_end_changed = p.quote_end != quote_end |
153 | | - |
154 | | - if p.status == 'approved-not-found' |
155 | | - puts "✅ Point ##{p.id} (#{p.title}) restored. Status updated to 'approved'." |
156 | | - p.status = 'approved' |
157 | | - p.quote_start = quote_start |
158 | | - p.quote_end = quote_end |
159 | | - p.save! |
160 | | - quotes << p |
161 | | - elsif !quote_start_changed && !quote_end_changed |
162 | | - quotes << p |
163 | | - else |
164 | | - puts "🛠️ Point ##{p.id} position mismatch. Old: #{p.quote_start}-#{p.quote_end}, New: #{quote_start}-#{quote_end}" |
165 | | - points_with_quote_text_to_restore_in_doc << p |
166 | | - end |
167 | | - end |
168 | | - |
169 | | - cursor = 0 |
170 | | - quotes.sort_by!(&:quote_start) |
171 | | - |
172 | | - quotes.each do |q| |
173 | | - next unless q.quote_start > cursor |
174 | | - |
175 | | - snippets << { text: text_to_scan[cursor, q.quote_start - cursor] } |
176 | | - snippets << { |
177 | | - pointId: q.id, |
178 | | - text: text_to_scan[q.quote_start, q.quote_end - q.quote_start], |
179 | | - title: q.title |
180 | | - } |
181 | | - cursor = q.quote_end |
182 | | - end |
183 | | - |
184 | | - snippets << { text: text_to_scan[cursor, text_to_scan.length - cursor] } |
185 | | - |
186 | | - { |
187 | | - snippets: snippets, |
188 | | - points_needing_restoration: points_with_quote_text_to_restore_in_doc |
189 | | - } |
| 95 | + SnippetRetriever.new(text: text_given, points: points).call |
190 | 96 | end |
191 | 97 | end |
0 commit comments