@@ -179,6 +179,94 @@ jobs:
179179 exit 1
180180 fi
181181
182+ - name : Remove Public Suffix List entries
183+ id : remove_psl_entries
184+ run : |
185+ echo "Fetching Public Suffix List..."
186+
187+ # Download and process the Public Suffix List with error handling
188+ # Remove comments, empty lines, wildcards, exceptions, and leading/trailing whitespace
189+ if ! curl -sL https://publicsuffix.org/list/public_suffix_list.dat | \
190+ grep -v '^//' | \
191+ grep -v '^$' | \
192+ grep -v '^\*' | \
193+ grep -v '^!' | \
194+ sed 's/^[ \t]*//;s/[ \t]*$//' > psl.txt; then
195+ echo "Error: Failed to download or process Public Suffix List"
196+ exit 1
197+ fi
198+
199+ # Verify the PSL file has content
200+ if [ ! -s psl.txt ]; then
201+ echo "Error: Public Suffix List file is empty"
202+ exit 1
203+ fi
204+
205+ PSL_COUNT=$(wc -l < psl.txt | tr -d ' ')
206+ echo "Loaded $PSL_COUNT public suffixes from PSL"
207+
208+ echo ""
209+ echo "Removing PSL entries from tranco.csv..."
210+
211+ # First, normalize line endings by removing all carriage returns
212+ # This handles both Unix (\n) and Windows (\r\n) line endings uniformly
213+ tr -d '\r' < tranco.csv > tranco_normalized.csv
214+ mv tranco_normalized.csv tranco.csv
215+
216+ # Build a single awk script for efficient filtering
217+ # This processes the file in one pass instead of O(n×m) complexity
218+ # Using exact string matching avoids regex escaping issues
219+ awk 'BEGIN {
220+ # Read all PSL entries into an associative array
221+ while ((getline line < "psl.txt") > 0) {
222+ if (line != "") {
223+ psl[line] = 1
224+ }
225+ }
226+ close("psl.txt")
227+ removed = 0
228+ }
229+ {
230+ # Extract domain from "rank,domain" format
231+ n = index($0, ",")
232+ if (n > 0) {
233+ domain = substr($0, n + 1)
234+
235+ # Check if domain in PSL (exact string match)
236+ if (domain in psl) {
237+ removed++
238+ print "✓ Removed: " domain > "/dev/stderr"
239+ } else {
240+ # Keep this line
241+ print $0
242+ }
243+ } else {
244+ # Malformed line, keep it
245+ print $0
246+ }
247+ }
248+ END {
249+ # Write count to a separate file for easy extraction
250+ print removed > "removal_count.txt"
251+ }' tranco.csv > tranco_filtered.csv
252+
253+ # Read the removal count
254+ TOTAL_REMOVED=$(cat removal_count.txt)
255+
256+ # Replace original file with filtered version
257+ mv tranco_filtered.csv tranco.csv
258+
259+ # Clean up
260+ rm -f removal_count.txt psl.txt
261+ # Report final statistics
262+ FINAL_COUNT=$(wc -l < tranco.csv | tr -d ' ')
263+ echo ""
264+ echo "=== Summary ==="
265+ echo "PSL entries checked: $PSL_COUNT"
266+ echo "PSL entries found in Tranco: $TOTAL_REMOVED"
267+ echo "Total domains removed: $TOTAL_REMOVED"
268+ echo "Final line count: $FINAL_COUNT"
269+
182270 - name : Set configuration for top files
183271 id : set_config_top
184272 run : |
0 commit comments