Skip to content

Commit 8881bd0

Browse files
aidenmitchellclaudeCopilot
authored
Add domain exclusion step to Tranco workflow (#643)
Co-authored-by: Claude <[email protected]> Co-authored-by: Copilot <[email protected]>
1 parent 04272b2 commit 8881bd0

File tree

1 file changed

+88
-0
lines changed

1 file changed

+88
-0
lines changed

.github/workflows/update-and-process-tranco.yml

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,94 @@ jobs:
179179
exit 1
180180
fi
181181
182+
- name: Remove Public Suffix List entries
183+
id: remove_psl_entries
184+
run: |
185+
echo "Fetching Public Suffix List..."
186+
187+
# Download and process the Public Suffix List with error handling
188+
# Remove comments, empty lines, wildcards, exceptions, and leading/trailing whitespace
189+
if ! curl -sL https://publicsuffix.org/list/public_suffix_list.dat | \
190+
grep -v '^//' | \
191+
grep -v '^$' | \
192+
grep -v '^\*' | \
193+
grep -v '^!' | \
194+
sed 's/^[ \t]*//;s/[ \t]*$//' > psl.txt; then
195+
echo "Error: Failed to download or process Public Suffix List"
196+
exit 1
197+
fi
198+
199+
# Verify the PSL file has content
200+
if [ ! -s psl.txt ]; then
201+
echo "Error: Public Suffix List file is empty"
202+
exit 1
203+
fi
204+
205+
PSL_COUNT=$(wc -l < psl.txt | tr -d ' ')
206+
echo "Loaded $PSL_COUNT public suffixes from PSL"
207+
208+
echo ""
209+
echo "Removing PSL entries from tranco.csv..."
210+
211+
# First, normalize line endings by removing all carriage returns
212+
# This handles both Unix (\n) and Windows (\r\n) line endings uniformly
213+
tr -d '\r' < tranco.csv > tranco_normalized.csv
214+
mv tranco_normalized.csv tranco.csv
215+
216+
# Build a single awk script for efficient filtering
217+
# This processes the file in one pass instead of O(n×m) complexity
218+
# Using exact string matching avoids regex escaping issues
219+
awk 'BEGIN {
220+
# Read all PSL entries into an associative array
221+
while ((getline line < "psl.txt") > 0) {
222+
if (line != "") {
223+
psl[line] = 1
224+
}
225+
}
226+
close("psl.txt")
227+
removed = 0
228+
}
229+
{
230+
# Extract domain from "rank,domain" format
231+
n = index($0, ",")
232+
if (n > 0) {
233+
domain = substr($0, n + 1)
234+
235+
# Check if domain in PSL (exact string match)
236+
if (domain in psl) {
237+
removed++
238+
print "✓ Removed: " domain > "/dev/stderr"
239+
} else {
240+
# Keep this line
241+
print $0
242+
}
243+
} else {
244+
# Malformed line, keep it
245+
print $0
246+
}
247+
}
248+
END {
249+
# Write count to a separate file for easy extraction
250+
print removed > "removal_count.txt"
251+
}' tranco.csv > tranco_filtered.csv
252+
253+
# Read the removal count
254+
TOTAL_REMOVED=$(cat removal_count.txt)
255+
256+
# Replace original file with filtered version
257+
mv tranco_filtered.csv tranco.csv
258+
259+
# Clean up
260+
rm -f removal_count.txt psl.txt
261+
# Report final statistics
262+
FINAL_COUNT=$(wc -l < tranco.csv | tr -d ' ')
263+
echo ""
264+
echo "=== Summary ==="
265+
echo "PSL entries checked: $PSL_COUNT"
266+
echo "PSL entries found in Tranco: $TOTAL_REMOVED"
267+
echo "Total domains removed: $TOTAL_REMOVED"
268+
echo "Final line count: $FINAL_COUNT"
269+
182270
- name: Set configuration for top files
183271
id: set_config_top
184272
run: |

0 commit comments

Comments
 (0)