Skip to content

Commit f502f63

Browse files
Add workflow to check for broken links
Signed-off-by: Alexandr Demicev <[email protected]>
1 parent 6483100 commit f502f63

File tree

2 files changed

+212
-0
lines changed

2 files changed

+212
-0
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
name: Verify Broken Links
2+
on:
3+
pull_request:
4+
types: [opened, edited, synchronize, reopened, labeled, unlabeled]
5+
jobs:
6+
check-links:
7+
runs-on: ubuntu-latest
8+
steps:
9+
- uses: actions/checkout@v4
10+
- name: Set up Go
11+
uses: actions/setup-go@v5
12+
with:
13+
go-version: '1.22'
14+
- name: Run broken links checker
15+
run: go run tools/verifybrokenlinks/main.go -docs-dir=docs -max-parallel=10

tools/verifybrokenlinks/main.go

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
package main
2+
3+
import (
4+
"flag"
5+
"fmt"
6+
"io/fs"
7+
"net/http"
8+
"os"
9+
"path/filepath"
10+
"regexp"
11+
"strings"
12+
"sync"
13+
"sync/atomic"
14+
"time"
15+
)
16+
17+
type LinkOccurrence struct {
18+
URL string
19+
File string
20+
}
21+
22+
type CheckResult struct {
23+
URL string
24+
IsValid bool
25+
StatusCode int
26+
ErrorMsg string
27+
}
28+
29+
func main() {
30+
docsDir := flag.String("docs-dir", "docs", "directory to scan for .adoc files")
31+
maxParallel := flag.Int("max-parallel", 10, "maximum number of parallel link checks")
32+
exitOnError := flag.Bool("exit-on-error", true, "exit with non-zero code if broken links are found")
33+
flag.Parse()
34+
35+
allLinks, uniqueLinks := extractLinks(*docsDir)
36+
37+
results := checkLinks(uniqueLinks, *maxParallel, len(uniqueLinks))
38+
39+
brokenLinks := make(map[string]CheckResult)
40+
for _, result := range results {
41+
if !result.IsValid {
42+
brokenLinks[result.URL] = result
43+
}
44+
}
45+
46+
for url, result := range brokenLinks {
47+
var errorCode string
48+
if result.StatusCode != 0 {
49+
errorCode = fmt.Sprintf("HTTP %d", result.StatusCode)
50+
} else {
51+
errorCode = result.ErrorMsg
52+
}
53+
54+
fmt.Printf("❌ %s %s\n", errorCode, url)
55+
fmt.Println("Found in:")
56+
57+
for _, occurrence := range allLinks {
58+
if occurrence.URL == url {
59+
relativePath := strings.TrimPrefix(occurrence.File, *docsDir+"/")
60+
fmt.Printf("- %s\n", relativePath)
61+
}
62+
}
63+
fmt.Println()
64+
}
65+
66+
if len(brokenLinks) > 0 {
67+
fmt.Printf("Found %d broken links out of %d total links\n", len(brokenLinks), len(uniqueLinks))
68+
fmt.Println("Done")
69+
70+
if *exitOnError {
71+
os.Exit(1)
72+
}
73+
} else {
74+
fmt.Printf("Success: All %d links are valid\n", len(uniqueLinks))
75+
fmt.Println("Done")
76+
}
77+
}
78+
79+
func extractLinks(docsDir string) ([]LinkOccurrence, []string) {
80+
var allLinks []LinkOccurrence
81+
uniqueLinksMap := make(map[string]bool)
82+
83+
githubLinkRegex := regexp.MustCompile(`https?://github\.com[-a-zA-Z0-9@:%._\+~#=/]*`)
84+
issuesPRRegex := regexp.MustCompile(`/(?:issues|pull)/[0-9]+`)
85+
86+
filepath.WalkDir(docsDir, func(path string, d fs.DirEntry, err error) error {
87+
if err != nil {
88+
return err
89+
}
90+
91+
if !d.IsDir() && strings.HasSuffix(path, ".adoc") {
92+
content, err := os.ReadFile(path)
93+
if err != nil {
94+
return err
95+
}
96+
97+
links := githubLinkRegex.FindAllString(string(content), -1)
98+
99+
for _, link := range links {
100+
cleanedLink := trimTrailingPunctuation(link)
101+
if !issuesPRRegex.MatchString(cleanedLink) {
102+
allLinks = append(allLinks, LinkOccurrence{
103+
URL: cleanedLink,
104+
File: path,
105+
})
106+
uniqueLinksMap[cleanedLink] = true
107+
}
108+
}
109+
}
110+
return nil
111+
})
112+
113+
uniqueLinks := make([]string, 0, len(uniqueLinksMap))
114+
for link := range uniqueLinksMap {
115+
uniqueLinks = append(uniqueLinks, link)
116+
}
117+
118+
return allLinks, uniqueLinks
119+
}
120+
121+
func trimTrailingPunctuation(url string) string {
122+
for _, punct := range []string{".", ",", ")", "]", ">"} {
123+
if strings.HasSuffix(url, punct) {
124+
return url[:len(url)-len(punct)]
125+
}
126+
}
127+
return url
128+
}
129+
130+
func checkLinks(links []string, maxParallel int, totalLinks int) []CheckResult {
131+
results := make([]CheckResult, 0, len(links))
132+
resultsChan := make(chan CheckResult, len(links))
133+
sem := make(chan struct{}, maxParallel)
134+
var wg sync.WaitGroup
135+
var progress int32
136+
137+
client := &http.Client{
138+
Timeout: 10 * time.Second,
139+
CheckRedirect: func(req *http.Request, via []*http.Request) error {
140+
return nil
141+
},
142+
}
143+
144+
for _, link := range links {
145+
wg.Add(1)
146+
sem <- struct{}{}
147+
148+
go func(url string) {
149+
defer wg.Done()
150+
defer func() { <-sem }()
151+
152+
result := CheckResult{URL: url, IsValid: false}
153+
154+
req, err := http.NewRequest("HEAD", url, nil)
155+
if err != nil {
156+
result.ErrorMsg = err.Error()
157+
resultsChan <- result
158+
return
159+
}
160+
161+
resp, err := client.Do(req)
162+
if err != nil {
163+
getResp, getErr := client.Get(url)
164+
if getErr != nil {
165+
result.ErrorMsg = "Connection failed or timed out"
166+
resultsChan <- result
167+
return
168+
}
169+
resp = getResp
170+
}
171+
172+
defer resp.Body.Close()
173+
result.StatusCode = resp.StatusCode
174+
175+
if resp.StatusCode >= 200 && resp.StatusCode < 400 {
176+
result.IsValid = true
177+
}
178+
179+
resultsChan <- result
180+
181+
current := atomic.AddInt32(&progress, 1)
182+
fmt.Fprintf(os.Stderr, "\rProgress: %d/%d links checked", current, totalLinks)
183+
}(link)
184+
}
185+
186+
go func() {
187+
wg.Wait()
188+
close(resultsChan)
189+
}()
190+
191+
for result := range resultsChan {
192+
results = append(results, result)
193+
}
194+
195+
fmt.Fprintln(os.Stderr)
196+
return results
197+
}

0 commit comments

Comments
 (0)