Compare commits

...

10 Commits

6 changed files with 225 additions and 19 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.aider*
.env

64
README.md Normal file
View File

@ -0,0 +1,64 @@
# Link Checker
A recursive link checker that crawls websites to find broken links and redirects. It helps maintain website health by identifying:
- Broken links (HTTP 4xx, 5xx status codes)
- Network/DNS errors
- HTTP redirects (3xx status codes)
## Features
- Recursive crawling of websites
- Handles both absolute and relative URLs
- Detects and reports HTTP redirects
- Shows progress during scanning
- Normalizes URLs for consistent checking
- Stays within the same domain
- Detailed reporting of issues found
## Installation
Make sure you have Go installed (version 1.16 or later), then run:
```bash
go install forgejo.ewintr.nl/ewintr/linkchecker@latest
```
## Usage
Run the link checker by providing a starting URL:
```bash
linkchecker -url="https://example.com"
```
The tool will:
1. Crawl all pages on the same domain
2. Check all links found (both internal and external)
3. Display progress during the scan
4. Generate a report showing:
- Total pages checked
- List of redirected links
- List of broken links
- Summary statistics
## Example Output
```
Checking page 1: https://example.com
Checking page 2: https://example.com/about
...
Total pages checked: 15
Redirects found:
- http://example.com/old-page (Redirect 301 -> https://example.com/new-page)
- http://example.com/blog (Redirect 302 -> https://blog.example.com)
Broken links found:
- https://example.com/missing-page (Status: 404)
- https://example.com/server-error (Status: 500)
- https://external-site.com/broken (Error: connection refused)
Total issues: 5 (2 redirects, 3 broken)
```

5
go.mod Normal file
View File

@ -0,0 +1,5 @@
module go-mod.ewintr.nl/linkchecker
go 1.23.3
require golang.org/x/net v0.31.0

2
go.sum Normal file
View File

@ -0,0 +1,2 @@
golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo=
golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM=

View File

@ -9,34 +9,88 @@ import (
"golang.org/x/net/html" "golang.org/x/net/html"
) )
const userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
type LinkChecker struct { type LinkChecker struct {
client *http.Client client *http.Client
visited map[string]bool
pagesChecked int
} }
func NewLinkChecker() *LinkChecker { func NewLinkChecker() *LinkChecker {
return &LinkChecker{ return &LinkChecker{
client: &http.Client{}, client: &http.Client{},
visited: make(map[string]bool),
pagesChecked: 0,
} }
} }
type RedirectInfo struct {
FromURL string
ToURL string
}
type BrokenLink struct { type BrokenLink struct {
URL string URL string
StatusCode int StatusCode int
Error string Error string
Redirect *RedirectInfo
}
func (lc *LinkChecker) normalizeURL(rawURL string) string {
u, err := url.Parse(rawURL)
if err != nil {
return rawURL
}
// Remove fragment
u.Fragment = ""
// Ensure consistent trailing slash handling
u.Path = strings.TrimSuffix(u.Path, "/")
if u.Path == "" {
u.Path = "/"
}
return u.String()
}
func (lc *LinkChecker) isSameDomain(baseURL, link string) bool {
base, err := url.Parse(baseURL)
if err != nil {
return false
}
target, err := url.Parse(link)
if err != nil {
return false
}
return base.Host == target.Host
} }
func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) { func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) {
// Get all links from the page return lc.checkLinksRecursive(baseURL, make([]BrokenLink, 0))
links, err := lc.getLinks(baseURL) }
func (lc *LinkChecker) checkLinksRecursive(pageURL string, brokenLinks []BrokenLink) ([]BrokenLink, error) {
normalizedURL := lc.normalizeURL(pageURL)
if lc.visited[normalizedURL] {
return brokenLinks, nil
}
lc.visited[normalizedURL] = true
lc.pagesChecked++
fmt.Printf("Checking page %d: %s\n", lc.pagesChecked, pageURL)
links, err := lc.getLinks(pageURL)
if err != nil { if err != nil {
return nil, fmt.Errorf("error getting links: %w", err) return nil, fmt.Errorf("error getting links: %w", err)
} }
var brokenLinks []BrokenLink
// Check each link
for _, link := range links { for _, link := range links {
if status, err := lc.isLinkValid(link); status >= 400 || err != nil { status, redirect, err := lc.isLinkValid(link)
if status >= 400 || err != nil {
broken := BrokenLink{URL: link} broken := BrokenLink{URL: link}
if err != nil { if err != nil {
broken.Error = err.Error() broken.Error = err.Error()
@ -44,6 +98,23 @@ func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) {
broken.StatusCode = status broken.StatusCode = status
} }
brokenLinks = append(brokenLinks, broken) brokenLinks = append(brokenLinks, broken)
} else if redirect != nil {
broken := BrokenLink{
URL: link,
StatusCode: status,
Redirect: redirect,
}
brokenLinks = append(brokenLinks, broken)
}
// Recursively check links from the same domain
normalizedLink := lc.normalizeURL(link)
if lc.isSameDomain(pageURL, link) && !lc.visited[normalizedLink] {
recursiveLinks, err := lc.checkLinksRecursive(link, brokenLinks)
if err != nil {
continue // Skip this page if there's an error, but continue checking others
}
brokenLinks = recursiveLinks
} }
} }
@ -51,7 +122,13 @@ func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) {
} }
func (lc *LinkChecker) getLinks(pageURL string) ([]string, error) { func (lc *LinkChecker) getLinks(pageURL string) ([]string, error) {
resp, err := lc.client.Get(pageURL) req, err := http.NewRequest("GET", pageURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", userAgent)
resp, err := lc.client.Do(req)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -99,12 +176,41 @@ func (lc *LinkChecker) getLinks(pageURL string) ([]string, error) {
return links, nil return links, nil
} }
func (lc *LinkChecker) isLinkValid(link string) (int, error) { func (lc *LinkChecker) isLinkValid(link string) (int, *RedirectInfo, error) {
resp, err := lc.client.Get(link) req, err := http.NewRequest("GET", link, nil)
if err != nil { if err != nil {
return 0, err return 0, nil, err
}
req.Header.Set("User-Agent", userAgent)
client := &http.Client{
CheckRedirect: func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
},
}
resp, err := client.Do(req)
if err != nil {
return 0, nil, err
} }
defer resp.Body.Close() defer resp.Body.Close()
return resp.StatusCode, nil if resp.StatusCode >= 300 && resp.StatusCode < 400 {
location := resp.Header.Get("Location")
if location != "" {
redirectURL := location
if !strings.HasPrefix(location, "http") {
baseURL, _ := url.Parse(link)
if relative, err := baseURL.Parse(location); err == nil {
redirectURL = relative.String()
}
}
return resp.StatusCode, &RedirectInfo{
FromURL: link,
ToURL: redirectURL,
}, nil
}
}
return resp.StatusCode, nil, nil
} }

31
main.go
View File

@ -20,17 +20,44 @@ func main() {
log.Fatal(err) log.Fatal(err)
} }
fmt.Printf("\nTotal pages checked: %d\n\n", checker.pagesChecked)
if len(brokenLinks) == 0 { if len(brokenLinks) == 0 {
fmt.Println("No broken links found!") fmt.Println("No issues found!")
return return
} }
fmt.Println("Found broken links:") // First list redirects
var redirectCount int
for _, link := range brokenLinks { for _, link := range brokenLinks {
if link.Redirect != nil {
if redirectCount == 0 {
fmt.Println("Redirects found:")
}
fmt.Printf("- %s (Redirect %d -> %s)\n", link.URL, link.StatusCode, link.Redirect.ToURL)
redirectCount++
}
}
if redirectCount > 0 {
fmt.Println()
}
// Then list broken links
var brokenCount int
for _, link := range brokenLinks {
if link.Redirect == nil {
if brokenCount == 0 {
fmt.Println("Broken links found:")
}
if link.Error != "" { if link.Error != "" {
fmt.Printf("- %s (Error: %s)\n", link.URL, link.Error) fmt.Printf("- %s (Error: %s)\n", link.URL, link.Error)
} else { } else {
fmt.Printf("- %s (Status: %d)\n", link.URL, link.StatusCode) fmt.Printf("- %s (Status: %d)\n", link.URL, link.StatusCode)
} }
brokenCount++
} }
}
fmt.Printf("\nTotal issues: %d (%d redirects, %d broken)\n",
len(brokenLinks), redirectCount, brokenCount)
} }