Compare commits
10 Commits
43d7a785ab
...
1d1425aa9a
Author | SHA1 | Date |
---|---|---|
Erik Winter | 1d1425aa9a | |
Erik Winter (aider) | 6acded4b9e | |
Erik Winter (aider) | afc3d33009 | |
Erik Winter (aider) | d2c064abda | |
Erik Winter (aider) | f17c54a840 | |
Erik Winter (aider) | 99b458e40a | |
Erik Winter (aider) | 3f7cde44a2 | |
Erik Winter (aider) | d49e358c58 | |
Erik Winter (aider) | 667d102277 | |
Erik Winter (aider) | 7020dbd47d |
|
@ -0,0 +1,2 @@
|
||||||
|
.aider*
|
||||||
|
.env
|
|
@ -0,0 +1,64 @@
|
||||||
|
# Link Checker
|
||||||
|
|
||||||
|
A recursive link checker that crawls websites to find broken links and redirects. It helps maintain website health by identifying:
|
||||||
|
|
||||||
|
- Broken links (HTTP 4xx, 5xx status codes)
|
||||||
|
- Network/DNS errors
|
||||||
|
- HTTP redirects (3xx status codes)
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Recursive crawling of websites
|
||||||
|
- Handles both absolute and relative URLs
|
||||||
|
- Detects and reports HTTP redirects
|
||||||
|
- Shows progress during scanning
|
||||||
|
- Normalizes URLs for consistent checking
|
||||||
|
- Stays within the same domain
|
||||||
|
- Detailed reporting of issues found
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
Make sure you have Go installed (version 1.16 or later), then run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
go install forgejo.ewintr.nl/ewintr/linkchecker@latest
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Run the link checker by providing a starting URL:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
linkchecker -url="https://example.com"
|
||||||
|
```
|
||||||
|
|
||||||
|
The tool will:
|
||||||
|
1. Crawl all pages on the same domain
|
||||||
|
2. Check all links found (both internal and external)
|
||||||
|
3. Display progress during the scan
|
||||||
|
4. Generate a report showing:
|
||||||
|
- Total pages checked
|
||||||
|
- List of redirected links
|
||||||
|
- List of broken links
|
||||||
|
- Summary statistics
|
||||||
|
|
||||||
|
## Example Output
|
||||||
|
|
||||||
|
```
|
||||||
|
Checking page 1: https://example.com
|
||||||
|
Checking page 2: https://example.com/about
|
||||||
|
...
|
||||||
|
|
||||||
|
Total pages checked: 15
|
||||||
|
|
||||||
|
Redirects found:
|
||||||
|
- http://example.com/old-page (Redirect 301 -> https://example.com/new-page)
|
||||||
|
- http://example.com/blog (Redirect 302 -> https://blog.example.com)
|
||||||
|
|
||||||
|
Broken links found:
|
||||||
|
- https://example.com/missing-page (Status: 404)
|
||||||
|
- https://example.com/server-error (Status: 500)
|
||||||
|
- https://external-site.com/broken (Error: connection refused)
|
||||||
|
|
||||||
|
Total issues: 5 (2 redirects, 3 broken)
|
||||||
|
```
|
|
@ -0,0 +1,5 @@
|
||||||
|
module go-mod.ewintr.nl/linkchecker
|
||||||
|
|
||||||
|
go 1.23.3
|
||||||
|
|
||||||
|
require golang.org/x/net v0.31.0
|
|
@ -0,0 +1,2 @@
|
||||||
|
golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo=
|
||||||
|
golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM=
|
128
linkchecker.go
128
linkchecker.go
|
@ -9,34 +9,88 @@ import (
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
|
||||||
|
|
||||||
type LinkChecker struct {
|
type LinkChecker struct {
|
||||||
client *http.Client
|
client *http.Client
|
||||||
|
visited map[string]bool
|
||||||
|
pagesChecked int
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewLinkChecker() *LinkChecker {
|
func NewLinkChecker() *LinkChecker {
|
||||||
return &LinkChecker{
|
return &LinkChecker{
|
||||||
client: &http.Client{},
|
client: &http.Client{},
|
||||||
|
visited: make(map[string]bool),
|
||||||
|
pagesChecked: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type RedirectInfo struct {
|
||||||
|
FromURL string
|
||||||
|
ToURL string
|
||||||
|
}
|
||||||
|
|
||||||
type BrokenLink struct {
|
type BrokenLink struct {
|
||||||
URL string
|
URL string
|
||||||
StatusCode int
|
StatusCode int
|
||||||
Error string
|
Error string
|
||||||
|
Redirect *RedirectInfo
|
||||||
|
}
|
||||||
|
|
||||||
|
func (lc *LinkChecker) normalizeURL(rawURL string) string {
|
||||||
|
u, err := url.Parse(rawURL)
|
||||||
|
if err != nil {
|
||||||
|
return rawURL
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove fragment
|
||||||
|
u.Fragment = ""
|
||||||
|
|
||||||
|
// Ensure consistent trailing slash handling
|
||||||
|
u.Path = strings.TrimSuffix(u.Path, "/")
|
||||||
|
if u.Path == "" {
|
||||||
|
u.Path = "/"
|
||||||
|
}
|
||||||
|
|
||||||
|
return u.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (lc *LinkChecker) isSameDomain(baseURL, link string) bool {
|
||||||
|
base, err := url.Parse(baseURL)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
target, err := url.Parse(link)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
return base.Host == target.Host
|
||||||
}
|
}
|
||||||
|
|
||||||
func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) {
|
func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) {
|
||||||
// Get all links from the page
|
return lc.checkLinksRecursive(baseURL, make([]BrokenLink, 0))
|
||||||
links, err := lc.getLinks(baseURL)
|
}
|
||||||
|
|
||||||
|
func (lc *LinkChecker) checkLinksRecursive(pageURL string, brokenLinks []BrokenLink) ([]BrokenLink, error) {
|
||||||
|
normalizedURL := lc.normalizeURL(pageURL)
|
||||||
|
if lc.visited[normalizedURL] {
|
||||||
|
return brokenLinks, nil
|
||||||
|
}
|
||||||
|
lc.visited[normalizedURL] = true
|
||||||
|
lc.pagesChecked++
|
||||||
|
|
||||||
|
fmt.Printf("Checking page %d: %s\n", lc.pagesChecked, pageURL)
|
||||||
|
|
||||||
|
links, err := lc.getLinks(pageURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("error getting links: %w", err)
|
return nil, fmt.Errorf("error getting links: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
var brokenLinks []BrokenLink
|
|
||||||
|
|
||||||
// Check each link
|
|
||||||
for _, link := range links {
|
for _, link := range links {
|
||||||
if status, err := lc.isLinkValid(link); status >= 400 || err != nil {
|
status, redirect, err := lc.isLinkValid(link)
|
||||||
|
if status >= 400 || err != nil {
|
||||||
broken := BrokenLink{URL: link}
|
broken := BrokenLink{URL: link}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
broken.Error = err.Error()
|
broken.Error = err.Error()
|
||||||
|
@ -44,6 +98,23 @@ func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) {
|
||||||
broken.StatusCode = status
|
broken.StatusCode = status
|
||||||
}
|
}
|
||||||
brokenLinks = append(brokenLinks, broken)
|
brokenLinks = append(brokenLinks, broken)
|
||||||
|
} else if redirect != nil {
|
||||||
|
broken := BrokenLink{
|
||||||
|
URL: link,
|
||||||
|
StatusCode: status,
|
||||||
|
Redirect: redirect,
|
||||||
|
}
|
||||||
|
brokenLinks = append(brokenLinks, broken)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recursively check links from the same domain
|
||||||
|
normalizedLink := lc.normalizeURL(link)
|
||||||
|
if lc.isSameDomain(pageURL, link) && !lc.visited[normalizedLink] {
|
||||||
|
recursiveLinks, err := lc.checkLinksRecursive(link, brokenLinks)
|
||||||
|
if err != nil {
|
||||||
|
continue // Skip this page if there's an error, but continue checking others
|
||||||
|
}
|
||||||
|
brokenLinks = recursiveLinks
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -51,7 +122,13 @@ func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (lc *LinkChecker) getLinks(pageURL string) ([]string, error) {
|
func (lc *LinkChecker) getLinks(pageURL string) ([]string, error) {
|
||||||
resp, err := lc.client.Get(pageURL)
|
req, err := http.NewRequest("GET", pageURL, nil)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
req.Header.Set("User-Agent", userAgent)
|
||||||
|
|
||||||
|
resp, err := lc.client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -99,12 +176,41 @@ func (lc *LinkChecker) getLinks(pageURL string) ([]string, error) {
|
||||||
return links, nil
|
return links, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (lc *LinkChecker) isLinkValid(link string) (int, error) {
|
func (lc *LinkChecker) isLinkValid(link string) (int, *RedirectInfo, error) {
|
||||||
resp, err := lc.client.Get(link)
|
req, err := http.NewRequest("GET", link, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
req.Header.Set("User-Agent", userAgent)
|
||||||
|
|
||||||
|
client := &http.Client{
|
||||||
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||||
|
return http.ErrUseLastResponse
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
return resp.StatusCode, nil
|
if resp.StatusCode >= 300 && resp.StatusCode < 400 {
|
||||||
|
location := resp.Header.Get("Location")
|
||||||
|
if location != "" {
|
||||||
|
redirectURL := location
|
||||||
|
if !strings.HasPrefix(location, "http") {
|
||||||
|
baseURL, _ := url.Parse(link)
|
||||||
|
if relative, err := baseURL.Parse(location); err == nil {
|
||||||
|
redirectURL = relative.String()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return resp.StatusCode, &RedirectInfo{
|
||||||
|
FromURL: link,
|
||||||
|
ToURL: redirectURL,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return resp.StatusCode, nil, nil
|
||||||
}
|
}
|
||||||
|
|
31
main.go
31
main.go
|
@ -20,17 +20,44 @@ func main() {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fmt.Printf("\nTotal pages checked: %d\n\n", checker.pagesChecked)
|
||||||
|
|
||||||
if len(brokenLinks) == 0 {
|
if len(brokenLinks) == 0 {
|
||||||
fmt.Println("No broken links found!")
|
fmt.Println("No issues found!")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Println("Found broken links:")
|
// First list redirects
|
||||||
|
var redirectCount int
|
||||||
for _, link := range brokenLinks {
|
for _, link := range brokenLinks {
|
||||||
|
if link.Redirect != nil {
|
||||||
|
if redirectCount == 0 {
|
||||||
|
fmt.Println("Redirects found:")
|
||||||
|
}
|
||||||
|
fmt.Printf("- %s (Redirect %d -> %s)\n", link.URL, link.StatusCode, link.Redirect.ToURL)
|
||||||
|
redirectCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if redirectCount > 0 {
|
||||||
|
fmt.Println()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then list broken links
|
||||||
|
var brokenCount int
|
||||||
|
for _, link := range brokenLinks {
|
||||||
|
if link.Redirect == nil {
|
||||||
|
if brokenCount == 0 {
|
||||||
|
fmt.Println("Broken links found:")
|
||||||
|
}
|
||||||
if link.Error != "" {
|
if link.Error != "" {
|
||||||
fmt.Printf("- %s (Error: %s)\n", link.URL, link.Error)
|
fmt.Printf("- %s (Error: %s)\n", link.URL, link.Error)
|
||||||
} else {
|
} else {
|
||||||
fmt.Printf("- %s (Status: %d)\n", link.URL, link.StatusCode)
|
fmt.Printf("- %s (Status: %d)\n", link.URL, link.StatusCode)
|
||||||
}
|
}
|
||||||
|
brokenCount++
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("\nTotal issues: %d (%d redirects, %d broken)\n",
|
||||||
|
len(brokenLinks), redirectCount, brokenCount)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue