linkchecker/linkchecker.go

203 lines
4.3 KiB
Go

package main
import (
"fmt"
"net/http"
"net/url"
"strings"
"golang.org/x/net/html"
)
type LinkChecker struct {
client *http.Client
visited map[string]bool
pagesChecked int
}
func NewLinkChecker() *LinkChecker {
return &LinkChecker{
client: &http.Client{},
visited: make(map[string]bool),
pagesChecked: 0,
}
}
type RedirectInfo struct {
FromURL string
ToURL string
}
type BrokenLink struct {
URL string
StatusCode int
Error string
Redirect *RedirectInfo
}
func (lc *LinkChecker) normalizeURL(rawURL string) string {
u, err := url.Parse(rawURL)
if err != nil {
return rawURL
}
// Remove fragment
u.Fragment = ""
// Ensure consistent trailing slash handling
u.Path = strings.TrimSuffix(u.Path, "/")
if u.Path == "" {
u.Path = "/"
}
return u.String()
}
func (lc *LinkChecker) isSameDomain(baseURL, link string) bool {
base, err := url.Parse(baseURL)
if err != nil {
return false
}
target, err := url.Parse(link)
if err != nil {
return false
}
return base.Host == target.Host
}
func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) {
return lc.checkLinksRecursive(baseURL, make([]BrokenLink, 0))
}
func (lc *LinkChecker) checkLinksRecursive(pageURL string, brokenLinks []BrokenLink) ([]BrokenLink, error) {
normalizedURL := lc.normalizeURL(pageURL)
if lc.visited[normalizedURL] {
return brokenLinks, nil
}
lc.visited[normalizedURL] = true
lc.pagesChecked++
fmt.Printf("Checking page %d: %s\n", lc.pagesChecked, pageURL)
links, err := lc.getLinks(pageURL)
if err != nil {
return nil, fmt.Errorf("error getting links: %w", err)
}
for _, link := range links {
status, redirect, err := lc.isLinkValid(link)
if status >= 400 || err != nil {
broken := BrokenLink{URL: link}
if err != nil {
broken.Error = err.Error()
} else {
broken.StatusCode = status
}
brokenLinks = append(brokenLinks, broken)
} else if redirect != nil {
broken := BrokenLink{
URL: link,
StatusCode: status,
Redirect: redirect,
}
brokenLinks = append(brokenLinks, broken)
}
// Recursively check links from the same domain
normalizedLink := lc.normalizeURL(link)
if lc.isSameDomain(pageURL, link) && !lc.visited[normalizedLink] {
recursiveLinks, err := lc.checkLinksRecursive(link, brokenLinks)
if err != nil {
continue // Skip this page if there's an error, but continue checking others
}
brokenLinks = recursiveLinks
}
}
return brokenLinks, nil
}
func (lc *LinkChecker) getLinks(pageURL string) ([]string, error) {
resp, err := lc.client.Get(pageURL)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("failed to fetch page: %s", resp.Status)
}
doc, err := html.Parse(resp.Body)
if err != nil {
return nil, err
}
var links []string
baseURL, err := url.Parse(pageURL)
if err != nil {
return nil, err
}
var traverse func(*html.Node)
traverse = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, attr := range n.Attr {
if attr.Key == "href" {
link := attr.Val
if !strings.HasPrefix(link, "http") {
// Convert relative URLs to absolute
if absURL, err := baseURL.Parse(link); err == nil {
link = absURL.String()
}
}
if strings.HasPrefix(link, "http") {
links = append(links, link)
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
traverse(c)
}
}
traverse(doc)
return links, nil
}
func (lc *LinkChecker) isLinkValid(link string) (int, *RedirectInfo, error) {
client := &http.Client{
CheckRedirect: func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
},
}
resp, err := client.Get(link)
if err != nil {
return 0, nil, err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 && resp.StatusCode < 400 {
location := resp.Header.Get("Location")
if location != "" {
redirectURL := location
if !strings.HasPrefix(location, "http") {
baseURL, _ := url.Parse(link)
if relative, err := baseURL.Parse(location); err == nil {
redirectURL = relative.String()
}
}
return resp.StatusCode, &RedirectInfo{
FromURL: link,
ToURL: redirectURL,
}, nil
}
}
return resp.StatusCode, nil, nil
}