217 lines
4.6 KiB
Go
217 lines
4.6 KiB
Go
package main
|
|
|
|
const userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
|
|
|
|
import (
|
|
"fmt"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
type LinkChecker struct {
|
|
client *http.Client
|
|
visited map[string]bool
|
|
pagesChecked int
|
|
}
|
|
|
|
func NewLinkChecker() *LinkChecker {
|
|
return &LinkChecker{
|
|
client: &http.Client{},
|
|
visited: make(map[string]bool),
|
|
pagesChecked: 0,
|
|
}
|
|
}
|
|
|
|
type RedirectInfo struct {
|
|
FromURL string
|
|
ToURL string
|
|
}
|
|
|
|
type BrokenLink struct {
|
|
URL string
|
|
StatusCode int
|
|
Error string
|
|
Redirect *RedirectInfo
|
|
}
|
|
|
|
func (lc *LinkChecker) normalizeURL(rawURL string) string {
|
|
u, err := url.Parse(rawURL)
|
|
if err != nil {
|
|
return rawURL
|
|
}
|
|
|
|
// Remove fragment
|
|
u.Fragment = ""
|
|
|
|
// Ensure consistent trailing slash handling
|
|
u.Path = strings.TrimSuffix(u.Path, "/")
|
|
if u.Path == "" {
|
|
u.Path = "/"
|
|
}
|
|
|
|
return u.String()
|
|
}
|
|
|
|
func (lc *LinkChecker) isSameDomain(baseURL, link string) bool {
|
|
base, err := url.Parse(baseURL)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
target, err := url.Parse(link)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
return base.Host == target.Host
|
|
}
|
|
|
|
func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) {
|
|
return lc.checkLinksRecursive(baseURL, make([]BrokenLink, 0))
|
|
}
|
|
|
|
func (lc *LinkChecker) checkLinksRecursive(pageURL string, brokenLinks []BrokenLink) ([]BrokenLink, error) {
|
|
normalizedURL := lc.normalizeURL(pageURL)
|
|
if lc.visited[normalizedURL] {
|
|
return brokenLinks, nil
|
|
}
|
|
lc.visited[normalizedURL] = true
|
|
lc.pagesChecked++
|
|
|
|
fmt.Printf("Checking page %d: %s\n", lc.pagesChecked, pageURL)
|
|
|
|
links, err := lc.getLinks(pageURL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error getting links: %w", err)
|
|
}
|
|
|
|
for _, link := range links {
|
|
status, redirect, err := lc.isLinkValid(link)
|
|
if status >= 400 || err != nil {
|
|
broken := BrokenLink{URL: link}
|
|
if err != nil {
|
|
broken.Error = err.Error()
|
|
} else {
|
|
broken.StatusCode = status
|
|
}
|
|
brokenLinks = append(brokenLinks, broken)
|
|
} else if redirect != nil {
|
|
broken := BrokenLink{
|
|
URL: link,
|
|
StatusCode: status,
|
|
Redirect: redirect,
|
|
}
|
|
brokenLinks = append(brokenLinks, broken)
|
|
}
|
|
|
|
// Recursively check links from the same domain
|
|
normalizedLink := lc.normalizeURL(link)
|
|
if lc.isSameDomain(pageURL, link) && !lc.visited[normalizedLink] {
|
|
recursiveLinks, err := lc.checkLinksRecursive(link, brokenLinks)
|
|
if err != nil {
|
|
continue // Skip this page if there's an error, but continue checking others
|
|
}
|
|
brokenLinks = recursiveLinks
|
|
}
|
|
}
|
|
|
|
return brokenLinks, nil
|
|
}
|
|
|
|
func (lc *LinkChecker) getLinks(pageURL string) ([]string, error) {
|
|
req, err := http.NewRequest("GET", pageURL, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
resp, err := lc.client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("failed to fetch page: %s", resp.Status)
|
|
}
|
|
|
|
doc, err := html.Parse(resp.Body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var links []string
|
|
baseURL, err := url.Parse(pageURL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var traverse func(*html.Node)
|
|
traverse = func(n *html.Node) {
|
|
if n.Type == html.ElementNode && n.Data == "a" {
|
|
for _, attr := range n.Attr {
|
|
if attr.Key == "href" {
|
|
link := attr.Val
|
|
if !strings.HasPrefix(link, "http") {
|
|
// Convert relative URLs to absolute
|
|
if absURL, err := baseURL.Parse(link); err == nil {
|
|
link = absURL.String()
|
|
}
|
|
}
|
|
if strings.HasPrefix(link, "http") {
|
|
links = append(links, link)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
traverse(c)
|
|
}
|
|
}
|
|
traverse(doc)
|
|
|
|
return links, nil
|
|
}
|
|
|
|
func (lc *LinkChecker) isLinkValid(link string) (int, *RedirectInfo, error) {
|
|
req, err := http.NewRequest("GET", link, nil)
|
|
if err != nil {
|
|
return 0, nil, err
|
|
}
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
client := &http.Client{
|
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
|
return http.ErrUseLastResponse
|
|
},
|
|
}
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return 0, nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode >= 300 && resp.StatusCode < 400 {
|
|
location := resp.Header.Get("Location")
|
|
if location != "" {
|
|
redirectURL := location
|
|
if !strings.HasPrefix(location, "http") {
|
|
baseURL, _ := url.Parse(link)
|
|
if relative, err := baseURL.Parse(location); err == nil {
|
|
redirectURL = relative.String()
|
|
}
|
|
}
|
|
return resp.StatusCode, &RedirectInfo{
|
|
FromURL: link,
|
|
ToURL: redirectURL,
|
|
}, nil
|
|
}
|
|
}
|
|
|
|
return resp.StatusCode, nil, nil
|
|
}
|