package main import ( "fmt" "net/http" "net/url" "strings" "golang.org/x/net/html" ) type LinkChecker struct { client *http.Client visited map[string]bool pagesChecked int } func NewLinkChecker() *LinkChecker { return &LinkChecker{ client: &http.Client{}, visited: make(map[string]bool), pagesChecked: 0, } } type RedirectInfo struct { FromURL string ToURL string } type BrokenLink struct { URL string StatusCode int Error string Redirect *RedirectInfo } func (lc *LinkChecker) normalizeURL(rawURL string) string { u, err := url.Parse(rawURL) if err != nil { return rawURL } // Remove fragment u.Fragment = "" // Ensure consistent trailing slash handling u.Path = strings.TrimSuffix(u.Path, "/") if u.Path == "" { u.Path = "/" } return u.String() } func (lc *LinkChecker) isSameDomain(baseURL, link string) bool { base, err := url.Parse(baseURL) if err != nil { return false } target, err := url.Parse(link) if err != nil { return false } return base.Host == target.Host } func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) { return lc.checkLinksRecursive(baseURL, make([]BrokenLink, 0)) } func (lc *LinkChecker) checkLinksRecursive(pageURL string, brokenLinks []BrokenLink) ([]BrokenLink, error) { normalizedURL := lc.normalizeURL(pageURL) if lc.visited[normalizedURL] { return brokenLinks, nil } lc.visited[normalizedURL] = true lc.pagesChecked++ fmt.Printf("Checking page %d: %s\n", lc.pagesChecked, pageURL) links, err := lc.getLinks(pageURL) if err != nil { return nil, fmt.Errorf("error getting links: %w", err) } for _, link := range links { status, redirect, err := lc.isLinkValid(link) if status >= 400 || err != nil { broken := BrokenLink{URL: link} if err != nil { broken.Error = err.Error() } else { broken.StatusCode = status } brokenLinks = append(brokenLinks, broken) } else if redirect != nil { broken := BrokenLink{ URL: link, StatusCode: status, Redirect: redirect, } brokenLinks = append(brokenLinks, broken) } // Recursively check links from the same domain normalizedLink := lc.normalizeURL(link) if lc.isSameDomain(pageURL, link) && !lc.visited[normalizedLink] { recursiveLinks, err := lc.checkLinksRecursive(link, brokenLinks) if err != nil { continue // Skip this page if there's an error, but continue checking others } brokenLinks = recursiveLinks } } return brokenLinks, nil } func (lc *LinkChecker) getLinks(pageURL string) ([]string, error) { resp, err := lc.client.Get(pageURL) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("failed to fetch page: %s", resp.Status) } doc, err := html.Parse(resp.Body) if err != nil { return nil, err } var links []string baseURL, err := url.Parse(pageURL) if err != nil { return nil, err } var traverse func(*html.Node) traverse = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for _, attr := range n.Attr { if attr.Key == "href" { link := attr.Val if !strings.HasPrefix(link, "http") { // Convert relative URLs to absolute if absURL, err := baseURL.Parse(link); err == nil { link = absURL.String() } } if strings.HasPrefix(link, "http") { links = append(links, link) } } } } for c := n.FirstChild; c != nil; c = c.NextSibling { traverse(c) } } traverse(doc) return links, nil } func (lc *LinkChecker) isLinkValid(link string) (int, *RedirectInfo, error) { client := &http.Client{ CheckRedirect: func(req *http.Request, via []*http.Request) error { return http.ErrUseLastResponse }, } resp, err := client.Get(link) if err != nil { return 0, nil, err } defer resp.Body.Close() if resp.StatusCode >= 300 && resp.StatusCode < 400 { location := resp.Header.Get("Location") if location != "" { redirectURL := location if !strings.HasPrefix(location, "http") { baseURL, _ := url.Parse(link) if relative, err := baseURL.Parse(location); err == nil { redirectURL = relative.String() } } return resp.StatusCode, &RedirectInfo{ FromURL: link, ToURL: redirectURL, }, nil } } return resp.StatusCode, nil, nil }