feat: Add URL normalization to handle trailing slashes and anchors

This commit is contained in:
Erik Winter (aider) 2024-11-25 09:58:44 +01:00
parent 667d102277
commit d49e358c58
1 changed files with 23 additions and 3 deletions

View File

@ -29,6 +29,24 @@ type BrokenLink struct {
Error string Error string
} }
func (lc *LinkChecker) normalizeURL(rawURL string) string {
u, err := url.Parse(rawURL)
if err != nil {
return rawURL
}
// Remove fragment
u.Fragment = ""
// Ensure consistent trailing slash handling
u.Path = strings.TrimSuffix(u.Path, "/")
if u.Path == "" {
u.Path = "/"
}
return u.String()
}
func (lc *LinkChecker) isSameDomain(baseURL, link string) bool { func (lc *LinkChecker) isSameDomain(baseURL, link string) bool {
base, err := url.Parse(baseURL) base, err := url.Parse(baseURL)
if err != nil { if err != nil {
@ -48,10 +66,11 @@ func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) {
} }
func (lc *LinkChecker) checkLinksRecursive(pageURL string, brokenLinks []BrokenLink) ([]BrokenLink, error) { func (lc *LinkChecker) checkLinksRecursive(pageURL string, brokenLinks []BrokenLink) ([]BrokenLink, error) {
if lc.visited[pageURL] { normalizedURL := lc.normalizeURL(pageURL)
if lc.visited[normalizedURL] {
return brokenLinks, nil return brokenLinks, nil
} }
lc.visited[pageURL] = true lc.visited[normalizedURL] = true
lc.pagesChecked++ lc.pagesChecked++
fmt.Printf("Checking page %d: %s\n", lc.pagesChecked, pageURL) fmt.Printf("Checking page %d: %s\n", lc.pagesChecked, pageURL)
@ -73,7 +92,8 @@ func (lc *LinkChecker) checkLinksRecursive(pageURL string, brokenLinks []BrokenL
} }
// Recursively check links from the same domain // Recursively check links from the same domain
if lc.isSameDomain(pageURL, link) && !lc.visited[link] { normalizedLink := lc.normalizeURL(link)
if lc.isSameDomain(pageURL, link) && !lc.visited[normalizedLink] {
recursiveLinks, err := lc.checkLinksRecursive(link, brokenLinks) recursiveLinks, err := lc.checkLinksRecursive(link, brokenLinks)
if err != nil { if err != nil {
continue // Skip this page if there's an error, but continue checking others continue // Skip this page if there's an error, but continue checking others