feat: Add URL normalization to handle trailing slashes and anchors
This commit is contained in:
parent
667d102277
commit
d49e358c58
|
@ -29,6 +29,24 @@ type BrokenLink struct {
|
|||
Error string
|
||||
}
|
||||
|
||||
func (lc *LinkChecker) normalizeURL(rawURL string) string {
|
||||
u, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return rawURL
|
||||
}
|
||||
|
||||
// Remove fragment
|
||||
u.Fragment = ""
|
||||
|
||||
// Ensure consistent trailing slash handling
|
||||
u.Path = strings.TrimSuffix(u.Path, "/")
|
||||
if u.Path == "" {
|
||||
u.Path = "/"
|
||||
}
|
||||
|
||||
return u.String()
|
||||
}
|
||||
|
||||
func (lc *LinkChecker) isSameDomain(baseURL, link string) bool {
|
||||
base, err := url.Parse(baseURL)
|
||||
if err != nil {
|
||||
|
@ -48,10 +66,11 @@ func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) {
|
|||
}
|
||||
|
||||
func (lc *LinkChecker) checkLinksRecursive(pageURL string, brokenLinks []BrokenLink) ([]BrokenLink, error) {
|
||||
if lc.visited[pageURL] {
|
||||
normalizedURL := lc.normalizeURL(pageURL)
|
||||
if lc.visited[normalizedURL] {
|
||||
return brokenLinks, nil
|
||||
}
|
||||
lc.visited[pageURL] = true
|
||||
lc.visited[normalizedURL] = true
|
||||
lc.pagesChecked++
|
||||
|
||||
fmt.Printf("Checking page %d: %s\n", lc.pagesChecked, pageURL)
|
||||
|
@ -73,7 +92,8 @@ func (lc *LinkChecker) checkLinksRecursive(pageURL string, brokenLinks []BrokenL
|
|||
}
|
||||
|
||||
// Recursively check links from the same domain
|
||||
if lc.isSameDomain(pageURL, link) && !lc.visited[link] {
|
||||
normalizedLink := lc.normalizeURL(link)
|
||||
if lc.isSameDomain(pageURL, link) && !lc.visited[normalizedLink] {
|
||||
recursiveLinks, err := lc.checkLinksRecursive(link, brokenLinks)
|
||||
if err != nil {
|
||||
continue // Skip this page if there's an error, but continue checking others
|
||||
|
|
Loading…
Reference in New Issue