feat: Add URL normalization to handle trailing slashes and anchors
This commit is contained in:
parent
667d102277
commit
d49e358c58
|
@ -29,6 +29,24 @@ type BrokenLink struct {
|
||||||
Error string
|
Error string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (lc *LinkChecker) normalizeURL(rawURL string) string {
|
||||||
|
u, err := url.Parse(rawURL)
|
||||||
|
if err != nil {
|
||||||
|
return rawURL
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove fragment
|
||||||
|
u.Fragment = ""
|
||||||
|
|
||||||
|
// Ensure consistent trailing slash handling
|
||||||
|
u.Path = strings.TrimSuffix(u.Path, "/")
|
||||||
|
if u.Path == "" {
|
||||||
|
u.Path = "/"
|
||||||
|
}
|
||||||
|
|
||||||
|
return u.String()
|
||||||
|
}
|
||||||
|
|
||||||
func (lc *LinkChecker) isSameDomain(baseURL, link string) bool {
|
func (lc *LinkChecker) isSameDomain(baseURL, link string) bool {
|
||||||
base, err := url.Parse(baseURL)
|
base, err := url.Parse(baseURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -48,10 +66,11 @@ func (lc *LinkChecker) CheckLinks(baseURL string) ([]BrokenLink, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (lc *LinkChecker) checkLinksRecursive(pageURL string, brokenLinks []BrokenLink) ([]BrokenLink, error) {
|
func (lc *LinkChecker) checkLinksRecursive(pageURL string, brokenLinks []BrokenLink) ([]BrokenLink, error) {
|
||||||
if lc.visited[pageURL] {
|
normalizedURL := lc.normalizeURL(pageURL)
|
||||||
|
if lc.visited[normalizedURL] {
|
||||||
return brokenLinks, nil
|
return brokenLinks, nil
|
||||||
}
|
}
|
||||||
lc.visited[pageURL] = true
|
lc.visited[normalizedURL] = true
|
||||||
lc.pagesChecked++
|
lc.pagesChecked++
|
||||||
|
|
||||||
fmt.Printf("Checking page %d: %s\n", lc.pagesChecked, pageURL)
|
fmt.Printf("Checking page %d: %s\n", lc.pagesChecked, pageURL)
|
||||||
|
@ -73,7 +92,8 @@ func (lc *LinkChecker) checkLinksRecursive(pageURL string, brokenLinks []BrokenL
|
||||||
}
|
}
|
||||||
|
|
||||||
// Recursively check links from the same domain
|
// Recursively check links from the same domain
|
||||||
if lc.isSameDomain(pageURL, link) && !lc.visited[link] {
|
normalizedLink := lc.normalizeURL(link)
|
||||||
|
if lc.isSameDomain(pageURL, link) && !lc.visited[normalizedLink] {
|
||||||
recursiveLinks, err := lc.checkLinksRecursive(link, brokenLinks)
|
recursiveLinks, err := lc.checkLinksRecursive(link, brokenLinks)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue // Skip this page if there's an error, but continue checking others
|
continue // Skip this page if there's an error, but continue checking others
|
||||||
|
|
Loading…
Reference in New Issue