emdb/client/imdb.go

84 lines
1.6 KiB
Go
Raw Normal View History

2023-12-29 19:10:31 +01:00
package client
import (
"fmt"
"net/http"
2023-12-30 09:19:53 +01:00
"regexp"
"strings"
2023-12-29 19:10:31 +01:00
"github.com/PuerkitoBio/goquery"
)
type Review struct {
Source string
Review string
}
type IMDB struct {
}
func NewIMDB() *IMDB {
return &IMDB{}
}
func (i *IMDB) GetReviews(imdbID string) (map[string]string, error) {
url := fmt.Sprintf("https://www.imdb.com/title/%s/reviews", imdbID)
req, err := http.NewRequest(http.MethodGet, url, nil)
if err != nil {
return nil, err
}
res, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status code: %d", res.StatusCode)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return nil, err
}
defer res.Body.Close()
reviews := make(map[string]string)
doc.Find(".lister-item-content").Each(func(i int, reviewNode *goquery.Selection) {
var permaLink string
reviewNode.Find("a").Each(func(i int, s *goquery.Selection) {
if s.Text() == "Permalink" {
link, exists := s.Attr("href")
if exists {
permaLink = link
}
}
})
if permaLink == "" {
return
}
2023-12-30 09:19:53 +01:00
reviews[permaLink] = ScrubIMDBReview(reviewNode.Text())
2023-12-29 19:10:31 +01:00
})
return reviews, nil
}
2023-12-30 09:19:53 +01:00
func ScrubIMDBReview(review string) string {
// remove footer
for _, text := range []string{"Was this review helpful?", "Sign in to vote.", "Permalink"} {
review = strings.ReplaceAll(review, text, "")
}
// remove superfluous whitespace
reWS := regexp.MustCompile(`\n\s+`)
review = reWS.ReplaceAllString(review, "\n")
// remove superfluous newlines
re := regexp.MustCompile(`\n{3,}`)
review = re.ReplaceAllString(review, "\n\n")
return review
}