emdb/client/imdb.go

98 lines
2.1 KiB
Go
Raw Normal View History

2023-12-29 19:10:31 +01:00
package client
import (
"fmt"
"net/http"
2023-12-30 09:19:53 +01:00
"regexp"
2024-01-17 07:57:52 +01:00
"strconv"
2023-12-30 09:19:53 +01:00
"strings"
2023-12-29 19:10:31 +01:00
2024-09-17 07:43:12 +02:00
"go-mod.ewintr.nl/emdb/storage"
2023-12-29 19:10:31 +01:00
"github.com/PuerkitoBio/goquery"
2024-01-17 07:57:52 +01:00
"github.com/google/uuid"
2023-12-29 19:10:31 +01:00
)
type IMDB struct {
}
func NewIMDB() *IMDB {
return &IMDB{}
}
2024-03-09 13:18:51 +01:00
func (i *IMDB) GetReviews(m storage.Movie) ([]storage.Review, error) {
2024-01-17 07:57:52 +01:00
url := fmt.Sprintf("https://www.imdb.com/title/%s/reviews", m.IMDBID)
2023-12-29 19:10:31 +01:00
req, err := http.NewRequest(http.MethodGet, url, nil)
if err != nil {
return nil, err
}
res, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status code: %d", res.StatusCode)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return nil, err
}
defer res.Body.Close()
2024-03-09 13:18:51 +01:00
reviews := make([]storage.Review, 0)
2023-12-29 19:10:31 +01:00
doc.Find(".lister-item-content").Each(func(i int, reviewNode *goquery.Selection) {
var permaLink string
reviewNode.Find("a").Each(func(i int, s *goquery.Selection) {
if s.Text() == "Permalink" {
link, exists := s.Attr("href")
if exists {
permaLink = link
}
}
})
if permaLink == "" {
return
}
2024-01-17 07:57:52 +01:00
rat, rev := ScrubIMDBReview(reviewNode.Text())
2024-03-09 13:18:51 +01:00
reviews = append(reviews, storage.Review{
2024-01-17 07:57:52 +01:00
ID: uuid.New().String(),
MovieID: m.ID,
2024-03-09 13:18:51 +01:00
Source: storage.ReviewSourceIMDB,
2024-01-17 07:57:52 +01:00
URL: fmt.Sprintf("https://www.imdb.com%s", permaLink),
Review: rev,
MovieRating: rat,
})
2023-12-29 19:10:31 +01:00
})
return reviews, nil
}
2023-12-30 09:19:53 +01:00
2024-01-17 07:57:52 +01:00
func ScrubIMDBReview(review string) (int, string) {
2023-12-30 09:19:53 +01:00
// remove footer
for _, text := range []string{"Was this review helpful?", "Sign in to vote.", "Permalink"} {
review = strings.ReplaceAll(review, text, "")
}
// remove superfluous whitespace
reWS := regexp.MustCompile(`\n\s+`)
review = reWS.ReplaceAllString(review, "\n")
// remove superfluous newlines
2024-01-17 07:57:52 +01:00
reRev := regexp.MustCompile(`\n{3,}`)
review = reRev.ReplaceAllString(review, "\n\n")
reRat := regexp.MustCompile(`(\d+)/10\n`)
reMatch := reRat.FindStringSubmatch(review)
var rating int
if len(reMatch) > 0 {
rating, _ = strconv.Atoi(reMatch[1])
review = strings.ReplaceAll(review, reMatch[0], "")
}
2023-12-30 09:19:53 +01:00
2024-01-17 07:57:52 +01:00
return rating, review
2023-12-30 09:19:53 +01:00
}