yogai/fetcher/fetcher.go

254 lines
6.8 KiB
Go
Raw Normal View History

2023-05-10 16:28:45 +02:00
package fetcher
import (
2023-05-27 14:36:22 +02:00
"time"
2023-05-10 16:28:45 +02:00
"ewintr.nl/yogai/model"
"ewintr.nl/yogai/storage"
"github.com/google/uuid"
2023-05-10 19:27:31 +02:00
"golang.org/x/exp/slog"
2023-05-10 16:28:45 +02:00
)
type Fetcher struct {
2023-05-10 20:08:45 +02:00
interval time.Duration
2023-05-27 14:36:22 +02:00
feedRepo storage.FeedRepository
2023-05-10 20:08:45 +02:00
videoRepo storage.VideoRepository
feedReader FeedReader
2023-05-27 14:36:22 +02:00
channelReader ChannelReader
2023-05-10 20:08:45 +02:00
metadataFetcher MetadataFetcher
2023-05-13 12:53:37 +02:00
summaryFetcher SummaryFetcher
2023-05-27 14:36:22 +02:00
feedPipeline chan *model.Feed
videoPipeline chan *model.Video
2023-05-10 20:08:45 +02:00
needsMetadata chan *model.Video
2023-05-13 12:53:37 +02:00
needsSummary chan *model.Video
2023-05-10 20:08:45 +02:00
logger *slog.Logger
2023-05-10 16:28:45 +02:00
}
2023-05-27 14:36:22 +02:00
func NewFetch(feedRepo storage.FeedRepository, videoRepo storage.VideoRepository, channelReader ChannelReader, feedReader FeedReader, interval time.Duration, metadataFetcher MetadataFetcher, summaryFetcher SummaryFetcher, logger *slog.Logger) *Fetcher {
2023-05-10 16:28:45 +02:00
return &Fetcher{
2023-05-10 20:08:45 +02:00
interval: interval,
2023-05-27 14:36:22 +02:00
feedRepo: feedRepo,
2023-05-10 20:08:45 +02:00
videoRepo: videoRepo,
2023-05-27 14:36:22 +02:00
channelReader: channelReader,
2023-05-10 20:08:45 +02:00
feedReader: feedReader,
metadataFetcher: metadataFetcher,
2023-05-13 12:53:37 +02:00
summaryFetcher: summaryFetcher,
2023-05-27 14:36:22 +02:00
feedPipeline: make(chan *model.Feed, 10),
videoPipeline: make(chan *model.Video, 10),
2023-05-13 12:53:37 +02:00
needsMetadata: make(chan *model.Video, 10),
needsSummary: make(chan *model.Video, 10),
2023-05-10 20:08:45 +02:00
logger: logger,
2023-05-10 16:28:45 +02:00
}
}
func (f *Fetcher) Run() {
2023-05-27 14:36:22 +02:00
go f.FetchHistoricalVideos()
go f.FindNewFeeds()
2023-05-10 16:28:45 +02:00
go f.ReadFeeds()
go f.MetadataFetcher()
2023-05-13 12:53:37 +02:00
go f.SummaryFetcher()
go f.FindUnprocessed()
2023-05-10 16:28:45 +02:00
2023-05-27 14:36:22 +02:00
f.logger.Info("started videoPipeline")
2023-05-10 16:28:45 +02:00
for {
select {
2023-05-27 14:36:22 +02:00
case video := <-f.videoPipeline:
2023-05-10 16:28:45 +02:00
switch video.Status {
2023-05-13 12:53:37 +02:00
case model.StatusNew:
2023-05-10 16:28:45 +02:00
f.needsMetadata <- video
2023-05-13 12:53:37 +02:00
case model.StatusHasMetadata:
f.needsSummary <- video
case model.StatusHasSummary:
video.Status = model.StatusReady
f.logger.Info("video is ready", slog.String("id", video.ID.String()))
}
if err := f.videoRepo.Save(video); err != nil {
f.logger.Error("failed to save video", err)
continue
2023-05-10 16:28:45 +02:00
}
}
2023-05-13 12:53:37 +02:00
}
}
2023-05-27 14:36:22 +02:00
func (f *Fetcher) FindNewFeeds() {
f.logger.Info("looking for new feeds")
feeds, err := f.feedRepo.FindByStatus(model.FeedStatusNew)
if err != nil {
f.logger.Error("failed to fetch feeds", err)
return
}
for _, feed := range feeds {
f.feedPipeline <- feed
}
}
func (f *Fetcher) FetchHistoricalVideos() {
f.logger.Info("started historical video fetcher")
for feed := range f.feedPipeline {
f.logger.Info("fetching historical videos", slog.String("channelid", string(feed.YoutubeChannelID)))
token := ""
for {
token = f.FetchHistoricalVideoPage(feed.YoutubeChannelID, token)
if token == "" {
break
}
}
feed.Status = model.FeedStatusReady
if err := f.feedRepo.Save(feed); err != nil {
f.logger.Error("failed to save feed", err)
continue
}
}
}
func (f *Fetcher) FetchHistoricalVideoPage(channelID model.YoutubeChannelID, pageToken string) string {
f.logger.Info("fetching historical video page", slog.String("channelid", string(channelID)), slog.String("pagetoken", pageToken))
ytIDs, pageToken, err := f.channelReader.Search(channelID, pageToken)
if err != nil {
f.logger.Error("failed to fetch channel", err)
return ""
}
for _, ytID := range ytIDs {
video := &model.Video{
ID: uuid.New(),
Status: model.StatusNew,
YoutubeID: ytID,
YoutubeChannelID: channelID,
}
if err := f.videoRepo.Save(video); err != nil {
f.logger.Error("failed to save video", err)
continue
}
f.videoPipeline <- video
}
f.logger.Info("fetched historical video page", slog.String("channelid", string(channelID)), slog.String("pagetoken", pageToken), slog.Int("count", len(ytIDs)))
return pageToken
}
2023-05-13 12:53:37 +02:00
func (f *Fetcher) FindUnprocessed() {
f.logger.Info("looking for unprocessed videos")
videos, err := f.videoRepo.FindByStatus(model.StatusNew, model.StatusHasMetadata)
if err != nil {
f.logger.Error("failed to fetch unprocessed videos", err)
return
}
f.logger.Info("found unprocessed videos", slog.Int("count", len(videos)))
for _, video := range videos {
2023-05-27 14:36:22 +02:00
f.videoPipeline <- video
2023-05-10 16:28:45 +02:00
}
}
func (f *Fetcher) ReadFeeds() {
2023-05-10 19:27:31 +02:00
f.logger.Info("started feed reader")
2023-05-10 16:28:45 +02:00
ticker := time.NewTicker(f.interval)
for range ticker.C {
entries, err := f.feedReader.Unread()
if err != nil {
2023-05-10 19:27:31 +02:00
f.logger.Error("failed to fetch unread entries", err)
continue
2023-05-10 16:28:45 +02:00
}
2023-05-10 19:27:31 +02:00
f.logger.Info("fetched unread entries", slog.Int("count", len(entries)))
if len(entries) == 0 {
continue
}
2023-05-10 16:28:45 +02:00
for _, entry := range entries {
video := &model.Video{
2023-05-27 14:36:22 +02:00
ID: uuid.New(),
Status: model.StatusNew,
YoutubeID: model.YoutubeVideoID(entry.YoutubeID),
YoutubeChannelID: model.YoutubeChannelID(entry.YoutubeChannelID),
2023-05-10 16:28:45 +02:00
}
if err := f.videoRepo.Save(video); err != nil {
2023-05-10 19:27:31 +02:00
f.logger.Error("failed to save video", err)
2023-05-10 16:28:45 +02:00
continue
}
2023-05-27 14:36:22 +02:00
f.videoPipeline <- video
2023-05-10 16:28:45 +02:00
if err := f.feedReader.MarkRead(entry.EntryID); err != nil {
2023-05-10 19:27:31 +02:00
f.logger.Error("failed to mark entry as read", err)
continue
2023-05-10 16:28:45 +02:00
}
}
}
}
func (f *Fetcher) MetadataFetcher() {
2023-05-10 19:27:31 +02:00
f.logger.Info("started metadata fetcher")
2023-05-10 16:28:45 +02:00
buffer := []*model.Video{}
timeout := time.NewTimer(10 * time.Second)
fetch := make(chan []*model.Video)
go func() {
for videos := range fetch {
2023-05-10 19:27:31 +02:00
f.logger.Info("fetching metadata", slog.Int("count", len(videos)))
2023-05-27 14:36:22 +02:00
ids := make([]model.YoutubeVideoID, 0, len(videos))
2023-05-10 20:08:45 +02:00
for _, video := range videos {
ids = append(ids, video.YoutubeID)
}
mds, err := f.metadataFetcher.FetchMetadata(ids)
if err != nil {
f.logger.Error("failed to fetch metadata", err)
continue
}
for _, video := range videos {
2023-05-31 16:27:35 +02:00
md := mds[video.YoutubeID]
video.YoutubeTitle = md.Title
video.YoutubeDescription = md.Description
video.YoutubeDuration = md.Duration
video.YoutubePublishedAt = md.PublishedAt
2023-05-13 12:53:37 +02:00
video.Status = model.StatusHasMetadata
2023-05-10 19:27:31 +02:00
2023-05-10 20:08:45 +02:00
if err := f.videoRepo.Save(video); err != nil {
f.logger.Error("failed to save video", err)
continue
}
}
2023-05-13 12:53:37 +02:00
f.logger.Info("fetched metadata", slog.Int("count", len(videos)))
2023-05-10 16:28:45 +02:00
}
}()
for {
select {
case video := <-f.needsMetadata:
timeout.Reset(10 * time.Second)
buffer = append(buffer, video)
2023-05-27 14:36:22 +02:00
if len(buffer) >= 50 {
2023-05-10 16:28:45 +02:00
batch := make([]*model.Video, len(buffer))
copy(batch, buffer)
fetch <- batch
buffer = []*model.Video{}
}
case <-timeout.C:
if len(buffer) == 0 {
continue
}
batch := make([]*model.Video, len(buffer))
copy(batch, buffer)
fetch <- batch
buffer = []*model.Video{}
}
}
}
2023-05-13 12:53:37 +02:00
func (f *Fetcher) SummaryFetcher() {
for {
select {
case video := <-f.needsSummary:
f.logger.Info("fetching summary", slog.String("id", video.ID.String()))
if err := f.summaryFetcher.FetchSummary(video); err != nil {
f.logger.Error("failed to fetch summary", err)
continue
}
video.Status = model.StatusHasSummary
f.logger.Info("fetched summary", slog.String("id", video.ID.String()))
2023-05-27 14:36:22 +02:00
f.videoPipeline <- video
2023-05-13 12:53:37 +02:00
}
}
}