split fetch and process

This commit is contained in:
Erik Winter 2023-07-06 13:25:51 +02:00
parent 3c451ae0a1
commit 5d125c0ac1
14 changed files with 173 additions and 87 deletions

View File

@ -1,4 +1,4 @@
package fetcher package fetch
import "ewintr.nl/yogai/model" import "ewintr.nl/yogai/model"

View File

@ -1,7 +1,6 @@
package fetcher package fetch
import ( import (
"context"
"time" "time"
"ewintr.nl/yogai/model" "ewintr.nl/yogai/model"
@ -12,34 +11,30 @@ import (
type Fetcher struct { type Fetcher struct {
interval time.Duration interval time.Duration
feedRepo storage.FeedRepository feedRepo storage.FeedRelRepository
videoRepo storage.VideoRepository videoRepo storage.VideoRelRepository
vectorClient *storage.Weaviate
feedReader FeedReader feedReader FeedReader
channelReader ChannelReader channelReader ChannelReader
metadataFetcher MetadataFetcher metadataFetcher MetadataFetcher
summaryFetcher SummaryFetcher
feedPipeline chan *model.Feed feedPipeline chan *model.Feed
videoPipeline chan *model.Video videoPipeline chan *model.Video
needsMetadata chan *model.Video needsMetadata chan *model.Video
needsSummary chan *model.Video out chan *model.Video
logger *slog.Logger logger *slog.Logger
} }
func NewFetch(feedRepo storage.FeedRepository, videoRepo storage.VideoRepository, channelReader ChannelReader, feedReader FeedReader, wvClient *storage.Weaviate, interval time.Duration, metadataFetcher MetadataFetcher, summaryFetcher SummaryFetcher, logger *slog.Logger) *Fetcher { func NewFetch(feedRepo storage.FeedRelRepository, videoRepo storage.VideoRelRepository, channelReader ChannelReader, feedReader FeedReader, interval time.Duration, metadataFetcher MetadataFetcher, logger *slog.Logger) *Fetcher {
return &Fetcher{ return &Fetcher{
interval: interval, interval: interval,
feedRepo: feedRepo, feedRepo: feedRepo,
videoRepo: videoRepo, videoRepo: videoRepo,
vectorClient: wvClient,
channelReader: channelReader, channelReader: channelReader,
feedReader: feedReader, feedReader: feedReader,
metadataFetcher: metadataFetcher, metadataFetcher: metadataFetcher,
summaryFetcher: summaryFetcher,
feedPipeline: make(chan *model.Feed, 10), feedPipeline: make(chan *model.Feed, 10),
videoPipeline: make(chan *model.Video, 10), videoPipeline: make(chan *model.Video, 10),
needsMetadata: make(chan *model.Video, 10), needsMetadata: make(chan *model.Video, 10),
needsSummary: make(chan *model.Video, 10), out: make(chan *model.Video),
logger: logger, logger: logger,
} }
} }
@ -50,34 +45,28 @@ func (f *Fetcher) Run() {
go f.ReadFeeds() go f.ReadFeeds()
go f.MetadataFetcher() go f.MetadataFetcher()
go f.SummaryFetcher()
go f.FindUnprocessed() go f.FindUnprocessed()
f.logger.Info("started videoPipeline") f.logger.Info("started videoPipeline")
for { for {
select { select {
case video := <-f.videoPipeline: case video := <-f.videoPipeline:
switch video.Status {
case model.StatusNew:
f.needsMetadata <- video
case model.StatusHasMetadata:
// f.needsSummary <- video
//case model.StatusHasSummary:
video.Status = model.StatusReady
f.logger.Info("video is ready", slog.String("id", video.ID.String()))
if err := f.vectorClient.Save(context.Background(), video); err != nil {
f.logger.Error("failed to save video in vector db", err)
continue
}
}
if err := f.videoRepo.Save(video); err != nil { if err := f.videoRepo.Save(video); err != nil {
f.logger.Error("failed to save video in normal db", err) f.logger.Error("failed to save video in normal db", err)
continue continue
} }
switch video.Status {
case model.StatusNew:
f.needsMetadata <- video
case model.StatusFetched:
f.out <- video
}
}
}
} }
} func (f *Fetcher) Out() chan *model.Video {
return f.out
} }
func (f *Fetcher) FindNewFeeds() { func (f *Fetcher) FindNewFeeds() {
@ -93,7 +82,7 @@ func (f *Fetcher) FindNewFeeds() {
} }
func (f *Fetcher) FetchHistoricalVideos() { func (f *Fetcher) FetchHistoricalVideos() {
f.logger.Info("started historical video fetcher") f.logger.Info("started historical video fetch")
for feed := range f.feedPipeline { for feed := range f.feedPipeline {
f.logger.Info("fetching historical videos", slog.String("channelid", string(feed.YoutubeChannelID))) f.logger.Info("fetching historical videos", slog.String("channelid", string(feed.YoutubeChannelID)))
@ -139,7 +128,7 @@ func (f *Fetcher) FetchHistoricalVideoPage(channelID model.YoutubeChannelID, pag
func (f *Fetcher) FindUnprocessed() { func (f *Fetcher) FindUnprocessed() {
f.logger.Info("looking for unprocessed videos") f.logger.Info("looking for unprocessed videos")
videos, err := f.videoRepo.FindByStatus(model.StatusNew, model.StatusHasMetadata) videos, err := f.videoRepo.FindByStatus(model.StatusNew, model.StatusFetched)
if err != nil { if err != nil {
f.logger.Error("failed to fetch unprocessed videos", err) f.logger.Error("failed to fetch unprocessed videos", err)
return return
@ -185,7 +174,7 @@ func (f *Fetcher) ReadFeeds() {
} }
func (f *Fetcher) MetadataFetcher() { func (f *Fetcher) MetadataFetcher() {
f.logger.Info("started metadata fetcher") f.logger.Info("started metadata fetch")
buffer := []*model.Video{} buffer := []*model.Video{}
timeout := time.NewTimer(10 * time.Second) timeout := time.NewTimer(10 * time.Second)
@ -209,7 +198,7 @@ func (f *Fetcher) MetadataFetcher() {
video.YoutubeDescription = md.Description video.YoutubeDescription = md.Description
video.YoutubeDuration = md.Duration video.YoutubeDuration = md.Duration
video.YoutubePublishedAt = md.PublishedAt video.YoutubePublishedAt = md.PublishedAt
video.Status = model.StatusHasMetadata video.Status = model.StatusFetched
if err := f.videoRepo.Save(video); err != nil { if err := f.videoRepo.Save(video); err != nil {
f.logger.Error("failed to save video", err) f.logger.Error("failed to save video", err)
@ -242,19 +231,3 @@ func (f *Fetcher) MetadataFetcher() {
} }
} }
} }
func (f *Fetcher) SummaryFetcher() {
for {
select {
case video := <-f.needsSummary:
f.logger.Info("fetching summary", slog.String("id", video.ID.String()))
if err := f.summaryFetcher.FetchSummary(video); err != nil {
f.logger.Error("failed to fetch summary", err)
continue
}
video.Status = model.StatusHasSummary
f.logger.Info("fetched summary", slog.String("id", video.ID.String()))
f.videoPipeline <- video
}
}
}

View File

@ -1,4 +1,4 @@
package fetcher package fetch
import "ewintr.nl/yogai/model" import "ewintr.nl/yogai/model"

View File

@ -1,4 +1,4 @@
package fetcher package fetch
import ( import (
"strings" "strings"

View File

@ -1,4 +1,4 @@
package fetcher package fetch
import "ewintr.nl/yogai/model" import "ewintr.nl/yogai/model"

View File

@ -1,4 +1,4 @@
package fetcher package fetch
import ( import (
"strings" "strings"

View File

@ -1,14 +1,15 @@
package handler package handler
import ( import (
"ewintr.nl/yogai/storage"
"fmt" "fmt"
"golang.org/x/exp/slog"
"miniflux.app/logger"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
"path" "path"
"strings" "strings"
"ewintr.nl/yogai/storage"
"golang.org/x/exp/slog"
"miniflux.app/logger"
) )
type Server struct { type Server struct {
@ -16,7 +17,7 @@ type Server struct {
logger *slog.Logger logger *slog.Logger
} }
func NewServer(videoRepo storage.VideoRepository, logger *slog.Logger) *Server { func NewServer(videoRepo storage.VideoRelRepository, logger *slog.Logger) *Server {
return &Server{ return &Server{
apis: map[string]http.Handler{ apis: map[string]http.Handler{
"video": NewVideoAPI(videoRepo, logger), "video": NewVideoAPI(videoRepo, logger),

View File

@ -12,11 +12,11 @@ import (
) )
type VideoAPI struct { type VideoAPI struct {
videoRepo storage.VideoRepository videoRepo storage.VideoRelRepository
logger *slog.Logger logger *slog.Logger
} }
func NewVideoAPI(videoRepo storage.VideoRepository, logger *slog.Logger) *VideoAPI { func NewVideoAPI(videoRepo storage.VideoRelRepository, logger *slog.Logger) *VideoAPI {
return &VideoAPI{ return &VideoAPI{
videoRepo: videoRepo, videoRepo: videoRepo,
logger: logger, logger: logger,

View File

@ -6,8 +6,7 @@ type VideoStatus string
const ( const (
StatusNew VideoStatus = "new" StatusNew VideoStatus = "new"
StatusHasMetadata VideoStatus = "has_metadata" StatusFetched VideoStatus = "fetched"
StatusHasSummary VideoStatus = "has_summary"
StatusReady VideoStatus = "ready" StatusReady VideoStatus = "ready"
) )

View File

@ -1,4 +1,4 @@
package fetcher package process
import ( import (
"context" "context"
@ -8,23 +8,27 @@ import (
"github.com/sashabaranov/go-openai" "github.com/sashabaranov/go-openai"
) )
type OpenAISummarizer struct {
client *openai.Client
}
func NewOpenAISummarizer(client *openai.Client) *OpenAISummarizer {
return &OpenAISummarizer{
client: client,
}
}
func (sum *OpenAISummarizer) Name() string {
return "openai summarizer"
}
func (sum *OpenAISummarizer) Do(ctx context.Context, video *model.Video) error {
const summarizePrompt = `You are an helpful assistant. Your task is to extract all text that refers to the content of a yoga workout video from the description a user gives you. const summarizePrompt = `You are an helpful assistant. Your task is to extract all text that refers to the content of a yoga workout video from the description a user gives you.
You will not add introductory sentences like "This text is about", or "Summary of...". Just give the words verbatim. Trim any white space back to a simple space You will not add introductory sentences like "This text is about", or "Summary of...". Just give the words verbatim. Trim any white space back to a simple space
` `
type OpenAI struct { resp, err := sum.client.CreateChatCompletion(
client *openai.Client ctx,
}
func NewOpenAI(apiKey string) *OpenAI {
return &OpenAI{
client: openai.NewClient(apiKey),
}
}
func (o *OpenAI) FetchSummary(video *model.Video) error {
resp, err := o.client.CreateChatCompletion(
context.Background(),
openai.ChatCompletionRequest{ openai.ChatCompletionRequest{
Model: openai.GPT4, Model: openai.GPT4,
Messages: []openai.ChatCompletionMessage{ Messages: []openai.ChatCompletionMessage{

85
process/processor.go Normal file
View File

@ -0,0 +1,85 @@
package process
import (
"context"
"ewintr.nl/yogai/model"
"ewintr.nl/yogai/storage"
"github.com/sashabaranov/go-openai"
"golang.org/x/exp/slog"
)
type VideoProcessor interface {
Name() string
Do(ctx context.Context, video *model.Video) error
}
type Processors struct {
procs map[string]VideoProcessor
}
func NewProcessors(openAIClient *openai.Client) *Processors {
return &Processors{
procs: map[string]VideoProcessor{
"summarizer": NewOpenAISummarizer(openAIClient),
},
}
}
func (p *Processors) Next(video *model.Video) VideoProcessor {
if video.Summary == "" {
return p.procs["summarizer"]
}
return nil
}
type Pipeline struct {
in chan *model.Video
procs *Processors
logger *slog.Logger
relStorage storage.VideoRelRepository
vecStorage storage.VideoVecRepository
}
func NewPipeline(in chan *model.Video, processors *Processors, relDB storage.VideoRelRepository, vecDB storage.VideoVecRepository, logger *slog.Logger) *Pipeline {
return &Pipeline{
in: in,
procs: processors,
relStorage: relDB,
vecStorage: vecDB,
logger: logger,
}
}
func (p *Pipeline) Run() {
ctx := context.Background()
for video := range p.in {
p.Process(ctx, video)
}
}
func (p *Pipeline) Process(ctx context.Context, video *model.Video) {
p.logger.Info("processing video", slog.String("video", string(video.YoutubeID)))
for {
next := p.procs.Next(video)
if next == nil {
p.logger.Info("no more processors for video", slog.String("video", string(video.YoutubeID)))
return
}
p.logger.Info("processing video", slog.String("video", string(video.YoutubeID)), slog.String("processor", next.Name()))
if err := next.Do(context.Background(), video); err != nil {
p.logger.Error("failed to process video", slog.String("video", string(video.YoutubeID)), slog.String("processor", next.Name()), slog.String("error", err.Error()))
return
}
if err := p.relStorage.Save(video); err != nil {
p.logger.Error("failed to save video in rel db", slog.String("video", string(video.YoutubeID)), slog.String("error", err.Error()))
return
}
if err := p.vecStorage.Save(ctx, video); err != nil {
p.logger.Error("failed to save video in rel db", slog.String("video", string(video.YoutubeID)), slog.String("error", err.Error()))
return
}
}
}

View File

@ -9,9 +9,11 @@ import (
"strconv" "strconv"
"time" "time"
"ewintr.nl/yogai/fetcher" "ewintr.nl/yogai/fetch"
"ewintr.nl/yogai/handler" "ewintr.nl/yogai/handler"
"ewintr.nl/yogai/process"
"ewintr.nl/yogai/storage" "ewintr.nl/yogai/storage"
"github.com/sashabaranov/go-openai"
"golang.org/x/exp/slog" "golang.org/x/exp/slog"
"google.golang.org/api/option" "google.golang.org/api/option"
"google.golang.org/api/youtube/v3" "google.golang.org/api/youtube/v3"
@ -33,10 +35,10 @@ func main() {
logger.Error("unable to connect to postgres", err) logger.Error("unable to connect to postgres", err)
os.Exit(1) os.Exit(1)
} }
videoRepo := storage.NewPostgresVideoRepository(postgres) videoRelRepo := storage.NewPostgresVideoRepository(postgres)
feedRepo := storage.NewPostgresFeedRepository(postgres) feedRelRepo := storage.NewPostgresFeedRepository(postgres)
mflx := fetcher.NewMiniflux(fetcher.MinifluxInfo{ mflxClient := fetch.NewMiniflux(fetch.MinifluxInfo{
Endpoint: getParam("MINIFLUX_ENDPOINT", "http://localhost/v1"), Endpoint: getParam("MINIFLUX_ENDPOINT", "http://localhost/v1"),
ApiKey: getParam("MINIFLUX_APIKEY", ""), ApiKey: getParam("MINIFLUX_APIKEY", ""),
}) })
@ -47,39 +49,45 @@ func main() {
os.Exit(1) os.Exit(1)
} }
ytClient, err := youtube.NewService(ctx, option.WithAPIKey(getParam("YOUTUBE_API_KEY", ""))) yt, err := youtube.NewService(ctx, option.WithAPIKey(getParam("YOUTUBE_API_KEY", "")))
if err != nil { if err != nil {
logger.Error("unable to create youtube service", err) logger.Error("unable to create youtube service", err)
os.Exit(1) os.Exit(1)
} }
yt := fetcher.NewYoutube(ytClient) ytClient := fetch.NewYoutube(yt)
openaiKey := getParam("OPENAI_API_KEY", "") openaiKey := getParam("OPENAI_API_KEY", "")
openAIClient := fetcher.NewOpenAI(openaiKey) openAIClient := openai.NewClient(openaiKey)
wvResetSchema := getParam("WEAVIATE_RESET_SCHEMA", "false") == "true" wvResetSchema := getParam("WEAVIATE_RESET_SCHEMA", "false") == "true"
wv, err := storage.NewWeaviate(getParam("WEAVIATE_HOST", ""), getParam("WEAVIATE_API_KEY", ""), openaiKey) wvClient, err := storage.NewWeaviate(getParam("WEAVIATE_HOST", ""), getParam("WEAVIATE_API_KEY", ""), openaiKey)
if err != nil { if err != nil {
logger.Error("unable to create weaviate client", err) logger.Error("unable to create weaviate client", err)
os.Exit(1) os.Exit(1)
} }
if wvResetSchema { if wvResetSchema {
logger.Info("resetting weaviate schema") logger.Info("resetting weaviate schema")
if err := wv.ResetSchema(); err != nil { if err := wvClient.ResetSchema(); err != nil {
logger.Error("unable to reset weaviate schema", err) logger.Error("unable to reset weaviate schema", err)
os.Exit(1) os.Exit(1)
} }
} }
go fetcher.NewFetch(feedRepo, videoRepo, yt, mflx, wv, fetchInterval, yt, openAIClient, logger).Run() fetcher := fetch.NewFetch(feedRelRepo, videoRelRepo, ytClient, mflxClient, fetchInterval, ytClient, logger)
go fetcher.Run()
logger.Info("fetch service started") logger.Info("fetch service started")
procs := process.NewProcessors(openAIClient)
pipeline := process.NewPipeline(fetcher.Out(), procs, videoRelRepo, wvClient, logger)
go pipeline.Run()
logger.Info("processing service started")
port, err := strconv.Atoi(getParam("API_PORT", "8080")) port, err := strconv.Atoi(getParam("API_PORT", "8080"))
if err != nil { if err != nil {
logger.Error("invalid port", err) logger.Error("invalid port", err)
os.Exit(1) os.Exit(1)
} }
go http.ListenAndServe(fmt.Sprintf(":%d", port), handler.NewServer(videoRepo, logger)) go http.ListenAndServe(fmt.Sprintf(":%d", port), handler.NewServer(videoRelRepo, logger))
logger.Info("http server started") logger.Info("http server started")
done := make(chan os.Signal) done := make(chan os.Signal)

View File

@ -42,4 +42,14 @@ ADD COLUMN published_at VARCHAR(255)`,
`ALTER TABLE video RENAME COLUMN title TO youtube_title`, `ALTER TABLE video RENAME COLUMN title TO youtube_title`,
`ALTER TABLE video RENAME COLUMN description TO youtube_description`, `ALTER TABLE video RENAME COLUMN description TO youtube_description`,
`ALTER TABLE video RENAME COLUMN youtube_published_id TO youtube_published_at`, `ALTER TABLE video RENAME COLUMN youtube_published_id TO youtube_published_at`,
`UPDATE video SET status = 'new'`,
`CREATE TYPE video_status_new AS ENUM ('new', 'fetched', 'ready')`,
`BEGIN;
ALTER TABLE video ADD COLUMN status_new video_status_new;
UPDATE video SET status_new = status::text::video_status_new;
ALTER TABLE video DROP COLUMN status;
ALTER TABLE video RENAME COLUMN status_new TO status;
COMMIT;`,
`DROP TYPE video_status`,
`ALTER TYPE video_status_new RENAME TO video_status`,
} }

View File

@ -1,15 +1,21 @@
package storage package storage
import ( import (
"context"
"ewintr.nl/yogai/model" "ewintr.nl/yogai/model"
) )
type FeedRepository interface { type FeedRelRepository interface {
Save(feed *model.Feed) error Save(feed *model.Feed) error
FindByStatus(statuses ...model.FeedStatus) ([]*model.Feed, error) FindByStatus(statuses ...model.FeedStatus) ([]*model.Feed, error)
} }
type VideoRepository interface { type VideoRelRepository interface {
Save(video *model.Video) error Save(video *model.Video) error
FindByStatus(statuses ...model.VideoStatus) ([]*model.Video, error) FindByStatus(statuses ...model.VideoStatus) ([]*model.Video, error)
} }
type VideoVecRepository interface {
Save(ctx context.Context, video *model.Video) error
}