split fetch and process

This commit is contained in:
Erik Winter 2023-07-06 13:25:51 +02:00
parent 3c451ae0a1
commit 5d125c0ac1
14 changed files with 173 additions and 87 deletions

View File

@ -1,4 +1,4 @@
package fetcher
package fetch
import "ewintr.nl/yogai/model"

View File

@ -1,7 +1,6 @@
package fetcher
package fetch
import (
"context"
"time"
"ewintr.nl/yogai/model"
@ -12,34 +11,30 @@ import (
type Fetcher struct {
interval time.Duration
feedRepo storage.FeedRepository
videoRepo storage.VideoRepository
vectorClient *storage.Weaviate
feedRepo storage.FeedRelRepository
videoRepo storage.VideoRelRepository
feedReader FeedReader
channelReader ChannelReader
metadataFetcher MetadataFetcher
summaryFetcher SummaryFetcher
feedPipeline chan *model.Feed
videoPipeline chan *model.Video
needsMetadata chan *model.Video
needsSummary chan *model.Video
out chan *model.Video
logger *slog.Logger
}
func NewFetch(feedRepo storage.FeedRepository, videoRepo storage.VideoRepository, channelReader ChannelReader, feedReader FeedReader, wvClient *storage.Weaviate, interval time.Duration, metadataFetcher MetadataFetcher, summaryFetcher SummaryFetcher, logger *slog.Logger) *Fetcher {
func NewFetch(feedRepo storage.FeedRelRepository, videoRepo storage.VideoRelRepository, channelReader ChannelReader, feedReader FeedReader, interval time.Duration, metadataFetcher MetadataFetcher, logger *slog.Logger) *Fetcher {
return &Fetcher{
interval: interval,
feedRepo: feedRepo,
videoRepo: videoRepo,
vectorClient: wvClient,
channelReader: channelReader,
feedReader: feedReader,
metadataFetcher: metadataFetcher,
summaryFetcher: summaryFetcher,
feedPipeline: make(chan *model.Feed, 10),
videoPipeline: make(chan *model.Video, 10),
needsMetadata: make(chan *model.Video, 10),
needsSummary: make(chan *model.Video, 10),
out: make(chan *model.Video),
logger: logger,
}
}
@ -50,36 +45,30 @@ func (f *Fetcher) Run() {
go f.ReadFeeds()
go f.MetadataFetcher()
go f.SummaryFetcher()
go f.FindUnprocessed()
f.logger.Info("started videoPipeline")
for {
select {
case video := <-f.videoPipeline:
switch video.Status {
case model.StatusNew:
f.needsMetadata <- video
case model.StatusHasMetadata:
// f.needsSummary <- video
//case model.StatusHasSummary:
video.Status = model.StatusReady
f.logger.Info("video is ready", slog.String("id", video.ID.String()))
if err := f.vectorClient.Save(context.Background(), video); err != nil {
f.logger.Error("failed to save video in vector db", err)
continue
}
}
if err := f.videoRepo.Save(video); err != nil {
f.logger.Error("failed to save video in normal db", err)
continue
}
switch video.Status {
case model.StatusNew:
f.needsMetadata <- video
case model.StatusFetched:
f.out <- video
}
}
}
}
func (f *Fetcher) Out() chan *model.Video {
return f.out
}
func (f *Fetcher) FindNewFeeds() {
f.logger.Info("looking for new feeds")
feeds, err := f.feedRepo.FindByStatus(model.FeedStatusNew)
@ -93,7 +82,7 @@ func (f *Fetcher) FindNewFeeds() {
}
func (f *Fetcher) FetchHistoricalVideos() {
f.logger.Info("started historical video fetcher")
f.logger.Info("started historical video fetch")
for feed := range f.feedPipeline {
f.logger.Info("fetching historical videos", slog.String("channelid", string(feed.YoutubeChannelID)))
@ -139,7 +128,7 @@ func (f *Fetcher) FetchHistoricalVideoPage(channelID model.YoutubeChannelID, pag
func (f *Fetcher) FindUnprocessed() {
f.logger.Info("looking for unprocessed videos")
videos, err := f.videoRepo.FindByStatus(model.StatusNew, model.StatusHasMetadata)
videos, err := f.videoRepo.FindByStatus(model.StatusNew, model.StatusFetched)
if err != nil {
f.logger.Error("failed to fetch unprocessed videos", err)
return
@ -185,7 +174,7 @@ func (f *Fetcher) ReadFeeds() {
}
func (f *Fetcher) MetadataFetcher() {
f.logger.Info("started metadata fetcher")
f.logger.Info("started metadata fetch")
buffer := []*model.Video{}
timeout := time.NewTimer(10 * time.Second)
@ -209,7 +198,7 @@ func (f *Fetcher) MetadataFetcher() {
video.YoutubeDescription = md.Description
video.YoutubeDuration = md.Duration
video.YoutubePublishedAt = md.PublishedAt
video.Status = model.StatusHasMetadata
video.Status = model.StatusFetched
if err := f.videoRepo.Save(video); err != nil {
f.logger.Error("failed to save video", err)
@ -242,19 +231,3 @@ func (f *Fetcher) MetadataFetcher() {
}
}
}
func (f *Fetcher) SummaryFetcher() {
for {
select {
case video := <-f.needsSummary:
f.logger.Info("fetching summary", slog.String("id", video.ID.String()))
if err := f.summaryFetcher.FetchSummary(video); err != nil {
f.logger.Error("failed to fetch summary", err)
continue
}
video.Status = model.StatusHasSummary
f.logger.Info("fetched summary", slog.String("id", video.ID.String()))
f.videoPipeline <- video
}
}
}

View File

@ -1,4 +1,4 @@
package fetcher
package fetch
import "ewintr.nl/yogai/model"

View File

@ -1,4 +1,4 @@
package fetcher
package fetch
import (
"strings"

View File

@ -1,4 +1,4 @@
package fetcher
package fetch
import "ewintr.nl/yogai/model"

View File

@ -1,4 +1,4 @@
package fetcher
package fetch
import (
"strings"

View File

@ -1,14 +1,15 @@
package handler
import (
"ewintr.nl/yogai/storage"
"fmt"
"golang.org/x/exp/slog"
"miniflux.app/logger"
"net/http"
"net/http/httptest"
"path"
"strings"
"ewintr.nl/yogai/storage"
"golang.org/x/exp/slog"
"miniflux.app/logger"
)
type Server struct {
@ -16,7 +17,7 @@ type Server struct {
logger *slog.Logger
}
func NewServer(videoRepo storage.VideoRepository, logger *slog.Logger) *Server {
func NewServer(videoRepo storage.VideoRelRepository, logger *slog.Logger) *Server {
return &Server{
apis: map[string]http.Handler{
"video": NewVideoAPI(videoRepo, logger),

View File

@ -12,11 +12,11 @@ import (
)
type VideoAPI struct {
videoRepo storage.VideoRepository
videoRepo storage.VideoRelRepository
logger *slog.Logger
}
func NewVideoAPI(videoRepo storage.VideoRepository, logger *slog.Logger) *VideoAPI {
func NewVideoAPI(videoRepo storage.VideoRelRepository, logger *slog.Logger) *VideoAPI {
return &VideoAPI{
videoRepo: videoRepo,
logger: logger,

View File

@ -5,10 +5,9 @@ import "github.com/google/uuid"
type VideoStatus string
const (
StatusNew VideoStatus = "new"
StatusHasMetadata VideoStatus = "has_metadata"
StatusHasSummary VideoStatus = "has_summary"
StatusReady VideoStatus = "ready"
StatusNew VideoStatus = "new"
StatusFetched VideoStatus = "fetched"
StatusReady VideoStatus = "ready"
)
type YoutubeVideoID string

View File

@ -1,4 +1,4 @@
package fetcher
package process
import (
"context"
@ -8,23 +8,27 @@ import (
"github.com/sashabaranov/go-openai"
)
const summarizePrompt = `You are an helpful assistant. Your task is to extract all text that refers to the content of a yoga workout video from the description a user gives you.
You will not add introductory sentences like "This text is about", or "Summary of...". Just give the words verbatim. Trim any white space back to a simple space
`
type OpenAI struct {
type OpenAISummarizer struct {
client *openai.Client
}
func NewOpenAI(apiKey string) *OpenAI {
return &OpenAI{
client: openai.NewClient(apiKey),
func NewOpenAISummarizer(client *openai.Client) *OpenAISummarizer {
return &OpenAISummarizer{
client: client,
}
}
func (o *OpenAI) FetchSummary(video *model.Video) error {
resp, err := o.client.CreateChatCompletion(
context.Background(),
func (sum *OpenAISummarizer) Name() string {
return "openai summarizer"
}
func (sum *OpenAISummarizer) Do(ctx context.Context, video *model.Video) error {
const summarizePrompt = `You are an helpful assistant. Your task is to extract all text that refers to the content of a yoga workout video from the description a user gives you.
You will not add introductory sentences like "This text is about", or "Summary of...". Just give the words verbatim. Trim any white space back to a simple space
`
resp, err := sum.client.CreateChatCompletion(
ctx,
openai.ChatCompletionRequest{
Model: openai.GPT4,
Messages: []openai.ChatCompletionMessage{

85
process/processor.go Normal file
View File

@ -0,0 +1,85 @@
package process
import (
"context"
"ewintr.nl/yogai/model"
"ewintr.nl/yogai/storage"
"github.com/sashabaranov/go-openai"
"golang.org/x/exp/slog"
)
type VideoProcessor interface {
Name() string
Do(ctx context.Context, video *model.Video) error
}
type Processors struct {
procs map[string]VideoProcessor
}
func NewProcessors(openAIClient *openai.Client) *Processors {
return &Processors{
procs: map[string]VideoProcessor{
"summarizer": NewOpenAISummarizer(openAIClient),
},
}
}
func (p *Processors) Next(video *model.Video) VideoProcessor {
if video.Summary == "" {
return p.procs["summarizer"]
}
return nil
}
type Pipeline struct {
in chan *model.Video
procs *Processors
logger *slog.Logger
relStorage storage.VideoRelRepository
vecStorage storage.VideoVecRepository
}
func NewPipeline(in chan *model.Video, processors *Processors, relDB storage.VideoRelRepository, vecDB storage.VideoVecRepository, logger *slog.Logger) *Pipeline {
return &Pipeline{
in: in,
procs: processors,
relStorage: relDB,
vecStorage: vecDB,
logger: logger,
}
}
func (p *Pipeline) Run() {
ctx := context.Background()
for video := range p.in {
p.Process(ctx, video)
}
}
func (p *Pipeline) Process(ctx context.Context, video *model.Video) {
p.logger.Info("processing video", slog.String("video", string(video.YoutubeID)))
for {
next := p.procs.Next(video)
if next == nil {
p.logger.Info("no more processors for video", slog.String("video", string(video.YoutubeID)))
return
}
p.logger.Info("processing video", slog.String("video", string(video.YoutubeID)), slog.String("processor", next.Name()))
if err := next.Do(context.Background(), video); err != nil {
p.logger.Error("failed to process video", slog.String("video", string(video.YoutubeID)), slog.String("processor", next.Name()), slog.String("error", err.Error()))
return
}
if err := p.relStorage.Save(video); err != nil {
p.logger.Error("failed to save video in rel db", slog.String("video", string(video.YoutubeID)), slog.String("error", err.Error()))
return
}
if err := p.vecStorage.Save(ctx, video); err != nil {
p.logger.Error("failed to save video in rel db", slog.String("video", string(video.YoutubeID)), slog.String("error", err.Error()))
return
}
}
}

View File

@ -9,9 +9,11 @@ import (
"strconv"
"time"
"ewintr.nl/yogai/fetcher"
"ewintr.nl/yogai/fetch"
"ewintr.nl/yogai/handler"
"ewintr.nl/yogai/process"
"ewintr.nl/yogai/storage"
"github.com/sashabaranov/go-openai"
"golang.org/x/exp/slog"
"google.golang.org/api/option"
"google.golang.org/api/youtube/v3"
@ -33,10 +35,10 @@ func main() {
logger.Error("unable to connect to postgres", err)
os.Exit(1)
}
videoRepo := storage.NewPostgresVideoRepository(postgres)
feedRepo := storage.NewPostgresFeedRepository(postgres)
videoRelRepo := storage.NewPostgresVideoRepository(postgres)
feedRelRepo := storage.NewPostgresFeedRepository(postgres)
mflx := fetcher.NewMiniflux(fetcher.MinifluxInfo{
mflxClient := fetch.NewMiniflux(fetch.MinifluxInfo{
Endpoint: getParam("MINIFLUX_ENDPOINT", "http://localhost/v1"),
ApiKey: getParam("MINIFLUX_APIKEY", ""),
})
@ -47,39 +49,45 @@ func main() {
os.Exit(1)
}
ytClient, err := youtube.NewService(ctx, option.WithAPIKey(getParam("YOUTUBE_API_KEY", "")))
yt, err := youtube.NewService(ctx, option.WithAPIKey(getParam("YOUTUBE_API_KEY", "")))
if err != nil {
logger.Error("unable to create youtube service", err)
os.Exit(1)
}
yt := fetcher.NewYoutube(ytClient)
ytClient := fetch.NewYoutube(yt)
openaiKey := getParam("OPENAI_API_KEY", "")
openAIClient := fetcher.NewOpenAI(openaiKey)
openAIClient := openai.NewClient(openaiKey)
wvResetSchema := getParam("WEAVIATE_RESET_SCHEMA", "false") == "true"
wv, err := storage.NewWeaviate(getParam("WEAVIATE_HOST", ""), getParam("WEAVIATE_API_KEY", ""), openaiKey)
wvClient, err := storage.NewWeaviate(getParam("WEAVIATE_HOST", ""), getParam("WEAVIATE_API_KEY", ""), openaiKey)
if err != nil {
logger.Error("unable to create weaviate client", err)
os.Exit(1)
}
if wvResetSchema {
logger.Info("resetting weaviate schema")
if err := wv.ResetSchema(); err != nil {
if err := wvClient.ResetSchema(); err != nil {
logger.Error("unable to reset weaviate schema", err)
os.Exit(1)
}
}
go fetcher.NewFetch(feedRepo, videoRepo, yt, mflx, wv, fetchInterval, yt, openAIClient, logger).Run()
fetcher := fetch.NewFetch(feedRelRepo, videoRelRepo, ytClient, mflxClient, fetchInterval, ytClient, logger)
go fetcher.Run()
logger.Info("fetch service started")
procs := process.NewProcessors(openAIClient)
pipeline := process.NewPipeline(fetcher.Out(), procs, videoRelRepo, wvClient, logger)
go pipeline.Run()
logger.Info("processing service started")
port, err := strconv.Atoi(getParam("API_PORT", "8080"))
if err != nil {
logger.Error("invalid port", err)
os.Exit(1)
}
go http.ListenAndServe(fmt.Sprintf(":%d", port), handler.NewServer(videoRepo, logger))
go http.ListenAndServe(fmt.Sprintf(":%d", port), handler.NewServer(videoRelRepo, logger))
logger.Info("http server started")
done := make(chan os.Signal)

View File

@ -42,4 +42,14 @@ ADD COLUMN published_at VARCHAR(255)`,
`ALTER TABLE video RENAME COLUMN title TO youtube_title`,
`ALTER TABLE video RENAME COLUMN description TO youtube_description`,
`ALTER TABLE video RENAME COLUMN youtube_published_id TO youtube_published_at`,
`UPDATE video SET status = 'new'`,
`CREATE TYPE video_status_new AS ENUM ('new', 'fetched', 'ready')`,
`BEGIN;
ALTER TABLE video ADD COLUMN status_new video_status_new;
UPDATE video SET status_new = status::text::video_status_new;
ALTER TABLE video DROP COLUMN status;
ALTER TABLE video RENAME COLUMN status_new TO status;
COMMIT;`,
`DROP TYPE video_status`,
`ALTER TYPE video_status_new RENAME TO video_status`,
}

View File

@ -1,15 +1,21 @@
package storage
import (
"context"
"ewintr.nl/yogai/model"
)
type FeedRepository interface {
type FeedRelRepository interface {
Save(feed *model.Feed) error
FindByStatus(statuses ...model.FeedStatus) ([]*model.Feed, error)
}
type VideoRepository interface {
type VideoRelRepository interface {
Save(video *model.Video) error
FindByStatus(statuses ...model.VideoStatus) ([]*model.Video, error)
}
type VideoVecRepository interface {
Save(ctx context.Context, video *model.Video) error
}