mirror of
https://github.com/gabehf/Koito.git
synced 2026-04-22 12:01:52 -07:00
Add bulk import optimization: track_lookup cache, batch inserts, BulkSubmitter
Adopts ListenBrainz-inspired patterns to speed up imports from ~24h to under 30 minutes for 49k scrobbles. Phase 1 - track_lookup cache table: - New migration (000006) adds persistent entity lookup cache - Maps normalized (artist, track, album) → (artist_id, album_id, track_id) - SubmitListen fast path: cache hit skips 18 DB queries → 2 queries - Cache populated after entity resolution, invalidated on merge/delete - Benefits both live scrobbles and imports Phase 2 - SaveListensBatch: - New batch listen insert using pgx CopyFrom → temp table → INSERT ON CONFLICT - Thousands of inserts per second vs one-at-a-time Phase 3 - BulkSubmitter: - Reusable import accelerator for all importers - Pre-deduplicates scrobbles by (artist, track, album) in memory - Worker pool (4 goroutines) for parallel entity creation on cache miss - Batch listen insertion via SaveListensBatch Phase 4 - Migrate importers: - Maloja, Spotify, LastFM, ListenBrainz importers use BulkSubmitter - Koito importer left as-is (already fast with pre-resolved IDs) Phase 5 - Skip image lookups during import: - GetArtistImage/GetAlbumImage calls fully skipped when SkipCacheImage=true - Background tasks (FetchMissingArtistImages/FetchMissingAlbumImages) backfill Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c92e93484e
commit
8ce6ec494d
21 changed files with 1294 additions and 129 deletions
15
db/migrations/000006_track_lookup.sql
Normal file
15
db/migrations/000006_track_lookup.sql
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
-- +goose Up
|
||||||
|
CREATE TABLE track_lookup (
|
||||||
|
lookup_key TEXT NOT NULL PRIMARY KEY,
|
||||||
|
artist_id INT NOT NULL REFERENCES artists(id) ON DELETE CASCADE,
|
||||||
|
album_id INT NOT NULL REFERENCES releases(id) ON DELETE CASCADE,
|
||||||
|
track_id INT NOT NULL REFERENCES tracks(id) ON DELETE CASCADE,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_track_lookup_track_id ON track_lookup(track_id);
|
||||||
|
CREATE INDEX idx_track_lookup_artist_id ON track_lookup(artist_id);
|
||||||
|
CREATE INDEX idx_track_lookup_album_id ON track_lookup(album_id);
|
||||||
|
|
||||||
|
-- +goose Down
|
||||||
|
DROP TABLE IF EXISTS track_lookup;
|
||||||
21
db/queries/track_lookup.sql
Normal file
21
db/queries/track_lookup.sql
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
-- name: GetTrackLookup :one
|
||||||
|
SELECT artist_id, album_id, track_id
|
||||||
|
FROM track_lookup
|
||||||
|
WHERE lookup_key = $1;
|
||||||
|
|
||||||
|
-- name: InsertTrackLookup :exec
|
||||||
|
INSERT INTO track_lookup (lookup_key, artist_id, album_id, track_id)
|
||||||
|
VALUES ($1, $2, $3, $4)
|
||||||
|
ON CONFLICT (lookup_key) DO UPDATE SET
|
||||||
|
artist_id = EXCLUDED.artist_id,
|
||||||
|
album_id = EXCLUDED.album_id,
|
||||||
|
track_id = EXCLUDED.track_id;
|
||||||
|
|
||||||
|
-- name: DeleteTrackLookupByArtist :exec
|
||||||
|
DELETE FROM track_lookup WHERE artist_id = $1;
|
||||||
|
|
||||||
|
-- name: DeleteTrackLookupByAlbum :exec
|
||||||
|
DELETE FROM track_lookup WHERE album_id = $1;
|
||||||
|
|
||||||
|
-- name: DeleteTrackLookupByTrack :exec
|
||||||
|
DELETE FROM track_lookup WHERE track_id = $1;
|
||||||
|
|
@ -122,9 +122,11 @@ func createOrUpdateAlbumWithMbzReleaseID(ctx context.Context, d db.DB, opts Asso
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
l.Debug().Msg("Searching for album images...")
|
|
||||||
var imgid uuid.UUID
|
var imgid uuid.UUID
|
||||||
imgUrl, err := images.GetAlbumImage(ctx, images.AlbumImageOpts{
|
var imgUrl string
|
||||||
|
if !opts.SkipCacheImage {
|
||||||
|
l.Debug().Msg("Searching for album images...")
|
||||||
|
imgUrl, err = images.GetAlbumImage(ctx, images.AlbumImageOpts{
|
||||||
Artists: utils.UniqueIgnoringCase(slices.Concat(utils.FlattenMbzArtistCreditNames(release.ArtistCredit), utils.FlattenArtistNames(opts.Artists))),
|
Artists: utils.UniqueIgnoringCase(slices.Concat(utils.FlattenMbzArtistCreditNames(release.ArtistCredit), utils.FlattenArtistNames(opts.Artists))),
|
||||||
Album: release.Title,
|
Album: release.Title,
|
||||||
ReleaseMbzID: &opts.ReleaseMbzID,
|
ReleaseMbzID: &opts.ReleaseMbzID,
|
||||||
|
|
@ -132,7 +134,6 @@ func createOrUpdateAlbumWithMbzReleaseID(ctx context.Context, d db.DB, opts Asso
|
||||||
|
|
||||||
if err == nil && imgUrl != "" {
|
if err == nil && imgUrl != "" {
|
||||||
imgid = uuid.New()
|
imgid = uuid.New()
|
||||||
if !opts.SkipCacheImage {
|
|
||||||
var size ImageSize
|
var size ImageSize
|
||||||
if cfg.FullImageCacheEnabled() {
|
if cfg.FullImageCacheEnabled() {
|
||||||
size = ImageSizeFull
|
size = ImageSizeFull
|
||||||
|
|
@ -144,12 +145,10 @@ func createOrUpdateAlbumWithMbzReleaseID(ctx context.Context, d db.DB, opts Asso
|
||||||
if err != nil {
|
if err != nil {
|
||||||
l.Err(err).Msg("createOrUpdateAlbumWithMbzReleaseID: failed to cache image")
|
l.Err(err).Msg("createOrUpdateAlbumWithMbzReleaseID: failed to cache image")
|
||||||
}
|
}
|
||||||
}
|
} else if err != nil {
|
||||||
}
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
l.Debug().Msgf("createOrUpdateAlbumWithMbzReleaseID: failed to get album images for %s: %s", release.Title, err.Error())
|
l.Debug().Msgf("createOrUpdateAlbumWithMbzReleaseID: failed to get album images for %s: %s", release.Title, err.Error())
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
album, err = d.SaveAlbum(ctx, db.SaveAlbumOpts{
|
album, err = d.SaveAlbum(ctx, db.SaveAlbumOpts{
|
||||||
Title: release.Title,
|
Title: release.Title,
|
||||||
|
|
@ -217,14 +216,15 @@ func matchAlbumByTitle(ctx context.Context, d db.DB, opts AssociateAlbumOpts) (*
|
||||||
return nil, fmt.Errorf("matchAlbumByTitle: %w", err)
|
return nil, fmt.Errorf("matchAlbumByTitle: %w", err)
|
||||||
} else {
|
} else {
|
||||||
var imgid uuid.UUID
|
var imgid uuid.UUID
|
||||||
imgUrl, err := images.GetAlbumImage(ctx, images.AlbumImageOpts{
|
var imgUrl string
|
||||||
|
if !opts.SkipCacheImage {
|
||||||
|
imgUrl, err = images.GetAlbumImage(ctx, images.AlbumImageOpts{
|
||||||
Artists: utils.FlattenArtistNames(opts.Artists),
|
Artists: utils.FlattenArtistNames(opts.Artists),
|
||||||
Album: opts.ReleaseName,
|
Album: opts.ReleaseName,
|
||||||
ReleaseMbzID: &opts.ReleaseMbzID,
|
ReleaseMbzID: &opts.ReleaseMbzID,
|
||||||
})
|
})
|
||||||
if err == nil && imgUrl != "" {
|
if err == nil && imgUrl != "" {
|
||||||
imgid = uuid.New()
|
imgid = uuid.New()
|
||||||
if !opts.SkipCacheImage {
|
|
||||||
var size ImageSize
|
var size ImageSize
|
||||||
if cfg.FullImageCacheEnabled() {
|
if cfg.FullImageCacheEnabled() {
|
||||||
size = ImageSizeFull
|
size = ImageSizeFull
|
||||||
|
|
@ -234,13 +234,12 @@ func matchAlbumByTitle(ctx context.Context, d db.DB, opts AssociateAlbumOpts) (*
|
||||||
l.Debug().Msg("Downloading album image from source...")
|
l.Debug().Msg("Downloading album image from source...")
|
||||||
err = DownloadAndCacheImage(ctx, imgid, imgUrl, size)
|
err = DownloadAndCacheImage(ctx, imgid, imgUrl, size)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
l.Err(err).Msg("createOrUpdateAlbumWithMbzReleaseID: failed to cache image")
|
l.Err(err).Msg("matchAlbumByTitle: failed to cache image")
|
||||||
}
|
}
|
||||||
}
|
} else if err != nil {
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
l.Debug().AnErr("error", err).Msgf("matchAlbumByTitle: failed to get album images for %s", opts.ReleaseName)
|
l.Debug().AnErr("error", err).Msgf("matchAlbumByTitle: failed to get album images for %s", opts.ReleaseName)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
a, err = d.SaveAlbum(ctx, db.SaveAlbumOpts{
|
a, err = d.SaveAlbum(ctx, db.SaveAlbumOpts{
|
||||||
Title: releaseName,
|
Title: releaseName,
|
||||||
|
|
|
||||||
|
|
@ -127,12 +127,14 @@ func matchArtistsByMBIDMappings(ctx context.Context, d db.DB, opts AssociateArti
|
||||||
l.Warn().AnErr("error", err).Msg("matchArtistsByMBIDMappings: MusicBrainz unreachable, creating new artist with provided MusicBrainz ID mapping")
|
l.Warn().AnErr("error", err).Msg("matchArtistsByMBIDMappings: MusicBrainz unreachable, creating new artist with provided MusicBrainz ID mapping")
|
||||||
|
|
||||||
var imgid uuid.UUID
|
var imgid uuid.UUID
|
||||||
imgUrl, imgErr := images.GetArtistImage(ctx, images.ArtistImageOpts{
|
var imgUrl string
|
||||||
|
if !opts.SkipCacheImage {
|
||||||
|
var imgErr error
|
||||||
|
imgUrl, imgErr = images.GetArtistImage(ctx, images.ArtistImageOpts{
|
||||||
Aliases: []string{a.Artist},
|
Aliases: []string{a.Artist},
|
||||||
})
|
})
|
||||||
if imgErr == nil && imgUrl != "" {
|
if imgErr == nil && imgUrl != "" {
|
||||||
imgid = uuid.New()
|
imgid = uuid.New()
|
||||||
if !opts.SkipCacheImage {
|
|
||||||
var size ImageSize
|
var size ImageSize
|
||||||
if cfg.FullImageCacheEnabled() {
|
if cfg.FullImageCacheEnabled() {
|
||||||
size = ImageSizeFull
|
size = ImageSizeFull
|
||||||
|
|
@ -144,10 +146,10 @@ func matchArtistsByMBIDMappings(ctx context.Context, d db.DB, opts AssociateArti
|
||||||
if err != nil {
|
if err != nil {
|
||||||
l.Err(err).Msg("Failed to cache image")
|
l.Err(err).Msg("Failed to cache image")
|
||||||
}
|
}
|
||||||
}
|
} else if imgErr != nil {
|
||||||
} else {
|
|
||||||
l.Err(imgErr).Msgf("matchArtistsByMBIDMappings: Failed to get artist image for artist '%s'", a.Artist)
|
l.Err(imgErr).Msgf("matchArtistsByMBIDMappings: Failed to get artist image for artist '%s'", a.Artist)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
artist, err = d.SaveArtist(ctx, db.SaveArtistOpts{
|
artist, err = d.SaveArtist(ctx, db.SaveArtistOpts{
|
||||||
Name: a.Artist,
|
Name: a.Artist,
|
||||||
|
|
@ -246,12 +248,13 @@ func resolveAliasOrCreateArtist(ctx context.Context, mbzID uuid.UUID, names []st
|
||||||
}
|
}
|
||||||
|
|
||||||
var imgid uuid.UUID
|
var imgid uuid.UUID
|
||||||
imgUrl, err := images.GetArtistImage(ctx, images.ArtistImageOpts{
|
var imgUrl string
|
||||||
|
if !opts.SkipCacheImage {
|
||||||
|
imgUrl, err = images.GetArtistImage(ctx, images.ArtistImageOpts{
|
||||||
Aliases: aliases,
|
Aliases: aliases,
|
||||||
})
|
})
|
||||||
if err == nil && imgUrl != "" {
|
if err == nil && imgUrl != "" {
|
||||||
imgid = uuid.New()
|
imgid = uuid.New()
|
||||||
if !opts.SkipCacheImage {
|
|
||||||
var size ImageSize
|
var size ImageSize
|
||||||
if cfg.FullImageCacheEnabled() {
|
if cfg.FullImageCacheEnabled() {
|
||||||
size = ImageSizeFull
|
size = ImageSizeFull
|
||||||
|
|
@ -263,10 +266,10 @@ func resolveAliasOrCreateArtist(ctx context.Context, mbzID uuid.UUID, names []st
|
||||||
if err != nil {
|
if err != nil {
|
||||||
l.Err(err).Msg("Failed to cache image")
|
l.Err(err).Msg("Failed to cache image")
|
||||||
}
|
}
|
||||||
}
|
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
l.Warn().AnErr("error", err).Msg("Failed to get artist image from ImageSrc")
|
l.Warn().AnErr("error", err).Msg("Failed to get artist image from ImageSrc")
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
u, err := d.SaveArtist(ctx, db.SaveArtistOpts{
|
u, err := d.SaveArtist(ctx, db.SaveArtistOpts{
|
||||||
MusicBrainzID: mbzID,
|
MusicBrainzID: mbzID,
|
||||||
|
|
@ -301,12 +304,13 @@ func matchArtistsByNames(ctx context.Context, names []string, existing []*models
|
||||||
}
|
}
|
||||||
if errors.Is(err, pgx.ErrNoRows) {
|
if errors.Is(err, pgx.ErrNoRows) {
|
||||||
var imgid uuid.UUID
|
var imgid uuid.UUID
|
||||||
imgUrl, err := images.GetArtistImage(ctx, images.ArtistImageOpts{
|
var imgUrl string
|
||||||
|
if !opts.SkipCacheImage {
|
||||||
|
imgUrl, err = images.GetArtistImage(ctx, images.ArtistImageOpts{
|
||||||
Aliases: []string{name},
|
Aliases: []string{name},
|
||||||
})
|
})
|
||||||
if err == nil && imgUrl != "" {
|
if err == nil && imgUrl != "" {
|
||||||
imgid = uuid.New()
|
imgid = uuid.New()
|
||||||
if !opts.SkipCacheImage {
|
|
||||||
var size ImageSize
|
var size ImageSize
|
||||||
if cfg.FullImageCacheEnabled() {
|
if cfg.FullImageCacheEnabled() {
|
||||||
size = ImageSizeFull
|
size = ImageSizeFull
|
||||||
|
|
@ -318,10 +322,10 @@ func matchArtistsByNames(ctx context.Context, names []string, existing []*models
|
||||||
if err != nil {
|
if err != nil {
|
||||||
l.Err(err).Msg("Failed to cache image")
|
l.Err(err).Msg("Failed to cache image")
|
||||||
}
|
}
|
||||||
}
|
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
l.Debug().AnErr("error", err).Msgf("Failed to get artist images for %s", name)
|
l.Debug().AnErr("error", err).Msgf("Failed to get artist images for %s", name)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
a, err = d.SaveArtist(ctx, db.SaveArtistOpts{Name: name, Image: imgid, ImageSrc: imgUrl})
|
a, err = d.SaveArtist(ctx, db.SaveArtistOpts{Name: name, Image: imgid, ImageSrc: imgUrl})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("matchArtistsByNames: %w", err)
|
return nil, fmt.Errorf("matchArtistsByNames: %w", err)
|
||||||
|
|
|
||||||
|
|
@ -77,6 +77,21 @@ func SubmitListen(ctx context.Context, store db.DB, opts SubmitListenOpts) error
|
||||||
// bandaid to ensure new activity does not have sub-second precision
|
// bandaid to ensure new activity does not have sub-second precision
|
||||||
opts.Time = opts.Time.Truncate(time.Second)
|
opts.Time = opts.Time.Truncate(time.Second)
|
||||||
|
|
||||||
|
// Fast path: check lookup cache for known entity combo
|
||||||
|
if !opts.SkipSaveListen {
|
||||||
|
key := TrackLookupKey(opts.Artist, opts.TrackTitle, opts.ReleaseTitle)
|
||||||
|
cached, err := store.GetTrackLookup(ctx, key)
|
||||||
|
if err == nil && cached != nil {
|
||||||
|
l.Debug().Msg("Track lookup cache hit — skipping entity resolution")
|
||||||
|
return store.SaveListen(ctx, db.SaveListenOpts{
|
||||||
|
TrackID: cached.TrackID,
|
||||||
|
Time: opts.Time,
|
||||||
|
UserID: opts.UserID,
|
||||||
|
Client: opts.Client,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
artists, err := AssociateArtists(
|
artists, err := AssociateArtists(
|
||||||
ctx,
|
ctx,
|
||||||
store,
|
store,
|
||||||
|
|
@ -183,6 +198,16 @@ func SubmitListen(ctx context.Context, store db.DB, opts SubmitListenOpts) error
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Populate lookup cache for future fast-path hits
|
||||||
|
if len(artists) > 0 {
|
||||||
|
store.SaveTrackLookup(ctx, db.SaveTrackLookupOpts{
|
||||||
|
Key: TrackLookupKey(opts.Artist, opts.TrackTitle, opts.ReleaseTitle),
|
||||||
|
ArtistID: artists[0].ID,
|
||||||
|
AlbumID: rg.ID,
|
||||||
|
TrackID: track.ID,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
if opts.IsNowPlaying {
|
if opts.IsNowPlaying {
|
||||||
if track.Duration == 0 {
|
if track.Duration == 0 {
|
||||||
memkv.Store.Set(strconv.Itoa(int(opts.UserID)), track.ID)
|
memkv.Store.Set(strconv.Itoa(int(opts.UserID)), track.ID)
|
||||||
|
|
|
||||||
9
internal/catalog/lookup_key.go
Normal file
9
internal/catalog/lookup_key.go
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
package catalog
|
||||||
|
|
||||||
|
import "strings"
|
||||||
|
|
||||||
|
// TrackLookupKey builds a normalized cache key for entity resolution.
|
||||||
|
// Uses null-byte separators to avoid collisions between field values.
|
||||||
|
func TrackLookupKey(artist, track, album string) string {
|
||||||
|
return strings.ToLower(artist) + "\x00" + strings.ToLower(track) + "\x00" + strings.ToLower(album)
|
||||||
|
}
|
||||||
|
|
@ -101,6 +101,16 @@ type DB interface {
|
||||||
MergeAlbums(ctx context.Context, fromId, toId int32, replaceImage bool) error
|
MergeAlbums(ctx context.Context, fromId, toId int32, replaceImage bool) error
|
||||||
MergeArtists(ctx context.Context, fromId, toId int32, replaceImage bool) error
|
MergeArtists(ctx context.Context, fromId, toId int32, replaceImage bool) error
|
||||||
|
|
||||||
|
// Track Lookup Cache
|
||||||
|
|
||||||
|
GetTrackLookup(ctx context.Context, key string) (*TrackLookupResult, error)
|
||||||
|
SaveTrackLookup(ctx context.Context, opts SaveTrackLookupOpts) error
|
||||||
|
InvalidateTrackLookup(ctx context.Context, opts InvalidateTrackLookupOpts) error
|
||||||
|
|
||||||
|
// Batch
|
||||||
|
|
||||||
|
SaveListensBatch(ctx context.Context, opts []SaveListenOpts) (int64, error)
|
||||||
|
|
||||||
// Etc
|
// Etc
|
||||||
|
|
||||||
ImageHasAssociation(ctx context.Context, image uuid.UUID) (bool, error)
|
ImageHasAssociation(ctx context.Context, image uuid.UUID) (bool, error)
|
||||||
|
|
|
||||||
|
|
@ -160,3 +160,22 @@ type GetInterestOpts struct {
|
||||||
ArtistID int32
|
ArtistID int32
|
||||||
TrackID int32
|
TrackID int32
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type TrackLookupResult struct {
|
||||||
|
ArtistID int32
|
||||||
|
AlbumID int32
|
||||||
|
TrackID int32
|
||||||
|
}
|
||||||
|
|
||||||
|
type SaveTrackLookupOpts struct {
|
||||||
|
Key string
|
||||||
|
ArtistID int32
|
||||||
|
AlbumID int32
|
||||||
|
TrackID int32
|
||||||
|
}
|
||||||
|
|
||||||
|
type InvalidateTrackLookupOpts struct {
|
||||||
|
ArtistID int32
|
||||||
|
AlbumID int32
|
||||||
|
TrackID int32
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -338,6 +338,7 @@ func (d *Psql) SaveAlbumAliases(ctx context.Context, id int32, aliases []string,
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Psql) DeleteAlbum(ctx context.Context, id int32) error {
|
func (d *Psql) DeleteAlbum(ctx context.Context, id int32) error {
|
||||||
|
d.q.DeleteTrackLookupByAlbum(ctx, id)
|
||||||
return d.q.DeleteRelease(ctx, id)
|
return d.q.DeleteRelease(ctx, id)
|
||||||
}
|
}
|
||||||
func (d *Psql) DeleteAlbumAlias(ctx context.Context, id int32, alias string) error {
|
func (d *Psql) DeleteAlbumAlias(ctx context.Context, id int32, alias string) error {
|
||||||
|
|
|
||||||
|
|
@ -119,6 +119,7 @@ func (d *Psql) SaveArtistAliases(ctx context.Context, id int32, aliases []string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Psql) DeleteArtist(ctx context.Context, id int32) error {
|
func (d *Psql) DeleteArtist(ctx context.Context, id int32) error {
|
||||||
|
d.q.DeleteTrackLookupByArtist(ctx, id)
|
||||||
return d.q.DeleteArtist(ctx, id)
|
return d.q.DeleteArtist(ctx, id)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ import (
|
||||||
"github.com/gabehf/koito/internal/logger"
|
"github.com/gabehf/koito/internal/logger"
|
||||||
"github.com/gabehf/koito/internal/models"
|
"github.com/gabehf/koito/internal/models"
|
||||||
"github.com/gabehf/koito/internal/repository"
|
"github.com/gabehf/koito/internal/repository"
|
||||||
|
"github.com/jackc/pgx/v5"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (d *Psql) GetListensPaginated(ctx context.Context, opts db.GetItemsOpts) (*db.PaginatedResponse[*models.Listen], error) {
|
func (d *Psql) GetListensPaginated(ctx context.Context, opts db.GetItemsOpts) (*db.PaginatedResponse[*models.Listen], error) {
|
||||||
|
|
@ -197,6 +198,67 @@ func (d *Psql) SaveListen(ctx context.Context, opts db.SaveListenOpts) error {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (d *Psql) SaveListensBatch(ctx context.Context, opts []db.SaveListenOpts) (int64, error) {
|
||||||
|
if len(opts) == 0 {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
tx, err := d.conn.BeginTx(ctx, pgx.TxOptions{})
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("SaveListensBatch: BeginTx: %w", err)
|
||||||
|
}
|
||||||
|
defer tx.Rollback(ctx)
|
||||||
|
|
||||||
|
_, err = tx.Exec(ctx, `
|
||||||
|
CREATE TEMP TABLE tmp_listens (
|
||||||
|
track_id INT,
|
||||||
|
listened_at TIMESTAMPTZ,
|
||||||
|
user_id INT,
|
||||||
|
client TEXT
|
||||||
|
) ON COMMIT DROP
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("SaveListensBatch: create temp table: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
rows := make([][]interface{}, len(opts))
|
||||||
|
for i, o := range opts {
|
||||||
|
var client interface{}
|
||||||
|
if o.Client != "" {
|
||||||
|
client = o.Client
|
||||||
|
}
|
||||||
|
t := o.Time
|
||||||
|
if t.IsZero() {
|
||||||
|
t = time.Now()
|
||||||
|
}
|
||||||
|
rows[i] = []interface{}{o.TrackID, t, o.UserID, client}
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = tx.CopyFrom(ctx,
|
||||||
|
pgx.Identifier{"tmp_listens"},
|
||||||
|
[]string{"track_id", "listened_at", "user_id", "client"},
|
||||||
|
pgx.CopyFromRows(rows),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("SaveListensBatch: CopyFrom: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
tag, err := tx.Exec(ctx, `
|
||||||
|
INSERT INTO listens (track_id, listened_at, user_id, client)
|
||||||
|
SELECT track_id, listened_at, user_id, client FROM tmp_listens
|
||||||
|
ON CONFLICT DO NOTHING
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("SaveListensBatch: insert: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := tx.Commit(ctx); err != nil {
|
||||||
|
return 0, fmt.Errorf("SaveListensBatch: Commit: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return tag.RowsAffected(), nil
|
||||||
|
}
|
||||||
|
|
||||||
func (d *Psql) DeleteListen(ctx context.Context, trackId int32, listenedAt time.Time) error {
|
func (d *Psql) DeleteListen(ctx context.Context, trackId int32, listenedAt time.Time) error {
|
||||||
l := logger.FromContext(ctx)
|
l := logger.FromContext(ctx)
|
||||||
if trackId == 0 {
|
if trackId == 0 {
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,8 @@ import (
|
||||||
func (d *Psql) MergeTracks(ctx context.Context, fromId, toId int32) error {
|
func (d *Psql) MergeTracks(ctx context.Context, fromId, toId int32) error {
|
||||||
l := logger.FromContext(ctx)
|
l := logger.FromContext(ctx)
|
||||||
l.Info().Msgf("Merging track %d into track %d", fromId, toId)
|
l.Info().Msgf("Merging track %d into track %d", fromId, toId)
|
||||||
|
d.q.DeleteTrackLookupByTrack(ctx, fromId)
|
||||||
|
d.q.DeleteTrackLookupByTrack(ctx, toId)
|
||||||
tx, err := d.conn.BeginTx(ctx, pgx.TxOptions{})
|
tx, err := d.conn.BeginTx(ctx, pgx.TxOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
l.Err(err).Msg("Failed to begin transaction")
|
l.Err(err).Msg("Failed to begin transaction")
|
||||||
|
|
@ -61,6 +63,8 @@ func (d *Psql) MergeTracks(ctx context.Context, fromId, toId int32) error {
|
||||||
func (d *Psql) MergeAlbums(ctx context.Context, fromId, toId int32, replaceImage bool) error {
|
func (d *Psql) MergeAlbums(ctx context.Context, fromId, toId int32, replaceImage bool) error {
|
||||||
l := logger.FromContext(ctx)
|
l := logger.FromContext(ctx)
|
||||||
l.Info().Msgf("Merging album %d into album %d", fromId, toId)
|
l.Info().Msgf("Merging album %d into album %d", fromId, toId)
|
||||||
|
d.q.DeleteTrackLookupByAlbum(ctx, fromId)
|
||||||
|
d.q.DeleteTrackLookupByAlbum(ctx, toId)
|
||||||
tx, err := d.conn.BeginTx(ctx, pgx.TxOptions{})
|
tx, err := d.conn.BeginTx(ctx, pgx.TxOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
l.Err(err).Msg("Failed to begin transaction")
|
l.Err(err).Msg("Failed to begin transaction")
|
||||||
|
|
@ -117,6 +121,8 @@ func (d *Psql) MergeAlbums(ctx context.Context, fromId, toId int32, replaceImage
|
||||||
func (d *Psql) MergeArtists(ctx context.Context, fromId, toId int32, replaceImage bool) error {
|
func (d *Psql) MergeArtists(ctx context.Context, fromId, toId int32, replaceImage bool) error {
|
||||||
l := logger.FromContext(ctx)
|
l := logger.FromContext(ctx)
|
||||||
l.Info().Msgf("Merging artist %d into artist %d", fromId, toId)
|
l.Info().Msgf("Merging artist %d into artist %d", fromId, toId)
|
||||||
|
d.q.DeleteTrackLookupByArtist(ctx, fromId)
|
||||||
|
d.q.DeleteTrackLookupByArtist(ctx, toId)
|
||||||
tx, err := d.conn.BeginTx(ctx, pgx.TxOptions{})
|
tx, err := d.conn.BeginTx(ctx, pgx.TxOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
l.Err(err).Msg("Failed to begin transaction")
|
l.Err(err).Msg("Failed to begin transaction")
|
||||||
|
|
|
||||||
|
|
@ -241,6 +241,9 @@ func (d *Psql) SaveTrackAliases(ctx context.Context, id int32, aliases []string,
|
||||||
|
|
||||||
func (d *Psql) DeleteTrack(ctx context.Context, id int32) error {
|
func (d *Psql) DeleteTrack(ctx context.Context, id int32) error {
|
||||||
l := logger.FromContext(ctx)
|
l := logger.FromContext(ctx)
|
||||||
|
|
||||||
|
d.q.DeleteTrackLookupByTrack(ctx, id)
|
||||||
|
|
||||||
tx, err := d.conn.BeginTx(ctx, pgx.TxOptions{})
|
tx, err := d.conn.BeginTx(ctx, pgx.TxOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
l.Err(err).Msg("Failed to begin transaction")
|
l.Err(err).Msg("Failed to begin transaction")
|
||||||
|
|
|
||||||
52
internal/db/psql/track_lookup.go
Normal file
52
internal/db/psql/track_lookup.go
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
package psql
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
"github.com/gabehf/koito/internal/db"
|
||||||
|
"github.com/gabehf/koito/internal/repository"
|
||||||
|
"github.com/jackc/pgx/v5"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (d *Psql) GetTrackLookup(ctx context.Context, key string) (*db.TrackLookupResult, error) {
|
||||||
|
row, err := d.q.GetTrackLookup(ctx, key)
|
||||||
|
if err != nil {
|
||||||
|
if err == pgx.ErrNoRows {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &db.TrackLookupResult{
|
||||||
|
ArtistID: row.ArtistID,
|
||||||
|
AlbumID: row.AlbumID,
|
||||||
|
TrackID: row.TrackID,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *Psql) SaveTrackLookup(ctx context.Context, opts db.SaveTrackLookupOpts) error {
|
||||||
|
return d.q.InsertTrackLookup(ctx, repository.InsertTrackLookupParams{
|
||||||
|
LookupKey: opts.Key,
|
||||||
|
ArtistID: opts.ArtistID,
|
||||||
|
AlbumID: opts.AlbumID,
|
||||||
|
TrackID: opts.TrackID,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *Psql) InvalidateTrackLookup(ctx context.Context, opts db.InvalidateTrackLookupOpts) error {
|
||||||
|
if opts.ArtistID != 0 {
|
||||||
|
if err := d.q.DeleteTrackLookupByArtist(ctx, opts.ArtistID); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if opts.AlbumID != 0 {
|
||||||
|
if err := d.q.DeleteTrackLookupByAlbum(ctx, opts.AlbumID); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if opts.TrackID != 0 {
|
||||||
|
if err := d.q.DeleteTrackLookupByTrack(ctx, opts.TrackID); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
157
internal/importer/bulk.go
Normal file
157
internal/importer/bulk.go
Normal file
|
|
@ -0,0 +1,157 @@
|
||||||
|
package importer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/gabehf/koito/internal/catalog"
|
||||||
|
"github.com/gabehf/koito/internal/db"
|
||||||
|
"github.com/gabehf/koito/internal/logger"
|
||||||
|
"github.com/gabehf/koito/internal/mbz"
|
||||||
|
)
|
||||||
|
|
||||||
|
// BulkSubmitter is a reusable import accelerator. It pre-deduplicates scrobbles
|
||||||
|
// in memory, resolves entities via the track_lookup cache (falling back to
|
||||||
|
// SubmitListen on cache miss with a worker pool for parallelism), and batch-inserts
|
||||||
|
// listens via SaveListensBatch.
|
||||||
|
type BulkSubmitter struct {
|
||||||
|
store db.DB
|
||||||
|
mbzc mbz.MusicBrainzCaller
|
||||||
|
ctx context.Context
|
||||||
|
buffer []catalog.SubmitListenOpts
|
||||||
|
workers int
|
||||||
|
}
|
||||||
|
|
||||||
|
type BulkSubmitterOpts struct {
|
||||||
|
Store db.DB
|
||||||
|
Mbzc mbz.MusicBrainzCaller
|
||||||
|
Workers int // default 4
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewBulkSubmitter(ctx context.Context, opts BulkSubmitterOpts) *BulkSubmitter {
|
||||||
|
workers := opts.Workers
|
||||||
|
if workers <= 0 {
|
||||||
|
workers = 4
|
||||||
|
}
|
||||||
|
return &BulkSubmitter{
|
||||||
|
store: opts.Store,
|
||||||
|
mbzc: opts.Mbzc,
|
||||||
|
ctx: ctx,
|
||||||
|
workers: workers,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Accept buffers a scrobble for later batch processing.
|
||||||
|
func (bs *BulkSubmitter) Accept(opts catalog.SubmitListenOpts) {
|
||||||
|
bs.buffer = append(bs.buffer, opts)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Flush processes all buffered scrobbles: deduplicates, resolves entities, and batch-inserts listens.
|
||||||
|
// Returns the number of listens successfully inserted.
|
||||||
|
func (bs *BulkSubmitter) Flush() (int, error) {
|
||||||
|
l := logger.FromContext(bs.ctx)
|
||||||
|
if len(bs.buffer) == 0 {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
l.Info().Msgf("BulkSubmitter: Processing %d scrobbles", len(bs.buffer))
|
||||||
|
|
||||||
|
// Phase A: Deduplicate — find unique (artist, track, album) tuples
|
||||||
|
unique := make(map[string]catalog.SubmitListenOpts)
|
||||||
|
for _, opts := range bs.buffer {
|
||||||
|
key := catalog.TrackLookupKey(opts.Artist, opts.TrackTitle, opts.ReleaseTitle)
|
||||||
|
if _, exists := unique[key]; !exists {
|
||||||
|
unique[key] = opts
|
||||||
|
}
|
||||||
|
}
|
||||||
|
l.Info().Msgf("BulkSubmitter: %d unique entity combos from %d scrobbles", len(unique), len(bs.buffer))
|
||||||
|
|
||||||
|
// Phase B: Resolve entities — check cache, create on miss
|
||||||
|
resolved := make(map[string]int32) // key → trackID
|
||||||
|
var mu sync.Mutex
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
sem := make(chan struct{}, bs.workers)
|
||||||
|
cacheHits := 0
|
||||||
|
|
||||||
|
for key, opts := range unique {
|
||||||
|
// Check track_lookup cache first
|
||||||
|
cached, err := bs.store.GetTrackLookup(bs.ctx, key)
|
||||||
|
if err == nil && cached != nil {
|
||||||
|
mu.Lock()
|
||||||
|
resolved[key] = cached.TrackID
|
||||||
|
cacheHits++
|
||||||
|
mu.Unlock()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cache miss — create entities via SubmitListen (with worker pool)
|
||||||
|
wg.Add(1)
|
||||||
|
sem <- struct{}{} // acquire worker slot
|
||||||
|
go func(k string, o catalog.SubmitListenOpts) {
|
||||||
|
defer wg.Done()
|
||||||
|
defer func() { <-sem }() // release worker slot
|
||||||
|
|
||||||
|
o.SkipSaveListen = true
|
||||||
|
o.SkipCacheImage = true
|
||||||
|
err := catalog.SubmitListen(bs.ctx, bs.store, o)
|
||||||
|
if err != nil {
|
||||||
|
l.Err(err).Msgf("BulkSubmitter: Failed to create entities for '%s' by '%s'", o.TrackTitle, o.Artist)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-check cache (SubmitListen populates it via Phase 1's integration)
|
||||||
|
cached, err := bs.store.GetTrackLookup(bs.ctx, k)
|
||||||
|
if err == nil && cached != nil {
|
||||||
|
mu.Lock()
|
||||||
|
resolved[k] = cached.TrackID
|
||||||
|
mu.Unlock()
|
||||||
|
}
|
||||||
|
}(key, opts)
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
l.Info().Msgf("BulkSubmitter: Resolved %d/%d entity combos (%d cache hits)",
|
||||||
|
len(resolved), len(unique), cacheHits)
|
||||||
|
|
||||||
|
// Phase C: Build listen batch
|
||||||
|
batch := make([]db.SaveListenOpts, 0, len(bs.buffer))
|
||||||
|
skipped := 0
|
||||||
|
for _, opts := range bs.buffer {
|
||||||
|
key := catalog.TrackLookupKey(opts.Artist, opts.TrackTitle, opts.ReleaseTitle)
|
||||||
|
trackID, ok := resolved[key]
|
||||||
|
if !ok {
|
||||||
|
skipped++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
batch = append(batch, db.SaveListenOpts{
|
||||||
|
TrackID: trackID,
|
||||||
|
Time: opts.Time.Truncate(time.Second),
|
||||||
|
UserID: opts.UserID,
|
||||||
|
Client: opts.Client,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if skipped > 0 {
|
||||||
|
l.Warn().Msgf("BulkSubmitter: Skipped %d scrobbles with unresolved entities", skipped)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase D: Batch insert listens (in chunks to avoid huge transactions)
|
||||||
|
const chunkSize = 5000
|
||||||
|
var totalInserted int64
|
||||||
|
for i := 0; i < len(batch); i += chunkSize {
|
||||||
|
end := i + chunkSize
|
||||||
|
if end > len(batch) {
|
||||||
|
end = len(batch)
|
||||||
|
}
|
||||||
|
inserted, err := bs.store.SaveListensBatch(bs.ctx, batch[i:end])
|
||||||
|
if err != nil {
|
||||||
|
return int(totalInserted), fmt.Errorf("BulkSubmitter: SaveListensBatch: %w", err)
|
||||||
|
}
|
||||||
|
totalInserted += inserted
|
||||||
|
}
|
||||||
|
|
||||||
|
l.Info().Msgf("BulkSubmitter: Inserted %d listens (%d duplicates skipped)",
|
||||||
|
totalInserted, int64(len(batch))-totalInserted)
|
||||||
|
return int(totalInserted), nil
|
||||||
|
}
|
||||||
|
|
@ -50,18 +50,17 @@ func ImportLastFMFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrainzCall
|
||||||
return fmt.Errorf("ImportLastFMFile: %w", err)
|
return fmt.Errorf("ImportLastFMFile: %w", err)
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
var throttleFunc = func() {}
|
|
||||||
if ms := cfg.ThrottleImportMs(); ms > 0 {
|
|
||||||
throttleFunc = func() {
|
|
||||||
time.Sleep(time.Duration(ms) * time.Millisecond)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
export := make([]LastFMExportPage, 0)
|
export := make([]LastFMExportPage, 0)
|
||||||
err = json.NewDecoder(file).Decode(&export)
|
err = json.NewDecoder(file).Decode(&export)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("ImportLastFMFile: %w", err)
|
return fmt.Errorf("ImportLastFMFile: %w", err)
|
||||||
}
|
}
|
||||||
count := 0
|
|
||||||
|
bs := NewBulkSubmitter(ctx, BulkSubmitterOpts{
|
||||||
|
Store: store,
|
||||||
|
Mbzc: mbzc,
|
||||||
|
})
|
||||||
|
|
||||||
for _, item := range export {
|
for _, item := range export {
|
||||||
for _, track := range item.Track {
|
for _, track := range item.Track {
|
||||||
album := track.Album.Text
|
album := track.Album.Text
|
||||||
|
|
@ -96,7 +95,6 @@ func ImportLastFMFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrainzCall
|
||||||
ts = time.Unix(unix, 0).UTC()
|
ts = time.Unix(unix, 0).UTC()
|
||||||
}
|
}
|
||||||
if !inImportTimeWindow(ts) {
|
if !inImportTimeWindow(ts) {
|
||||||
l.Debug().Msgf("Skipping import due to import time rules")
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -105,7 +103,7 @@ func ImportLastFMFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrainzCall
|
||||||
artistMbidMap = append(artistMbidMap, catalog.ArtistMbidMap{Artist: track.Artist.Text, Mbid: artistMbzID})
|
artistMbidMap = append(artistMbidMap, catalog.ArtistMbidMap{Artist: track.Artist.Text, Mbid: artistMbzID})
|
||||||
}
|
}
|
||||||
|
|
||||||
opts := catalog.SubmitListenOpts{
|
bs.Accept(catalog.SubmitListenOpts{
|
||||||
MbzCaller: mbzc,
|
MbzCaller: mbzc,
|
||||||
Artist: track.Artist.Text,
|
Artist: track.Artist.Text,
|
||||||
ArtistNames: []string{track.Artist.Text},
|
ArtistNames: []string{track.Artist.Text},
|
||||||
|
|
@ -118,16 +116,14 @@ func ImportLastFMFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrainzCall
|
||||||
Client: "lastfm",
|
Client: "lastfm",
|
||||||
Time: ts,
|
Time: ts,
|
||||||
UserID: 1,
|
UserID: 1,
|
||||||
SkipCacheImage: !cfg.FetchImagesDuringImport(),
|
SkipCacheImage: true,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
err = catalog.SubmitListen(ctx, store, opts)
|
}
|
||||||
|
|
||||||
|
count, err := bs.Flush()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
l.Err(err).Msg("Failed to import LastFM playback item")
|
|
||||||
return fmt.Errorf("ImportLastFMFile: %w", err)
|
return fmt.Errorf("ImportLastFMFile: %w", err)
|
||||||
}
|
}
|
||||||
count++
|
|
||||||
throttleFunc()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return finishImport(ctx, filename, count)
|
return finishImport(ctx, filename, count)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -63,13 +63,11 @@ func ImportListenBrainzFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrai
|
||||||
|
|
||||||
scanner := bufio.NewScanner(r)
|
scanner := bufio.NewScanner(r)
|
||||||
|
|
||||||
var throttleFunc = func() {}
|
bs := NewBulkSubmitter(ctx, BulkSubmitterOpts{
|
||||||
if ms := cfg.ThrottleImportMs(); ms > 0 {
|
Store: store,
|
||||||
throttleFunc = func() {
|
Mbzc: mbzc,
|
||||||
time.Sleep(time.Duration(ms) * time.Millisecond)
|
})
|
||||||
}
|
|
||||||
}
|
|
||||||
count := 0
|
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
line := scanner.Bytes()
|
line := scanner.Bytes()
|
||||||
payload := new(handlers.LbzSubmitListenPayload)
|
payload := new(handlers.LbzSubmitListenPayload)
|
||||||
|
|
@ -80,7 +78,6 @@ func ImportListenBrainzFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrai
|
||||||
}
|
}
|
||||||
ts := time.Unix(payload.ListenedAt, 0)
|
ts := time.Unix(payload.ListenedAt, 0)
|
||||||
if !inImportTimeWindow(ts) {
|
if !inImportTimeWindow(ts) {
|
||||||
l.Debug().Msgf("Skipping import due to import time rules")
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
artistMbzIDs, err := utils.ParseUUIDSlice(payload.TrackMeta.AdditionalInfo.ArtistMBIDs)
|
artistMbzIDs, err := utils.ParseUUIDSlice(payload.TrackMeta.AdditionalInfo.ArtistMBIDs)
|
||||||
|
|
@ -139,7 +136,7 @@ func ImportListenBrainzFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrai
|
||||||
artistMbidMap = append(artistMbidMap, catalog.ArtistMbidMap{Artist: a.ArtistName, Mbid: mbid})
|
artistMbidMap = append(artistMbidMap, catalog.ArtistMbidMap{Artist: a.ArtistName, Mbid: mbid})
|
||||||
}
|
}
|
||||||
|
|
||||||
opts := catalog.SubmitListenOpts{
|
bs.Accept(catalog.SubmitListenOpts{
|
||||||
MbzCaller: mbzc,
|
MbzCaller: mbzc,
|
||||||
ArtistNames: payload.TrackMeta.AdditionalInfo.ArtistNames,
|
ArtistNames: payload.TrackMeta.AdditionalInfo.ArtistNames,
|
||||||
Artist: payload.TrackMeta.ArtistName,
|
Artist: payload.TrackMeta.ArtistName,
|
||||||
|
|
@ -154,16 +151,14 @@ func ImportListenBrainzFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrai
|
||||||
Time: ts,
|
Time: ts,
|
||||||
UserID: 1,
|
UserID: 1,
|
||||||
Client: client,
|
Client: client,
|
||||||
SkipCacheImage: !cfg.FetchImagesDuringImport(),
|
SkipCacheImage: true,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
err = catalog.SubmitListen(ctx, store, opts)
|
|
||||||
|
count, err := bs.Flush()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
l.Err(err).Msg("Failed to import LastFM playback item")
|
|
||||||
return fmt.Errorf("ImportListenBrainzFile: %w", err)
|
return fmt.Errorf("ImportListenBrainzFile: %w", err)
|
||||||
}
|
}
|
||||||
count++
|
|
||||||
throttleFunc()
|
|
||||||
}
|
|
||||||
l.Info().Msgf("Finished importing %s; imported %d items", filename, count)
|
l.Info().Msgf("Finished importing %s; imported %d items", filename, count)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -44,12 +44,6 @@ func ImportMalojaFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrainzCall
|
||||||
return fmt.Errorf("ImportMalojaFile: %w", err)
|
return fmt.Errorf("ImportMalojaFile: %w", err)
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
var throttleFunc = func() {}
|
|
||||||
if ms := cfg.ThrottleImportMs(); ms > 0 {
|
|
||||||
throttleFunc = func() {
|
|
||||||
time.Sleep(time.Duration(ms) * time.Millisecond)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
export := new(MalojaFile)
|
export := new(MalojaFile)
|
||||||
err = json.NewDecoder(file).Decode(&export)
|
err = json.NewDecoder(file).Decode(&export)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
@ -59,12 +53,14 @@ func ImportMalojaFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrainzCall
|
||||||
if len(items) == 0 {
|
if len(items) == 0 {
|
||||||
items = export.List
|
items = export.List
|
||||||
}
|
}
|
||||||
count := 0
|
|
||||||
total := len(items)
|
bs := NewBulkSubmitter(ctx, BulkSubmitterOpts{
|
||||||
for i, item := range items {
|
Store: store,
|
||||||
|
Mbzc: mbzc,
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, item := range items {
|
||||||
martists := make([]string, 0)
|
martists := make([]string, 0)
|
||||||
// Maloja has a tendency to have the the artist order ['feature', 'main ● feature'], so
|
|
||||||
// here we try to turn that artist array into ['main', 'feature']
|
|
||||||
item.Track.Artists = utils.MoveFirstMatchToFront(item.Track.Artists, " \u2022 ")
|
item.Track.Artists = utils.MoveFirstMatchToFront(item.Track.Artists, " \u2022 ")
|
||||||
for _, an := range item.Track.Artists {
|
for _, an := range item.Track.Artists {
|
||||||
ans := strings.Split(an, " \u2022 ")
|
ans := strings.Split(an, " \u2022 ")
|
||||||
|
|
@ -77,14 +73,13 @@ func ImportMalojaFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrainzCall
|
||||||
}
|
}
|
||||||
ts := time.Unix(item.Time, 0)
|
ts := time.Unix(item.Time, 0)
|
||||||
if !inImportTimeWindow(ts) {
|
if !inImportTimeWindow(ts) {
|
||||||
l.Debug().Msgf("Skipping import due to import time rules")
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
releaseTitle := ""
|
releaseTitle := ""
|
||||||
if item.Track.Album != nil {
|
if item.Track.Album != nil {
|
||||||
releaseTitle = item.Track.Album.Title
|
releaseTitle = item.Track.Album.Title
|
||||||
}
|
}
|
||||||
opts := catalog.SubmitListenOpts{
|
bs.Accept(catalog.SubmitListenOpts{
|
||||||
MbzCaller: mbzc,
|
MbzCaller: mbzc,
|
||||||
Artist: item.Track.Artists[0],
|
Artist: item.Track.Artists[0],
|
||||||
ArtistNames: artists,
|
ArtistNames: artists,
|
||||||
|
|
@ -93,18 +88,13 @@ func ImportMalojaFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrainzCall
|
||||||
Time: ts.Local(),
|
Time: ts.Local(),
|
||||||
Client: "maloja",
|
Client: "maloja",
|
||||||
UserID: 1,
|
UserID: 1,
|
||||||
SkipCacheImage: !cfg.FetchImagesDuringImport(),
|
SkipCacheImage: true,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
err = catalog.SubmitListen(ctx, store, opts)
|
|
||||||
|
count, err := bs.Flush()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
l.Err(err).Msgf("Failed to import maloja item %d/%d", i+1, total)
|
return fmt.Errorf("ImportMalojaFile: %w", err)
|
||||||
continue
|
|
||||||
}
|
|
||||||
count++
|
|
||||||
if count%500 == 0 {
|
|
||||||
l.Info().Msgf("Maloja import progress: %d/%d", count, total)
|
|
||||||
}
|
|
||||||
throttleFunc()
|
|
||||||
}
|
}
|
||||||
return finishImport(ctx, filename, count)
|
return finishImport(ctx, filename, count)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -33,48 +33,44 @@ func ImportSpotifyFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrainzCal
|
||||||
return fmt.Errorf("ImportSpotifyFile: %w", err)
|
return fmt.Errorf("ImportSpotifyFile: %w", err)
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
var throttleFunc = func() {}
|
|
||||||
if ms := cfg.ThrottleImportMs(); ms > 0 {
|
|
||||||
throttleFunc = func() {
|
|
||||||
time.Sleep(time.Duration(ms) * time.Millisecond)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
export := make([]SpotifyExportItem, 0)
|
export := make([]SpotifyExportItem, 0)
|
||||||
err = json.NewDecoder(file).Decode(&export)
|
err = json.NewDecoder(file).Decode(&export)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("ImportSpotifyFile: %w", err)
|
return fmt.Errorf("ImportSpotifyFile: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bs := NewBulkSubmitter(ctx, BulkSubmitterOpts{
|
||||||
|
Store: store,
|
||||||
|
Mbzc: mbzc,
|
||||||
|
})
|
||||||
|
|
||||||
for _, item := range export {
|
for _, item := range export {
|
||||||
if item.ReasonEnd != "trackdone" {
|
if item.ReasonEnd != "trackdone" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if !inImportTimeWindow(item.Timestamp) {
|
if !inImportTimeWindow(item.Timestamp) {
|
||||||
l.Debug().Msgf("Skipping import due to import time rules")
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
dur := item.MsPlayed
|
|
||||||
if item.TrackName == "" || item.ArtistName == "" {
|
if item.TrackName == "" || item.ArtistName == "" {
|
||||||
l.Debug().Msg("Skipping non-track item")
|
l.Debug().Msg("Skipping non-track item")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
opts := catalog.SubmitListenOpts{
|
bs.Accept(catalog.SubmitListenOpts{
|
||||||
MbzCaller: mbzc,
|
MbzCaller: mbzc,
|
||||||
Artist: item.ArtistName,
|
Artist: item.ArtistName,
|
||||||
TrackTitle: item.TrackName,
|
TrackTitle: item.TrackName,
|
||||||
ReleaseTitle: item.AlbumName,
|
ReleaseTitle: item.AlbumName,
|
||||||
Duration: dur / 1000,
|
Duration: item.MsPlayed / 1000,
|
||||||
Time: item.Timestamp,
|
Time: item.Timestamp,
|
||||||
Client: "spotify",
|
Client: "spotify",
|
||||||
UserID: 1,
|
UserID: 1,
|
||||||
SkipCacheImage: !cfg.FetchImagesDuringImport(),
|
SkipCacheImage: true,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
err = catalog.SubmitListen(ctx, store, opts)
|
|
||||||
|
count, err := bs.Flush()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
l.Err(err).Msg("Failed to import spotify playback item")
|
|
||||||
return fmt.Errorf("ImportSpotifyFile: %w", err)
|
return fmt.Errorf("ImportSpotifyFile: %w", err)
|
||||||
}
|
}
|
||||||
throttleFunc()
|
return finishImport(ctx, filename, count)
|
||||||
}
|
|
||||||
return finishImport(ctx, filename, len(export))
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
82
internal/repository/track_lookup.sql.go
Normal file
82
internal/repository/track_lookup.sql.go
Normal file
|
|
@ -0,0 +1,82 @@
|
||||||
|
// Code generated by sqlc. DO NOT EDIT.
|
||||||
|
// versions:
|
||||||
|
// sqlc v1.28.0
|
||||||
|
// source: track_lookup.sql
|
||||||
|
|
||||||
|
package repository
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
)
|
||||||
|
|
||||||
|
const deleteTrackLookupByAlbum = `-- name: DeleteTrackLookupByAlbum :exec
|
||||||
|
DELETE FROM track_lookup WHERE album_id = $1
|
||||||
|
`
|
||||||
|
|
||||||
|
func (q *Queries) DeleteTrackLookupByAlbum(ctx context.Context, albumID int32) error {
|
||||||
|
_, err := q.db.Exec(ctx, deleteTrackLookupByAlbum, albumID)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
const deleteTrackLookupByArtist = `-- name: DeleteTrackLookupByArtist :exec
|
||||||
|
DELETE FROM track_lookup WHERE artist_id = $1
|
||||||
|
`
|
||||||
|
|
||||||
|
func (q *Queries) DeleteTrackLookupByArtist(ctx context.Context, artistID int32) error {
|
||||||
|
_, err := q.db.Exec(ctx, deleteTrackLookupByArtist, artistID)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
const deleteTrackLookupByTrack = `-- name: DeleteTrackLookupByTrack :exec
|
||||||
|
DELETE FROM track_lookup WHERE track_id = $1
|
||||||
|
`
|
||||||
|
|
||||||
|
func (q *Queries) DeleteTrackLookupByTrack(ctx context.Context, trackID int32) error {
|
||||||
|
_, err := q.db.Exec(ctx, deleteTrackLookupByTrack, trackID)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
const getTrackLookup = `-- name: GetTrackLookup :one
|
||||||
|
SELECT artist_id, album_id, track_id
|
||||||
|
FROM track_lookup
|
||||||
|
WHERE lookup_key = $1
|
||||||
|
`
|
||||||
|
|
||||||
|
type GetTrackLookupRow struct {
|
||||||
|
ArtistID int32
|
||||||
|
AlbumID int32
|
||||||
|
TrackID int32
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *Queries) GetTrackLookup(ctx context.Context, lookupKey string) (GetTrackLookupRow, error) {
|
||||||
|
row := q.db.QueryRow(ctx, getTrackLookup, lookupKey)
|
||||||
|
var i GetTrackLookupRow
|
||||||
|
err := row.Scan(&i.ArtistID, &i.AlbumID, &i.TrackID)
|
||||||
|
return i, err
|
||||||
|
}
|
||||||
|
|
||||||
|
const insertTrackLookup = `-- name: InsertTrackLookup :exec
|
||||||
|
INSERT INTO track_lookup (lookup_key, artist_id, album_id, track_id)
|
||||||
|
VALUES ($1, $2, $3, $4)
|
||||||
|
ON CONFLICT (lookup_key) DO UPDATE SET
|
||||||
|
artist_id = EXCLUDED.artist_id,
|
||||||
|
album_id = EXCLUDED.album_id,
|
||||||
|
track_id = EXCLUDED.track_id
|
||||||
|
`
|
||||||
|
|
||||||
|
type InsertTrackLookupParams struct {
|
||||||
|
LookupKey string
|
||||||
|
ArtistID int32
|
||||||
|
AlbumID int32
|
||||||
|
TrackID int32
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *Queries) InsertTrackLookup(ctx context.Context, arg InsertTrackLookupParams) error {
|
||||||
|
_, err := q.db.Exec(ctx, insertTrackLookup,
|
||||||
|
arg.LookupKey,
|
||||||
|
arg.ArtistID,
|
||||||
|
arg.AlbumID,
|
||||||
|
arg.TrackID,
|
||||||
|
)
|
||||||
|
return err
|
||||||
|
}
|
||||||
722
thoughts/plans/20260325_bulk_import_optimization.md
Normal file
722
thoughts/plans/20260325_bulk_import_optimization.md
Normal file
|
|
@ -0,0 +1,722 @@
|
||||||
|
# Bulk Import Optimization — Implementation Plan
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Optimize Koito's import pipeline from ~20 listens/min to thousands/min by adopting ListenBrainz-inspired patterns: a persistent entity lookup cache, batch DB writes, pre-deduplication, and deferred enrichment. The core insight from the ecosystem research is **write raw first, enrich async** — and the persistent lookup table benefits all scrobbles (live + import), not just bulk imports.
|
||||||
|
|
||||||
|
## Current State Analysis
|
||||||
|
|
||||||
|
### The Problem
|
||||||
|
|
||||||
|
Importing 49,050 Maloja scrobbles takes ~24 hours. The import is stable (our PR 1 fixes eliminated panics) but each scrobble runs through the full `SubmitListen` path:
|
||||||
|
|
||||||
|
- **`GetArtist`** issues 6 DB queries per lookup (including rank computation via window functions)
|
||||||
|
- **`GetAlbum`** issues 6 DB queries per lookup
|
||||||
|
- **`GetTrack`** issues 5-6 DB queries per lookup
|
||||||
|
- **`GetArtistImage` / `GetAlbumImage`** makes HTTP calls even when all image providers are disabled
|
||||||
|
- **`SaveListen`** is a single INSERT — the only fast part
|
||||||
|
|
||||||
|
Per unique scrobble: ~18 DB round-trips + 2 image lookups. Per repeated scrobble: ~18 DB round-trips (no caching). With 5,589 unique artists, 2,628 unique albums, and 49,050 total scrobbles, this is massively redundant.
|
||||||
|
|
||||||
|
### Key Discoveries
|
||||||
|
|
||||||
|
- `SubmitListenOpts.SkipSaveListen` (`catalog.go:43`) can be used to create entities without recording a listen — useful for entity pre-creation
|
||||||
|
- `SubmitListenOpts.SkipCacheImage` (`catalog.go:46`) controls image download but NOT image URL resolution — the HTTP calls still happen
|
||||||
|
- The Koito native importer (`importer/koito.go`) already bypasses `SubmitListen` and does direct DB calls — a precedent for a faster import path
|
||||||
|
- `pgxpool.Pool` is goroutine-safe — concurrent DB operations are safe at the pool level
|
||||||
|
- `SaveListen` SQL uses `ON CONFLICT DO NOTHING` — re-importing is idempotent
|
||||||
|
- No batch insert methods exist anywhere in the codebase
|
||||||
|
- `GetArtist`/`GetAlbum`/`GetTrack` compute full stats (listen count, time listened, rank) on every call — unnecessary during import
|
||||||
|
|
||||||
|
### Ecosystem Patterns (from research)
|
||||||
|
|
||||||
|
- **ListenBrainz**: Stores raw scrobbles immediately, resolves MBIDs asynchronously via background worker + Typesense index. Uses MessyBrainz as a stable `(artist, track, release) → ID` mapping.
|
||||||
|
- **Maloja**: Runs every import through the full normalize → dedup → cache-invalidate cycle. Works for live scrobbles, kills bulk import. **This is exactly Koito's current problem.**
|
||||||
|
- **Last.fm**: Resolves metadata at write time (corrections), batches up to 50 scrobbles per request.
|
||||||
|
- **General**: DB-level dedup via unique constraint + `ON CONFLICT` is the industry standard.
|
||||||
|
|
||||||
|
## Desired End State
|
||||||
|
|
||||||
|
1. A `track_lookup` table provides O(1) entity resolution for any `(artist, track, album)` tuple — both live and import scrobbles benefit
|
||||||
|
2. All 5 importers use a shared `BulkSubmitter` that pre-deduplicates, creates entities in parallel, and batch-inserts listens
|
||||||
|
3. Image/MBZ enrichment is fully deferred to existing background tasks during import
|
||||||
|
4. 49k Maloja import completes in **under 30 minutes** (vs 24 hours currently)
|
||||||
|
5. Live scrobbles are faster too — cache hit skips 18 DB queries, goes straight to 1 SELECT + 1 INSERT
|
||||||
|
|
||||||
|
### Verification
|
||||||
|
|
||||||
|
- `go build ./...` compiles
|
||||||
|
- `go test ./...` passes (existing + new tests)
|
||||||
|
- Manual: import 49k Maloja scrobbles in under 30 minutes on vo-pc
|
||||||
|
- Manual: verify live scrobbles from multi-scrobbler still work correctly
|
||||||
|
- Manual: verify album art appears after background image backfill runs
|
||||||
|
|
||||||
|
## What We're NOT Doing
|
||||||
|
|
||||||
|
- **Replacing the DB engine** (no TimescaleDB, no Redis) — Postgres is fine for self-hosted scale
|
||||||
|
- **Local MusicBrainz mirror or Typesense index** — overkill for single-user; the live API + background enrichment is sufficient
|
||||||
|
- **Changing the live `SubmitListen` API path** — the lookup cache makes it faster, but the logic stays the same
|
||||||
|
- **Parallelizing live scrobbles** — only imports use the worker pool; live scrobbles remain single-threaded through `SubmitListen`
|
||||||
|
- **Changing the ListenBrainz/Last.fm relay** — multi-scrobbler handles that independently
|
||||||
|
|
||||||
|
## Implementation Approach
|
||||||
|
|
||||||
|
Adopt ListenBrainz's "MessyBrainz" pattern as a persistent Postgres table: a normalized `(artist, track, album)` tuple maps to resolved `(artist_id, album_id, track_id)`. This is the foundational optimization — everything else builds on it.
|
||||||
|
|
||||||
|
```
|
||||||
|
Before (per scrobble):
|
||||||
|
GetArtist (6 queries) → GetAlbum (6 queries) → GetTrack (6 queries) → SaveListen (1 query)
|
||||||
|
= 19 queries minimum
|
||||||
|
|
||||||
|
After (cache hit):
|
||||||
|
SELECT FROM track_lookup (1 query) → SaveListen (1 query)
|
||||||
|
= 2 queries
|
||||||
|
|
||||||
|
After (bulk import, cache hit):
|
||||||
|
In-memory map lookup (0 queries) → batched SaveListen
|
||||||
|
= amortized ~0.01 queries per scrobble
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 1: `track_lookup` Cache Table
|
||||||
|
|
||||||
|
### Overview
|
||||||
|
|
||||||
|
Add a persistent lookup table that maps normalized `(artist_name, track_title, release_title)` to resolved entity IDs. Integrate into `SubmitListen` so both live and import scrobbles benefit.
|
||||||
|
|
||||||
|
### Changes Required
|
||||||
|
|
||||||
|
#### 1. New Migration
|
||||||
|
|
||||||
|
**File**: `db/migrations/000006_track_lookup.sql`
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- +goose Up
|
||||||
|
CREATE TABLE track_lookup (
|
||||||
|
lookup_key TEXT NOT NULL PRIMARY KEY,
|
||||||
|
artist_id INT NOT NULL REFERENCES artists(id) ON DELETE CASCADE,
|
||||||
|
album_id INT NOT NULL REFERENCES releases(id) ON DELETE CASCADE,
|
||||||
|
track_id INT NOT NULL REFERENCES tracks(id) ON DELETE CASCADE,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_track_lookup_track_id ON track_lookup(track_id);
|
||||||
|
CREATE INDEX idx_track_lookup_artist_id ON track_lookup(artist_id);
|
||||||
|
CREATE INDEX idx_track_lookup_album_id ON track_lookup(album_id);
|
||||||
|
|
||||||
|
-- +goose Down
|
||||||
|
DROP TABLE IF EXISTS track_lookup;
|
||||||
|
```
|
||||||
|
|
||||||
|
The `lookup_key` is a normalized string: `lower(artist) || '\x00' || lower(track) || '\x00' || lower(album)`. Using a single TEXT key with a null-byte separator is simpler and faster than a multi-column composite key with `citext`.
|
||||||
|
|
||||||
|
#### 2. New SQL Queries
|
||||||
|
|
||||||
|
**File**: `db/queries/track_lookup.sql`
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- name: GetTrackLookup :one
|
||||||
|
SELECT artist_id, album_id, track_id
|
||||||
|
FROM track_lookup
|
||||||
|
WHERE lookup_key = $1;
|
||||||
|
|
||||||
|
-- name: InsertTrackLookup :exec
|
||||||
|
INSERT INTO track_lookup (lookup_key, artist_id, album_id, track_id)
|
||||||
|
VALUES ($1, $2, $3, $4)
|
||||||
|
ON CONFLICT (lookup_key) DO UPDATE SET
|
||||||
|
artist_id = EXCLUDED.artist_id,
|
||||||
|
album_id = EXCLUDED.album_id,
|
||||||
|
track_id = EXCLUDED.track_id;
|
||||||
|
|
||||||
|
-- name: DeleteTrackLookupByArtist :exec
|
||||||
|
DELETE FROM track_lookup WHERE artist_id = $1;
|
||||||
|
|
||||||
|
-- name: DeleteTrackLookupByAlbum :exec
|
||||||
|
DELETE FROM track_lookup WHERE album_id = $1;
|
||||||
|
|
||||||
|
-- name: DeleteTrackLookupByTrack :exec
|
||||||
|
DELETE FROM track_lookup WHERE track_id = $1;
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Regenerate sqlc
|
||||||
|
|
||||||
|
Run `sqlc generate` to create the Go bindings in `internal/repository/`.
|
||||||
|
|
||||||
|
#### 4. DB Interface + Psql Implementation
|
||||||
|
|
||||||
|
**File**: `internal/db/db.go` — Add to interface:
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Track Lookup Cache
|
||||||
|
GetTrackLookup(ctx context.Context, key string) (*TrackLookupResult, error)
|
||||||
|
SaveTrackLookup(ctx context.Context, opts SaveTrackLookupOpts) error
|
||||||
|
InvalidateTrackLookup(ctx context.Context, opts InvalidateTrackLookupOpts) error
|
||||||
|
```
|
||||||
|
|
||||||
|
**File**: `internal/db/opts.go` — Add types:
|
||||||
|
|
||||||
|
```go
|
||||||
|
type TrackLookupResult struct {
|
||||||
|
ArtistID int32
|
||||||
|
AlbumID int32
|
||||||
|
TrackID int32
|
||||||
|
}
|
||||||
|
|
||||||
|
type SaveTrackLookupOpts struct {
|
||||||
|
Key string
|
||||||
|
ArtistID int32
|
||||||
|
AlbumID int32
|
||||||
|
TrackID int32
|
||||||
|
}
|
||||||
|
|
||||||
|
type InvalidateTrackLookupOpts struct {
|
||||||
|
ArtistID int32
|
||||||
|
AlbumID int32
|
||||||
|
TrackID int32
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**File**: `internal/db/psql/track_lookup.go` — New file implementing the three methods.
|
||||||
|
|
||||||
|
#### 5. Lookup Key Helper
|
||||||
|
|
||||||
|
**File**: `internal/catalog/lookup_key.go` — New file:
|
||||||
|
|
||||||
|
```go
|
||||||
|
package catalog
|
||||||
|
|
||||||
|
import "strings"
|
||||||
|
|
||||||
|
// TrackLookupKey builds a normalized cache key for entity resolution.
|
||||||
|
func TrackLookupKey(artist, track, album string) string {
|
||||||
|
return strings.ToLower(artist) + "\x00" + strings.ToLower(track) + "\x00" + strings.ToLower(album)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 6. Integrate into SubmitListen
|
||||||
|
|
||||||
|
**File**: `internal/catalog/catalog.go` — Add fast path at the top of `SubmitListen`:
|
||||||
|
|
||||||
|
```go
|
||||||
|
func SubmitListen(ctx context.Context, store db.DB, opts SubmitListenOpts) error {
|
||||||
|
l := logger.FromContext(ctx)
|
||||||
|
|
||||||
|
if opts.Artist == "" || opts.TrackTitle == "" {
|
||||||
|
return errors.New("track name and artist are required")
|
||||||
|
}
|
||||||
|
|
||||||
|
opts.Time = opts.Time.Truncate(time.Second)
|
||||||
|
|
||||||
|
// Fast path: check lookup cache for known entity combo
|
||||||
|
if !opts.SkipSaveListen {
|
||||||
|
key := TrackLookupKey(opts.Artist, opts.TrackTitle, opts.ReleaseTitle)
|
||||||
|
cached, err := store.GetTrackLookup(ctx, key)
|
||||||
|
if err == nil && cached != nil {
|
||||||
|
l.Debug().Msg("Track lookup cache hit — skipping entity resolution")
|
||||||
|
return store.SaveListen(ctx, db.SaveListenOpts{
|
||||||
|
TrackID: cached.TrackID,
|
||||||
|
Time: opts.Time,
|
||||||
|
UserID: opts.UserID,
|
||||||
|
Client: opts.Client,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ... existing SubmitListen logic (unchanged) ...
|
||||||
|
|
||||||
|
// After successful entity resolution, populate the cache
|
||||||
|
store.SaveTrackLookup(ctx, db.SaveTrackLookupOpts{
|
||||||
|
Key: TrackLookupKey(opts.Artist, opts.TrackTitle, opts.ReleaseTitle),
|
||||||
|
ArtistID: artists[0].ID,
|
||||||
|
AlbumID: rg.ID,
|
||||||
|
TrackID: track.ID,
|
||||||
|
})
|
||||||
|
|
||||||
|
// ... rest of existing logic ...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: The cache only applies when we have a direct artist+track+album text match. Scrobbles with MBZ IDs that resolve to different text representations will still go through the full path on first encounter, then be cached.
|
||||||
|
|
||||||
|
#### 7. Invalidation on Merge/Delete
|
||||||
|
|
||||||
|
**File**: `internal/db/psql/artist.go` — In `DeleteArtist` and `MergeArtists`, add:
|
||||||
|
```go
|
||||||
|
d.q.DeleteTrackLookupByArtist(ctx, id)
|
||||||
|
```
|
||||||
|
|
||||||
|
**File**: `internal/db/psql/album.go` — In `DeleteAlbum` and `MergeAlbums`, add:
|
||||||
|
```go
|
||||||
|
d.q.DeleteTrackLookupByAlbum(ctx, id)
|
||||||
|
```
|
||||||
|
|
||||||
|
**File**: `internal/db/psql/track.go` — In `DeleteTrack` and `MergeTracks`, add:
|
||||||
|
```go
|
||||||
|
d.q.DeleteTrackLookupByTrack(ctx, id)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Success Criteria
|
||||||
|
|
||||||
|
- [ ] `go build ./...` compiles
|
||||||
|
- [ ] `go test ./...` passes
|
||||||
|
- [ ] New test: `TestSubmitListen_LookupCacheHit` — second identical scrobble uses cache
|
||||||
|
- [ ] New test: `TestSubmitListen_LookupCacheInvalidateOnDelete` — deleting entity clears cache
|
||||||
|
- [ ] Migration applies cleanly on fresh and existing databases
|
||||||
|
- [ ] Live scrobbles from multi-scrobbler populate the cache on first hit, use it on second
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 2: `SaveListensBatch` DB Method
|
||||||
|
|
||||||
|
### Overview
|
||||||
|
|
||||||
|
Add a batch listen insert method using pgx's `CopyFrom` for high-throughput listen insertion. This is the DB foundation for the BulkSubmitter.
|
||||||
|
|
||||||
|
### Changes Required
|
||||||
|
|
||||||
|
#### 1. New SQL + DB Interface
|
||||||
|
|
||||||
|
**File**: `internal/db/db.go` — Add to interface:
|
||||||
|
|
||||||
|
```go
|
||||||
|
SaveListensBatch(ctx context.Context, opts []SaveListenOpts) (int64, error)
|
||||||
|
```
|
||||||
|
|
||||||
|
Returns the number of rows actually inserted (excluding `ON CONFLICT` duplicates).
|
||||||
|
|
||||||
|
#### 2. Psql Implementation
|
||||||
|
|
||||||
|
**File**: `internal/db/psql/listen.go` — New method:
|
||||||
|
|
||||||
|
```go
|
||||||
|
func (d *Psql) SaveListensBatch(ctx context.Context, opts []db.SaveListenOpts) (int64, error) {
|
||||||
|
if len(opts) == 0 {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use a transaction with a temp table + INSERT ... ON CONFLICT pattern
|
||||||
|
// since CopyFrom doesn't support ON CONFLICT directly
|
||||||
|
tx, err := d.conn.BeginTx(ctx, pgx.TxOptions{})
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("SaveListensBatch: BeginTx: %w", err)
|
||||||
|
}
|
||||||
|
defer tx.Rollback(ctx)
|
||||||
|
|
||||||
|
// Create temp table
|
||||||
|
_, err = tx.Exec(ctx, `
|
||||||
|
CREATE TEMP TABLE tmp_listens (
|
||||||
|
track_id INT,
|
||||||
|
listened_at TIMESTAMPTZ,
|
||||||
|
user_id INT,
|
||||||
|
client TEXT
|
||||||
|
) ON COMMIT DROP
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("SaveListensBatch: create temp table: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// CopyFrom into temp table
|
||||||
|
rows := make([][]interface{}, len(opts))
|
||||||
|
for i, o := range opts {
|
||||||
|
var client interface{}
|
||||||
|
if o.Client != "" {
|
||||||
|
client = o.Client
|
||||||
|
}
|
||||||
|
rows[i] = []interface{}{o.TrackID, o.Time, o.UserID, client}
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = tx.CopyFrom(ctx,
|
||||||
|
pgx.Identifier{"tmp_listens"},
|
||||||
|
[]string{"track_id", "listened_at", "user_id", "client"},
|
||||||
|
pgx.CopyFromRows(rows),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("SaveListensBatch: CopyFrom: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert from temp table with dedup
|
||||||
|
tag, err := tx.Exec(ctx, `
|
||||||
|
INSERT INTO listens (track_id, listened_at, user_id, client)
|
||||||
|
SELECT track_id, listened_at, user_id, client FROM tmp_listens
|
||||||
|
ON CONFLICT DO NOTHING
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("SaveListensBatch: insert: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := tx.Commit(ctx); err != nil {
|
||||||
|
return 0, fmt.Errorf("SaveListensBatch: Commit: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return tag.RowsAffected(), nil
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
This uses the standard `CopyFrom → temp table → INSERT ON CONFLICT` pattern, which is the fastest bulk insert approach with pgx while still supporting deduplication.
|
||||||
|
|
||||||
|
### Success Criteria
|
||||||
|
|
||||||
|
- [ ] `go build ./...` compiles
|
||||||
|
- [ ] `go test ./...` passes
|
||||||
|
- [ ] New test: `TestSaveListensBatch` — insert 1000 listens, verify count
|
||||||
|
- [ ] New test: `TestSaveListensBatch_Dedup` — insert duplicates, verify no double-counting
|
||||||
|
- [ ] New test: `TestSaveListensBatch_Empty` — empty input returns 0, no error
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 3: `BulkSubmitter` Helper
|
||||||
|
|
||||||
|
### Overview
|
||||||
|
|
||||||
|
A reusable import accelerator that all importers can use. Pre-deduplicates scrobbles in memory, resolves entities via the `track_lookup` cache (falling back to `SubmitListen` on cache miss), and batch-inserts listens.
|
||||||
|
|
||||||
|
### Design
|
||||||
|
|
||||||
|
```
|
||||||
|
BulkSubmitter
|
||||||
|
├── Accept(SubmitListenOpts) — buffer a scrobble
|
||||||
|
├── Flush() (int, error) — process all buffered scrobbles
|
||||||
|
│ ├── Phase A: Deduplicate by (artist, track, album) key
|
||||||
|
│ ├── Phase B: Resolve entities
|
||||||
|
│ │ ├── Check track_lookup cache (single SELECT)
|
||||||
|
│ │ ├── On miss: call SubmitListen(SkipSaveListen=true) to create entities
|
||||||
|
│ │ ├── Worker pool: N goroutines for parallel entity creation
|
||||||
|
│ │ └── Populate track_lookup cache after creation
|
||||||
|
│ ├── Phase C: Map all scrobbles to track_ids via resolved cache
|
||||||
|
│ └── Phase D: SaveListensBatch
|
||||||
|
└── Progress callback for logging
|
||||||
|
```
|
||||||
|
|
||||||
|
### Changes Required
|
||||||
|
|
||||||
|
#### 1. New Package
|
||||||
|
|
||||||
|
**File**: `internal/importer/bulk.go`
|
||||||
|
|
||||||
|
```go
|
||||||
|
package importer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"github.com/gabehf/koito/internal/catalog"
|
||||||
|
"github.com/gabehf/koito/internal/db"
|
||||||
|
"github.com/gabehf/koito/internal/logger"
|
||||||
|
"github.com/gabehf/koito/internal/mbz"
|
||||||
|
)
|
||||||
|
|
||||||
|
type BulkSubmitter struct {
|
||||||
|
store db.DB
|
||||||
|
mbzc mbz.MusicBrainzCaller
|
||||||
|
ctx context.Context
|
||||||
|
buffer []catalog.SubmitListenOpts
|
||||||
|
workers int
|
||||||
|
onProgress func(imported, total int)
|
||||||
|
}
|
||||||
|
|
||||||
|
type BulkSubmitterOpts struct {
|
||||||
|
Store db.DB
|
||||||
|
Mbzc mbz.MusicBrainzCaller
|
||||||
|
Workers int // default 4
|
||||||
|
OnProgress func(imported, total int) // called every 500 items
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewBulkSubmitter(ctx context.Context, opts BulkSubmitterOpts) *BulkSubmitter {
|
||||||
|
workers := opts.Workers
|
||||||
|
if workers <= 0 {
|
||||||
|
workers = 4
|
||||||
|
}
|
||||||
|
return &BulkSubmitter{
|
||||||
|
store: opts.Store,
|
||||||
|
mbzc: opts.Mbzc,
|
||||||
|
ctx: ctx,
|
||||||
|
workers: workers,
|
||||||
|
onProgress: opts.OnProgress,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (bs *BulkSubmitter) Accept(opts catalog.SubmitListenOpts) {
|
||||||
|
bs.buffer = append(bs.buffer, opts)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (bs *BulkSubmitter) Flush() (int, error) {
|
||||||
|
l := logger.FromContext(bs.ctx)
|
||||||
|
if len(bs.buffer) == 0 {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
l.Info().Msgf("BulkSubmitter: Processing %d scrobbles", len(bs.buffer))
|
||||||
|
|
||||||
|
// Phase A: Deduplicate — find unique (artist, track, album) tuples
|
||||||
|
type entityKey = string
|
||||||
|
unique := make(map[entityKey]catalog.SubmitListenOpts)
|
||||||
|
for _, opts := range bs.buffer {
|
||||||
|
key := catalog.TrackLookupKey(opts.Artist, opts.TrackTitle, opts.ReleaseTitle)
|
||||||
|
if _, exists := unique[key]; !exists {
|
||||||
|
unique[key] = opts
|
||||||
|
}
|
||||||
|
}
|
||||||
|
l.Info().Msgf("BulkSubmitter: %d unique entity combos from %d scrobbles", len(unique), len(bs.buffer))
|
||||||
|
|
||||||
|
// Phase B: Resolve entities — check cache, create on miss
|
||||||
|
resolved := make(map[entityKey]int32) // key → trackID
|
||||||
|
var mu sync.Mutex
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
sem := make(chan struct{}, bs.workers)
|
||||||
|
resolveCount := 0
|
||||||
|
|
||||||
|
for key, opts := range unique {
|
||||||
|
// Check track_lookup cache first
|
||||||
|
cached, err := bs.store.GetTrackLookup(bs.ctx, key)
|
||||||
|
if err == nil && cached != nil {
|
||||||
|
mu.Lock()
|
||||||
|
resolved[key] = cached.TrackID
|
||||||
|
resolveCount++
|
||||||
|
mu.Unlock()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cache miss — create entities via SubmitListen (with worker pool)
|
||||||
|
wg.Add(1)
|
||||||
|
sem <- struct{}{} // acquire worker slot
|
||||||
|
go func(k entityKey, o catalog.SubmitListenOpts) {
|
||||||
|
defer wg.Done()
|
||||||
|
defer func() { <-sem }() // release worker slot
|
||||||
|
|
||||||
|
o.SkipSaveListen = true
|
||||||
|
o.SkipCacheImage = true
|
||||||
|
err := catalog.SubmitListen(bs.ctx, bs.store, o)
|
||||||
|
if err != nil {
|
||||||
|
l.Err(err).Msgf("BulkSubmitter: Failed to create entities for '%s' by '%s'", o.TrackTitle, o.Artist)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-check cache (SubmitListen populates it in Phase 1's integration)
|
||||||
|
cached, err := bs.store.GetTrackLookup(bs.ctx, k)
|
||||||
|
if err == nil && cached != nil {
|
||||||
|
mu.Lock()
|
||||||
|
resolved[k] = cached.TrackID
|
||||||
|
mu.Unlock()
|
||||||
|
}
|
||||||
|
}(key, opts)
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
l.Info().Msgf("BulkSubmitter: Resolved %d/%d entity combos", len(resolved), len(unique))
|
||||||
|
|
||||||
|
// Phase C: Build listen batch
|
||||||
|
batch := make([]db.SaveListenOpts, 0, len(bs.buffer))
|
||||||
|
skipped := 0
|
||||||
|
for _, opts := range bs.buffer {
|
||||||
|
key := catalog.TrackLookupKey(opts.Artist, opts.TrackTitle, opts.ReleaseTitle)
|
||||||
|
trackID, ok := resolved[key]
|
||||||
|
if !ok {
|
||||||
|
skipped++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
batch = append(batch, db.SaveListenOpts{
|
||||||
|
TrackID: trackID,
|
||||||
|
Time: opts.Time.Truncate(time.Second),
|
||||||
|
UserID: opts.UserID,
|
||||||
|
Client: opts.Client,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if skipped > 0 {
|
||||||
|
l.Warn().Msgf("BulkSubmitter: Skipped %d scrobbles with unresolved entities", skipped)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase D: Batch insert listens
|
||||||
|
inserted, err := bs.store.SaveListensBatch(bs.ctx, batch)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("BulkSubmitter: SaveListensBatch: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
l.Info().Msgf("BulkSubmitter: Inserted %d listens (%d duplicates skipped)", inserted, int64(len(batch))-inserted)
|
||||||
|
return int(inserted), nil
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. TOCTOU Safety for Parallel Entity Creation
|
||||||
|
|
||||||
|
The worker pool creates entities via `SubmitListen(SkipSaveListen=true)`. Two workers could race on the same artist name. The existing code uses a get-then-save pattern. Mitigations:
|
||||||
|
|
||||||
|
- Pre-dedup in Phase A ensures each unique tuple is processed by exactly one goroutine — **no TOCTOU within the worker pool**
|
||||||
|
- The only remaining race is between the import workers and live scrobbles from multi-scrobbler hitting the same `SubmitListen` path. This is already handled by the DB's unique constraints + `ON CONFLICT` clauses on join tables.
|
||||||
|
|
||||||
|
### Success Criteria
|
||||||
|
|
||||||
|
- [ ] `go build ./...` compiles
|
||||||
|
- [ ] `go test ./...` passes
|
||||||
|
- [ ] New test: `TestBulkSubmitter_BasicImport` — buffer 100 scrobbles, flush, verify all imported
|
||||||
|
- [ ] New test: `TestBulkSubmitter_Dedup` — buffer 100 scrobbles with 10 unique combos, verify 10 entity creations
|
||||||
|
- [ ] New test: `TestBulkSubmitter_CacheHit` — pre-populate track_lookup, verify no SubmitListen calls
|
||||||
|
- [ ] New test: `TestBulkSubmitter_PartialFailure` — one entity creation fails, rest still imported
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 4: Migrate All Importers
|
||||||
|
|
||||||
|
### Overview
|
||||||
|
|
||||||
|
Wire all 5 importers to use BulkSubmitter instead of calling `SubmitListen` directly in a loop.
|
||||||
|
|
||||||
|
### Changes Required
|
||||||
|
|
||||||
|
#### 1. Maloja Importer
|
||||||
|
|
||||||
|
**File**: `internal/importer/maloja.go`
|
||||||
|
|
||||||
|
Replace the per-item `catalog.SubmitListen` loop with:
|
||||||
|
|
||||||
|
```go
|
||||||
|
func ImportMalojaFile(ctx context.Context, store db.DB, mbzc mbz.MusicBrainzCaller, filename string) error {
|
||||||
|
l := logger.FromContext(ctx)
|
||||||
|
// ... file reading and JSON parsing (unchanged) ...
|
||||||
|
|
||||||
|
bs := NewBulkSubmitter(ctx, BulkSubmitterOpts{
|
||||||
|
Store: store,
|
||||||
|
Mbzc: mbzc,
|
||||||
|
OnProgress: func(imported, total int) {
|
||||||
|
l.Info().Msgf("Maloja import progress: %d/%d", imported, total)
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, item := range items {
|
||||||
|
// ... existing artist parsing, time window check (unchanged) ...
|
||||||
|
|
||||||
|
bs.Accept(catalog.SubmitListenOpts{
|
||||||
|
MbzCaller: mbzc,
|
||||||
|
Artist: item.Track.Artists[0],
|
||||||
|
ArtistNames: artists,
|
||||||
|
TrackTitle: item.Track.Title,
|
||||||
|
ReleaseTitle: releaseTitle,
|
||||||
|
Time: ts.Local(),
|
||||||
|
Client: "maloja",
|
||||||
|
UserID: 1,
|
||||||
|
SkipCacheImage: true,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
count, err := bs.Flush()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("ImportMalojaFile: %w", err)
|
||||||
|
}
|
||||||
|
return finishImport(ctx, filename, count)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Spotify Importer
|
||||||
|
|
||||||
|
**File**: `internal/importer/spotify.go` — Same pattern: Accept into BulkSubmitter, Flush at end.
|
||||||
|
|
||||||
|
#### 3. LastFM Importer
|
||||||
|
|
||||||
|
**File**: `internal/importer/lastfm.go` — Same pattern. Note: LastFM scrobbles include MBZ IDs, which will pass through to `SubmitListen` on cache miss for proper entity resolution.
|
||||||
|
|
||||||
|
#### 4. ListenBrainz Importer
|
||||||
|
|
||||||
|
**File**: `internal/importer/listenbrainz.go` — Same pattern. ListenBrainz data is the richest (full MBZ IDs, MBID mappings) — cache hits will be common after first import.
|
||||||
|
|
||||||
|
#### 5. Koito Importer
|
||||||
|
|
||||||
|
**File**: `internal/importer/koito.go` — This one currently bypasses `SubmitListen` with direct DB calls. Two options:
|
||||||
|
- **Option A**: Migrate to BulkSubmitter (consistent, benefits from cache)
|
||||||
|
- **Option B**: Leave as-is (it's already fast, Koito exports have pre-resolved IDs)
|
||||||
|
|
||||||
|
Recommend **Option A** for consistency, with the Koito importer becoming the simplest BulkSubmitter user since its data is pre-resolved.
|
||||||
|
|
||||||
|
### Success Criteria
|
||||||
|
|
||||||
|
- [ ] `go build ./...` compiles
|
||||||
|
- [ ] `go test ./...` passes — all existing import tests still pass
|
||||||
|
- [ ] `TestImportMaloja` — 38 listens imported correctly
|
||||||
|
- [ ] `TestImportMaloja_NullAlbum` — null album handled
|
||||||
|
- [ ] `TestImportMaloja_ApiFormat` — list format works
|
||||||
|
- [ ] `TestImportSpotify` — duration data preserved
|
||||||
|
- [ ] `TestImportLastFM` — MBZ IDs resolved
|
||||||
|
- [ ] `TestImportListenBrainz` — MBID mappings applied
|
||||||
|
- [ ] `TestImportKoito` — aliases preserved
|
||||||
|
- [ ] Manual: 49k Maloja import on vo-pc completes in under 30 minutes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 5: Skip Image Lookups During Import
|
||||||
|
|
||||||
|
### Overview
|
||||||
|
|
||||||
|
Short-circuit `GetArtistImage` and `GetAlbumImage` calls when `SkipCacheImage` is true. Currently these functions still make HTTP calls (or call providers that return "no providers enabled") even when the result won't be used. The existing background tasks (`FetchMissingArtistImages`, `FetchMissingAlbumImages`) will backfill images after import.
|
||||||
|
|
||||||
|
### Changes Required
|
||||||
|
|
||||||
|
#### 1. Early Return in Associate Functions
|
||||||
|
|
||||||
|
**File**: `internal/catalog/associate_artists.go`
|
||||||
|
|
||||||
|
In `resolveAliasOrCreateArtist` (line ~248) and `matchArtistsByNames` (line ~304):
|
||||||
|
|
||||||
|
```go
|
||||||
|
// Before:
|
||||||
|
imgUrl, err := images.GetArtistImage(ctx, images.ArtistImageOpts{...})
|
||||||
|
if err == nil && imgUrl != "" {
|
||||||
|
imgid = uuid.New()
|
||||||
|
if !opts.SkipCacheImage {
|
||||||
|
// download image
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// After:
|
||||||
|
var imgUrl string
|
||||||
|
if !opts.SkipCacheImage {
|
||||||
|
imgUrl, err = images.GetArtistImage(ctx, images.ArtistImageOpts{...})
|
||||||
|
if err == nil && imgUrl != "" {
|
||||||
|
imgid = uuid.New()
|
||||||
|
// download image
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**File**: `internal/catalog/associate_album.go`
|
||||||
|
|
||||||
|
Same pattern in `createOrUpdateAlbumWithMbzReleaseID` (line ~125) and `matchAlbumByTitle` (line ~220).
|
||||||
|
|
||||||
|
### Success Criteria
|
||||||
|
|
||||||
|
- [ ] `go build ./...` compiles
|
||||||
|
- [ ] `go test ./...` passes
|
||||||
|
- [ ] No `GetArtistImage`/`GetAlbumImage` calls during import (verify via log: no "No image providers" warnings)
|
||||||
|
- [ ] Background tasks still fetch images after import completes
|
||||||
|
- [ ] Live scrobbles (SkipCacheImage=false) still fetch images normally
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Performance Estimates
|
||||||
|
|
||||||
|
| Scenario | Current | After Phase 1 | After All Phases |
|
||||||
|
|---|---|---|---|
|
||||||
|
| Repeated live scrobble | ~19 queries | 2 queries (cache hit) | 2 queries |
|
||||||
|
| New live scrobble | ~19 queries | ~19 queries + 1 cache write | ~19 queries + 1 cache write |
|
||||||
|
| 49k Maloja import | ~24 hours | ~12 hours (cache helps repeats) | ~15-30 minutes |
|
||||||
|
| 49k import (second run) | ~24 hours | ~20 minutes (all cache hits) | ~5 minutes |
|
||||||
|
|
||||||
|
## Implementation Order
|
||||||
|
|
||||||
|
1. **Phase 1** (track_lookup) — standalone, immediate benefit for all scrobbles
|
||||||
|
2. **Phase 5** (skip image lookups) — standalone, no dependencies, quick win
|
||||||
|
3. **Phase 2** (SaveListensBatch) — DB layer, needed by Phase 3
|
||||||
|
4. **Phase 3** (BulkSubmitter) — the core, depends on Phase 1 + 2
|
||||||
|
5. **Phase 4** (migrate importers) — depends on Phase 3
|
||||||
|
|
||||||
|
Phases 1 and 5 can be done first as independent PRs. Phases 2-4 are one PR.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- ListenBrainz architecture: https://listenbrainz.readthedocs.io/en/latest/developers/architecture.html
|
||||||
|
- ListenBrainz MBID mapping: https://listenbrainz.readthedocs.io/en/latest/developers/mapping.html
|
||||||
|
- MusicBrainz rate limiting: https://musicbrainz.org/doc/MusicBrainz_API/Rate_Limiting
|
||||||
|
- PR 1 (importer fixes): https://github.com/gabehf/Koito/pull/228
|
||||||
|
- PR 2 (MBZ search): https://github.com/gabehf/Koito/pull/229
|
||||||
|
- Koito native importer (bypass pattern): `internal/importer/koito.go`
|
||||||
|
- Current SubmitListen: `internal/catalog/catalog.go:70`
|
||||||
|
- pgx CopyFrom docs: https://pkg.go.dev/github.com/jackc/pgx/v5#Conn.CopyFrom
|
||||||
Loading…
Add table
Add a link
Reference in a new issue