feat: custom artist separator regex

feat/custom-artist-sep
Gabe Farrell 3 weeks ago
parent 164a9dc56f
commit acb362e6ad

@ -40,6 +40,9 @@ If the environment variable is defined without **and** with the suffix at the sa
##### KOITO_LOG_LEVEL ##### KOITO_LOG_LEVEL
- Default: `info` - Default: `info`
- Description: One of `debug | info | warn | error | fatal` - Description: One of `debug | info | warn | error | fatal`
##### KOITO_ARTIST_SEPARATORS_REGEX
- Default: `\s+·\s+`
- Description: The list of regex patterns Koito will use to separate artist strings, separated by two semicolons (`;;`).
##### KOITO_MUSICBRAINZ_URL ##### KOITO_MUSICBRAINZ_URL
- Default: `https://musicbrainz.org` - Default: `https://musicbrainz.org`
- Description: The URL Koito will use to contact MusicBrainz. Replace this value if you have your own MusicBrainz mirror. - Description: The URL Koito will use to contact MusicBrainz. Replace this value if you have your own MusicBrainz mirror.

@ -62,7 +62,7 @@ func AssociateArtists(ctx context.Context, d db.DB, opts AssociateArtistsOpts) (
} }
if len(result) < 1 { if len(result) < 1 {
allArtists := slices.Concat(opts.ArtistNames, ParseArtists(opts.ArtistName, opts.TrackTitle)) allArtists := slices.Concat(opts.ArtistNames, ParseArtists(opts.ArtistName, opts.TrackTitle, cfg.ArtistSeparators()))
l.Debug().Msgf("Associating artists by artist name(s) %v and track title '%s'", allArtists, opts.TrackTitle) l.Debug().Msgf("Associating artists by artist name(s) %v and track title '%s'", allArtists, opts.TrackTitle)
fallbackMatches, err := matchArtistsByNames(ctx, allArtists, nil, d, opts) fallbackMatches, err := matchArtistsByNames(ctx, allArtists, nil, d, opts)
if err != nil { if err != nil {
@ -180,7 +180,7 @@ func matchArtistsByMBID(ctx context.Context, d db.DB, opts AssociateArtistsOpts,
} }
if len(opts.ArtistNames) < 1 { if len(opts.ArtistNames) < 1 {
opts.ArtistNames = slices.Concat(opts.ArtistNames, ParseArtists(opts.ArtistName, opts.TrackTitle)) opts.ArtistNames = slices.Concat(opts.ArtistNames, ParseArtists(opts.ArtistName, opts.TrackTitle, cfg.ArtistSeparators()))
} }
a, err = resolveAliasOrCreateArtist(ctx, id, opts.ArtistNames, d, opts) a, err = resolveAliasOrCreateArtist(ctx, id, opts.ArtistNames, d, opts)

@ -201,21 +201,18 @@ func buildArtistStr(artists []*models.Artist) string {
var ( var (
// Bracketed feat patterns // Bracketed feat patterns
bracketFeatPatterns = []*regexp.Regexp{ bracketFeatPatterns = []*regexp.Regexp{
regexp.MustCompile(`(?i)\(feat\. ([^)]*)\)`), regexp.MustCompile(`(?i)\([fF]eat\. ([^)]*)\)`),
regexp.MustCompile(`(?i)\[feat\. ([^\]]*)\]`), regexp.MustCompile(`(?i)\[[fF]eat\. ([^\]]*)\]`),
} }
// Inline feat (not in brackets) // Inline feat (not in brackets)
inlineFeatPattern = regexp.MustCompile(`(?i)feat\. ([^()\[\]]+)$`) inlineFeatPattern = regexp.MustCompile(`(?i)[fF]eat\. ([^()\[\]]+)$`)
// Delimiters only used inside feat. sections // Delimiters only used inside feat. sections
featSplitDelimiters = regexp.MustCompile(`(?i)\s*(?:,|&|and|·)\s*`) featSplitDelimiters = regexp.MustCompile(`(?i)\s*(?:,|&|and|·)\s*`)
// Delimiter for separating artists in main string (rare but real usage)
mainArtistDotSplitter = regexp.MustCompile(`\s+·\s+`)
) )
// ParseArtists extracts all contributing artist names from the artist and title strings // ParseArtists extracts all contributing artist names from the artist and title strings
func ParseArtists(artist string, title string) []string { func ParseArtists(artist string, title string, addlSeparators []*regexp.Regexp) []string {
seen := make(map[string]struct{}) seen := make(map[string]struct{})
var out []string var out []string
@ -230,12 +227,9 @@ func ParseArtists(artist string, title string) []string {
} }
} }
foundFeat := false
// Extract bracketed features from artist // Extract bracketed features from artist
for _, re := range bracketFeatPatterns { for _, re := range bracketFeatPatterns {
if matches := re.FindStringSubmatch(artist); matches != nil { if matches := re.FindStringSubmatch(artist); matches != nil {
foundFeat = true
artist = strings.Replace(artist, matches[0], "", 1) artist = strings.Replace(artist, matches[0], "", 1)
for _, name := range featSplitDelimiters.Split(matches[1], -1) { for _, name := range featSplitDelimiters.Split(matches[1], -1) {
add(name) add(name)
@ -244,7 +238,6 @@ func ParseArtists(artist string, title string) []string {
} }
// Extract inline feat. from artist // Extract inline feat. from artist
if matches := inlineFeatPattern.FindStringSubmatch(artist); matches != nil { if matches := inlineFeatPattern.FindStringSubmatch(artist); matches != nil {
foundFeat = true
artist = strings.Replace(artist, matches[0], "", 1) artist = strings.Replace(artist, matches[0], "", 1)
for _, name := range featSplitDelimiters.Split(matches[1], -1) { for _, name := range featSplitDelimiters.Split(matches[1], -1) {
add(name) add(name)
@ -252,14 +245,19 @@ func ParseArtists(artist string, title string) []string {
} }
// Add base artist(s) // Add base artist(s)
if foundFeat { l1 := len(out)
add(strings.TrimSpace(artist)) for _, re := range addlSeparators {
} else { for _, name := range re.Split(artist, -1) {
// Only split on " · " in base artist string if name == artist {
for _, name := range mainArtistDotSplitter.Split(artist, -1) { continue
}
add(name) add(name)
} }
} }
// Only add the full artist string if no splitters were matched
if l1 == len(out) {
add(artist)
}
// Extract features from title // Extract features from title
for _, re := range bracketFeatPatterns { for _, re := range bracketFeatPatterns {

@ -5,6 +5,7 @@ import (
"fmt" "fmt"
"log" "log"
"os" "os"
"regexp"
"testing" "testing"
"time" "time"
@ -167,15 +168,15 @@ func getTestGetenv(resource *dockertest.Resource) func(string) string {
func truncateTestData(t *testing.T) { func truncateTestData(t *testing.T) {
err := store.Exec(context.Background(), err := store.Exec(context.Background(),
`TRUNCATE `TRUNCATE
artists, artists,
artist_aliases, artist_aliases,
tracks, tracks,
artist_tracks, artist_tracks,
releases, releases,
artist_releases, artist_releases,
release_aliases, release_aliases,
listens listens
RESTART IDENTITY CASCADE`) RESTART IDENTITY CASCADE`)
require.NoError(t, err) require.NoError(t, err)
} }
@ -184,23 +185,23 @@ func setupTestDataWithMbzIDs(t *testing.T) {
truncateTestData(t) truncateTestData(t)
err := store.Exec(context.Background(), err := store.Exec(context.Background(),
`INSERT INTO artists (musicbrainz_id) `INSERT INTO artists (musicbrainz_id)
VALUES ('00000000-0000-0000-0000-000000000001')`) VALUES ('00000000-0000-0000-0000-000000000001')`)
require.NoError(t, err) require.NoError(t, err)
err = store.Exec(context.Background(), err = store.Exec(context.Background(),
`INSERT INTO artist_aliases (artist_id, alias, source, is_primary) `INSERT INTO artist_aliases (artist_id, alias, source, is_primary)
VALUES (1, 'ATARASHII GAKKO!', 'Testing', true)`) VALUES (1, 'ATARASHII GAKKO!', 'Testing', true)`)
require.NoError(t, err) require.NoError(t, err)
err = store.Exec(context.Background(), err = store.Exec(context.Background(),
`INSERT INTO releases (musicbrainz_id) `INSERT INTO releases (musicbrainz_id)
VALUES ('00000000-0000-0000-0000-000000000101')`) VALUES ('00000000-0000-0000-0000-000000000101')`)
require.NoError(t, err) require.NoError(t, err)
err = store.Exec(context.Background(), err = store.Exec(context.Background(),
`INSERT INTO release_aliases (release_id, alias, source, is_primary) `INSERT INTO release_aliases (release_id, alias, source, is_primary)
VALUES (1, 'AG! Calling', 'Testing', true)`) VALUES (1, 'AG! Calling', 'Testing', true)`)
require.NoError(t, err) require.NoError(t, err)
err = store.Exec(context.Background(), err = store.Exec(context.Background(),
`INSERT INTO artist_releases (artist_id, release_id) `INSERT INTO artist_releases (artist_id, release_id)
VALUES (1, 1)`) VALUES (1, 1)`)
require.NoError(t, err) require.NoError(t, err)
err = store.Exec(context.Background(), err = store.Exec(context.Background(),
@ -221,23 +222,23 @@ func setupTestDataSansMbzIDs(t *testing.T) {
truncateTestData(t) truncateTestData(t)
err := store.Exec(context.Background(), err := store.Exec(context.Background(),
`INSERT INTO artists (musicbrainz_id) `INSERT INTO artists (musicbrainz_id)
VALUES (NULL)`) VALUES (NULL)`)
require.NoError(t, err) require.NoError(t, err)
err = store.Exec(context.Background(), err = store.Exec(context.Background(),
`INSERT INTO artist_aliases (artist_id, alias, source, is_primary) `INSERT INTO artist_aliases (artist_id, alias, source, is_primary)
VALUES (1, 'ATARASHII GAKKO!', 'Testing', true)`) VALUES (1, 'ATARASHII GAKKO!', 'Testing', true)`)
require.NoError(t, err) require.NoError(t, err)
err = store.Exec(context.Background(), err = store.Exec(context.Background(),
`INSERT INTO releases (musicbrainz_id) `INSERT INTO releases (musicbrainz_id)
VALUES (NULL)`) VALUES (NULL)`)
require.NoError(t, err) require.NoError(t, err)
err = store.Exec(context.Background(), err = store.Exec(context.Background(),
`INSERT INTO release_aliases (release_id, alias, source, is_primary) `INSERT INTO release_aliases (release_id, alias, source, is_primary)
VALUES (1, 'AG! Calling', 'Testing', true)`) VALUES (1, 'AG! Calling', 'Testing', true)`)
require.NoError(t, err) require.NoError(t, err)
err = store.Exec(context.Background(), err = store.Exec(context.Background(),
`INSERT INTO artist_releases (artist_id, release_id) `INSERT INTO artist_releases (artist_id, release_id)
VALUES (1, 1)`) VALUES (1, 1)`)
require.NoError(t, err) require.NoError(t, err)
err = store.Exec(context.Background(), err = store.Exec(context.Background(),
@ -358,10 +359,16 @@ func TestArtistStringParse(t *testing.T) {
// artists in both // artists in both
{"Daft Punk feat. Julian Casablancas", "Instant Crush (feat. Julian Casablancas)"}: {"Daft Punk", "Julian Casablancas"}, {"Daft Punk feat. Julian Casablancas", "Instant Crush (feat. Julian Casablancas)"}: {"Daft Punk", "Julian Casablancas"},
{"Paramore (feat. Joy Williams)", "Hate to See Your Heart Break feat. Joy Williams"}: {"Paramore", "Joy Williams"}, {"Paramore (feat. Joy Williams)", "Hate to See Your Heart Break feat. Joy Williams"}: {"Paramore", "Joy Williams"},
{"MINSU", "오해 금지 (Feat. BIG Naughty)"}: {"MINSU", "BIG Naughty"},
{"MINSU", "오해 금지 [Feat. BIG Naughty]"}: {"MINSU", "BIG Naughty"},
{"MINSU", "오해 금지 Feat. BIG Naughty"}: {"MINSU", "BIG Naughty"},
// custom separator
{"MIMiNARI//楠木ともり", "眠れない"}: {"MIMiNARI", "楠木ともり"},
} }
for in, out := range cases { for in, out := range cases {
artists := catalog.ParseArtists(in.Name, in.Title) artists := catalog.ParseArtists(in.Name, in.Title, []*regexp.Regexp{regexp.MustCompile(`\s*//\s*`), regexp.MustCompile(`\s+·\s+`)})
assert.ElementsMatch(t, out, artists) assert.ElementsMatch(t, out, artists)
} }
} }

@ -3,6 +3,7 @@ package cfg
import ( import (
"errors" "errors"
"fmt" "fmt"
"regexp"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
@ -45,6 +46,7 @@ const (
IMPORT_BEFORE_UNIX_ENV = "KOITO_IMPORT_BEFORE_UNIX" IMPORT_BEFORE_UNIX_ENV = "KOITO_IMPORT_BEFORE_UNIX"
IMPORT_AFTER_UNIX_ENV = "KOITO_IMPORT_AFTER_UNIX" IMPORT_AFTER_UNIX_ENV = "KOITO_IMPORT_AFTER_UNIX"
FETCH_IMAGES_DURING_IMPORT_ENV = "KOITO_FETCH_IMAGES_DURING_IMPORT" FETCH_IMAGES_DURING_IMPORT_ENV = "KOITO_FETCH_IMAGES_DURING_IMPORT"
ARTIST_SEPARATORS_ENV = "KOITO_ARTIST_SEPARATORS_REGEX"
) )
type config struct { type config struct {
@ -80,6 +82,7 @@ type config struct {
userAgent string userAgent string
importBefore time.Time importBefore time.Time
importAfter time.Time importAfter time.Time
artistSeparators []*regexp.Regexp
} }
var ( var (
@ -189,6 +192,18 @@ func loadConfig(getenv func(string) string, version string) (*config, error) {
rawCors := getenv(CORS_ORIGINS_ENV) rawCors := getenv(CORS_ORIGINS_ENV)
cfg.allowedOrigins = strings.Split(rawCors, ",") cfg.allowedOrigins = strings.Split(rawCors, ",")
if getenv(ARTIST_SEPARATORS_ENV) != "" {
for pattern := range strings.SplitSeq(getenv(ARTIST_SEPARATORS_ENV), ";;") {
regex, err := regexp.Compile(pattern)
if err != nil {
return nil, fmt.Errorf("failed to compile regex pattern %s", pattern)
}
cfg.artistSeparators = append(cfg.artistSeparators, regex)
}
} else {
cfg.artistSeparators = []*regexp.Regexp{regexp.MustCompile(`\s+·\s+`)}
}
switch strings.ToLower(getenv(LOG_LEVEL_ENV)) { switch strings.ToLower(getenv(LOG_LEVEL_ENV)) {
case "debug": case "debug":
cfg.logLevel = 0 cfg.logLevel = 0
@ -388,3 +403,9 @@ func FetchImagesDuringImport() bool {
defer lock.RUnlock() defer lock.RUnlock()
return globalConfig.fetchImageDuringImport return globalConfig.fetchImageDuringImport
} }
func ArtistSeparators() []*regexp.Regexp {
lock.RLock()
defer lock.RUnlock()
return globalConfig.artistSeparators
}

Loading…
Cancel
Save