feat: custom artist separator regex

feat/custom-artist-sep
Gabe Farrell 3 weeks ago
parent 164a9dc56f
commit acb362e6ad

@ -40,6 +40,9 @@ If the environment variable is defined without **and** with the suffix at the sa
##### KOITO_LOG_LEVEL
- Default: `info`
- Description: One of `debug | info | warn | error | fatal`
##### KOITO_ARTIST_SEPARATORS_REGEX
- Default: `\s+·\s+`
- Description: The list of regex patterns Koito will use to separate artist strings, separated by two semicolons (`;;`).
##### KOITO_MUSICBRAINZ_URL
- Default: `https://musicbrainz.org`
- Description: The URL Koito will use to contact MusicBrainz. Replace this value if you have your own MusicBrainz mirror.

@ -62,7 +62,7 @@ func AssociateArtists(ctx context.Context, d db.DB, opts AssociateArtistsOpts) (
}
if len(result) < 1 {
allArtists := slices.Concat(opts.ArtistNames, ParseArtists(opts.ArtistName, opts.TrackTitle))
allArtists := slices.Concat(opts.ArtistNames, ParseArtists(opts.ArtistName, opts.TrackTitle, cfg.ArtistSeparators()))
l.Debug().Msgf("Associating artists by artist name(s) %v and track title '%s'", allArtists, opts.TrackTitle)
fallbackMatches, err := matchArtistsByNames(ctx, allArtists, nil, d, opts)
if err != nil {
@ -180,7 +180,7 @@ func matchArtistsByMBID(ctx context.Context, d db.DB, opts AssociateArtistsOpts,
}
if len(opts.ArtistNames) < 1 {
opts.ArtistNames = slices.Concat(opts.ArtistNames, ParseArtists(opts.ArtistName, opts.TrackTitle))
opts.ArtistNames = slices.Concat(opts.ArtistNames, ParseArtists(opts.ArtistName, opts.TrackTitle, cfg.ArtistSeparators()))
}
a, err = resolveAliasOrCreateArtist(ctx, id, opts.ArtistNames, d, opts)

@ -201,21 +201,18 @@ func buildArtistStr(artists []*models.Artist) string {
var (
// Bracketed feat patterns
bracketFeatPatterns = []*regexp.Regexp{
regexp.MustCompile(`(?i)\(feat\. ([^)]*)\)`),
regexp.MustCompile(`(?i)\[feat\. ([^\]]*)\]`),
regexp.MustCompile(`(?i)\([fF]eat\. ([^)]*)\)`),
regexp.MustCompile(`(?i)\[[fF]eat\. ([^\]]*)\]`),
}
// Inline feat (not in brackets)
inlineFeatPattern = regexp.MustCompile(`(?i)feat\. ([^()\[\]]+)$`)
inlineFeatPattern = regexp.MustCompile(`(?i)[fF]eat\. ([^()\[\]]+)$`)
// Delimiters only used inside feat. sections
featSplitDelimiters = regexp.MustCompile(`(?i)\s*(?:,|&|and|·)\s*`)
// Delimiter for separating artists in main string (rare but real usage)
mainArtistDotSplitter = regexp.MustCompile(`\s+·\s+`)
)
// ParseArtists extracts all contributing artist names from the artist and title strings
func ParseArtists(artist string, title string) []string {
func ParseArtists(artist string, title string, addlSeparators []*regexp.Regexp) []string {
seen := make(map[string]struct{})
var out []string
@ -230,12 +227,9 @@ func ParseArtists(artist string, title string) []string {
}
}
foundFeat := false
// Extract bracketed features from artist
for _, re := range bracketFeatPatterns {
if matches := re.FindStringSubmatch(artist); matches != nil {
foundFeat = true
artist = strings.Replace(artist, matches[0], "", 1)
for _, name := range featSplitDelimiters.Split(matches[1], -1) {
add(name)
@ -244,7 +238,6 @@ func ParseArtists(artist string, title string) []string {
}
// Extract inline feat. from artist
if matches := inlineFeatPattern.FindStringSubmatch(artist); matches != nil {
foundFeat = true
artist = strings.Replace(artist, matches[0], "", 1)
for _, name := range featSplitDelimiters.Split(matches[1], -1) {
add(name)
@ -252,14 +245,19 @@ func ParseArtists(artist string, title string) []string {
}
// Add base artist(s)
if foundFeat {
add(strings.TrimSpace(artist))
} else {
// Only split on " · " in base artist string
for _, name := range mainArtistDotSplitter.Split(artist, -1) {
l1 := len(out)
for _, re := range addlSeparators {
for _, name := range re.Split(artist, -1) {
if name == artist {
continue
}
add(name)
}
}
// Only add the full artist string if no splitters were matched
if l1 == len(out) {
add(artist)
}
// Extract features from title
for _, re := range bracketFeatPatterns {

@ -5,6 +5,7 @@ import (
"fmt"
"log"
"os"
"regexp"
"testing"
"time"
@ -167,15 +168,15 @@ func getTestGetenv(resource *dockertest.Resource) func(string) string {
func truncateTestData(t *testing.T) {
err := store.Exec(context.Background(),
`TRUNCATE
artists,
`TRUNCATE
artists,
artist_aliases,
tracks,
artist_tracks,
releases,
artist_releases,
tracks,
artist_tracks,
releases,
artist_releases,
release_aliases,
listens
listens
RESTART IDENTITY CASCADE`)
require.NoError(t, err)
}
@ -184,23 +185,23 @@ func setupTestDataWithMbzIDs(t *testing.T) {
truncateTestData(t)
err := store.Exec(context.Background(),
`INSERT INTO artists (musicbrainz_id)
`INSERT INTO artists (musicbrainz_id)
VALUES ('00000000-0000-0000-0000-000000000001')`)
require.NoError(t, err)
err = store.Exec(context.Background(),
`INSERT INTO artist_aliases (artist_id, alias, source, is_primary)
`INSERT INTO artist_aliases (artist_id, alias, source, is_primary)
VALUES (1, 'ATARASHII GAKKO!', 'Testing', true)`)
require.NoError(t, err)
err = store.Exec(context.Background(),
`INSERT INTO releases (musicbrainz_id)
`INSERT INTO releases (musicbrainz_id)
VALUES ('00000000-0000-0000-0000-000000000101')`)
require.NoError(t, err)
err = store.Exec(context.Background(),
`INSERT INTO release_aliases (release_id, alias, source, is_primary)
`INSERT INTO release_aliases (release_id, alias, source, is_primary)
VALUES (1, 'AG! Calling', 'Testing', true)`)
require.NoError(t, err)
err = store.Exec(context.Background(),
`INSERT INTO artist_releases (artist_id, release_id)
`INSERT INTO artist_releases (artist_id, release_id)
VALUES (1, 1)`)
require.NoError(t, err)
err = store.Exec(context.Background(),
@ -221,23 +222,23 @@ func setupTestDataSansMbzIDs(t *testing.T) {
truncateTestData(t)
err := store.Exec(context.Background(),
`INSERT INTO artists (musicbrainz_id)
`INSERT INTO artists (musicbrainz_id)
VALUES (NULL)`)
require.NoError(t, err)
err = store.Exec(context.Background(),
`INSERT INTO artist_aliases (artist_id, alias, source, is_primary)
`INSERT INTO artist_aliases (artist_id, alias, source, is_primary)
VALUES (1, 'ATARASHII GAKKO!', 'Testing', true)`)
require.NoError(t, err)
err = store.Exec(context.Background(),
`INSERT INTO releases (musicbrainz_id)
`INSERT INTO releases (musicbrainz_id)
VALUES (NULL)`)
require.NoError(t, err)
err = store.Exec(context.Background(),
`INSERT INTO release_aliases (release_id, alias, source, is_primary)
`INSERT INTO release_aliases (release_id, alias, source, is_primary)
VALUES (1, 'AG! Calling', 'Testing', true)`)
require.NoError(t, err)
err = store.Exec(context.Background(),
`INSERT INTO artist_releases (artist_id, release_id)
`INSERT INTO artist_releases (artist_id, release_id)
VALUES (1, 1)`)
require.NoError(t, err)
err = store.Exec(context.Background(),
@ -358,10 +359,16 @@ func TestArtistStringParse(t *testing.T) {
// artists in both
{"Daft Punk feat. Julian Casablancas", "Instant Crush (feat. Julian Casablancas)"}: {"Daft Punk", "Julian Casablancas"},
{"Paramore (feat. Joy Williams)", "Hate to See Your Heart Break feat. Joy Williams"}: {"Paramore", "Joy Williams"},
{"MINSU", "오해 금지 (Feat. BIG Naughty)"}: {"MINSU", "BIG Naughty"},
{"MINSU", "오해 금지 [Feat. BIG Naughty]"}: {"MINSU", "BIG Naughty"},
{"MINSU", "오해 금지 Feat. BIG Naughty"}: {"MINSU", "BIG Naughty"},
// custom separator
{"MIMiNARI//楠木ともり", "眠れない"}: {"MIMiNARI", "楠木ともり"},
}
for in, out := range cases {
artists := catalog.ParseArtists(in.Name, in.Title)
artists := catalog.ParseArtists(in.Name, in.Title, []*regexp.Regexp{regexp.MustCompile(`\s*//\s*`), regexp.MustCompile(`\s+·\s+`)})
assert.ElementsMatch(t, out, artists)
}
}

@ -3,6 +3,7 @@ package cfg
import (
"errors"
"fmt"
"regexp"
"strconv"
"strings"
"sync"
@ -45,6 +46,7 @@ const (
IMPORT_BEFORE_UNIX_ENV = "KOITO_IMPORT_BEFORE_UNIX"
IMPORT_AFTER_UNIX_ENV = "KOITO_IMPORT_AFTER_UNIX"
FETCH_IMAGES_DURING_IMPORT_ENV = "KOITO_FETCH_IMAGES_DURING_IMPORT"
ARTIST_SEPARATORS_ENV = "KOITO_ARTIST_SEPARATORS_REGEX"
)
type config struct {
@ -80,6 +82,7 @@ type config struct {
userAgent string
importBefore time.Time
importAfter time.Time
artistSeparators []*regexp.Regexp
}
var (
@ -189,6 +192,18 @@ func loadConfig(getenv func(string) string, version string) (*config, error) {
rawCors := getenv(CORS_ORIGINS_ENV)
cfg.allowedOrigins = strings.Split(rawCors, ",")
if getenv(ARTIST_SEPARATORS_ENV) != "" {
for pattern := range strings.SplitSeq(getenv(ARTIST_SEPARATORS_ENV), ";;") {
regex, err := regexp.Compile(pattern)
if err != nil {
return nil, fmt.Errorf("failed to compile regex pattern %s", pattern)
}
cfg.artistSeparators = append(cfg.artistSeparators, regex)
}
} else {
cfg.artistSeparators = []*regexp.Regexp{regexp.MustCompile(`\s+·\s+`)}
}
switch strings.ToLower(getenv(LOG_LEVEL_ENV)) {
case "debug":
cfg.logLevel = 0
@ -388,3 +403,9 @@ func FetchImagesDuringImport() bool {
defer lock.RUnlock()
return globalConfig.fetchImageDuringImport
}
func ArtistSeparators() []*regexp.Regexp {
lock.RLock()
defer lock.RUnlock()
return globalConfig.artistSeparators
}

Loading…
Cancel
Save