From c6e28a4f75baac93c931333a1e21a1a21cbbd179 Mon Sep 17 00:00:00 2001 From: zebra Date: Wed, 10 Jun 2026 18:53:27 -0700 Subject: [PATCH] fix: harden --repair against music videos; first-artist folder for single URLs --repair was clobbering good tags and erroring on real libraries: - Validate the parsed id per source (YouTube 11-char, SoundCloud numeric) so junk ids from bracketed descriptors ([Official Video]) are skipped, not queried. - Skip files whose source returns no real music metadata (no album/year, e.g. music videos) instead of overwriting clean tags with channel/decorated titles. - Year from release info only (sane 1000-2100), never upload_date (which gave wrong years for old songs and bogus values like 6577). - album/year are authoritative; artist/title are fill-missing-only (no clobber). Also: download_single now uses the first artist for the folder (matching the search/playlist paths) so single-URL downloads stop creating multi-artist dirs. Co-Authored-By: Claude Opus 4.8 --- README.md | 14 ++++-- musicfetch | 67 +++++++++++++++++++------ tests/test_repair.py | 117 +++++++++++++++++++++++++++++-------------- 3 files changed, 140 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 810a67b..7a7920e 100644 --- a/README.md +++ b/README.md @@ -125,10 +125,16 @@ export LIDARR_API_KEY="your-lidarr-api-key" `--repair` walks `///` (the `youtube`/`soundcloud`/… download folders — Lidarr album folders are skipped), re-fetches authoritative metadata for each -file using the `[id]` in its filename, and fixes tags (album, year, artist, title). Useful -when downloads landed with missing album or wrong year. It re-queries the source over the -network, so run it occasionally, not constantly. Requires `mutagen` (a yt-dlp dependency, -usually already present). CLI-only — not exposed via the REST API. +file using the `[id]` in its filename, and fixes tags. Useful when downloads landed with +missing album or wrong year. + +It is deliberately **conservative**: it overwrites **album** and **year** (the usual +breakage), but only *fills in* a missing **artist**/**title** — it never overwrites existing +artist/title with channel names or decorated video titles. Files whose source returns no real +music metadata (no album and no release year — e.g. plain music videos) are left untouched. + +It re-queries the source over the network, so run it occasionally, not constantly. Requires +`mutagen` (a yt-dlp dependency, usually already present). CLI-only — not exposed via the REST API. ```bash # Preview what would change (writes nothing) diff --git a/musicfetch b/musicfetch index 354f65f..083a5be 100755 --- a/musicfetch +++ b/musicfetch @@ -760,7 +760,9 @@ def download_single(url: str, root: str, quality: str, dry_run: bool) -> dict: artist = get_artist_from_metadata(meta) if meta else "Unknown Artist" title = (meta or {}).get("title", "") source = _sanitize_source((meta or {}).get("extractor", "")) if meta else "downloads" - target = os.path.join(root, artist, source) + # First artist only for the folder (matches the search/playlist paths). + artist_dir = artist.split(",")[0].strip() or "Unknown Artist" + target = os.path.join(root, artist_dir, source) ok = yt_download(url, target, quality, dry_run) return {"title": title, "artist": artist, "ok": ok} @@ -827,18 +829,24 @@ def _repair_probe_url(source: str, vid: str): return None -def _desired_tags(meta: dict) -> dict: - """Authoritative tags from yt-dlp metadata (omit empties).""" - year = (str(meta.get("release_year") or "") - or (meta.get("release_date") or "")[:4] - or (meta.get("upload_date") or "")[:4]) - fields = { - "artist": get_artist_from_metadata(meta), - "title": meta.get("title", ""), - "album": meta.get("album", ""), - "date": year, - } - return {k: v for k, v in fields.items() if v and v != "Unknown Artist"} +def _repair_id_ok(source: str, vid: str) -> bool: + """True if the parsed id matches the source's id format (avoids querying + junk ids pulled from bracketed descriptors like '[Official Video]').""" + if source == "youtube": + return bool(re.fullmatch(r"[A-Za-z0-9_-]{11}", vid)) + if source == "soundcloud": + return vid.isdigit() + return False + + +def _valid_year(meta: dict) -> str: + """A plausible release year from metadata, or '' . Uses release info only — + NOT upload_date, which is the upload year, not the song's year.""" + for v in (meta.get("release_year"), (meta.get("release_date") or "")[:4]): + s = str(v or "") + if s.isdigit() and 1000 <= int(s) <= 2100: + return s + return "" def _open_audio(path: str): @@ -868,12 +876,19 @@ def _read_tag(audio, key_map, field: str) -> str: def repair_file(path: str, source: str, dry_run: bool) -> list[str]: - """Re-tag one file from source metadata. Returns the list of changed fields.""" + """Re-tag one file from source metadata. Conservative: fixes album/year + (the common breakage) and only fills MISSING artist/title — never clobbers + existing tags with channel names or decorated music-video titles. Files whose + source has no real music metadata (no album/year, e.g. music videos) are left + untouched. Returns the list of changed fields.""" parsed = _parse_track_file(os.path.basename(path)) if not parsed: dbg(f"skip (no id): {path}") return [] _, vid = parsed + if not _repair_id_ok(source, vid): + dbg(f"skip (bad {source} id '{vid}'): {path}") + return [] url = _repair_probe_url(source, vid) if not url: dbg(f"skip (source '{source}' not re-queryable): {path}") @@ -882,7 +897,13 @@ def repair_file(path: str, source: str, dry_run: bool) -> list[str]: if not meta: dbg(f"skip (no metadata): {path}") return [] - desired = _desired_tags(meta) + + album = (meta.get("album") or "").strip() + year = _valid_year(meta) + if not album and not year: + dbg(f"skip (no music metadata, likely a video): {path}") + return [] + try: audio, key_map = _open_audio(path) except Exception as e: # noqa: BLE001 @@ -890,8 +911,22 @@ def repair_file(path: str, source: str, dry_run: bool) -> list[str]: return [] if audio is None: return [] + + # album/year are authoritative (overwrite); artist/title fill-missing only. + updates = {} + if album: + updates["album"] = album + if year: + updates["date"] = year + artist = get_artist_from_metadata(meta) + if artist and artist != "Unknown Artist" and not _read_tag(audio, key_map, "artist"): + updates["artist"] = artist + title = meta.get("title", "") + if title and not _read_tag(audio, key_map, "title"): + updates["title"] = title + changed = [] - for field, value in desired.items(): + for field, value in updates.items(): if _read_tag(audio, key_map, field) != value: changed.append(f"{field}={value}") if not dry_run: diff --git a/tests/test_repair.py b/tests/test_repair.py index 0baf8d6..8c333f2 100644 --- a/tests/test_repair.py +++ b/tests/test_repair.py @@ -1,13 +1,15 @@ import server.mf # noqa: F401 — loads musicfetch, registers musicfetch_core import musicfetch_core as mf +YT_ID = "dQw4w9WgXcQ" # valid 11-char YouTube id + # ---- _is_source_dir ---- def test_is_source_dir(): assert mf._is_source_dir("youtube") is True assert mf._is_source_dir("soundcloud") is True assert mf._is_source_dir("downloads") is True - assert mf._is_source_dir("Discovery") is False # Lidarr album folder + assert mf._is_source_dir("Discovery") is False # Lidarr album folder assert mf._is_source_dir("Random Access Memories") is False assert mf._is_source_dir("") is False @@ -16,38 +18,39 @@ def test_is_source_dir(): def test_parse_track_file(): assert mf._parse_track_file("Under My Skin [nGSNF2l44Zc].opus") == ("Under My Skin", "nGSNF2l44Zc") assert mf._parse_track_file("Ignomon [2202690443].m4a") == ("Ignomon", "2202690443") + # greedy title: real id is the LAST bracket + assert mf._parse_track_file("WHO GON' SLIDE [Official Music Video] [AxjP9s6J3uY].opus") \ + == ("WHO GON' SLIDE [Official Music Video]", "AxjP9s6J3uY") assert mf._parse_track_file("no-id-here.opus") is None assert mf._parse_track_file("cover.jpg") is None +# ---- _repair_id_ok ---- +def test_repair_id_ok(): + assert mf._repair_id_ok("youtube", YT_ID) is True + assert mf._repair_id_ok("youtube", "Official Video") is False # space, wrong length + assert mf._repair_id_ok("youtube", "Cover") is False + assert mf._repair_id_ok("soundcloud", "2202690443") is True + assert mf._repair_id_ok("soundcloud", "abc") is False + assert mf._repair_id_ok("bandcamp", "x") is False + + +# ---- _valid_year ---- +def test_valid_year(): + assert mf._valid_year({"release_year": 2001}) == "2001" + assert mf._valid_year({"release_date": "1976-09-10"}) == "1976" + assert mf._valid_year({"upload_date": "20110101"}) == "" # upload date ignored + assert mf._valid_year({"release_year": 6577}) == "" # out of range + assert mf._valid_year({}) == "" + + # ---- _repair_probe_url ---- def test_repair_probe_url(): - assert mf._repair_probe_url("youtube", "vid") == "https://music.youtube.com/watch?v=vid" + assert mf._repair_probe_url("youtube", YT_ID) == f"https://music.youtube.com/watch?v={YT_ID}" assert mf._repair_probe_url("soundcloud", "123") == "https://api.soundcloud.com/tracks/123" assert mf._repair_probe_url("bandcamp", "x") is None -# ---- _desired_tags ---- -def test_desired_tags_full(): - meta = {"artist": "Daft Punk", "title": "Aerodynamic", "album": "Discovery", "release_year": 2001} - assert mf._desired_tags(meta) == {"artist": "Daft Punk", "title": "Aerodynamic", - "album": "Discovery", "date": "2001"} - - -def test_desired_tags_year_fallbacks_and_omits_empty(): - meta = {"title": "T", "uploader": "Chan", "upload_date": "20230102"} # no album, no release_year - out = mf._desired_tags(meta) - assert out["date"] == "2023" - assert out["title"] == "T" - assert out["artist"] == "Chan" - assert "album" not in out # omitted when empty - - -def test_desired_tags_drops_unknown_artist(): - meta = {"title": "T"} # get_artist_from_metadata -> "Unknown Artist" - assert "artist" not in mf._desired_tags(meta) - - # ---- repair_file (fake audio + mocked metadata) ---- class _FakeAudio(dict): def __init__(self, initial): @@ -58,13 +61,13 @@ class _FakeAudio(dict): self.saved = True -def test_repair_file_writes_changed_fields(monkeypatch): +def test_repair_file_fixes_album_and_year(monkeypatch): monkeypatch.setattr(mf, "run_yt_dlp_get_metadata", lambda url: {"artist": "Daft Punk", "title": "Aerodynamic", "album": "Discovery", "release_year": 2001}) audio = _FakeAudio({"artist": ["Daft Punk"], "title": ["Aerodynamic"]}) # album/date missing monkeypatch.setattr(mf, "_open_audio", lambda path: (audio, None)) - changed = mf.repair_file("X/youtube/Aerodynamic [vid].opus", "youtube", dry_run=False) + changed = mf.repair_file(f"X/youtube/Aerodynamic [{YT_ID}].opus", "youtube", dry_run=False) assert set(changed) == {"album=Discovery", "date=2001"} assert audio["album"] == ["Discovery"] assert audio["date"] == ["2001"] @@ -76,30 +79,69 @@ def test_repair_file_dry_run_writes_nothing(monkeypatch): lambda url: {"artist": "A", "title": "T", "album": "Alb", "release_year": 2020}) audio = _FakeAudio({}) monkeypatch.setattr(mf, "_open_audio", lambda path: (audio, None)) - changed = mf.repair_file("X/youtube/T [vid].opus", "youtube", dry_run=True) - assert changed # reports would-change - assert audio == {} # nothing written + changed = mf.repair_file(f"X/youtube/T [{YT_ID}].opus", "youtube", dry_run=True) + assert changed + assert audio == {} assert audio.saved is False +def test_repair_file_skips_music_video(monkeypatch): + # No album AND no valid release year -> treat as a video, leave tags alone. + monkeypatch.setattr(mf, "run_yt_dlp_get_metadata", + lambda url: {"title": "Artist - Song (Official Music Video)", + "uploader": "SomeVEVO", "upload_date": "20110101"}) + audio = _FakeAudio({"artist": ["Real Artist"], "title": ["Song"]}) + monkeypatch.setattr(mf, "_open_audio", lambda path: (audio, None)) + changed = mf.repair_file(f"X/youtube/Song [{YT_ID}].opus", "youtube", dry_run=False) + assert changed == [] + assert audio == {"artist": ["Real Artist"], "title": ["Song"]} # untouched + + +def test_repair_file_fills_missing_but_never_clobbers(monkeypatch): + # Source artist is a channel name; existing artist must be kept. + monkeypatch.setattr(mf, "run_yt_dlp_get_metadata", + lambda url: {"artist": "SomeChannelVEVO", "title": "Channel Decorated Title", + "album": "Real Album", "release_year": 2019}) + audio = _FakeAudio({"artist": ["Correct Artist"], "title": ["Clean Title"]}) + monkeypatch.setattr(mf, "_open_audio", lambda path: (audio, None)) + changed = mf.repair_file(f"X/youtube/x [{YT_ID}].opus", "youtube", dry_run=False) + assert set(changed) == {"album=Real Album", "date=2019"} + assert audio["artist"] == ["Correct Artist"] # NOT overwritten with channel + assert audio["title"] == ["Clean Title"] # NOT overwritten with decorated title + + +def test_repair_file_fills_missing_artist_when_absent(monkeypatch): + monkeypatch.setattr(mf, "run_yt_dlp_get_metadata", + lambda url: {"artist": "Real Artist", "title": "T", + "album": "Alb", "release_year": 2020}) + audio = _FakeAudio({}) # nothing present -> fill artist + title too + monkeypatch.setattr(mf, "_open_audio", lambda path: (audio, None)) + changed = mf.repair_file(f"X/youtube/T [{YT_ID}].opus", "youtube", dry_run=False) + assert set(changed) == {"album=Alb", "date=2020", "artist=Real Artist", "title=T"} + + +def test_repair_file_skips_bad_id(monkeypatch): + called = {"meta": False} + monkeypatch.setattr(mf, "run_yt_dlp_get_metadata", + lambda url: called.update(meta=True) or {}) + # last bracket is a descriptor, not a real id + assert mf.repair_file("X/youtube/Song [Official Video].opus", "youtube", dry_run=False) == [] + assert called["meta"] is False # never hit the network + + def test_repair_file_skips_unparseable(monkeypatch): called = {"meta": False} monkeypatch.setattr(mf, "run_yt_dlp_get_metadata", lambda url: called.update(meta=True) or {}) assert mf.repair_file("X/youtube/no-id.opus", "youtube", dry_run=False) == [] - assert called["meta"] is False # never hit the network - - -def test_repair_file_skips_unqueryable_source(monkeypatch): - monkeypatch.setattr(mf, "run_yt_dlp_get_metadata", lambda url: {"title": "x"}) - assert mf.repair_file("X/bandcamp/T [id].m4a", "bandcamp", dry_run=False) == [] + assert called["meta"] is False # ---- repair_library (real temp tree, repair_file mocked) ---- def test_repair_library_scans_only_source_dirs(tmp_path, monkeypatch): root = tmp_path (root / "Daft Punk" / "youtube").mkdir(parents=True) - (root / "Daft Punk" / "youtube" / "Aerodynamic [vid].opus").write_text("x") + (root / "Daft Punk" / "youtube" / f"Aerodynamic [{YT_ID}].opus").write_text("x") (root / "Daft Punk" / "Discovery").mkdir(parents=True) # Lidarr album -> skip (root / "Daft Punk" / "Discovery" / "Aerodynamic.flac").write_text("x") (root / "Ephixa" / "soundcloud").mkdir(parents=True) @@ -110,9 +152,8 @@ def test_repair_library_scans_only_source_dirs(tmp_path, monkeypatch): lambda path, source, dry_run: visited.append((source, path)) or ["album=X"]) scanned, changed = mf.repair_library(str(root), dry_run=False) assert scanned == 2 and changed == 2 - sources = sorted(s for s, _ in visited) - assert sources == ["soundcloud", "youtube"] # Discovery album folder skipped + assert sorted(s for s, _ in visited) == ["soundcloud", "youtube"] # album folder skipped -def test_repair_library_missing_root(monkeypatch): +def test_repair_library_missing_root(): assert mf.repair_library("/no/such/dir", dry_run=False) == (0, 0)