From 92742b9ad6d9c3ce12264c7e443a47cc569303d7 Mon Sep 17 00:00:00 2001 From: zebra Date: Sat, 13 Jun 2026 11:13:52 -0700 Subject: [PATCH] perf: parallelize --repair with a thread pool (--workers, default 8) Each repaired file is an independent yt-dlp metadata round-trip, so repair is network-bound; run them concurrently via ThreadPoolExecutor. Adds --workers (default 8) to cap concurrency and a progress line every 100 files. At ~50k tracks this turns a ~day-long sequential run into hours. Lower --workers if YouTube rate-limits (429/403). Co-Authored-By: Claude Opus 4.8 --- README.md | 8 ++++++-- musicfetch | 31 +++++++++++++++++++++++-------- tests/test_repair.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 523b3f6..a3a0da9 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,7 @@ export LIDARR_API_KEY="your-lidarr-api-key" | `-o`, `--root PATH` | Output root folder (default `/media/music`). | | `--search-all` | Search all albums when adding an artist to Lidarr. | | `--repair` | Re-tag existing downloads under `--root` from source metadata (see below). | +| `--workers N` | Parallel metadata fetches during `--repair` (default 8). | | `--retag-from-path` | Offline: re-tag artist/title from folder + filename (see below). | | `-x`, `--exclude NAME` | Folder under `--root` to skip during `--repair`/`--retag-from-path` (repeatable). | | `--debug` | Verbose output. | @@ -137,8 +138,11 @@ but it never overwrites a genuine existing artist/title with a channel name or d title. A bogus `NA [].` filename is renamed to the recovered title, and a literal `NA` album with no source album is normalised to `Unknown Album`. -It re-queries the source over the network, so run it occasionally, not constantly. Requires -`mutagen` (a yt-dlp dependency, usually already present). CLI-only — not exposed via the REST API. +Each file is its own yt-dlp network round-trip, so repair runs them in a thread pool; +`--workers N` (default 8) caps concurrency — lower it if YouTube starts rate-limiting +(HTTP 429/403), raise it to go faster on a large library. Progress prints every 100 files. +Requires `mutagen` (a yt-dlp dependency, usually already present). CLI-only — not exposed via +the REST API. ```bash # Preview what would change (writes nothing) diff --git a/musicfetch b/musicfetch index 2438c7d..710d9a1 100755 --- a/musicfetch +++ b/musicfetch @@ -1020,19 +1020,31 @@ def repair_file(path: str, source: str, dry_run: bool) -> list[str]: return changed -def repair_library(root: str, dry_run: bool, exclude=()) -> tuple[int, int]: - """Walk /// and re-tag audio files. Returns (scanned, changed).""" +def repair_library(root: str, dry_run: bool, exclude=(), workers: int = 8) -> tuple[int, int]: + """Walk /// and re-tag audio files. Returns (scanned, changed). + Each file is an independent yt-dlp network round-trip, so they run in a + thread pool (network-bound); `workers` caps concurrency. Each thread owns + its own file + request, so no shared state needs locking beyond the counts. + Lower `workers` if YouTube starts rate-limiting (HTTP 429/403).""" if not os.path.isdir(root): err(f"Root folder not found: {root}") return 0, 0 - scanned = changed = 0 - for path, source, _artist in _iter_source_files(root, exclude): - scanned += 1 + + def _one(path, source): try: - if repair_file(path, source, dry_run): - changed += 1 + return bool(repair_file(path, source, dry_run)) except Exception as e: # noqa: BLE001 — one bad file shouldn't abort err(f"repair failed ({os.path.basename(path)}): {e}") + return False + + scanned = changed = 0 + files = ((p, s) for p, s, _a in _iter_source_files(root, exclude)) + with ThreadPoolExecutor(max_workers=max(1, workers)) as ex: + for ok in ex.map(lambda ps: _one(*ps), files): + scanned += 1 + changed += int(ok) + if scanned % 100 == 0: + print(f"… {scanned} scanned, {changed} changed", flush=True) verb = "Would repair" if dry_run else "Repaired" print(f"{verb} {changed}/{scanned} files") return scanned, changed @@ -1171,6 +1183,9 @@ def parse_args(): help="Search all albums when adding an artist to Lidarr.") p.add_argument("--repair", action="store_true", help="Re-tag existing downloads under --root from source metadata.") + p.add_argument("--workers", type=int, default=8, + help="Parallel yt-dlp metadata fetches during --repair (default 8; " + "lower if YouTube rate-limits).") p.add_argument("--retag-from-path", action="store_true", help="Offline: re-tag artist/title from folder + filename " "(fixes tags damaged by a prior --repair).") @@ -1192,7 +1207,7 @@ def main(): return if args.repair: - repair_library(args.root, args.dry_run, args.exclude) + repair_library(args.root, args.dry_run, args.exclude, args.workers) return if not query: diff --git a/tests/test_repair.py b/tests/test_repair.py index a8d115a..41384f1 100644 --- a/tests/test_repair.py +++ b/tests/test_repair.py @@ -336,3 +336,34 @@ def test_repair_file_dry_run_does_not_rename(tmp_path, monkeypatch): def test_fs_safe_replaces_slash(): assert "/" not in mf._fs_safe("AC/DC Live") + + +# ---- parallel repair ---- +def test_repair_library_parallel_visits_all(tmp_path, monkeypatch): + root = tmp_path + n = 50 + for i in range(n): + d = root / f"Artist{i}" / "youtube" + d.mkdir(parents=True) + (d / f"T{i} [{YT_ID}].opus").write_text("x") + + import threading + seen = set() + lock = threading.Lock() + + def fake(path, source, dry_run): + with lock: + seen.add(path) + return ["album=X"] + monkeypatch.setattr(mf, "repair_file", fake) + scanned, changed = mf.repair_library(str(root), dry_run=False, workers=8) + assert scanned == n and changed == n + assert len(seen) == n + + +def test_repair_library_default_workers_still_works(tmp_path, monkeypatch): + root = tmp_path + (root / "A" / "youtube").mkdir(parents=True) + (root / "A" / "youtube" / f"T [{YT_ID}].opus").write_text("x") + monkeypatch.setattr(mf, "repair_file", lambda p, s, d: ["x"]) + assert mf.repair_library(str(root), dry_run=False) == (1, 1)