perf: parallelize --repair with a thread pool (--workers, default 8)

Each repaired file is an independent yt-dlp metadata round-trip, so repair is
network-bound; run them concurrently via ThreadPoolExecutor. Adds --workers
(default 8) to cap concurrency and a progress line every 100 files. At ~50k
tracks this turns a ~day-long sequential run into hours. Lower --workers if
YouTube rate-limits (429/403).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-13 11:13:52 -07:00
parent 0347a638cf
commit 92742b9ad6
3 changed files with 60 additions and 10 deletions

View File

@@ -95,6 +95,7 @@ export LIDARR_API_KEY="your-lidarr-api-key"
| `-o`, `--root PATH` | Output root folder (default `/media/music`). |
| `--search-all` | Search all albums when adding an artist to Lidarr. |
| `--repair` | Re-tag existing downloads under `--root` from source metadata (see below). |
| `--workers N` | Parallel metadata fetches during `--repair` (default 8). |
| `--retag-from-path` | Offline: re-tag artist/title from folder + filename (see below). |
| `-x`, `--exclude NAME` | Folder under `--root` to skip during `--repair`/`--retag-from-path` (repeatable). |
| `--debug` | Verbose output. |
@@ -137,8 +138,11 @@ but it never overwrites a genuine existing artist/title with a channel name or d
title. A bogus `NA [<id>].<ext>` filename is renamed to the recovered title, and a literal
`NA` album with no source album is normalised to `Unknown Album`.
It re-queries the source over the network, so run it occasionally, not constantly. Requires
`mutagen` (a yt-dlp dependency, usually already present). CLI-only — not exposed via the REST API.
Each file is its own yt-dlp network round-trip, so repair runs them in a thread pool;
`--workers N` (default 8) caps concurrency — lower it if YouTube starts rate-limiting
(HTTP 429/403), raise it to go faster on a large library. Progress prints every 100 files.
Requires `mutagen` (a yt-dlp dependency, usually already present). CLI-only — not exposed via
the REST API.
```bash
# Preview what would change (writes nothing)

View File

@@ -1020,19 +1020,31 @@ def repair_file(path: str, source: str, dry_run: bool) -> list[str]:
return changed
def repair_library(root: str, dry_run: bool, exclude=()) -> tuple[int, int]:
"""Walk <root>/<artist>/<source>/ and re-tag audio files. Returns (scanned, changed)."""
def repair_library(root: str, dry_run: bool, exclude=(), workers: int = 8) -> tuple[int, int]:
"""Walk <root>/<artist>/<source>/ and re-tag audio files. Returns (scanned, changed).
Each file is an independent yt-dlp network round-trip, so they run in a
thread pool (network-bound); `workers` caps concurrency. Each thread owns
its own file + request, so no shared state needs locking beyond the counts.
Lower `workers` if YouTube starts rate-limiting (HTTP 429/403)."""
if not os.path.isdir(root):
err(f"Root folder not found: {root}")
return 0, 0
scanned = changed = 0
for path, source, _artist in _iter_source_files(root, exclude):
scanned += 1
def _one(path, source):
try:
if repair_file(path, source, dry_run):
changed += 1
return bool(repair_file(path, source, dry_run))
except Exception as e: # noqa: BLE001 — one bad file shouldn't abort
err(f"repair failed ({os.path.basename(path)}): {e}")
return False
scanned = changed = 0
files = ((p, s) for p, s, _a in _iter_source_files(root, exclude))
with ThreadPoolExecutor(max_workers=max(1, workers)) as ex:
for ok in ex.map(lambda ps: _one(*ps), files):
scanned += 1
changed += int(ok)
if scanned % 100 == 0:
print(f"… {scanned} scanned, {changed} changed", flush=True)
verb = "Would repair" if dry_run else "Repaired"
print(f"{verb} {changed}/{scanned} files")
return scanned, changed
@@ -1171,6 +1183,9 @@ def parse_args():
help="Search all albums when adding an artist to Lidarr.")
p.add_argument("--repair", action="store_true",
help="Re-tag existing downloads under --root from source metadata.")
p.add_argument("--workers", type=int, default=8,
help="Parallel yt-dlp metadata fetches during --repair (default 8; "
"lower if YouTube rate-limits).")
p.add_argument("--retag-from-path", action="store_true",
help="Offline: re-tag artist/title from folder + filename "
"(fixes tags damaged by a prior --repair).")
@@ -1192,7 +1207,7 @@ def main():
return
if args.repair:
repair_library(args.root, args.dry_run, args.exclude)
repair_library(args.root, args.dry_run, args.exclude, args.workers)
return
if not query:

View File

@@ -336,3 +336,34 @@ def test_repair_file_dry_run_does_not_rename(tmp_path, monkeypatch):
def test_fs_safe_replaces_slash():
assert "/" not in mf._fs_safe("AC/DC Live")
# ---- parallel repair ----
def test_repair_library_parallel_visits_all(tmp_path, monkeypatch):
root = tmp_path
n = 50
for i in range(n):
d = root / f"Artist{i}" / "youtube"
d.mkdir(parents=True)
(d / f"T{i} [{YT_ID}].opus").write_text("x")
import threading
seen = set()
lock = threading.Lock()
def fake(path, source, dry_run):
with lock:
seen.add(path)
return ["album=X"]
monkeypatch.setattr(mf, "repair_file", fake)
scanned, changed = mf.repair_library(str(root), dry_run=False, workers=8)
assert scanned == n and changed == n
assert len(seen) == n
def test_repair_library_default_workers_still_works(tmp_path, monkeypatch):
root = tmp_path
(root / "A" / "youtube").mkdir(parents=True)
(root / "A" / "youtube" / f"T [{YT_ID}].opus").write_text("x")
monkeypatch.setattr(mf, "repair_file", lambda p, s, d: ["x"])
assert mf.repair_library(str(root), dry_run=False) == (1, 1)