Merge pull request #682 from stonerl/faster-chapter-extraction

faster chapter extraction
2026-07-14 11:22:04 +00:00 · 2024-05-23 11:34:07 +02:00
parent 4fd3a37705 c9fb41c8e8
commit 03d24fbc42
1 changed files with 74 additions and 38 deletions
--- a/Model/Applications/VideosAPI.swift
+++ b/Model/Applications/VideosAPI.swift
@@ -152,58 +152,94 @@ extension VideosAPI {
        /*
         The following chapter patterns are covered:
-         start - end - title / start - end: Title / start - end title
+         1) "start - end - title" / "start - end: Title" / "start - end title"
-         start - title / start: title / start title / [start] - title / [start]: title / [start] title
+         2) "start - title" / "start: title" / "start title" / "[start] - title" / "[start]: title" / "[start] title"
-         index. title - start / index. title start
+         3) "index. title - start" / "index. title start"
-         title: (start)
+         4) "title: (start)"
         5) "(start) title"
-         The order is important!
+         These represent:
         -  "start" and "end" are timestamps, defining the start and end of the individual chapter
         -  "title" is the name of the chapter
         -  "index" is the chapter's position in a list
         The order of these patterns is important as it determines the priority. The patterns listed first have a higher priority.
         In the case of multiple matches, the pattern with the highest priority will be chosen - lower number means higher priority.
         */
        let patterns = [
            "(?<=\\n|^)\\s*(?:►\\s*)?\\[?(?<start>(?:[0-9]+:){1,2}[0-9]+)\\]?(?:\\s*-\\s*)?(?<end>(?:[0-9]+:){1,2}[0-9]+)?(?:\\s*-\\s*|\\s*[:]\\s*)?(?<title>.*)(?=\\n|$)",
            "(?<=\\n|^)\\s*(?:►\\s*)?\\[?(?<start>(?:[0-9]+:){1,2}[0-9]+)\\]?\\s*[-:]?\\s*(?<title>.+)(?=\\n|$)",
            "(?<=\\n|^)(?<index>[0-9]+\\.\\s)(?<title>.+?)(?:\\s*-\\s*)?(?<start>(?:[0-9]+:){1,2}[0-9]+)(?=\\n|$)",
-            "(?<=\\n|^)(?<title>.+?):\\s*\\((?<start>(?:[0-9]+:){1,2}[0-9]+)\\)(?=\\n|$)"
+            "(?<=\\n|^)(?<title>.+?):\\s*\\((?<start>(?:[0-9]+:){1,2}[0-9]+)\\)(?=\\n|$)",
            "(?<=^|\\n)\\((?<start>(?:[0-9]+:){1,2}[0-9]+)\\)\\s*(?<title>.+?)(?=\\n|$)"
        ]
-        for pattern in patterns {
+        let extractChaptersGroup = DispatchGroup()
-            guard let chaptersRegularExpression = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive) else { continue }
+        var capturedChapters: [Int: [Chapter]] = [:]
-            let chapterLines = chaptersRegularExpression.matches(in: description, range: NSRange(description.startIndex..., in: description))
+        let lock = NSLock()
-            if !chapterLines.isEmpty {
+        for (index, pattern) in patterns.enumerated() {
-                return chapterLines.compactMap { line in
+            extractChaptersGroup.enter()
-                    let titleRange = line.range(withName: "title")
+            DispatchQueue.global().async {
-                    let startRange = line.range(withName: "start")
+                if let chaptersRegularExpression = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive) {
-                    guard let titleSubstringRange = Range(titleRange, in: description),
+                    let chapterLines = chaptersRegularExpression.matches(in: description, range: NSRange(description.startIndex..., in: description))
-                          let startSubstringRange = Range(startRange, in: description)
+                    let extractedChapters = chapterLines.compactMap { line -> Chapter? in
-                    else {
+                        let titleRange = line.range(withName: "title")
-                        return nil
+                        let startRange = line.range(withName: "start")
                    }
                    let titleCapture = String(description[titleSubstringRange]).trimmingCharacters(in: .whitespaces)
                    let startCapture = String(description[startSubstringRange])
                    let startComponents = startCapture.components(separatedBy: ":")
                    guard startComponents.count <= 3 else { return nil }
-                    var hours: Double?
+                        guard let titleSubstringRange = Range(titleRange, in: description),
-                    var minutes: Double?
+                              let startSubstringRange = Range(startRange, in: description)
-                    var seconds: Double?
+                        else {
                            return nil
                        }
-                    if startComponents.count == 3 {
+                        let titleCapture = String(description[titleSubstringRange]).trimmingCharacters(in: .whitespaces)
-                        hours = Double(startComponents[0])
+                        let startCapture = String(description[startSubstringRange])
-                        minutes = Double(startComponents[1])
+                        let startComponents = startCapture.components(separatedBy: ":")
-                        seconds = Double(startComponents[2])
+                        guard startComponents.count <= 3 else { return nil }
-                    } else if startComponents.count == 2 {
+
-                        minutes = Double(startComponents[0])
+                        var hours: Double?
-                        seconds = Double(startComponents[1])
+                        var minutes: Double?
                        var seconds: Double?
                        if startComponents.count == 3 {
                            hours = Double(startComponents[0])
                            minutes = Double(startComponents[1])
                            seconds = Double(startComponents[2])
                        } else if startComponents.count == 2 {
                            minutes = Double(startComponents[0])
                            seconds = Double(startComponents[1])
                        }
                        guard var startSeconds = seconds else { return nil }
                        startSeconds += (minutes ?? 0) * 60
                        startSeconds += (hours ?? 0) * 60 * 60
                        return Chapter(title: titleCapture, start: startSeconds)
                    }
-                    guard var startSeconds = seconds else { return nil }
+                    if !extractedChapters.isEmpty {
-
+                        lock.lock()
-                    startSeconds += (minutes ?? 0) * 60
+                        capturedChapters[index] = extractedChapters
-                    startSeconds += (hours ?? 0) * 60 * 60
+                        lock.unlock()
-
+                    }
                    return .init(title: titleCapture, start: startSeconds)
                }
                extractChaptersGroup.leave()
            }
        }
        extractChaptersGroup.wait()
        // Now we sort the keys of the capturedChapters dictionary.
        // These keys correspond to the priority of each pattern.
        let sortedKeys = Array(capturedChapters.keys).sorted(by: <)
        // Return first non-empty result in the order of patterns
        for key in sortedKeys {
            if let chapters = capturedChapters[key], !chapters.isEmpty {
                return chapters
            }
        }
        return []