From c9fb41c8e85bec148b27c456b65ee5968de8e115 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toni=20F=C3=B6rster?= Date: Sun, 19 May 2024 17:43:35 +0200 Subject: [PATCH] faster chapter extraction The extraction of chapters is now faster since it is run in parallel for each pattern. Also a new pattern hast been added: "(start) title" --- Model/Applications/VideosAPI.swift | 112 +++++++++++++++++++---------- 1 file changed, 74 insertions(+), 38 deletions(-) diff --git a/Model/Applications/VideosAPI.swift b/Model/Applications/VideosAPI.swift index 31f8705c..a7614956 100644 --- a/Model/Applications/VideosAPI.swift +++ b/Model/Applications/VideosAPI.swift @@ -152,58 +152,94 @@ extension VideosAPI { /* The following chapter patterns are covered: - start - end - title / start - end: Title / start - end title - start - title / start: title / start title / [start] - title / [start]: title / [start] title - index. title - start / index. title start - title: (start) + 1) "start - end - title" / "start - end: Title" / "start - end title" + 2) "start - title" / "start: title" / "start title" / "[start] - title" / "[start]: title" / "[start] title" + 3) "index. title - start" / "index. title start" + 4) "title: (start)" + 5) "(start) title" - The order is important! + These represent: + + - "start" and "end" are timestamps, defining the start and end of the individual chapter + - "title" is the name of the chapter + - "index" is the chapter's position in a list + + The order of these patterns is important as it determines the priority. The patterns listed first have a higher priority. + In the case of multiple matches, the pattern with the highest priority will be chosen - lower number means higher priority. */ let patterns = [ "(?<=\\n|^)\\s*(?:►\\s*)?\\[?(?(?:[0-9]+:){1,2}[0-9]+)\\]?(?:\\s*-\\s*)?(?(?:[0-9]+:){1,2}[0-9]+)?(?:\\s*-\\s*|\\s*[:]\\s*)?(?.*)(?=\\n|$)", "(?<=\\n|^)\\s*(?:►\\s*)?\\[?(?<start>(?:[0-9]+:){1,2}[0-9]+)\\]?\\s*[-:]?\\s*(?<title>.+)(?=\\n|$)", "(?<=\\n|^)(?<index>[0-9]+\\.\\s)(?<title>.+?)(?:\\s*-\\s*)?(?<start>(?:[0-9]+:){1,2}[0-9]+)(?=\\n|$)", - "(?<=\\n|^)(?<title>.+?):\\s*\\((?<start>(?:[0-9]+:){1,2}[0-9]+)\\)(?=\\n|$)" + "(?<=\\n|^)(?<title>.+?):\\s*\\((?<start>(?:[0-9]+:){1,2}[0-9]+)\\)(?=\\n|$)", + "(?<=^|\\n)\\((?<start>(?:[0-9]+:){1,2}[0-9]+)\\)\\s*(?<title>.+?)(?=\\n|$)" ] - for pattern in patterns { - guard let chaptersRegularExpression = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive) else { continue } - let chapterLines = chaptersRegularExpression.matches(in: description, range: NSRange(description.startIndex..., in: description)) + let extractChaptersGroup = DispatchGroup() + var capturedChapters: [Int: [Chapter]] = [:] + let lock = NSLock() - if !chapterLines.isEmpty { - return chapterLines.compactMap { line in - let titleRange = line.range(withName: "title") - let startRange = line.range(withName: "start") - guard let titleSubstringRange = Range(titleRange, in: description), - let startSubstringRange = Range(startRange, in: description) - else { - return nil - } - let titleCapture = String(description[titleSubstringRange]).trimmingCharacters(in: .whitespaces) - let startCapture = String(description[startSubstringRange]) - let startComponents = startCapture.components(separatedBy: ":") - guard startComponents.count <= 3 else { return nil } + for (index, pattern) in patterns.enumerated() { + extractChaptersGroup.enter() + DispatchQueue.global().async { + if let chaptersRegularExpression = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive) { + let chapterLines = chaptersRegularExpression.matches(in: description, range: NSRange(description.startIndex..., in: description)) + let extractedChapters = chapterLines.compactMap { line -> Chapter? in + let titleRange = line.range(withName: "title") + let startRange = line.range(withName: "start") - var hours: Double? - var minutes: Double? - var seconds: Double? + guard let titleSubstringRange = Range(titleRange, in: description), + let startSubstringRange = Range(startRange, in: description) + else { + return nil + } - if startComponents.count == 3 { - hours = Double(startComponents[0]) - minutes = Double(startComponents[1]) - seconds = Double(startComponents[2]) - } else if startComponents.count == 2 { - minutes = Double(startComponents[0]) - seconds = Double(startComponents[1]) + let titleCapture = String(description[titleSubstringRange]).trimmingCharacters(in: .whitespaces) + let startCapture = String(description[startSubstringRange]) + let startComponents = startCapture.components(separatedBy: ":") + guard startComponents.count <= 3 else { return nil } + + var hours: Double? + var minutes: Double? + var seconds: Double? + + if startComponents.count == 3 { + hours = Double(startComponents[0]) + minutes = Double(startComponents[1]) + seconds = Double(startComponents[2]) + } else if startComponents.count == 2 { + minutes = Double(startComponents[0]) + seconds = Double(startComponents[1]) + } + + guard var startSeconds = seconds else { return nil } + + startSeconds += (minutes ?? 0) * 60 + startSeconds += (hours ?? 0) * 60 * 60 + + return Chapter(title: titleCapture, start: startSeconds) } - guard var startSeconds = seconds else { return nil } - - startSeconds += (minutes ?? 0) * 60 - startSeconds += (hours ?? 0) * 60 * 60 - - return .init(title: titleCapture, start: startSeconds) + if !extractedChapters.isEmpty { + lock.lock() + capturedChapters[index] = extractedChapters + lock.unlock() + } } + extractChaptersGroup.leave() + } + } + + extractChaptersGroup.wait() + + // Now we sort the keys of the capturedChapters dictionary. + // These keys correspond to the priority of each pattern. + let sortedKeys = Array(capturedChapters.keys).sorted(by: <) + + // Return first non-empty result in the order of patterns + for key in sortedKeys { + if let chapters = capturedChapters[key], !chapters.isEmpty { + return chapters } } return []