Merge pull request #11 from dudil/split-by-punctuation

New algorithm of splitting by punctuation. Thx @dudil.
peterk · Aug 3, 2024 · 631cac1 · 631cac1
2 parents b54a6d2 + 61bf5d0
commit 631cac1
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 32 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,16 @@
+## [0.1.10] - 2024-08-03
+
+### Added
+- New method to split long srt lines taking punctuation into account.
+
+### Acknowledgments
+- Thanks to @dudil for the new splitting method.
+
+## [0.1.9] - 2024-03-02
+
+### Changed
+- Fix fractional timestamps from Whisper output.
+
 ## [0.1.8] - 2023-12-08
 
 ### Added

diff --git a/README.md b/README.md
@@ -62,14 +62,17 @@ the most profound of our time.
 ```
 
 ### Algorithms
-By default, this script uses greedy algorithm which splits the text at the rightmost possible space.
+By default, this script uses `greedy` algorithm which splits the text at the rightmost possible space.
 
-An alternative splitting algorithm can be used that will split longer lines at half instead of always trying to use maximum line length. This prevents producing lines with isolated word remainders.
+An alternative splitting algorithm is `halving` which will split longer lines more evenly instead of always trying to use maximum line length. This prevents producing lines with isolated word remainders.
+
+Another alternative is the `punctuation` algorithm that takes punctuation (commas, periods, etc.) into account. 
 
 ```python
 
 from srt_equalizer import srt_equalizer
 
+# use "greedy", "halving" or "punctuation" for the method parameter
 srt_equalizer.equalize_srt_file("test.srt", "shortened.srt", 42, method='halving')
 ```
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "srt_equalizer"
-version = "0.1.9"
+version = "0.1.10"
 description = "Transform subtitle line lengths, splitting into multiple subtitle fragments if necessary. "
 authors = ["Peter Krantz"]
 license = "MIT"

diff --git a/src/srt_equalizer/srt_equalizer.py b/src/srt_equalizer/srt_equalizer.py
@@ -1,3 +1,4 @@
+import re
 from datetime import timedelta
 from typing import List
 
@@ -42,7 +43,7 @@ def split_subtitle(sub: srt.Subtitle, target_chars: int = 42, start_from_index:
         sub: A srt.Subtitle object.
         target_chars: The max number of characters for a subtitle line.
         start_from_index: The start index of the subtitle item.
-        method: algorithm for splitting - either "greedy" or "halving".
+        method: algorithm for splitting - either "greedy" (default), "halving" or "punctuation".
 
     Returns:
         An array of one or more subtitle items.
@@ -53,21 +54,13 @@ def split_subtitle(sub: srt.Subtitle, target_chars: int = 42, start_from_index:
         sub.index = start_from_index + 1
         return [sub]
 
-    if method == "greedy":
-        text_chunks = []
-        current_chunk = ""
-        words = sub.content.split()
-        for word in words:
-            if len(current_chunk) + len(word) + 1 > target_chars:
-                text_chunks.append(current_chunk.strip())
-                current_chunk = word + ' '
-            else:
-                current_chunk += word + ' '
-        if current_chunk:
-            text_chunks.append(current_chunk.strip())
-    else:
-        assert method == "halving"
+    elif method == "greedy":
+        text_chunks = split_greedy(sub.content, target_chars)
+    elif method == "halving":
         text_chunks = split_at_half(sub.content, target_chars)
+    else:
+        assert method == "punctuation"
+        text_chunks = split_by_punctuation(sub.content, target_chars)
 
     # Create a new subtitle item for each text chunk, proportional to its length.
     split_subs = []
@@ -100,7 +93,7 @@ def split_subtitle(sub: srt.Subtitle, target_chars: int = 42, start_from_index:
 def equalize_srt_file(srt_path: str, output_srt_path: str, target_chars: int, method='greedy'):
     """Load subs from an SRT file and output equalized subtitles to a new SRT file.
     """
-    assert method in {'greedy', 'halving'}, method
+    assert method in {'greedy', 'halving', 'punctuation'}, method
     subs = load_srt(srt_path)
 
     adjusted_subs = []
@@ -118,7 +111,28 @@ def equalize_srt_file(srt_path: str, output_srt_path: str, target_chars: int, me
     write_srt(filepath=output_srt_path, subs=adjusted_subs)
 
 
+def split_greedy(sentance: str, target_chars: int) -> List[srt.Subtitle]:
+    """Split subtitles into chunks of target_chars length as soon as possible.
+    """
+
+    text_chunks = []
+    current_chunk = ''
+    words = sentance.split()
+    for word in words:
+        if len(current_chunk) + len(word) + 1 > target_chars:
+            text_chunks.append(current_chunk.strip())
+            current_chunk = word + ' '
+        else:
+            current_chunk += word + ' '
+    if current_chunk:
+        text_chunks.append(current_chunk.strip())
+
+    return text_chunks
+
+
 def split_at_half(sentence, target_chars):
+    """Try to split subtitles into similar line lengths takign commas into account."""
+
     if len(sentence) <= target_chars or ' ' not in sentence:
         return [sentence]
 
@@ -139,4 +153,38 @@ def split_at_half(sentence, target_chars):
     # recursively call this function until the length is bellow limit
     left = sentence[:closest_space_to_center]
     right = sentence[closest_space_to_center+1:]
-    return split_at_half(left, target_chars) + split_at_half(right, target_chars)
+    return split_at_half(left, target_chars) + split_at_half(right, target_chars)
+
+
+def split_by_punctuation(sentance: str, target_chars: int) -> List[str]:
+    """Split subtitles into chunks of target_chars length by punctuation."""
+
+    if len(sentance) <= target_chars:
+        return [sentance]
+
+    # use regex to split the sentance by punctuation
+    chunks = re.split(r'([.,!?])', sentance)
+    normalized_chunks = []
+    for chunk in chunks:
+        # strip whitespace
+        chunk = chunk.strip()
+
+        # if this chunk is an empty one, skip it
+        if not chunk:
+            continue
+
+        if len(chunk) > target_chars:
+            normalized_chunks.extend(split_greedy(chunk, target_chars))
+            continue
+
+        if normalized_chunks:
+            if chunk in '.,!?':
+                # add pucturation to the last chunk
+                chunk = normalized_chunks.pop() + chunk
+            elif len(chunk) + len(normalized_chunks[-1]) <= target_chars:
+                # add this chunk to the last one since they still under the limit allowed
+                chunk = normalized_chunks.pop() + ' ' + chunk
+
+        normalized_chunks.append(chunk)
+
+    return normalized_chunks
diff --git a/tests/test_srt_equalizer.py b/tests/test_srt_equalizer.py
@@ -20,10 +20,12 @@ def test_load_srt_file_not_found():
 
 def test_split_subtitle():
     """Test split subtitle."""
-    sub = srt.Subtitle(index=1,
-                       start=datetime.timedelta(seconds=0, milliseconds=0),
-                       end=datetime.timedelta(seconds=1, milliseconds=0),
-                       content="A string with more than 40 characters that should be split into several smaller ones.")
+    sub = srt.Subtitle(
+        index=1,
+        start=datetime.timedelta(seconds=0, milliseconds=0),
+        end=datetime.timedelta(seconds=1, milliseconds=0),
+        content="A string with more than 40 characters that should be split into several smaller ones.",
+    )
     s = split_subtitle(sub, 42)
 
     # check that the line is split after "characters"
@@ -41,13 +43,15 @@ def test_split_subtitle():
 
 def test_split_subtitle_halving():
     """Test split subtitle."""
-    sub = srt.Subtitle(index=1,
-                       start=datetime.timedelta(seconds=0, milliseconds=0),
-                       end=datetime.timedelta(seconds=1, milliseconds=0),
-                       content="A string with more than 40 characters that should be split into several smaller ones.")
-    s = split_subtitle(sub, 42, method='halving')
-
-    reconstructed = ' '.join([x.content for x in s])
+    sub = srt.Subtitle(
+        index=1,
+        start=datetime.timedelta(seconds=0, milliseconds=0),
+        end=datetime.timedelta(seconds=1, milliseconds=0),
+        content="A string with more than 40 characters that should be split into several smaller ones.",
+    )
+    s = split_subtitle(sub, 42, method="halving")
+
+    reconstructed = " ".join([x.content for x in s])
     assert sub.content == reconstructed
 
     assert s[0].content == "A string with more than 40 characters that"
@@ -63,7 +67,7 @@ def test_whisper_result_to_srt():
     # Load example whipser result from pickle
     whisper_result = dict()
 
-    with open("tests/whisper_result_example.pkl", 'rb') as file:
+    with open("tests/whisper_result_example.pkl", "rb") as file:
         whisper_result = pickle.load(file)
 
     # check that fractional seconds are converted correctly
@@ -72,3 +76,24 @@ def test_whisper_result_to_srt():
 
     assert subs[0].start == datetime.timedelta(microseconds=123000)
     assert subs[0].end == datetime.timedelta(seconds=10, microseconds=789000)
+
+
+def test_split_subtitle_punctuation():
+    """Test split subtitle."""
+    sub = srt.Subtitle(
+        index=1,
+        start=datetime.timedelta(seconds=0, milliseconds=0),
+        end=datetime.timedelta(seconds=1, milliseconds=0),
+        content="A string with more than 40 characters! This should be split into several, smaller ones.",
+    )
+    s = split_subtitle(sub, 42, method="punctuation")
+
+    reconstructed = " ".join([x.content for x in s])
+    assert sub.content == reconstructed
+
+    assert s[0].content == "A string with more than 40 characters!"
+    assert s[1].content == "This should be split into several,"
+    assert s[2].content == "smaller ones."
+
+    # check fragment timing
+    assert s[2].end == sub.end