From ab2171c9178c51b8247ed0939cda3a54a0eadf48 Mon Sep 17 00:00:00 2001 From: Oscar Kuzniar Date: Tue, 21 Dec 2021 18:10:22 +0100 Subject: [PATCH] Extend generated audio to fit captions gaps: extend voice breaks after commas --- .gitignore | 5 ++- .idea/.gitignore | 3 -- .../inspectionProfiles/profiles_settings.xml | 6 --- .idea/karpik-poc-py.iml | 10 ----- .idea/misc.xml | 4 -- .idea/modules.xml | 8 ---- .idea/vcs.xml | 6 --- generate-audio.py | 39 ++++++++++++++++++- 8 files changed, 41 insertions(+), 40 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml delete mode 100644 .idea/karpik-poc-py.iml delete mode 100644 .idea/misc.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/vcs.xml diff --git a/.gitignore b/.gitignore index f88488b..ad598f8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ # ignore generated output files -output/ \ No newline at end of file +output/ +.idea/ +.idea/misc.xml +.DS_Store \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 26d3352..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/karpik-poc-py.iml b/.idea/karpik-poc-py.iml deleted file mode 100644 index 74d515a..0000000 --- a/.idea/karpik-poc-py.iml +++ /dev/null @@ -1,10 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 8cdbed9..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index ad7b78b..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/generate-audio.py b/generate-audio.py index 1dfd201..2075c14 100755 --- a/generate-audio.py +++ b/generate-audio.py @@ -33,7 +33,7 @@ def synthesize(text, config): response = polly_client.synthesize_speech( Engine='neural', # standard|neural - neural nie obsługuje max-duration - VoiceId= config.voice, + VoiceId=config.voice, LanguageCode='en-US', OutputFormat='mp3', TextType='ssml', # or text @@ -52,6 +52,12 @@ def caption_start(caption): return seconds +def caption_end(caption): + nums = [float(n) for n in caption.end.split(':')] + seconds = nums[0] * 3600 + nums[1] * 60 + nums[2] + return seconds + + def load_captions(config): if config.captions_format == 'vtt': return webvtt.read(f'input/{config.captions_file_name}') @@ -61,6 +67,33 @@ def load_captions(config): raise Exception('Unsupported subtitles format') +# TODO figure out better way of defining break length +def define_break(diff_length, num_of_pauses): + length_of_pause = diff_length / num_of_pauses + if diff_length / num_of_pauses > 2: + return 1000 + elif 1 < length_of_pause < 2: + return 800 + else: + return 500 + + +def extend_sentence_audio(sentence_audio, caption): + audio_duration = sentence_audio.duration_seconds + caption_start_time = caption_start(caption) + caption_end_time = caption_end(caption) + diff = ((caption_end_time - caption_start_time) - audio_duration).__round__(3) + result = '' + split_caption = caption.text.split(',') + if len(split_caption) == 1: + return sentence_audio + for idx, cpt in enumerate(split_caption): + result = result + cpt + if idx != len(split_caption): + result = result + ''.format(define_break(diff, len(split_caption) - 1)) + return synthesize(result, config) + + if __name__ == '__main__': config = InlineClass({ 'captions_file_name': 'udemy_sample_01.vtt', @@ -80,6 +113,8 @@ def load_captions(config): print(f'Processing {caption}') sentence_audio = synthesize(caption.text, config) + sentence_audio = extend_sentence_audio(sentence_audio, caption) + start = caption_start(caption) if audio.duration_seconds < start: break_length = (start - audio.duration_seconds) * 1000 @@ -92,4 +127,4 @@ def load_captions(config): new_audio = mpe.AudioFileClip(f'output/{config.audio_file_name}') # new_audio = mpe.CompositeAudioClip([input_clip.audio, new_audio]) final_clip = input_clip.set_audio(new_audio) - final_clip.write_videofile(f'output/{config.movie_file_name}') + final_clip.write_videofile(f'output/output_{config.movie_file_name}')