ba whisper for JPN

TalkBank · Jan 26, 2024 · bf42acb · bf42acb
1 parent caa0edc
commit bf42acb
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 31 deletions.
diff --git a/batchalign/pipelines/asr/whisper.py b/batchalign/pipelines/asr/whisper.py
@@ -19,7 +19,7 @@ class WhisperEngine(BatchalignEngine):
 
     def __init__(self, model=None, lang="eng"):
 
-        if model == None and lang == "eng":
+        if model == None and lang == "eng" or lang == "jpn":
             model = "talkbank/CHATWhisper-en-large-v1"
         elif model == None:
             model = "openai/whisper-large-v2"

diff --git a/batchalign/version b/batchalign/version
@@ -1,3 +1,3 @@
-0.4.0-post.4
+0.4.1
 Jan 26th, 2024
-Disable Serbian MWT models
+Using Batchalign Whisper for JPN
diff --git a/scratchpad.py b/scratchpad.py
@@ -19,27 +19,27 @@
 ########### The Batchalign Core Test Harness ###########
 # from batchalign.formats.chat.parser import chat_parse_utterance
 
-# text = "i'm going to read some random crap as i see on the screen . 2530_4940"
+# text = "電車 は 通ります よ ."
 
 # function = "morphosyntax"
-# lang = "eng"
+# lang = "jpn"
 # num_speakers = 1
 
-# forms, delim = chat_parse_utterance(text, none, none, none, none)
-# utterance = utterance(content=forms, delim=delim)
+# forms, delim = chat_parse_utterance(text, None, None, None, None)
+# utterance = Utterance(content=forms, delim=delim)
 
 # # utterance = utterance(content=text)
 
-# ut = document(content=[utterance], langs=[lang])
+# ut = Document(content=[utterance], langs=[lang])
 
-# pipeline = batchalignpipeline.new(function, lang=lang, num_speakers=num_speakers)
+# pipeline = BatchalignPipeline.new(function, lang=lang, num_speakers=num_speakers)
 # doc = pipeline(ut)
 # doc[0][-1]
 # doc[0][-2].model_dump()
 
 # doc[0].content[-2]
 
-# print(str(chatfile(doc=doc)))
+# print(str(CHATFile(doc=doc)))
 
 ########### The Batchalign String Test Harness ###########
 # from batchalign.formats.chat.parser import chat_parse_utterance
@@ -93,34 +93,34 @@
 # doc[-382][1]
 
 ########### The Batchalign CLI Harness ###########
-from batchalign.cli.dispatch import _dispatch
+# from batchalign.cli.dispatch import _dispatch
 
-in_dir = "../talkbank-alignment/test_harness/input/"
-out_dir = "../talkbank-alignment/test_harness/output/"
-in_format = "cha"
+# in_dir = "../talkbank-alignment/test_harness/input/"
+# out_dir = "../talkbank-alignment/test_harness/output/"
+# in_format = "cha"
 
-function = "morphotag"
-lang = "hrv"
-num_speakers = 1
+# function = "morphotag"
+# lang = "hrv"
+# num_speakers = 1
 
-class Context:
-    obj = {"verbose": 3}
+# class Context:
+#     obj = {"verbose": 3}
 
-def loader(file):
-    return CHATFile(path=os.path.abspath(file)).doc
+# def loader(file):
+#     return CHATFile(path=os.path.abspath(file)).doc
 
-    # return file
+#     # return file
 
-def writer(doc, output):
-    CHATFile(doc=doc).write(output)
-    # CHATFile(doc=doc).write(output
-    #                         .replace(".wav", ".cha")
-    #                         .replace(".mp4", ".cha")
-    #                         .replace(".mp3", ".cha"))
+# def writer(doc, output):
+#     CHATFile(doc=doc).write(output)
+#     # CHATFile(doc=doc).write(output
+#     #                         .replace(".wav", ".cha")
+#     #                         .replace(".mp4", ".cha")
+#     #                         .replace(".mp3", ".cha"))
 
-_dispatch(function, lang, num_speakers, [in_format], Context(),
-            in_dir, out_dir,
-            loader, writer, Console())
+# _dispatch(function, lang, num_speakers, [in_format], Context(),
+#             in_dir, out_dir,
+#             loader, writer, Console())
 
 ########## The Batchalign CHAT Test Tarness ##########