Skip to content

Commit

Permalink
ba whisper for JPN
Browse files Browse the repository at this point in the history
  • Loading branch information
Jemoka committed Jan 26, 2024
1 parent caa0edc commit bf42acb
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 31 deletions.
2 changes: 1 addition & 1 deletion batchalign/pipelines/asr/whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class WhisperEngine(BatchalignEngine):

def __init__(self, model=None, lang="eng"):

if model == None and lang == "eng":
if model == None and lang == "eng" or lang == "jpn":
model = "talkbank/CHATWhisper-en-large-v1"
elif model == None:
model = "openai/whisper-large-v2"
Expand Down
4 changes: 2 additions & 2 deletions batchalign/version
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
0.4.0-post.4
0.4.1
Jan 26th, 2024
Disable Serbian MWT models
Using Batchalign Whisper for JPN
56 changes: 28 additions & 28 deletions scratchpad.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,27 @@
########### The Batchalign Core Test Harness ###########
# from batchalign.formats.chat.parser import chat_parse_utterance

# text = "i'm going to read some random crap as i see on the screen . 2530_4940"
# text = "電車 は 通ります よ ."

# function = "morphosyntax"
# lang = "eng"
# lang = "jpn"
# num_speakers = 1

# forms, delim = chat_parse_utterance(text, none, none, none, none)
# utterance = utterance(content=forms, delim=delim)
# forms, delim = chat_parse_utterance(text, None, None, None, None)
# utterance = Utterance(content=forms, delim=delim)

# # utterance = utterance(content=text)

# ut = document(content=[utterance], langs=[lang])
# ut = Document(content=[utterance], langs=[lang])

# pipeline = batchalignpipeline.new(function, lang=lang, num_speakers=num_speakers)
# pipeline = BatchalignPipeline.new(function, lang=lang, num_speakers=num_speakers)
# doc = pipeline(ut)
# doc[0][-1]
# doc[0][-2].model_dump()

# doc[0].content[-2]

# print(str(chatfile(doc=doc)))
# print(str(CHATFile(doc=doc)))

########### The Batchalign String Test Harness ###########
# from batchalign.formats.chat.parser import chat_parse_utterance
Expand Down Expand Up @@ -93,34 +93,34 @@
# doc[-382][1]

########### The Batchalign CLI Harness ###########
from batchalign.cli.dispatch import _dispatch
# from batchalign.cli.dispatch import _dispatch

in_dir = "../talkbank-alignment/test_harness/input/"
out_dir = "../talkbank-alignment/test_harness/output/"
in_format = "cha"
# in_dir = "../talkbank-alignment/test_harness/input/"
# out_dir = "../talkbank-alignment/test_harness/output/"
# in_format = "cha"

function = "morphotag"
lang = "hrv"
num_speakers = 1
# function = "morphotag"
# lang = "hrv"
# num_speakers = 1

class Context:
obj = {"verbose": 3}
# class Context:
# obj = {"verbose": 3}

def loader(file):
return CHATFile(path=os.path.abspath(file)).doc
# def loader(file):
# return CHATFile(path=os.path.abspath(file)).doc

# return file
# # return file

def writer(doc, output):
CHATFile(doc=doc).write(output)
# CHATFile(doc=doc).write(output
# .replace(".wav", ".cha")
# .replace(".mp4", ".cha")
# .replace(".mp3", ".cha"))
# def writer(doc, output):
# CHATFile(doc=doc).write(output)
# # CHATFile(doc=doc).write(output
# # .replace(".wav", ".cha")
# # .replace(".mp4", ".cha")
# # .replace(".mp3", ".cha"))

_dispatch(function, lang, num_speakers, [in_format], Context(),
in_dir, out_dir,
loader, writer, Console())
# _dispatch(function, lang, num_speakers, [in_format], Context(),
# in_dir, out_dir,
# loader, writer, Console())

########## The Batchalign CHAT Test Tarness ##########

Expand Down

0 comments on commit bf42acb

Please sign in to comment.