Skip to content

Commit

Permalink
集成百度翻译接口,细节待优化;
Browse files Browse the repository at this point in the history
优化编译参数,增加-s;
  • Loading branch information
speauty committed Mar 1, 2023
1 parent 2d2c2b6 commit 0004b4f
Show file tree
Hide file tree
Showing 7 changed files with 496 additions and 81 deletions.
2 changes: 1 addition & 1 deletion release.subtitle.bat
Original file line number Diff line number Diff line change
@@ -1 +1 @@
go build "-ldflags=-w -H=windowsgui" -o .\bin\subtitle.exe gui.subtitle
go build "-ldflags=-w -s -H=windowsgui" -o .\bin\subtitle.exe gui.subtitle
222 changes: 175 additions & 47 deletions src/logic/translate/translate.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,28 @@ package translate
import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
"github.com/golang-module/carbon"
mt2 "gui.subtitle/src/srv/mt"
aliyun2 "gui.subtitle/src/srv/mt/aliyun"
"gui.subtitle/src/srv/mt/bd"
"gui.subtitle/src/util/lang"
"io"
"regexp"
"runtime"
"strings"
"sync"
"time"
"unicode"
)

type Block struct {
Idx string
TimeStr string
TextCH string
TextZH string
TextZHCnt int
TextEN string
TextENCnt int
}
Expand Down Expand Up @@ -53,7 +59,8 @@ func Reader(ir io.Reader) ([]*Block, error) {
tmpST.TimeStr = line
} else {
if hasZH(line) {
tmpST.TextCH = line
tmpST.TextZH = line
tmpST.TextZHCnt = lineLen
} else {
tmpST.TextEN = line
tmpST.TextENCnt = lineLen
Expand All @@ -73,8 +80,8 @@ func Writer(iw io.Writer, contents []*Block) (int, error) {
cnt := 0
for _, content := range contents {
buffer := bytes.NewBufferString(fmt.Sprintf("%s\n%s\n", content.Idx, content.TimeStr))
if content.TextCH != "" {
buffer.WriteString(content.TextCH)
if content.TextZH != "" {
buffer.WriteString(content.TextZH)
buffer.WriteByte('\n')
}
if content.TextEN != "" {
Expand All @@ -91,60 +98,181 @@ func Writer(iw io.Writer, contents []*Block) (int, error) {
return cnt, nil
}

func Translate(mt interface{}, contents []*Block) ([]string, int) {
func Translate(ctx context.Context, mt interface{}, contents []*Block, fromLanguage lang.StrLang, toLanguage lang.StrLang) ([]string, int, error) {
if err := preCheckBlocks(contents, fromLanguage); err != nil {
return nil, 0, fmt.Errorf("预检字幕失败, 错误: %s", err.Error())
}
wg := &sync.WaitGroup{}
var results []string
cntError := 0
var lastError error
cntBlock := len(contents)

maxCoroutine := 10

switch mt.(mt2.MT).GetId() {
case mt2.ALI:
contentsChunked, err := chunkBlocksForALi(contents, toLanguage)
if err != nil {
return nil, 0, fmt.Errorf("阿里翻译字幕分包异常, 错误: %s", err.Error())
}
for idx, m := range contentsChunked {
wg.Add(1)
localIdx := idx
mChunked := m
go func() {
defer wg.Done()
timeStartedAt := carbon.Now()
marshal, _ := json.Marshal(mChunked)
args := new(aliyun2.TextBatchTranslateArg).New(string(marshal))
args.FromLanguage = fromLanguage.ToString()
args.ToLanguage = toLanguage.ToString()
translates, err := mt.(mt2.MT).TextBatchTranslate(ctx, args)
if err != nil {
msg := fmt.Sprintf("[%s]%s失败, 错误: %s", carbon.Now(), mt.(mt2.MT).GetName(), err)
results = append(results, msg)
cntError++
lastError = fmt.Errorf(msg)
return
}
lineMatchedCnt := 0
for _, blockTranslated := range translates {
for contentIdx, content := range contents {
if blockTranslated.Idx == content.Idx {
contents[contentIdx].TextZH = blockTranslated.StrTranslated
lineMatchedCnt++
}
}
}
results = append(results, fmt.Sprintf(
"[%s]%s成功, 序号: %d, 字幕行数: %d, 耗时(s): %d",
carbon.Now(), mt.(mt2.MT).GetName(), localIdx+1, lineMatchedCnt, carbon.Now().DiffAbsInSeconds(timeStartedAt),
))
}()
}
case mt2.BAIDU:
for coroutineIdx := 0; coroutineIdx < maxCoroutine && coroutineIdx < cntBlock; coroutineIdx++ {
wg.Add(1)
go func(localCtx context.Context, localWG *sync.WaitGroup, localCoroutineIdx int) {
defer localWG.Done()
timeStart := carbon.Now()
cntBlockTranslated := 0
for blockIdx := 0; blockIdx < cntBlock; blockIdx++ {
if blockIdx%10 != localCoroutineIdx {
continue
}
currentBlock := contents[blockIdx]
sourceText := currentBlock.TextZH
if fromLanguage == lang.EN {
sourceText = currentBlock.TextEN
}

args := new(bd.TextTranslateArg).New(sourceText)
args.FromLanguage = fromLanguage.ToString()
args.ToLanguage = toLanguage.ToString()
var err error
translateResp := new(mt2.TextTranslateResp)

for failIdx := 0; failIdx < 3; failIdx++ {
translateResp, err = mt.(mt2.MT).TextTranslate(ctx, args)
if err != nil {
err = fmt.Errorf("[%s]%s失败, 协程序号: %d, 字幕序号: %s, 错误: %s", carbon.Now(), mt.(mt2.MT).GetName(), localCoroutineIdx, currentBlock.Idx, err)
time.Sleep(time.Second)
continue
}
break
}
if err != nil {
results = append(results, err.Error())
cntError++
lastError = err
breakFlags := []string{
"52002", "52003", "54001", "54004", "58000", "58001", "58002", "90107",
}
for _, flag := range breakFlags {
if strings.Contains(err.Error(), flag) {
runtime.Goexit()
}
}

continue
}

if toLanguage == lang.ZH {
contents[blockIdx].TextZH = translateResp.StrTranslated
} else if toLanguage == lang.EN {
contents[blockIdx].TextEN = translateResp.StrTranslated
}
cntBlockTranslated++
}
if cntBlockTranslated > 0 {
results = append(results, fmt.Sprintf(
"[%s]%s成功, 协程序号: %d, 字幕行数: %d, 耗时(s): %d",
carbon.Now(), mt.(mt2.MT).GetName(), localCoroutineIdx, cntBlockTranslated, carbon.Now().DiffAbsInSeconds(timeStart),
))
} else {
results = append(results, fmt.Sprintf(
"[%s]%s, 协程序号: %d, 错误: 空转, 耗时(s): %d",
carbon.Now(), mt.(mt2.MT).GetName(), localCoroutineIdx, carbon.Now().DiffAbsInSeconds(timeStart),
))
}

}(ctx, wg, coroutineIdx)
}
}
wg.Wait()
return results, cntError, lastError
}

// preCheckBlocks 预检字幕块, 主要保证需要翻译的字幕块存在
func preCheckBlocks(contents []*Block, fromLanguage lang.StrLang) error {
if !(fromLanguage == lang.EN || fromLanguage == lang.ZH) {
return fmt.Errorf("暂未实现来源语言[%s]的字幕链接", fromLanguage.GetCH())
}

for _, content := range contents {
if (fromLanguage == lang.EN && content.TextEN == "") || (fromLanguage == lang.ZH && content.TextZH == "") {
return fmt.Errorf("当前字幕缺失, 索引: %s, 时间: %s", content.Idx, content.TimeStr)
}
}
return nil
}

// chunkBlocksForALi 阿里云翻译的专属分包函数
func chunkBlocksForALi(contents []*Block, toLanguage lang.StrLang) ([]map[string]string, error) {
var contentsChunked []map[string]string
tmpMap := map[string]string{}
tmpLen := 0
for _, content := range contents {
if content.TextCH != "" { // 略过已经翻译了的
continue
}
if tmpLen+content.TextENCnt >= 5000 { // 单次批量翻译最大值
contentsChunked = append(contentsChunked, tmpMap)
tmpLen = 0
tmpMap = map[string]string{}
if toLanguage == lang.ZH {
if content.TextZH != "" { // 略过已经翻译了的
continue
}
if tmpLen+content.TextENCnt >= 5000 { // 单次批量翻译最大值
contentsChunked = append(contentsChunked, tmpMap)
tmpLen = 0
tmpMap = map[string]string{}
}
} else if toLanguage == lang.EN {
if content.TextEN != "" { // 略过已经翻译了的
continue
}
if tmpLen+content.TextZHCnt >= 5000 { // 单次批量翻译最大值
contentsChunked = append(contentsChunked, tmpMap)
tmpLen = 0
tmpMap = map[string]string{}
}
} else {
return nil, fmt.Errorf("暂未实现目标语言[%s]的分包操作", toLanguage.GetCH())
}

tmpLen += content.TextENCnt
tmpMap[content.Idx] = content.TextEN
}
if tmpLen != 0 {
contentsChunked = append(contentsChunked, tmpMap)
}
wg := sync.WaitGroup{}
var results []string
cntError := 0
for idx, m := range contentsChunked {
wg.Add(1)
localIdx := idx
mChunked := m
go func() {
defer wg.Done()
timeStartedAt := carbon.Now()
marshal, _ := json.Marshal(mChunked)
translates, err := mt.(mt2.MT).TextBatchTranslate(new(aliyun2.TextBatchTranslateArg).New(string(marshal)))
if err != nil {
results = append(results, fmt.Sprintf("[%s]%s翻译失败, 错误: %s", carbon.Now(), mt.(mt2.MT).GetName(), err))
cntError++
return
}
lineMatchedCnt := 0
for _, blockTranslated := range translates {
for contentIdx, content := range contents {
if blockTranslated.Idx == content.Idx {
contents[contentIdx].TextCH = blockTranslated.StrTranslated
lineMatchedCnt++
}
}
}
results = append(results, fmt.Sprintf(
"[%s]%s翻译成功, 序号: %d, 字幕行数: %d, 耗时(s): %d",
carbon.Now(), mt.(mt2.MT).GetName(), localIdx+1, lineMatchedCnt, carbon.Now().DiffAbsInSeconds(timeStartedAt),
))
}()

}
wg.Wait()
return results, cntError
return contentsChunked, nil
}

func hasZH(str string) bool {
Expand Down
37 changes: 23 additions & 14 deletions src/srv/mt/aliyun/ali_mt.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
package aliyun

import (
"context"
"fmt"
alimt20181012 "github.com/alibabacloud-go/alimt-20181012/v2/client"
openapi "github.com/alibabacloud-go/darabonba-openapi/v2/client"
util "github.com/alibabacloud-go/tea-utils/v2/service"
"github.com/alibabacloud-go/tea/tea"
"github.com/golang-module/carbon"
"gui.subtitle/src/srv/mt"
"gui.subtitle/src/util/lang"
)

type Cfg struct {
Expand All @@ -23,7 +25,7 @@ type ALiMT struct {
mtClient *alimt20181012.Client
}

func (m *ALiMT) Init(cfg interface{}) error {
func (m *ALiMT) Init(_ context.Context, cfg interface{}) error {
if _, ok := cfg.(*Cfg); !ok {
return fmt.Errorf("the cfg's mismatched")
}
Expand All @@ -38,31 +40,35 @@ func (m *ALiMT) Init(cfg interface{}) error {
}

type TextBatchTranslateArg struct {
Scene string
ApiType string
SourceText string
TargetLang string
SourceLang string
Scene string
ApiType string
SourceText string
ToLanguage string
FromLanguage string
}

func (arg *TextBatchTranslateArg) New(text string) *TextBatchTranslateArg {
arg.Scene = "general"
arg.ApiType = "translate_standard"
arg.SourceText = text
arg.TargetLang = "zh"
arg.SourceLang = "en"
arg.ToLanguage = lang.ZH.ToString()
arg.FromLanguage = lang.EN.ToString()
return arg
}

func (m *ALiMT) TextBatchTranslate(args interface{}) ([]mt.TextTranslateResp, error) {
func (m *ALiMT) TextTranslate(context.Context, interface{}) (*mt.TextTranslateResp, error) {
return nil, nil
}

func (m *ALiMT) TextBatchTranslate(_ context.Context, args interface{}) ([]mt.TextTranslateResp, error) {
if _, ok := args.(*TextBatchTranslateArg); !ok {
return nil, fmt.Errorf("the args for ALiMT.TextBatchTranslate mismatched")
}
getBatchTranslateRequest := &alimt20181012.GetBatchTranslateRequest{
FormatType: tea.String("text"), Scene: tea.String(args.(*TextBatchTranslateArg).Scene),
ApiType: tea.String(args.(*TextBatchTranslateArg).ApiType),
SourceText: tea.String(args.(*TextBatchTranslateArg).SourceText),
TargetLanguage: tea.String(args.(*TextBatchTranslateArg).TargetLang), SourceLanguage: tea.String(args.(*TextBatchTranslateArg).SourceLang),
TargetLanguage: tea.String(args.(*TextBatchTranslateArg).ToLanguage), SourceLanguage: tea.String(args.(*TextBatchTranslateArg).FromLanguage),
}
runtime := &util.RuntimeOptions{}
resp, err := m.mtClient.GetBatchTranslateWithOptions(getBatchTranslateRequest, runtime)
Expand All @@ -75,11 +81,10 @@ func (m *ALiMT) TextBatchTranslate(args interface{}) ([]mt.TextTranslateResp, er
var funcResp []mt.TextTranslateResp
for blockIdx, blockTranslated := range resp.Body.TranslatedList {
if blockTranslated["code"].(string) != "200" && m.cfg.IsDebug {
fmt.Println(fmt.Errorf(
return nil, fmt.Errorf(
"[%s]%s, 错判翻译异常[%d], 索引: %s",
carbon.Now(), m.GetName(), blockIdx, blockTranslated["index"],
))
continue
)
}
funcResp = append(funcResp, mt.TextTranslateResp{
Idx: blockTranslated["index"].(string),
Expand All @@ -89,8 +94,12 @@ func (m *ALiMT) TextBatchTranslate(args interface{}) ([]mt.TextTranslateResp, er
return funcResp, nil
}

func (m *ALiMT) GetId() mt.Id {
return mt.ALI
}

func (m *ALiMT) GetName() string {
return "阿里云-机器翻译"
return "阿里云机器翻译"
}

func (m *ALiMT) initClient() error {
Expand Down
Loading

0 comments on commit 0004b4f

Please sign in to comment.