From 26caaefaf50b68b2f9c19cfe449999a11a3590ef Mon Sep 17 00:00:00 2001 From: "devin.yf" Date: Wed, 29 May 2024 14:48:02 +0800 Subject: [PATCH] paraformer: voice file to text --- README.md | 4 +- .../paraformer/{ => realtime}/speech2text.go | 0 .../paraformer/voice_file/recordfile2text.go | 56 +++++++++ httpclient/http_client.go | 4 +- paraformer/dtypes.go | 89 ++++++++++++-- .../{paraformer.go => paraformer_ws.go} | 2 + paraformer/paraformercli.go | 111 ++++++++++++++++++ paraformer/params.go | 15 +++ tongyiclient.go | 40 +++++-- 9 files changed, 297 insertions(+), 24 deletions(-) rename example/paraformer/{ => realtime}/speech2text.go (100%) create mode 100644 example/paraformer/voice_file/recordfile2text.go rename paraformer/{paraformer.go => paraformer_ws.go} (98%) create mode 100644 paraformer/paraformercli.go diff --git a/README.md b/README.md index 1077eef..4e5c667 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,8 @@ go get -u github.com/devinyf/dashscopego - [ ] 人像风格重绘 - [ ] 图像背景生成 #### Paraformer(语音识别文字) -- [x] [实时语音识别](./example/paraformer/speech2text.go) -- [ ] 录音文件识别 +- [x] [实时语音识别](./example/paraformer/realtime/speech2text.go) +- [x] [录音文件识别](./example/paraformer/voice_file/recordfile2text.go) #### 语音合成 - [ ] 文本至语音的实时流式合成 #### 通用文本向量 Embedding diff --git a/example/paraformer/speech2text.go b/example/paraformer/realtime/speech2text.go similarity index 100% rename from example/paraformer/speech2text.go rename to example/paraformer/realtime/speech2text.go diff --git a/example/paraformer/voice_file/recordfile2text.go b/example/paraformer/voice_file/recordfile2text.go new file mode 100644 index 0000000..e791ed6 --- /dev/null +++ b/example/paraformer/voice_file/recordfile2text.go @@ -0,0 +1,56 @@ +package main + +import ( + "context" + "fmt" + "os" + "os/user" + "path/filepath" + + "github.com/devinyf/dashscopego" + "github.com/devinyf/dashscopego/paraformer" +) + +func main() { + model := paraformer.ParaformerV1 + token := os.Getenv("DASHSCOPE_API_KEY") + if token == "" { + panic("token is empty") + } + + cli := dashscopego.NewTongyiClient(model, token) + + usr, err := user.Current() + if err != nil { + panic(err) + } + + voiceFile := filepath.Join(usr.HomeDir, "Desktop", "hello_world_female2.wav") + filePath := "file://" + voiceFile + + req := ¶former.AsyncTaskRequest{ + Model: paraformer.ParaformerV1, + Input: paraformer.AsyncInput{ + // 官方示例中使用的远程文件. + // FileURLs: []string{"https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_female2.wav"}, + // 本地文件. + FileURLs: []string{filePath}, + }, + Download: true, // 是否下载异步任务结果. + } + + resp, err := cli.CreateVoiceFileToTextGeneration(context.TODO(), req) + if err != nil { + panic(err) + } + + // 如果不需要下载异步任务的结果,仅获取异步任务的 task_id 后自行轮询结果. + fmt.Println("taskInfo: ", resp.AsyncTaskResp) //nolint:all + // 当 request 中设置了 Download = true 时, 等待语音识别结果输出. + fmt.Println("等待语音识别结果输出...") //nolint:all + for _, v := range resp.FileResults { + for _, v2 := range v.Transcripts { + fmt.Println(v2.Text) //nolint:all + } + } +} diff --git a/httpclient/http_client.go b/httpclient/http_client.go index 1941d4c..68e167d 100644 --- a/httpclient/http_client.go +++ b/httpclient/http_client.go @@ -65,6 +65,8 @@ func (c *HTTPCli) Get(ctx context.Context, urll string, params map[string]string return err } + // fmt.Println("result: ", string(result)) + err = json.Unmarshal(result, &respbody) if err != nil { return &WrapMessageError{Message: "Unmarshal Json failed", Cause: err} @@ -197,7 +199,7 @@ func (c *HTTPCli) httpInner(ctx context.Context, method, url string, body interf if err != nil { return nil, err } - // fmt.Printf("debug... body: %+v\n", bodyBuffer.String()) + // fmt.Printf("debug... req-body: %+v\n", bodyBuffer.String()) c.req, err = http.NewRequestWithContext(ctx, method, url, bodyBuffer) if err != nil { diff --git a/paraformer/dtypes.go b/paraformer/dtypes.go index 98e065d..b3991ed 100644 --- a/paraformer/dtypes.go +++ b/paraformer/dtypes.go @@ -2,8 +2,6 @@ package paraformer import "context" -const ParaformerWSURL = "wss://dashscope.aliyuncs.com/api-ws/v1/inference" - type Parameters struct { SampleRate int `json:"sample_rate"` Format string `json:"format"` @@ -40,13 +38,6 @@ type PayloadIn struct { // Punctuation string `json:"punctuation"` // } -type Sentence struct { - BeginTime int `json:"begin_time"` - EndTime int `json:"end_time"` - Text string `json:"text"` // full text - // Words []Word `json:"words"` -} - type Output struct { Sentence Sentence `json:"sentence"` } @@ -72,3 +63,83 @@ type RecognitionResult struct { Header Header `json:"header"` Payload PayloadOut `json:"payload"` } + +// =========== +// 生成异步 task_id. +type AsyncTaskRequest struct { + Model string `json:"model"` + Input AsyncInput `json:"input"` + HasUploadOss bool `json:"-"` + Download bool `json:"-"` +} + +type AsyncInput struct { + FileURLs []string `json:"file_urls"` +} + +type AsyncTaskResponse struct { + RequestID string `json:"request_id"` + Output TaskResultResponse `json:"output"` +} + +// 根据 task_id 获取结果. +type TaskResultRequest struct { + TaskID string `json:"task_id"` +} + +type TaskResultResponse struct { + TaskID string `json:"task_id,omitempty"` + TaskStatus string `json:"task_status,omitempty"` + SubmitTime string `json:"submit_time,omitempty"` + ScheduledTime string `json:"scheduled_time,omitempty"` + EndTime string `json:"end_time,omitempty"` + Results []Result `json:"results,omitempty"` + TaskMetrics TaskMetrics `json:"task_metrics,omitempty"` +} + +type Result struct { + FileURL string `json:"file_url,omitempty"` + TranscriptionURL string `json:"transcription_url,omitempty"` + SubtaskStatus string `json:"subtask_status,omitempty"` +} + +type TaskMetrics struct { + Total int `json:"TOTAL,omitempty"` + Succeeded int `json:"SUCCEEDED,omitempty"` + Failed int `json:"FAILED,omitempty"` +} + +// =========== 最终结果 ===========. +type FileResult struct { + FileURL string `json:"file_url"` + Properties Properties `json:"properties"` + Transcripts []Transcript `json:"transcripts"` +} + +type Properties struct { + Channels []interface{} `json:"channels"` + OriginalSamplingRate int `json:"original_sampling_rate"` + OriginalDurationInMilliseconds int `json:"original_duration_in_milliseconds"` +} + +type Transcript struct { + ChannelID int `json:"channel_id"` + ContentDurationInMilliseconds int `json:"content_duration_in_milliseconds"` + Text string `json:"text"` + Sentences []Sentence `json:"sentences"` +} + +type Sentence struct { + BeginTime int `json:"begin_time"` + EndTime int `json:"end_time"` + SentenceID int `json:"sentence_id"` + Text string `json:"text"` + Words []Word `json:"words"` +} + +type Word struct { + BeginTime int `json:"begin_time"` + EndTime int `json:"end_time"` + Text string `json:"text"` + Punctuation string `json:"punctuation"` +} diff --git a/paraformer/paraformer.go b/paraformer/paraformer_ws.go similarity index 98% rename from paraformer/paraformer.go rename to paraformer/paraformer_ws.go index 6f73e69..661806d 100644 --- a/paraformer/paraformer.go +++ b/paraformer/paraformer_ws.go @@ -10,6 +10,8 @@ import ( "github.com/google/uuid" ) +// real-time voice recognition + func ConnRecognitionClient(request *Request, token string) (*httpclient.WsClient, error) { // Initialize the client with the necessary parameters. header := http.Header{} diff --git a/paraformer/paraformercli.go b/paraformer/paraformercli.go new file mode 100644 index 0000000..d6d8c10 --- /dev/null +++ b/paraformer/paraformercli.go @@ -0,0 +1,111 @@ +package paraformer + +import ( + "context" + "log" + "time" + + httpclient "github.com/devinyf/dashscopego/httpclient" +) + +// + +func AsyncVoiceFileRecognitionTask(ctx context.Context, request *AsyncTaskRequest, cli httpclient.IHttpClient, token string) (*AsyncTaskResponse, error) { + tokenHeader := httpclient.WithTokenHeaderOption(token) + header := httpclient.HeaderMap{ + "X-DashScope-Async": "enable", + "Content-Type": "application/json", + } + + if request.HasUploadOss { + header["X-DashScope-OssResourceResolve"] = "enable" + } + + contentHeader := httpclient.WithHeader(header) + resp := AsyncTaskResponse{} + err := cli.Post(ctx, ParaformerAsyncURL, request, &resp, tokenHeader, contentHeader) + if err != nil { + return &resp, err + } + + return &resp, nil +} + +type VoiceFileResponse struct { + AsyncTaskResp *AsyncTaskResponse + FileResults []*FileResult +} + +func VoiceFileToTextGeneration(ctx context.Context, req *AsyncTaskRequest, cli httpclient.IHttpClient, token string) (*VoiceFileResponse, error) { + resp := &VoiceFileResponse{} + + var resultList []*FileResult + + tokenHeader := httpclient.WithTokenHeaderOption(token) + contentHeader := httpclient.WithHeader(httpclient.HeaderMap{ + "Accept": "application/json", + }) + + taskResp, err := AsyncVoiceFileRecognitionTask(ctx, req, cli, token) + if err != nil { + return nil, err + } + resp.AsyncTaskResp = taskResp + + if req.Download { + taskReq := TaskResultRequest{TaskID: taskResp.Output.TaskID} + + taskStatusReap := &AsyncTaskResponse{} + firstQuery := true + for firstQuery || + taskStatusReap.Output.TaskStatus == "PENDING" || + taskStatusReap.Output.TaskStatus == "RUNNING" { + firstQuery = false + log.Println("TaskStatus: ", taskStatusReap.Output.TaskStatus) + taskStatusReap, err = CheckTaskStatus(ctx, &taskReq, cli, tokenHeader, contentHeader) + if err != nil { + return nil, err + } + + time.Sleep(1 * time.Second) + } + + // use taskID download file and read json content. + for _, resultInfo := range taskStatusReap.Output.Results { + result, err := downloadJsonfile(ctx, resultInfo.TranscriptionURL, cli) + if err != nil { + return nil, err + } + + resultList = append(resultList, result) + resp.FileResults = resultList + } + } + + return resp, nil +} + +//nolint:lll +func CheckTaskStatus(ctx context.Context, req *TaskResultRequest, httpcli httpclient.IHttpClient, options ...httpclient.HTTPOption) (*AsyncTaskResponse, error) { + resp := AsyncTaskResponse{} + err := httpcli.Get(ctx, TaskURL(req.TaskID), nil, &resp, options...) + if err != nil { + return nil, err + } + + return &resp, nil +} + +func downloadJsonfile(ctx context.Context, url string, httpcli httpclient.IHttpClient) (*FileResult, error) { + contentHeader := httpclient.WithHeader(httpclient.HeaderMap{ + "Accept": "application/json", + }) + + resp := FileResult{} + err := httpcli.Get(ctx, url, nil, &resp, contentHeader) + if err != nil { + return nil, err + } + + return &resp, nil +} diff --git a/paraformer/params.go b/paraformer/params.go index db5296e..140c761 100644 --- a/paraformer/params.go +++ b/paraformer/params.go @@ -1,5 +1,7 @@ package paraformer +import "fmt" + type ModelParaformer = string const ( @@ -11,3 +13,16 @@ const ( ParaformerRealTimeV1 ModelParaformer = "paraformer-realtime-v1" ParaformerRealTime8KV1 ModelParaformer = "paraformer-realtime-8k-v1" ) + +const ( + // real-time voice recognition. + ParaformerWSURL = "wss://dashscope.aliyuncs.com/api-ws/v1/inference" + // audio file to text. + ParaformerAsyncURL = "https://dashscope.aliyuncs.com/api/v1/services/audio/asr/transcription" + // audio file to text async-task-result query. + ParaformerTaskURL = "https://dashscope.aliyuncs.com/api/v1/tasks/%s" +) + +func TaskURL(taskID string) string { + return fmt.Sprintf(ParaformerTaskURL, taskID) +} diff --git a/tongyiclient.go b/tongyiclient.go index bbeef91..54333dd 100644 --- a/tongyiclient.go +++ b/tongyiclient.go @@ -45,13 +45,13 @@ func (q *TongyiClient) SetUploadCache(uploadCache qwen.UploadCacher) *TongyiClie // // nolint:lll func (q *TongyiClient) CreateCompletion(ctx context.Context, payload *qwen.Request[*qwen.TextContent]) (*TextQwenResponse, error) { - payload = paylosdPreCheck(q, payload) + payload = payloadPreCheck(q, payload) return genericCompletion[*qwen.TextContent, *qwen.TextContent](ctx, payload, q.httpCli, qwen.URLQwen(), q.token) } //nolint:lll func (q *TongyiClient) CreateVLCompletion(ctx context.Context, payload *qwen.Request[*qwen.VLContentList]) (*VLQwenResponse, error) { - payload = paylosdPreCheck(q, payload) + payload = payloadPreCheck(q, payload) for _, vMsg := range payload.Input.Messages { tmpImageContent, hasImg := vMsg.Content.PopImageContent() @@ -74,7 +74,7 @@ func (q *TongyiClient) CreateVLCompletion(ctx context.Context, payload *qwen.Req //nolint:lll func (q *TongyiClient) CreateAudioCompletion(ctx context.Context, payload *qwen.Request[*qwen.AudioContentList]) (*AudioQwenResponse, error) { - payload = paylosdPreCheck(q, payload) + payload = payloadPreCheck(q, payload) for _, acMsg := range payload.Input.Messages { tmpAudioContent, hasAudio := acMsg.Content.PopAudioContent() @@ -100,7 +100,7 @@ func (q *TongyiClient) CreateAudioCompletion(ctx context.Context, payload *qwen. // //nolint:lll func (q *TongyiClient) CreateFileCompletion(ctx context.Context, payload *qwen.Request[*qwen.FileContentList]) (*FileQwenResponse, error) { - payload = paylosdPreCheck(q, payload) + payload = payloadPreCheck(q, payload) for _, vMsg := range payload.Input.Messages { tmpImageContent, hasImg := vMsg.Content.PopFileContent() @@ -174,20 +174,36 @@ func (q *TongyiClient) CreateImageGeneration(ctx context.Context, payload *wanx. return wanx.CreateImageGeneration(ctx, payload, q.httpCli, q.token) } -/* -func (q *TongyiClient) CreateVoiceFileToTextGeneration(ctx context.Context, request *paraformer.Request) (any, error) { - if request.Payload.Model == "" { +// voice file to text. +func (q *TongyiClient) CreateVoiceFileToTextGeneration(ctx context.Context, request *paraformer.AsyncTaskRequest) (*paraformer.VoiceFileResponse, error) { + if request.Model == "" { if q.Model == "" { return nil, ErrModelNotSet } - request.Payload.Model = q.Model + request.Model = q.Model } - return - panic("not implemented") + var RequestURLs []string + for _, fileURL := range request.Input.FileURLs { + ossURL, hasUploadOss, err := checkIfNeedUploadFile(ctx, fileURL, request.Model, q.token, q.uploadCache) + if err != nil { + return nil, err + } + if hasUploadOss { + // upload file to oss + RequestURLs = append(RequestURLs, ossURL) + request.HasUploadOss = true + } else { + RequestURLs = append(RequestURLs, fileURL) + } + } + + request.Input.FileURLs = RequestURLs + + return paraformer.VoiceFileToTextGeneration(ctx, request, q.httpCli, q.token) } -*/ +// realtime sppech to text. func (q *TongyiClient) CreateSpeechToTextGeneration(ctx context.Context, request *paraformer.Request, reader *bufio.Reader) error { if request.Payload.Model == "" { if q.Model == "" { @@ -242,7 +258,7 @@ func (q *TongyiClient) CreateEmbedding(ctx context.Context, r *embedding.Request return embeddings, totslTokens, nil } -func paylosdPreCheck[T qwen.IQwenContent](q *TongyiClient, payload *qwen.Request[T]) *qwen.Request[T] { +func payloadPreCheck[T qwen.IQwenContent](q *TongyiClient, payload *qwen.Request[T]) *qwen.Request[T] { if payload.Model == "" { payload.Model = q.Model }