Skip to content

Commit

Permalink
paraformer: voice file to text
Browse files Browse the repository at this point in the history
  • Loading branch information
devinyf committed May 29, 2024
1 parent 989b083 commit 26caaef
Show file tree
Hide file tree
Showing 9 changed files with 297 additions and 24 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ go get -u github.com/devinyf/dashscopego
- [ ] 人像风格重绘
- [ ] 图像背景生成
#### Paraformer(语音识别文字)
- [x] [实时语音识别](./example/paraformer/speech2text.go)
- [ ] 录音文件识别
- [x] [实时语音识别](./example/paraformer/realtime/speech2text.go)
- [x] [录音文件识别](./example/paraformer/voice_file/recordfile2text.go)
#### 语音合成
- [ ] 文本至语音的实时流式合成
#### 通用文本向量 Embedding
Expand Down
File renamed without changes.
56 changes: 56 additions & 0 deletions example/paraformer/voice_file/recordfile2text.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package main

import (
"context"
"fmt"
"os"
"os/user"
"path/filepath"

"github.com/devinyf/dashscopego"
"github.com/devinyf/dashscopego/paraformer"
)

func main() {
model := paraformer.ParaformerV1
token := os.Getenv("DASHSCOPE_API_KEY")
if token == "" {
panic("token is empty")
}

cli := dashscopego.NewTongyiClient(model, token)

usr, err := user.Current()
if err != nil {
panic(err)
}

voiceFile := filepath.Join(usr.HomeDir, "Desktop", "hello_world_female2.wav")
filePath := "file://" + voiceFile

req := &paraformer.AsyncTaskRequest{
Model: paraformer.ParaformerV1,
Input: paraformer.AsyncInput{
// 官方示例中使用的远程文件.
// FileURLs: []string{"https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/hello_world_female2.wav"},
// 本地文件.
FileURLs: []string{filePath},
},
Download: true, // 是否下载异步任务结果.
}

resp, err := cli.CreateVoiceFileToTextGeneration(context.TODO(), req)
if err != nil {
panic(err)
}

// 如果不需要下载异步任务的结果,仅获取异步任务的 task_id 后自行轮询结果.
fmt.Println("taskInfo: ", resp.AsyncTaskResp) //nolint:all
// 当 request 中设置了 Download = true 时, 等待语音识别结果输出.
fmt.Println("等待语音识别结果输出...") //nolint:all
for _, v := range resp.FileResults {
for _, v2 := range v.Transcripts {
fmt.Println(v2.Text) //nolint:all
}
}
}
4 changes: 3 additions & 1 deletion httpclient/http_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ func (c *HTTPCli) Get(ctx context.Context, urll string, params map[string]string
return err
}

// fmt.Println("result: ", string(result))

err = json.Unmarshal(result, &respbody)
if err != nil {
return &WrapMessageError{Message: "Unmarshal Json failed", Cause: err}
Expand Down Expand Up @@ -197,7 +199,7 @@ func (c *HTTPCli) httpInner(ctx context.Context, method, url string, body interf
if err != nil {
return nil, err
}
// fmt.Printf("debug... body: %+v\n", bodyBuffer.String())
// fmt.Printf("debug... req-body: %+v\n", bodyBuffer.String())

c.req, err = http.NewRequestWithContext(ctx, method, url, bodyBuffer)
if err != nil {
Expand Down
89 changes: 80 additions & 9 deletions paraformer/dtypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ package paraformer

import "context"

const ParaformerWSURL = "wss://dashscope.aliyuncs.com/api-ws/v1/inference"

type Parameters struct {
SampleRate int `json:"sample_rate"`
Format string `json:"format"`
Expand Down Expand Up @@ -40,13 +38,6 @@ type PayloadIn struct {
// Punctuation string `json:"punctuation"`
// }

type Sentence struct {
BeginTime int `json:"begin_time"`
EndTime int `json:"end_time"`
Text string `json:"text"` // full text
// Words []Word `json:"words"`
}

type Output struct {
Sentence Sentence `json:"sentence"`
}
Expand All @@ -72,3 +63,83 @@ type RecognitionResult struct {
Header Header `json:"header"`
Payload PayloadOut `json:"payload"`
}

// ===========
// 生成异步 task_id.
type AsyncTaskRequest struct {
Model string `json:"model"`
Input AsyncInput `json:"input"`
HasUploadOss bool `json:"-"`
Download bool `json:"-"`
}

type AsyncInput struct {
FileURLs []string `json:"file_urls"`
}

type AsyncTaskResponse struct {
RequestID string `json:"request_id"`
Output TaskResultResponse `json:"output"`
}

// 根据 task_id 获取结果.
type TaskResultRequest struct {
TaskID string `json:"task_id"`
}

type TaskResultResponse struct {
TaskID string `json:"task_id,omitempty"`
TaskStatus string `json:"task_status,omitempty"`
SubmitTime string `json:"submit_time,omitempty"`
ScheduledTime string `json:"scheduled_time,omitempty"`
EndTime string `json:"end_time,omitempty"`
Results []Result `json:"results,omitempty"`
TaskMetrics TaskMetrics `json:"task_metrics,omitempty"`
}

type Result struct {
FileURL string `json:"file_url,omitempty"`
TranscriptionURL string `json:"transcription_url,omitempty"`
SubtaskStatus string `json:"subtask_status,omitempty"`
}

type TaskMetrics struct {
Total int `json:"TOTAL,omitempty"`
Succeeded int `json:"SUCCEEDED,omitempty"`
Failed int `json:"FAILED,omitempty"`
}

// =========== 最终结果 ===========.
type FileResult struct {
FileURL string `json:"file_url"`
Properties Properties `json:"properties"`
Transcripts []Transcript `json:"transcripts"`
}

type Properties struct {
Channels []interface{} `json:"channels"`
OriginalSamplingRate int `json:"original_sampling_rate"`
OriginalDurationInMilliseconds int `json:"original_duration_in_milliseconds"`
}

type Transcript struct {
ChannelID int `json:"channel_id"`
ContentDurationInMilliseconds int `json:"content_duration_in_milliseconds"`
Text string `json:"text"`
Sentences []Sentence `json:"sentences"`
}

type Sentence struct {
BeginTime int `json:"begin_time"`
EndTime int `json:"end_time"`
SentenceID int `json:"sentence_id"`
Text string `json:"text"`
Words []Word `json:"words"`
}

type Word struct {
BeginTime int `json:"begin_time"`
EndTime int `json:"end_time"`
Text string `json:"text"`
Punctuation string `json:"punctuation"`
}
2 changes: 2 additions & 0 deletions paraformer/paraformer.go → paraformer/paraformer_ws.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ import (
"github.com/google/uuid"
)

// real-time voice recognition

func ConnRecognitionClient(request *Request, token string) (*httpclient.WsClient, error) {
// Initialize the client with the necessary parameters.
header := http.Header{}
Expand Down
111 changes: 111 additions & 0 deletions paraformer/paraformercli.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package paraformer

import (
"context"
"log"
"time"

httpclient "github.com/devinyf/dashscopego/httpclient"
)

//

func AsyncVoiceFileRecognitionTask(ctx context.Context, request *AsyncTaskRequest, cli httpclient.IHttpClient, token string) (*AsyncTaskResponse, error) {
tokenHeader := httpclient.WithTokenHeaderOption(token)
header := httpclient.HeaderMap{
"X-DashScope-Async": "enable",
"Content-Type": "application/json",
}

if request.HasUploadOss {
header["X-DashScope-OssResourceResolve"] = "enable"
}

contentHeader := httpclient.WithHeader(header)
resp := AsyncTaskResponse{}
err := cli.Post(ctx, ParaformerAsyncURL, request, &resp, tokenHeader, contentHeader)
if err != nil {
return &resp, err
}

return &resp, nil
}

type VoiceFileResponse struct {
AsyncTaskResp *AsyncTaskResponse
FileResults []*FileResult
}

func VoiceFileToTextGeneration(ctx context.Context, req *AsyncTaskRequest, cli httpclient.IHttpClient, token string) (*VoiceFileResponse, error) {
resp := &VoiceFileResponse{}

var resultList []*FileResult

tokenHeader := httpclient.WithTokenHeaderOption(token)
contentHeader := httpclient.WithHeader(httpclient.HeaderMap{
"Accept": "application/json",
})

taskResp, err := AsyncVoiceFileRecognitionTask(ctx, req, cli, token)
if err != nil {
return nil, err
}
resp.AsyncTaskResp = taskResp

if req.Download {
taskReq := TaskResultRequest{TaskID: taskResp.Output.TaskID}

taskStatusReap := &AsyncTaskResponse{}
firstQuery := true
for firstQuery ||
taskStatusReap.Output.TaskStatus == "PENDING" ||
taskStatusReap.Output.TaskStatus == "RUNNING" {
firstQuery = false
log.Println("TaskStatus: ", taskStatusReap.Output.TaskStatus)
taskStatusReap, err = CheckTaskStatus(ctx, &taskReq, cli, tokenHeader, contentHeader)
if err != nil {
return nil, err
}

time.Sleep(1 * time.Second)
}

// use taskID download file and read json content.
for _, resultInfo := range taskStatusReap.Output.Results {
result, err := downloadJsonfile(ctx, resultInfo.TranscriptionURL, cli)
if err != nil {
return nil, err
}

resultList = append(resultList, result)
resp.FileResults = resultList
}
}

return resp, nil
}

//nolint:lll
func CheckTaskStatus(ctx context.Context, req *TaskResultRequest, httpcli httpclient.IHttpClient, options ...httpclient.HTTPOption) (*AsyncTaskResponse, error) {
resp := AsyncTaskResponse{}
err := httpcli.Get(ctx, TaskURL(req.TaskID), nil, &resp, options...)
if err != nil {
return nil, err
}

return &resp, nil
}

func downloadJsonfile(ctx context.Context, url string, httpcli httpclient.IHttpClient) (*FileResult, error) {
contentHeader := httpclient.WithHeader(httpclient.HeaderMap{
"Accept": "application/json",
})

resp := FileResult{}
err := httpcli.Get(ctx, url, nil, &resp, contentHeader)
if err != nil {
return nil, err
}

return &resp, nil
}
15 changes: 15 additions & 0 deletions paraformer/params.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package paraformer

import "fmt"

type ModelParaformer = string

const (
Expand All @@ -11,3 +13,16 @@ const (
ParaformerRealTimeV1 ModelParaformer = "paraformer-realtime-v1"
ParaformerRealTime8KV1 ModelParaformer = "paraformer-realtime-8k-v1"
)

const (
// real-time voice recognition.
ParaformerWSURL = "wss://dashscope.aliyuncs.com/api-ws/v1/inference"
// audio file to text.
ParaformerAsyncURL = "https://dashscope.aliyuncs.com/api/v1/services/audio/asr/transcription"
// audio file to text async-task-result query.
ParaformerTaskURL = "https://dashscope.aliyuncs.com/api/v1/tasks/%s"
)

func TaskURL(taskID string) string {
return fmt.Sprintf(ParaformerTaskURL, taskID)
}
Loading

0 comments on commit 26caaef

Please sign in to comment.