diff --git a/code/go.mod b/code/go.mod index 4597915c..9c3e4fdb 100644 --- a/code/go.mod +++ b/code/go.mod @@ -9,6 +9,7 @@ require ( github.com/google/uuid v1.3.0 github.com/larksuite/oapi-sdk-gin v1.0.0 github.com/patrickmn/go-cache v2.1.0+incompatible + github.com/pion/opus v0.0.0-20230123082803-1052c3e89e58 github.com/spf13/pflag v1.0.5 github.com/spf13/viper v1.14.0 ) diff --git a/code/go.sum b/code/go.sum index 33de6945..d1e28f10 100644 --- a/code/go.sum +++ b/code/go.sum @@ -180,6 +180,8 @@ github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3v github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= github.com/pelletier/go-toml/v2 v2.0.6 h1:nrzqCb7j9cDFj2coyLNLaZuJTLjWjlaz6nvTvIwycIU= github.com/pelletier/go-toml/v2 v2.0.6/go.mod h1:eumQOmlWiOPt5WriQQqoM5y18pDHwha2N+QD+EUNTek= +github.com/pion/opus v0.0.0-20230123082803-1052c3e89e58 h1:wi5XffRvL9Ghx8nRAdZyAjmLV/ccnn2xJ4w6S6fELgA= +github.com/pion/opus v0.0.0-20230123082803-1052c3e89e58/go.mod h1:m8ODxkLrcNvLY6BPvOj7yLxK1wMQWA+2jqKcsrZ293U= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/sftp v1.13.1/go.mod h1:3HaPG6Dq1ILlpPZRO0HVMrsydcdLt6HRDccSgb87qRg= diff --git a/code/handlers/common.go b/code/handlers/common.go index bc510496..85c33137 100644 --- a/code/handlers/common.go +++ b/code/handlers/common.go @@ -23,6 +23,9 @@ func parseContent(content string) string { if err != nil { fmt.Println(err) } + if contentMap["text"] == nil { + return "" + } text := contentMap["text"].(string) return msgFilter(text) } @@ -67,3 +70,17 @@ func cleanTextBlock(msg string) string { msg = processQuote(msg) return msg } + +func parseFileKey(content string) string { + var contentMap map[string]interface{} + err := json.Unmarshal([]byte(content), &contentMap) + if err != nil { + fmt.Println(err) + return "" + } + if contentMap["file_key"] == nil { + return "" + } + fileKey := contentMap["file_key"].(string) + return fileKey +} diff --git a/code/handlers/event_action.go b/code/handlers/event_action.go index c1623544..44b93362 100644 --- a/code/handlers/event_action.go +++ b/code/handlers/event_action.go @@ -4,8 +4,11 @@ import ( "context" "fmt" larkim "github.com/larksuite/oapi-sdk-go/v3/service/im/v1" + "os" + "start-feishubot/initialization" "start-feishubot/services" "start-feishubot/utils" + "start-feishubot/utils/audio" ) type MsgInfo struct { @@ -14,6 +17,7 @@ type MsgInfo struct { msgId *string chatId *string qParsed string + fileKey string sessionId *string mention []*larkim.MentionEvent } @@ -169,7 +173,7 @@ func (*MessageAction) Execute(a *ActionInfo) bool { a.handler.sessionCache.SetMsg(*a.info.sessionId, msg) //if new topic if len(msg) == 2 { - fmt.Println("new topic", msg[1].Content) + //fmt.Println("new topic", msg[1].Content) sendNewTopicCard(*a.ctx, a.info.sessionId, a.info.msgId, completions.Content) return false @@ -182,3 +186,53 @@ func (*MessageAction) Execute(a *ActionInfo) bool { } return true } + +type AudioAction struct { /*语音*/ +} + +func (*AudioAction) Execute(a *ActionInfo) bool { + // 只有私聊才解析语音,其他不解析 + if a.info.handlerType != UserHandler { + return true + } + + //判断是否是语音 + if a.info.msgType == "audio" { + fileKey := a.info.fileKey + //fmt.Printf("fileKey: %s \n", fileKey) + msgId := a.info.msgId + //fmt.Println("msgId: ", *msgId) + req := larkim.NewGetMessageResourceReqBuilder().MessageId( + *msgId).FileKey(fileKey).Type("file").Build() + resp, err := initialization.GetLarkClient().Im.MessageResource.Get(context.Background(), req) + //fmt.Println(resp, err) + if err != nil { + fmt.Println(err) + return true + } + f := fmt.Sprintf("%s.ogg", fileKey) + resp.WriteFile(f) + defer os.Remove(f) + + //fmt.Println("f: ", f) + output := fmt.Sprintf("%s.mp3", fileKey) + // 等待转换完成 + audio.OggToWavByPath(f, output) + defer os.Remove(output) + //fmt.Println("output: ", output) + + text, err := a.handler.gpt.AudioToText(output) + if err != nil { + fmt.Println(err) + sendMsg(*a.ctx, "🤖️:语音转换失败,请稍后再试~", a.info.msgId) + return false + } + //删除文件 + //fmt.Println("text: ", text) + a.info.qParsed = text + return true + } + + return true + +} diff --git a/code/handlers/handler.go b/code/handlers/handler.go index 95b13d23..5833d231 100644 --- a/code/handlers/handler.go +++ b/code/handlers/handler.go @@ -107,10 +107,11 @@ func (m MessageHandler) msgReceivedHandler(ctx context.Context, event *larkim.P2 return nil } msgType := judgeMsgType(event) - if msgType != "text" { + if msgType != "text" && msgType != "audio" { fmt.Println("unknown msg type") return nil } + //fmt.Println(larkcore.Prettify(event.Event.Message)) content := event.Event.Message.Content msgId := event.Event.Message.MessageId @@ -128,6 +129,7 @@ func (m MessageHandler) msgReceivedHandler(ctx context.Context, event *larkim.P2 msgId: msgId, chatId: chatId, qParsed: strings.Trim(parseContent(*content), " "), + fileKey: parseFileKey(*content), sessionId: sessionId, mention: mention, } @@ -139,6 +141,7 @@ func (m MessageHandler) msgReceivedHandler(ctx context.Context, event *larkim.P2 actions := []Action{ &ProcessedUniqueAction{}, //避免重复处理 &ProcessMentionAction{}, //判断机器人是否应该被调用 + &AudioAction{}, //语音处理 &EmptyAction{}, //空消息处理 &ClearAction{}, //清除消息处理 &HelpAction{}, //帮助处理 diff --git a/code/handlers/init.go b/code/handlers/init.go index 8b729949..333e43f1 100644 --- a/code/handlers/init.go +++ b/code/handlers/init.go @@ -2,7 +2,6 @@ package handlers import ( "context" - "fmt" "start-feishubot/initialization" "start-feishubot/services" @@ -62,7 +61,6 @@ func judgeCardType(cardAction *larkcard.CardAction) HandlerType { func judgeChatType(event *larkim.P2MessageReceiveV1) HandlerType { chatType := event.Event.Message.ChatType - fmt.Printf("chatType: %v", *chatType) if *chatType == "group" { return GroupHandler } @@ -77,5 +75,12 @@ func judgeMsgType(event *larkim.P2MessageReceiveV1) string { if *msgType == "text" { return "text" } + if *msgType == "image" { + return "image" + } + if *msgType == "audio" { + return "audio" + } + return "" } diff --git a/code/handlers/msg.go b/code/handlers/msg.go index 40f4f504..747296cf 100644 --- a/code/handlers/msg.go +++ b/code/handlers/msg.go @@ -428,7 +428,7 @@ func replayImageByBase64(ctx context.Context, base64Str string, } //example := "img_v2_041b28e3-5680-48c2-9af2-497ace79333g" //imageKey := &example - fmt.Println("imageKey", *imageKey) + //fmt.Println("imageKey", *imageKey) err = sendImageCard(ctx, *imageKey, msgId, sessionId, question) if err != nil { return err diff --git a/code/services/gpt3.go b/code/services/gpt3.go index cccca2ce..ee0c2908 100644 --- a/code/services/gpt3.go +++ b/code/services/gpt3.go @@ -5,9 +5,12 @@ import ( "encoding/json" "errors" "fmt" + "io" "io/ioutil" + "mime/multipart" "net/http" "net/url" + "os" "start-feishubot/initialization" "start-feishubot/services/loadbalancer" "strings" @@ -71,37 +74,91 @@ type ImageGenerationResponseBody struct { } `json:"data"` } -func (gpt ChatGPT) doRequest(url, method string, - requestBody interface{}, responseBody interface{}, - client *http.Client) error { - api := gpt.Lb.GetAPI() - if api == nil { - return errors.New("no available API") +type AudioToTextRequestBody struct { + File string `json:"file"` + Model string `json:"model"` + ResponseFormat string `json:"response_format"` +} + +type AudioToTextResponseBody struct { + Text string `json:"text"` +} + +type requestBodyType int + +const ( + jsonBody requestBodyType = iota + formDataBody +) + +func (gpt ChatGPT) doAPIRequestWithRetry(url, method string, bodyType requestBodyType, + requestBody interface{}, responseBody interface{}, client *http.Client, maxRetries int) error { + var api *loadbalancer.API + var requestBodyData []byte + var err error + var writer *multipart.Writer + + switch bodyType { + case jsonBody: + api = gpt.Lb.GetAPI() + requestBodyData, err = json.Marshal(requestBody) + if err != nil { + return err + } + case formDataBody: + api = gpt.Lb.GetAPI() + formBody := &bytes.Buffer{} + writer = multipart.NewWriter(formBody) + err = audioMultipartForm(requestBody.(AudioToTextRequestBody), writer) + if err != nil { + return err + } + err = writer.Close() + if err != nil { + return err + } + requestBodyData = formBody.Bytes() + default: + return errors.New("unknown request body type") } - requestData, err := json.Marshal(requestBody) - if err != nil { - return err + if api == nil { + return errors.New("no available API") } - req, err := http.NewRequest(method, url, bytes.NewBuffer(requestData)) + req, err := http.NewRequest(method, url, bytes.NewReader(requestBodyData)) if err != nil { return err } req.Header.Set("Content-Type", "application/json") + if bodyType == formDataBody { + req.Header.Set("Content-Type", writer.FormDataContentType()) + } req.Header.Set("Authorization", "Bearer "+api.Key) - response, err := client.Do(req) - if err != nil { - gpt.Lb.SetAvailability(api.Key, false) - return err + var response *http.Response + var retry int + for retry = 0; retry <= maxRetries; retry++ { + response, err = client.Do(req) + //fmt.Println("req", req) + //fmt.Println("response", response, "err", err) + if err != nil || response.StatusCode < 200 || response.StatusCode >= 300 { + gpt.Lb.SetAvailability(api.Key, false) + if retry == maxRetries { + break + } + time.Sleep(time.Duration(retry+1) * time.Second) + } else { + break + } + } + if response != nil { + defer response.Body.Close() } - defer response.Body.Close() - if response.StatusCode/2 != 100 { - gpt.Lb.SetAvailability(api.Key, false) - return fmt.Errorf("%s api %s", strings.ToUpper(method), response.Status) + if response == nil || response.StatusCode < 200 || response.StatusCode >= 300 { + return fmt.Errorf("%s api failed after %d retries", strings.ToUpper(method), retry) } body, err := ioutil.ReadAll(response.Body) @@ -118,19 +175,18 @@ func (gpt ChatGPT) doRequest(url, method string, return nil } -func (gpt ChatGPT) sendRequest(link, method string, +func (gpt ChatGPT) sendRequestWithBodyType(link, method string, bodyType requestBodyType, requestBody interface{}, responseBody interface{}) error { var err error client := &http.Client{Timeout: 110 * time.Second} if gpt.HttpProxy == "" { - err = gpt.doRequest(link, method, requestBody, responseBody, client) + err = gpt.doAPIRequestWithRetry(link, method, bodyType, + requestBody, responseBody, client, 3) } else { - //fmt.Println("using proxy: " + gpt.HttpProxy) proxyUrl, err := url.Parse(gpt.HttpProxy) if err != nil { return err } - transport := &http.Transport{ Proxy: http.ProxyURL(proxyUrl), } @@ -138,8 +194,8 @@ func (gpt ChatGPT) sendRequest(link, method string, Transport: transport, Timeout: 110 * time.Second, } - - err = gpt.doRequest(link, method, requestBody, responseBody, proxyClient) + err = gpt.doAPIRequestWithRetry(link, method, bodyType, + requestBody, responseBody, proxyClient, 3) } return err @@ -156,7 +212,8 @@ func (gpt ChatGPT) Completions(msg []Messages) (resp Messages, err error) { PresencePenalty: 0, } gptResponseBody := &ChatGPTResponseBody{} - err = gpt.sendRequest(gpt.ApiUrl+"/v1/chat/completions", "POST", + err = gpt.sendRequestWithBodyType(gpt.ApiUrl+"/v1/chat/completions", "POST", + jsonBody, requestBody, gptResponseBody) if err == nil && len(gptResponseBody.Choices) > 0 { @@ -168,6 +225,37 @@ func (gpt ChatGPT) Completions(msg []Messages) (resp Messages, err error) { return resp, err } +// audioMultipartForm creates a form with audio file contents and the name of the model to use for +// audio processing. +func audioMultipartForm(request AudioToTextRequestBody, w *multipart.Writer) error { + f, err := os.Open(request.File) + if err != nil { + return fmt.Errorf("opening audio file: %w", err) + } + + fw, err := w.CreateFormFile("file", f.Name()) + if err != nil { + return fmt.Errorf("creating form file: %w", err) + } + + if _, err = io.Copy(fw, f); err != nil { + return fmt.Errorf("reading from opened audio file: %w", err) + } + + fw, err = w.CreateFormField("model") + if err != nil { + return fmt.Errorf("creating form field: %w", err) + } + + modelName := bytes.NewReader([]byte(request.Model)) + if _, err = io.Copy(fw, modelName); err != nil { + return fmt.Errorf("writing model name: %w", err) + } + w.Close() + + return nil +} + func (gpt ChatGPT) GenerateImage(prompt string, size string, n int) ([]string, error) { requestBody := ImageGenerationRequestBody{ Prompt: prompt, @@ -177,8 +265,8 @@ func (gpt ChatGPT) GenerateImage(prompt string, size string, n int) ([]string, e } imageResponseBody := &ImageGenerationResponseBody{} - err := gpt.sendRequest(gpt.ApiUrl+"/v1/images/generations", - "POST", requestBody, imageResponseBody) + err := gpt.sendRequestWithBodyType(gpt.ApiUrl+"/v1/images/generations", + "POST", jsonBody, requestBody, imageResponseBody) if err != nil { return nil, err @@ -199,6 +287,23 @@ func (gpt ChatGPT) GenerateOneImage(prompt string, size string) (string, error) return b64s[0], nil } +func (gpt ChatGPT) AudioToText(audio string) (string, error) { + requestBody := AudioToTextRequestBody{ + File: audio, + Model: "whisper-1", + ResponseFormat: "text", + } + audioToTextResponseBody := &AudioToTextResponseBody{} + err := gpt.sendRequestWithBodyType(gpt.ApiUrl+"/v1/audio/transcriptions", + "POST", formDataBody, requestBody, audioToTextResponseBody) + //fmt.Println(audioToTextResponseBody) + if err != nil { + //fmt.Println(err) + return "", err + } + + return audioToTextResponseBody.Text, nil +} func NewChatGPT(config initialization.Config) *ChatGPT { apiKeys := config.OpenaiApiKeys apiUrl := config.OpenaiApiUrl diff --git a/code/utils/audio/ogg.go b/code/utils/audio/ogg.go new file mode 100644 index 00000000..fe65db59 --- /dev/null +++ b/code/utils/audio/ogg.go @@ -0,0 +1,61 @@ +package audio + +import ( + "bytes" + "errors" + "io" + "os" + + "github.com/pion/opus" + "github.com/pion/opus/pkg/oggreader" +) + +func OggToWavByPath(ogg string, wav string) error { + input, err := os.Open(ogg) + if err != nil { + return err + } + defer input.Close() + + output, err := os.Create(wav) + if err != nil { + return err + } + + defer output.Close() + return OggToWav(input, output) +} + +func OggToWav(input io.Reader, output io.WriteSeeker) error { + ogg, _, err := oggreader.NewWith(input) + if err != nil { + return err + } + + out := make([]byte, 1920) + + decoder := opus.NewDecoder() + encoder := NewEncoder(output, 44100, 16) + + for { + segments, _, err := ogg.ParseNextPage() + if errors.Is(err, io.EOF) { + break + } else if bytes.HasPrefix(segments[0], []byte("OpusTags")) { + continue + } + + if err != nil { + panic(err) + } + + for i := range segments { + if _, _, err = decoder.Decode(segments[i], out); err != nil { + panic(err) + } + encoder.Write(out) + } + } + encoder.Close() + return nil +} diff --git a/code/utils/audio/wav.go b/code/utils/audio/wav.go new file mode 100644 index 00000000..4308f604 --- /dev/null +++ b/code/utils/audio/wav.go @@ -0,0 +1,107 @@ +package audio + +import ( + "encoding/binary" + "io" +) + +type Encoder struct { + Output io.WriteSeeker + SampleRate int + BitDepth int + totalBytes uint32 + isHeaderWritten bool +} + +func (e *Encoder) WriteHeader() error { + if err := writeLe(e.Output, []byte("RIFF")); err != nil { + return err + } + + if err := writeLe(e.Output, uint32(0)); err != nil { // Placeholder for file size + return err + } + + if err := writeLe(e.Output, []byte("WAVE")); err != nil { + return err + } + + if err := writeLe(e.Output, []byte("fmt ")); err != nil { + return err + } + if err := writeLe(e.Output, uint32(16)); err != nil { + return err + } + + if err := writeLe(e.Output, uint16(1)); err != nil { // Audio format: PCM + return err + } + if err := writeLe(e.Output, uint16(1)); err != nil { // Number of channels: 1 (mono) + return err + } + if err := writeLe(e.Output, uint32(e.SampleRate)); err != nil { + return err + } + + if err := writeLe(e.Output, uint32(e.SampleRate*e.BitDepth/8)); err != nil { + return err + } + + if err := writeLe(e.Output, uint16(e.BitDepth/8)); err != nil { + return err + } + if err := writeLe(e.Output, uint16(e.BitDepth)); err != nil { + return err + } + + if err := writeLe(e.Output, []byte("data")); err != nil { + return err + } + + if err := writeLe(e.Output, uint32(0)); err != nil { //Placeholder for data size + return err + } + e.isHeaderWritten = true + return nil +} + +func writeLe[T []byte | uint32 | uint16 | uint8](w io.Writer, data T) error { + return binary.Write(w, binary.LittleEndian, data) +} + +func (e *Encoder) Write(data []byte) error { + if !e.isHeaderWritten { + e.WriteHeader() + } + n, err := e.Output.Write(data) + if err != nil { + return err + } + e.totalBytes += uint32(n) + return nil +} + +func (e *Encoder) Close() error { + if _, err := e.Output.Seek(4, io.SeekStart); err != nil { + return err + } + if err := binary.Write(e.Output, binary.LittleEndian, uint32(36+e.totalBytes)); err != nil { + return err + } + if _, err := e.Output.Seek(40, io.SeekStart); err != nil { + return err + } + if err := binary.Write(e.Output, binary.LittleEndian, e.totalBytes); err != nil { + return err + } + return nil +} + +func NewEncoder(w io.WriteSeeker, sampleRate int, bitDepth int) *Encoder { + return &Encoder{ + SampleRate: sampleRate, + Output: w, + BitDepth: bitDepth, + isHeaderWritten: false, + } +} diff --git a/readme.md b/readme.md index f087e46e..1a6235c1 100644 --- a/readme.md +++ b/readme.md @@ -26,7 +26,7 @@ ## 👻 机器人功能 -🗣 语音交流:直接与机器人畅所欲言 🚧 +🗣 语音交流:私人直接与机器人畅所欲言 💬 多话题对话:支持私人和群聊多话题讨论,高效连贯