移植windows所有功能

v3ucn · Aug 20, 2024 · 715b431 · 715b431
1 parent cfbba1c
commit 715b431
Show file tree

Hide file tree

Showing 74 changed files with 7,721 additions and 6,064 deletions.
diff --git a/.gitignore b/.gitignore
@@ -43,8 +43,6 @@ tensorboard
 compile_commands.json
 
 # train/inference files
-*.wav
-*.pt
 pretrained_models/*
 *_pb2_grpc.py
 *_pb2.py
diff --git a/api.py b/api.py
@@ -0,0 +1,360 @@
+
+import time
+import io, os, sys
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/third_party/AcademiCodec'.format(ROOT_DIR))
+sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
+
+import numpy as np
+from flask import Flask, request, Response
+import torch
+import torchaudio
+
+from cosyvoice.cli.cosyvoice import CosyVoice
+from cosyvoice.utils.file_utils import load_wav
+import torchaudio
+import ffmpeg
+
+from flask_cors import CORS
+from flask import make_response
+
+import json
+
+cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M')
+
+spk_new = []
+
+for name in os.listdir(f"{ROOT_DIR}/voices/"):
+    print(name.replace(".py",""))
+    spk_new.append(name.replace(".py",""))
+
+print("默认音色",cosyvoice.list_avaliable_spks())
+print("自定义音色",spk_new)
+
+app = Flask(__name__)
+
+CORS(app, cors_allowed_origins="*")
+
+CORS(app, supports_credentials=True)
+
+
+def speed_change(input_audio: np.ndarray, speed: float, sr: int):
+    # 检查输入数据类型和声道数
+    if input_audio.dtype != np.int16:
+        raise ValueError("输入音频数据类型必须为 np.int16")
+
+
+    # 转换为字节流
+    raw_audio = input_audio.astype(np.int16).tobytes()
+
+    # 设置 ffmpeg 输入流
+    input_stream = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le', ar=str(sr), ac=1)
+
+    # 变速处理
+    output_stream = input_stream.filter('atempo', speed)
+
+    # 输出流到管道
+    out, _ = (
+        output_stream.output('pipe:', format='s16le', acodec='pcm_s16le')
+        .run(input=raw_audio, capture_stdout=True, capture_stderr=True)
+    )
+
+    # 将管道输出解码为 NumPy 数组
+    processed_audio = np.frombuffer(out, np.int16)
+
+    return processed_audio
+
+@app.route("/", methods=['POST'])
+def sft_post():
+    question_data = request.get_json()
+
+    text = question_data.get('text')
+    speaker = question_data.get('speaker')
+    new = question_data.get('new',0)
+    streaming = question_data.get('streaming',0)
+
+    speed = request.args.get('speed',1.0)
+    speed = float(speed)
+
+
+    if not text:
+        return {"error": "文本不能为空"}, 400
+
+    if not speaker:
+        return {"error": "角色名不能为空"}, 400
+
+    # 非流式
+    if streaming == 0:
+
+        start = time.process_time()
+        if not new:
+            output = cosyvoice.inference_sft(text,speaker,"无")
+        else:
+            output = cosyvoice.inference_sft(text,speaker,speaker)
+        end = time.process_time()
+        print("infer time:", end - start)
+        buffer = io.BytesIO()
+
+        if speed != 1.0:
+            try:
+                numpy_array = output['tts_speech'].numpy()
+                audio = (numpy_array * 32768).astype(np.int16) 
+                audio_data = speed_change(audio, speed=speed, sr=int(22050))
+                audio_data = torch.from_numpy(audio_data)
+                audio_data = audio_data.reshape(1, -1)
+            except Exception as e:
+                print(f"Failed to change speed of audio: \n{e}")
+        else:
+            audio_data = output['tts_speech']
+
+        torchaudio.save(buffer,audio_data, 22050, format="wav")
+        buffer.seek(0)
+        return Response(buffer.read(), mimetype="audio/wav")
+
+    # 流式模式
+    else:
+
+        spk_id = speaker
+
+        if new:
+            spk_id = "中文女"
+
+        joblist = cosyvoice.frontend.text_normalize_stream(text, split=True)
+
+        def generate():
+
+            for i in joblist:
+                print(i)
+                print("流式0")
+                tts_speeches = []
+                model_input = cosyvoice.frontend.frontend_sft(i, spk_id)
+                if new:
+                    # 加载数据
+                    newspk = torch.load(f'{ROOT_DIR}/voices/{speaker}.pt')
+
+                    model_input["flow_embedding"] = newspk["flow_embedding"]
+                    model_input["llm_embedding"] = newspk["llm_embedding"]
+
+                    model_input["llm_prompt_speech_token"] = newspk["llm_prompt_speech_token"]
+                    model_input["llm_prompt_speech_token_len"] = newspk["llm_prompt_speech_token_len"]
+
+                    model_input["flow_prompt_speech_token"] = newspk["flow_prompt_speech_token"]
+                    model_input["flow_prompt_speech_token_len"] = newspk["flow_prompt_speech_token_len"]
+
+                    model_input["prompt_speech_feat_len"] = newspk["prompt_speech_feat_len"]
+                    model_input["prompt_speech_feat"] = newspk["prompt_speech_feat"]
+                    model_input["prompt_text"] = newspk["prompt_text"]
+                    model_input["prompt_text_len"] = newspk["prompt_text_len"]
+
+                model_output = next(cosyvoice.model.inference_stream(**model_input))
+                # print(model_input)
+                tts_speeches.append(model_output['tts_speech'])
+                output = torch.concat(tts_speeches, dim=1)
+                buffer = io.BytesIO()
+                if speed != 1.0:
+                    try:
+                        numpy_array = output.numpy()
+                        audio = (numpy_array * 32768).astype(np.int16) 
+                        audio_data = speed_change(audio, speed=speed, sr=int(22050))
+                        audio_data = torch.from_numpy(audio_data)
+                        audio_data = audio_data.reshape(1, -1)
+                    except Exception as e:
+                        print(f"Failed to change speed of audio: \n{e}")
+                else:
+                    audio_data = output
+
+                torchaudio.save(buffer,audio_data, 22050, format="ogg")
+                buffer.seek(0)
+
+                yield buffer.read()
+
+        response = make_response(generate())
+        response.headers['Content-Type'] = 'audio/ogg'
+        response.headers['Content-Disposition'] = 'attachment; filename=sound.ogg'
+        return response
+
+
+@app.route("/", methods=['GET'])
+def sft_get():
+
+    text = request.args.get('text')
+    speaker = request.args.get('speaker')
+    new = request.args.get('new',0)
+    streaming = request.args.get('streaming',0)
+    speed = request.args.get('speed',1.0)
+    speed = float(speed)
+
+    if not text:
+        return {"error": "文本不能为空"}, 400
+
+    if not speaker:
+        return {"error": "角色名不能为空"}, 400
+
+    # 非流式
+    if streaming == 0:
+
+        start = time.process_time()
+        if not new:
+            output = cosyvoice.inference_sft(text,speaker,"无")
+        else:
+            output = cosyvoice.inference_sft(text,speaker,speaker)
+        end = time.process_time()
+        print("infer time:", end - start)
+        buffer = io.BytesIO()
+
+        if speed != 1.0:
+            try:
+                numpy_array = output['tts_speech'].numpy()
+                audio = (numpy_array * 32768).astype(np.int16) 
+                audio_data = speed_change(audio, speed=speed, sr=int(22050))
+                audio_data = torch.from_numpy(audio_data)
+                audio_data = audio_data.reshape(1, -1)
+            except Exception as e:
+                print(f"Failed to change speed of audio: \n{e}")
+        else:
+            audio_data = output['tts_speech']
+
+        torchaudio.save(buffer,audio_data, 22050, format="wav")
+        buffer.seek(0)
+        return Response(buffer.read(), mimetype="audio/wav")
+
+    # 流式模式
+    else:
+
+        spk_id = speaker
+
+        if new:
+            spk_id = "中文女"
+
+        joblist = cosyvoice.frontend.text_normalize_stream(text, split=True)
+
+        def generate():
+
+            for i in joblist:
+                print(i)
+                print("流式0")
+                tts_speeches = []
+                model_input = cosyvoice.frontend.frontend_sft(i, spk_id)
+                if new:
+                    # 加载数据
+                    newspk = torch.load(f'{ROOT_DIR}/voices/{speaker}.pt', map_location=torch.device('cpu'))
+
+                    model_input["flow_embedding"] = newspk["flow_embedding"]
+                    model_input["llm_embedding"] = newspk["llm_embedding"]
+
+                    model_input["llm_prompt_speech_token"] = newspk["llm_prompt_speech_token"]
+                    model_input["llm_prompt_speech_token_len"] = newspk["llm_prompt_speech_token_len"]
+
+                    model_input["flow_prompt_speech_token"] = newspk["flow_prompt_speech_token"]
+                    model_input["flow_prompt_speech_token_len"] = newspk["flow_prompt_speech_token_len"]
+
+                    model_input["prompt_speech_feat_len"] = newspk["prompt_speech_feat_len"]
+                    model_input["prompt_speech_feat"] = newspk["prompt_speech_feat"]
+                    model_input["prompt_text"] = newspk["prompt_text"]
+                    model_input["prompt_text_len"] = newspk["prompt_text_len"]
+
+                model_output = next(cosyvoice.model.inference_stream(**model_input))
+                # print(model_input)
+                tts_speeches.append(model_output['tts_speech'])
+                output = torch.concat(tts_speeches, dim=1)
+                buffer = io.BytesIO()
+                if speed != 1.0:
+                    try:
+                        numpy_array = output.numpy()
+                        audio = (numpy_array * 32768).astype(np.int16) 
+                        audio_data = speed_change(audio, speed=speed, sr=int(22050))
+                        audio_data = torch.from_numpy(audio_data)
+                        audio_data = audio_data.reshape(1, -1)
+                    except Exception as e:
+                        print(f"Failed to change speed of audio: \n{e}")
+                else:
+                    audio_data = output
+
+                torchaudio.save(buffer,audio_data, 22050, format="ogg")
+                buffer.seek(0)
+
+                yield buffer.read()
+
+        response = make_response(generate())
+        response.headers['Content-Type'] = 'audio/ogg'
+        response.headers['Content-Disposition'] = 'attachment; filename=sound.ogg'
+        return response
+
+        # return Response(generate(), mimetype='audio/x-wav')
+
+
+
+
+
+
+
+@app.route("/tts_to_audio/", methods=['POST'])
+def tts_to_audio():
+
+    import speaker_config
+
+    question_data = request.get_json()
+
+    text = question_data.get('text')
+    speaker = speaker_config.speaker
+    new = speaker_config.new
+
+    speed = speaker_config.speed
+
+
+    if not text:
+        return {"error": "文本不能为空"}, 400
+
+    if not speaker:
+        return {"error": "角色名不能为空"}, 400
+
+    start = time.process_time()
+    if not new:
+        output = cosyvoice.inference_sft(text,speaker,"无")
+    else:
+        output = cosyvoice.inference_sft(text,speaker,speaker)
+    end = time.process_time()
+    print("infer time:", end - start)
+    buffer = io.BytesIO()
+    if speed != 1.0:
+        try:
+            numpy_array = output['tts_speech'].numpy()
+            audio = (numpy_array * 32768).astype(np.int16) 
+            audio_data = speed_change(audio, speed=speed, sr=int(22050))
+            audio_data = torch.from_numpy(audio_data)
+            audio_data = audio_data.reshape(1, -1)
+        except Exception as e:
+            print(f"Failed to change speed of audio: \n{e}")
+    else:
+        audio_data = output['tts_speech']
+
+    torchaudio.save(buffer,audio_data, 22050, format="wav")
+    buffer.seek(0)
+    return Response(buffer.read(), mimetype="audio/wav")
+
+
+
+@app.route("/speakers", methods=['GET'])
+def speakers():
+
+    response = app.response_class(
+        response=json.dumps([{"name":"default","vid":1}]),
+        status=200,
+        mimetype='application/json'
+    )
+    return response
+
+
+@app.route("/speakers_list", methods=['GET'])
+def speakers_list():
+
+    response = app.response_class(
+        response=json.dumps(["female_calm","female","male"]),
+        status=200,
+        mimetype='application/json'
+    )
+    return response
+
+
+if __name__ == "__main__":
+    app.run(host='0.0.0.0', port=9880)
diff --git a/audios/my biggest weaknesses is asking for help when I need it.wav b/audios/my biggest weaknesses is asking for help when I need it.wav
diff --git a/audios/ホットするっていうかそれはあの.wav b/audios/ホットするっていうかそれはあの.wav
diff --git a/audios/一支穿云箭,千军万马来相见.wav b/audios/一支穿云箭,千军万马来相见.wav
diff --git a/audios/从来没有一位高官,会在在位或者退休之后.wav b/audios/从来没有一位高官,会在在位或者退休之后.wav
diff --git a/audios/光动嘴不如亲自做给你看.wav b/audios/光动嘴不如亲自做给你看.wav
diff --git a/audios/哪些死去士兵的意义将由我们来赋予.wav b/audios/哪些死去士兵的意义将由我们来赋予.wav
diff --git a/audios/希望你以后,能够做的比我还好哟.wav b/audios/希望你以后,能够做的比我还好哟.wav
diff --git a/audios/我之前无数次地想过,要不然干脆死了算了.wav b/audios/我之前无数次地想过,要不然干脆死了算了.wav
diff --git a/audios/我当然知道了.wav b/audios/我当然知道了.wav
diff --git a/audios/是的,全灭的可能性相当的高.wav b/audios/是的,全灭的可能性相当的高.wav
diff --git a/audios/说得好像您带我以来我考好过几次一样.wav b/audios/说得好像您带我以来我考好过几次一样.wav
diff --git a/cosyvoice/bin/inference.py b/cosyvoice/bin/inference.py
@@ -1,16 +1,5 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+#!/usr/bin/env python
+#coding=utf-8
 
 from __future__ import print_function