diff --git a/app/api/common.ts b/app/api/common.ts
index 24453dd96356..09cafd41bc07 100644
--- a/app/api/common.ts
+++ b/app/api/common.ts
@@ -96,11 +96,12 @@ export async function requestOpenai(req: NextRequest) {
     }
   }
 
+  const contentType = req.headers.get("Content-Type") ?? "application/json";
   const fetchUrl = cloudflareAIGatewayUrl(`${baseUrl}/${path}`);
   console.log("fetchUrl", fetchUrl);
   const fetchOptions: RequestInit = {
     headers: {
-      "Content-Type": "application/json",
+      "Content-Type": contentType,
       "Cache-Control": "no-store",
       [authHeaderName]: authValue,
       ...(serverConfig.openaiOrgId && {
@@ -117,7 +118,7 @@ export async function requestOpenai(req: NextRequest) {
   };
 
   // #1815 try to refuse gpt4 request
-  if (serverConfig.customModels && req.body) {
+  if (serverConfig.customModels && req.body && contentType.includes("json")) {
     try {
       const clonedBody = await req.text();
       fetchOptions.body = clonedBody;
diff --git a/app/client/api.ts b/app/client/api.ts
index 7e1d0135ed62..06c2aa2b5b1f 100644
--- a/app/client/api.ts
+++ b/app/client/api.ts
@@ -261,6 +261,18 @@ export function getHeaders(ignoreHeaders?: boolean) {
         ? accessStore.iflytekApiKey + ":" + accessStore.iflytekApiSecret
         : ""
       : accessStore.openaiApiKey;
+    if (ignoreHeaders) {
+      return {
+        isGoogle: false,
+        isAzure: false,
+        isAnthropic: false,
+        isBaidu: false,
+        isByteDance: false,
+        isAlibaba: false,
+        apiKey: accessStore.openaiApiKey,
+        isEnabledAccessControl,
+      };
+    }
     return {
       isGoogle,
       isAzure,
diff --git a/app/client/platforms/openai.ts b/app/client/platforms/openai.ts
index 02115140b72b..d2929ee3fe78 100644
--- a/app/client/platforms/openai.ts
+++ b/app/client/platforms/openai.ts
@@ -205,7 +205,7 @@ export class ChatGPTApi implements LLMApi {
         signal: controller.signal,
         headers: headers,
       };
-
+      console.log("[payload]: ", payload);
       // make a fetch request
       const requestTimeoutId = setTimeout(
         () => controller.abort(),
diff --git a/app/utils/speech.ts b/app/utils/speech.ts
index dc8102879fb3..ded3437e1c79 100644
--- a/app/utils/speech.ts
+++ b/app/utils/speech.ts
@@ -32,6 +32,11 @@ export class OpenAITranscriptionApi extends SpeechApi {
   }
 
   async start(): Promise<void> {
+    // 如果已经在监听，先停止当前的会话
+    if (this.listeningStatus) {
+      await this.stop();
+    }
+
     // @ts-ignore
     navigator.getUserMedia =
       // @ts-ignore
@@ -42,28 +47,30 @@ export class OpenAITranscriptionApi extends SpeechApi {
       navigator.mozGetUserMedia ||
       // @ts-ignore
       navigator.msGetUserMedia;
-    if (navigator.mediaDevices) {
-      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-      this.mediaRecorder = new MediaRecorder(stream);
-      this.mediaRecorder.ondataavailable = (e) => {
-        if (e.data && e.data.size > 0) {
-          this.audioChunks.push(e.data);
-        }
-      };
 
-      this.stream = stream;
+    if (navigator.mediaDevices) {
+      try {
+        const stream = await navigator.mediaDevices.getUserMedia({
+          audio: true,
+        });
+        this.stream = stream;
+        this.mediaRecorder = new MediaRecorder(stream);
+        this.mediaRecorder.ondataavailable = (e) => {
+          if (e.data && e.data.size > 0) {
+            this.audioChunks.push(e.data);
+          }
+        };
+      } catch (error) {
+        console.error("Error accessing media devices:", error);
+        return;
+      }
     } else {
-      console.warn("Media Decives will work only with SSL");
+      console.warn("Media Devices will work only with SSL");
       return;
     }
 
     this.audioChunks = [];
-
-    // this.recorder.addEventListener("dataavailable", (event) => {
-    //     this.audioChunks.push(event.data);
-    // });
-
-    this.mediaRecorder.start(1000);
+    this.mediaRecorder!.start(1000);
     this.listeningStatus = true;
   }
 
@@ -79,6 +86,13 @@ export class OpenAITranscriptionApi extends SpeechApi {
         const transcription = await llm.transcription({ file: audioBlob });
         this.onTranscription(transcription);
         this.listeningStatus = false;
+
+        // 停止所有音轨
+        if (this.stream) {
+          this.stream.getTracks().forEach((track) => track.stop());
+          this.stream = null;
+        }
+
         resolve();
       });
 
@@ -90,37 +104,117 @@ export class OpenAITranscriptionApi extends SpeechApi {
 export class WebTranscriptionApi extends SpeechApi {
   private listeningStatus = false;
   private recognitionInstance: any | null = null;
+  private shouldContinueListening = false;
 
   isListening = () => this.listeningStatus;
 
   constructor(transcriptionCallback?: TranscriptionCallback) {
     super();
-    if (isFirefox()) return;
+    this.initRecognition();
+    if (transcriptionCallback) {
+      this.onTranscriptionReceived(transcriptionCallback);
+    }
+  }
+
+  private initRecognition(): void {
     const SpeechRecognition =
       (window as any).SpeechRecognition ||
-      (window as any).webkitSpeechRecognition;
+      (window as any).webkitSpeechRecognition ||
+      (window as any).msSpeechRecognition;
+
+    if (!SpeechRecognition) {
+      console.error("SpeechRecognition is not supported in this browser");
+      return;
+    }
+
     this.recognitionInstance = new SpeechRecognition();
     this.recognitionInstance.continuous = true;
     this.recognitionInstance.interimResults = true;
     this.recognitionInstance.lang = getSTTLang();
-    if (transcriptionCallback) {
-      this.onTranscriptionReceived(transcriptionCallback);
-    }
+
     this.recognitionInstance.onresult = (event: any) => {
       const result = event.results[event.results.length - 1];
       if (result.isFinal) {
         this.onTranscription(result[0].transcript);
       }
     };
+
+    this.recognitionInstance.onerror = (event: any) => {
+      console.error("Speech recognition error:", event.error);
+      if (event.error !== "no-speech") {
+        this.listeningStatus = false;
+        this.shouldContinueListening = false;
+      }
+    };
+
+    this.recognitionInstance.onend = () => {
+      console.log("Speech recognition ended");
+      this.listeningStatus = false;
+      if (this.shouldContinueListening) {
+        console.log("Restarting speech recognition");
+        this.start();
+      }
+    };
   }
 
   async start(): Promise<void> {
-    this.listeningStatus = true;
-    await this.recognitionInstance.start();
+    if (this.listeningStatus) {
+      console.warn("Speech recognition is already active.");
+      return;
+    }
+
+    if (!this.recognitionInstance) {
+      this.initRecognition();
+    }
+
+    if (!this.recognitionInstance) {
+      throw new Error("Failed to initialize speech recognition");
+    }
+
+    this.shouldContinueListening = true;
+
+    return new Promise((resolve, reject) => {
+      const startRecognition = () => {
+        try {
+          this.recognitionInstance.start();
+          this.listeningStatus = true;
+          console.log("Speech recognition started");
+          resolve();
+        } catch (error) {
+          console.error("Error starting speech recognition:", error);
+          this.listeningStatus = false;
+          this.shouldContinueListening = false;
+          reject(error);
+        }
+      };
+
+      startRecognition();
+    });
   }
 
   async stop(): Promise<void> {
-    this.listeningStatus = false;
-    await this.recognitionInstance.stop();
+    this.shouldContinueListening = false;
+
+    if (!this.listeningStatus || !this.recognitionInstance) {
+      return;
+    }
+
+    return new Promise<void>((resolve) => {
+      const onStop = () => {
+        this.listeningStatus = false;
+        this.recognitionInstance.removeEventListener("end", onStop);
+        console.log("Speech recognition stopped");
+        resolve();
+      };
+
+      this.recognitionInstance.addEventListener("end", onStop);
+
+      try {
+        this.recognitionInstance.stop();
+      } catch (error) {
+        console.error("Error stopping speech recognition:", error);
+        onStop();
+      }
+    });
   }
 }