From cd8cc038d1808129fd9cb04bf65f61a0d4f82c3d Mon Sep 17 00:00:00 2001 From: mkagenius Date: Wed, 8 Jan 2025 23:01:35 +0530 Subject: [PATCH] add audio length selection feature - 5min or 10 min --- backend/app/prompts.py | 11 ++++ backend/app/routers/generate.py | 100 +++++++++++++++++++++-------- src/app/[username]/[repo]/page.tsx | 16 ++++- src/app/layout.tsx | 23 +++---- src/app/page.tsx | 5 +- src/app/providers.tsx | 39 +++++++++++ src/components/main-card.tsx | 42 ++++++++++-- src/hooks/useDiagram.ts | 6 +- src/lib/fetch-backend.ts | 12 ++-- 9 files changed, 201 insertions(+), 53 deletions(-) diff --git a/backend/app/prompts.py b/backend/app/prompts.py index 57d94cb..92aa7b8 100644 --- a/backend/app/prompts.py +++ b/backend/app/prompts.py @@ -19,8 +19,19 @@ Discuss in the podcast, the main components of the system (e.g., frontend, backend, database, building, external services). Discuss the relationships and interactions between these components. Sometimes the answers can also be single word or very small so that it seems natural. Long answers all the time makes it monotonous. +Make it a 20 minute long or longer podcast if possible. Give atleast 200 voice tags for the host + Same amount of voice tags for guest. Slowly count them and re-write the ssml if its falling short and then return the ssml.""" + + +PODCAST_SSML_PROMPT_BEFORE_BREAK = """ First of all dont use break tags outside the voice tag - illegal. Dont waste too much time on intro. Can you convert it into a podcast so that someone could listen to it and understand what's going on, also discuss project structure or go in detail for some files, long 8-10 min podcast is fine by me - make it a ssml similar to this: \n\nWelcome to Next Gen Innovators! (no need to open links) .. +also make it a conversation between host and guest of a podcast, question answer kind. \n\n\nI'm your host, Ava, and today we’re diving into an exciting topic: how students can embark on their entrepreneurial journey right from college.\n\nJoining us is Arun Sharma, a seasoned entrepreneur with over two decades of experience and a passion for mentoring young innovators.\n\nArun, it’s a pleasure to have you here.\n\n\n\n Thank you, Ava.\n \n It’s great to be here. I’m excited to talk about how students can channel their creativity and energy into building impactful ventures.\n ..\n", Use "en-US-DustinMultilingualNeural" voice as guest (and must use en-US-AvaMultilingualNeural voice as host always but her actual name can be something else). Add little bit of fillers like umm or uh so that it feels natural (dont over do it), +Also discuss something technically intriguing part that is something unique to this project +Discuss any important architectural patterns or design principles used in the project. +Discuss in the podcast, the main components of the system (e.g., frontend, backend, database, building, external services). +Discuss the relationships and interactions between these components. +Sometimes the answers can also be single word or very small so that it seems natural. Long answers all the time makes it monotonous. Make it a 20 minute long or longer podcast if possible. Give atleast 200 voice tags for the host + Same amount of voice tags for guest. Slowly count them and re-write the ssml if its falling short and then return the ssml. This is the first part of the podcast so tell the listeners you will be back after the break.""" + PODCAST_SSML_PROMPT_AFTER_BREAK = """ First of all dont use break tags outside the voice tag. Dont waste time on introducing guest too much. Can you convert it into a podcast so that someone could listen to it and understand what's going on, also discuss project structure or go in detail for some files, long 8-10 min podcast is fine by me - make it a ssml similar to this: \n\nWelcome to Next Gen Innovators! (no need to open links) .. also make it a conversation between host and guest of a podcast, question answer kind. \n\n\nI'm your host, Ava, and today we’re diving into an exciting topic: how students can embark on their entrepreneurial journey right from college.\n\nJoining us is Arun Sharma, a seasoned entrepreneur with over two decades of experience and a passion for mentoring young innovators.\n\nArun, it’s a pleasure to have you here.\n\n\n\n Thank you, Ava.\n \n It’s great to be here. I’m excited to talk about how students can channel their creativity and energy into building impactful ventures.\n ..\n", Use "en-US-DustinMultilingualNeural" voice as guest (and must use en-US-AvaMultilingualNeural voice as host always but her actual name can be something else). Add little bit of fillers like umm or uh so that it feels natural (dont over do it), Also discuss something technically intriguing part that is something unique to this project diff --git a/backend/app/routers/generate.py b/backend/app/routers/generate.py index 95a26c0..545bb04 100644 --- a/backend/app/routers/generate.py +++ b/backend/app/routers/generate.py @@ -6,7 +6,7 @@ from app.services.openai_service import OpenAIService from app.core.limiter import limiter import os -from app.prompts import PODCAST_SSML_PROMPT_AFTER_BREAK, PODCAST_SSML_PROMPT +from app.prompts import PODCAST_SSML_PROMPT_AFTER_BREAK, PODCAST_SSML_PROMPT, PODCAST_SSML_PROMPT_BEFORE_BREAK from anthropic._exceptions import RateLimitError from pydantic import BaseModel from functools import lru_cache @@ -15,6 +15,7 @@ import base64 from pydub import AudioSegment import io +import concurrent.futures load_dotenv() @@ -79,12 +80,66 @@ def process_github_content(content, speech_prompt, max_length, max_tokens=None): return ssml_response +def generate_ssml_concurrently(file_tree, readme, file_content, audio_length) -> str | dict: + # Prepare the content + if audio_length == 'short': + combined_content = f"FILE TREE: {file_tree}\nREADME: {readme} IMPORTANT FILES: {file_content}" + ssml_response = process_github_content(combined_content, PODCAST_SSML_PROMPT, 250000, 100000) + return ssml_response + else: + combined_content_tree_readme = f"FILE TREE: {file_tree}\nREADME: {readme}" + combined_content_file_content = f"IMPORTANT FILES: {file_content}" + + # Define a function for error handling + def check_response(response): + if isinstance(response, dict): # Check if it returns an error dictionary + return response + return None + + # Use ThreadPoolExecutor to execute tasks concurrently + with concurrent.futures.ThreadPoolExecutor() as executor: + future_tree_readme = executor.submit( + process_github_content, + combined_content_tree_readme, + PODCAST_SSML_PROMPT_BEFORE_BREAK, + 250000, + 100000 + ) + future_file_content = executor.submit( + process_github_content, + combined_content_file_content, + PODCAST_SSML_PROMPT_AFTER_BREAK, + 250000, + 100000 + ) + + ssml_response_tree_readme = future_tree_readme.result() + ssml_response_file_content = future_file_content.result() + + # Check for errors + error_response = check_response(ssml_response_tree_readme) or check_response(ssml_response_file_content) + if error_response: + return error_response + # Apply the function to remove the first occurrence of the tags from responses + ssml_response_tree_readme_content = speech_service.remove_first_speak_tag(ssml_response_tree_readme) + ssml_response_file_content_content = speech_service.remove_first_speak_tag(ssml_response_file_content) + # Combine the contents + combined_ssml_content = f"{ssml_response_tree_readme_content}\n{ssml_response_file_content_content}" + + # Wrap the combined content in a single tag + full_ssml_response = f'{combined_ssml_content}' + + # Proceed with ssml_response_tree_readme and ssml_response_file_content as needed + return full_ssml_response + + class ApiRequest(BaseModel): username: str repo: str instructions: str api_key: str | None = None audio: bool = False # new param + audio_length: str = 'long' # @limiter.limit("1/minute;5/day") # TEMP: disable rate limit for growth?? @@ -99,54 +154,43 @@ async def generate(request: Request, body: ApiRequest): file_tree = github_data["file_tree"] readme = github_data["readme"] file_content = github_data["file_content"] + audio_length = body.audio_length + result = generate_ssml_concurrently(file_tree, readme, file_content, audio_length) + # Check if there was an error response + if isinstance(result, dict): # There was an error + print("Error in processing:") + for error in result.get("errors", []): + print(error) + return {"error": "Some error in genererating audio: E001"} + else: + # Successful processing + full_ssml_response = result - # Use the extracted function for both content types - combined_content_tree_readme = f"FILE TREE: {file_tree}\nREADME: {readme}" - ssml_response_tree_readme = process_github_content(combined_content_tree_readme, PODCAST_SSML_PROMPT, 250000, 100000) - if isinstance(ssml_response_tree_readme, dict): # Check if it returns an error dictionary - return ssml_response_tree_readme - - combined_content_file_content = f"IMPORTANT FILES: {file_content} \s so far ssml before the break: {ssml_response_tree_readme}" - ssml_response_file_content = process_github_content(combined_content_file_content, PODCAST_SSML_PROMPT_AFTER_BREAK, 250000, 100000) - if isinstance(ssml_response_file_content, dict): # Check if it returns an error dictionary - return ssml_response_file_content - - # Apply the function to remove the first occurrence of the tags from responses - ssml_response_tree_readme_content = speech_service.remove_first_speak_tag(ssml_response_tree_readme) - ssml_response_file_content_content = speech_service.remove_first_speak_tag(ssml_response_file_content) - # Combine the contents - combined_ssml_content = f"{ssml_response_tree_readme_content}\n{ssml_response_file_content_content}" - - # Wrap the combined content in a single tag - full_ssml_response = f'{combined_ssml_content}' - - print(full_ssml_response) # or return this full_ssml_response + print(full_ssml_response) ssml_response = full_ssml_response if not body.audio: return {"diagram": "flowchart TB\n subgraph Input\n CLI[CLI Interface]:::input\n API[API Interface]:::input\n end\n\n subgraph Orchestration\n TM[Task Manager]:::core\n PR[Platform Router]:::core\n end\n\n subgraph \"Planning Layer\"\n TP[Task Planning]:::core\n subgraph Planners\n OP[OpenAI Planner]:::planner\n GP[Gemini Planner]:::planner\n LP[Local Ollama Planner]:::planner\n end\n end\n\n subgraph \"Finding Layer\"\n subgraph Finders\n OF[OpenAI Finder]:::finder\n GF[Gemini Finder]:::finder\n LF[Local Ollama Finder]:::finder\n MF[MLX Finder]:::finder\n end\n end\n\n subgraph \"Execution Layer\"\n AE[Android Executor]:::executor\n OE[OSX Executor]:::executor\n end\n\n subgraph \"External Services\"\n direction TB\n OAPI[OpenAI API]:::external\n GAPI[Google Gemini API]:::external\n LAPI[Local Ollama Instance]:::external\n end\n\n subgraph \"Platform Tools\"\n direction TB\n ADB[Android Debug Bridge]:::platform\n OSX[OSX System Tools]:::platform\n end\n\n subgraph \"Configuration\"\n direction TB\n MS[Model Settings]:::config\n FD[Function Declarations]:::config\n SP[System Prompts]:::config\n end\n\n %% Connections\n CLI --> TM\n API --> TM\n TM --> PR\n PR --> TP\n TP --> Planners\n Planners --> Finders\n Finders --> AE & OE\n \n %% External Service Connections\n OP & OF -.-> OAPI\n GP & GF -.-> GAPI\n LP & LF -.-> LAPI\n \n %% Platform Tool Connections\n AE --> ADB\n OE --> OSX\n \n %% Configuration Connections\n MS -.-> TM\n FD -.-> PR\n SP -.-> TP\n\n %% Click Events\n click CLI \"https://github.com/BandarLabs/clickclickclick/blob/main/main.py\"\n click API \"https://github.com/BandarLabs/clickclickclick/blob/main/api.py\"\n click MS \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/config/models.yaml\"\n click FD \"https://github.com/BandarLabs/clickclickclick/tree/main/clickclickclick/config/function_declarations\"\n click SP \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/config/prompts.yaml\"\n click OP \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/planner/openai.py\"\n click GP \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/planner/gemini.py\"\n click LP \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/planner/local_ollama.py\"\n click TP \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/planner/task.py\"\n click OF \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/finder/openai.py\"\n click GF \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/finder/gemini.py\"\n click LF \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/finder/local_ollama.py\"\n click MF \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/finder/mlx.py\"\n click AE \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/executor/android.py\"\n click OE \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/executor/osx.py\"\n\n %% Styles\n classDef input fill:#87CEEB,stroke:#333,stroke-width:2px\n classDef core fill:#4169E1,stroke:#333,stroke-width:2px\n classDef planner fill:#6495ED,stroke:#333,stroke-width:2px\n classDef finder fill:#4682B4,stroke:#333,stroke-width:2px\n classDef executor fill:#1E90FF,stroke:#333,stroke-width:2px\n classDef external fill:#98FB98,stroke:#333,stroke-width:2px\n classDef platform fill:#FFA500,stroke:#333,stroke-width:2px\n classDef config fill:#D3D3D3,stroke:#333,stroke-width:2px", "explanation": 'EXPLANATION'} else: - # ssml_string = f"Hi The test was successfully completed, now use this place to insert actual data" - # Assuming ssml_response is a string with multiple lines audio_bytes = speech_service.text_to_mp3(ssml_response) - # mp3_bytes = convert_wav_to_mp3(audio_bytes) + if audio_bytes: response = Response(content=audio_bytes, media_type="audio/mpeg", headers={"Content-Disposition": "attachment; filename=explanation.mp3"}) - # Assuming audio_bytes contains the audio data + audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format="mp3") duration_in_seconds = len(audio) / 1000.0 print("duration in sec", duration_in_seconds) vtt_content = speech_service.ssml_to_webvtt(ssml_response, duration_in_seconds) encoded_vtt_content = base64.b64encode(vtt_content.encode('utf-8')).decode('utf-8') response.headers["X-VTT-Content"] = encoded_vtt_content - # Add CORS headers + response.headers["Access-Control-Expose-Headers"] = "X-VTT-Content" response.headers["Access-Control-Allow-Origin"] = "*" return response else: - return {"error": "Text to speech is not available. Please set Azure speech credentials in .env"} + return {"error": "Text to speech is not available. Please set Azure speech credentials in .env E002"} except RateLimitError as e: raise HTTPException( status_code=429, diff --git a/src/app/[username]/[repo]/page.tsx b/src/app/[username]/[repo]/page.tsx index c98dc3b..4371bd7 100644 --- a/src/app/[username]/[repo]/page.tsx +++ b/src/app/[username]/[repo]/page.tsx @@ -15,6 +15,7 @@ import { ApiKeyButton } from "~/components/api-key-button"; import React, { useRef, useState, useEffect } from 'react'; import {parseWebVTT, syncSubtitle} from "~/lib/utils"; +import { useGlobalState } from "~/app/providers"; interface Subtitle { start: number; @@ -22,9 +23,15 @@ interface Subtitle { text: string; } -export default function Repo() { + type RepoProps = { + audioLength: string | null; + }; + +const Repo: React.FC = () => { + const {audioLength, anotherVariable} = useGlobalState(); const videoRef = useRef(null); const params = useParams<{ username: string; repo: string }>(); + const { diagram, error, @@ -44,9 +51,12 @@ export default function Repo() { audioUrl, audioRef, subtitleUrl - } = useDiagram(params.username, params.repo); + } = useDiagram(params.username, params.repo, audioLength, anotherVariable); const [subtitles, setSubtitles] = useState([]); const [currentSubtitle, setCurrentSubtitle] = useState(""); +// useEffect(() => { +// console.log("anotherVariable has changed:", anotherVariable); +// }, [anotherVariable]); useEffect(() => { async function fetchSubtitles() { @@ -175,3 +185,5 @@ export default function Repo() { ); } + +export default Repo; \ No newline at end of file diff --git a/src/app/layout.tsx b/src/app/layout.tsx index 6056ce4..1ea52ce 100644 --- a/src/app/layout.tsx +++ b/src/app/layout.tsx @@ -4,7 +4,7 @@ import { GeistSans } from "geist/font/sans"; import { type Metadata } from "next"; import { Header } from "~/components/header"; import { Footer } from "~/components/footer"; -import { CSPostHogProvider } from "./providers"; +import { CSPostHogProvider, GlobalStateProvider } from "./providers"; export const metadata: Metadata = { title: "GitPodcast", @@ -34,8 +34,7 @@ export const metadata: Metadata = { "software development", "open source", "open source software", - "ahmedkhaleel2004", - "ahmed khaleel", + "bandarlabs", "gitpodcast", "gitpodcast.com", ], @@ -50,7 +49,7 @@ export const metadata: Metadata = { title: "GitPodcast - Repository to Podcast in Seconds", description: "Turn any GitHub repository into an engaging podcast in seconds.", - siteName: "GitDiagram", + siteName: "GitPodcast", images: [ { url: "/og-image.png?v=2", // You'll need to create this image @@ -76,13 +75,15 @@ export default function RootLayout({ }: Readonly<{ children: React.ReactNode }>) { return ( - - -
-
{children}
-