Skip to content

Commit

Permalink
add audio length selection feature - 5min or 10 min
Browse files Browse the repository at this point in the history
  • Loading branch information
mkagenius committed Jan 8, 2025
1 parent 3b27578 commit cd8cc03
Show file tree
Hide file tree
Showing 9 changed files with 201 additions and 53 deletions.
11 changes: 11 additions & 0 deletions backend/app/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,19 @@
Discuss in the podcast, the main components of the system (e.g., frontend, backend, database, building, external services).
Discuss the relationships and interactions between these components.
Sometimes the answers can also be single word or very small so that it seems natural. Long answers all the time makes it monotonous.
Make it a 20 minute long or longer podcast if possible. Give atleast 200 voice tags for the host + Same amount of voice tags for guest. Slowly count them and re-write the ssml if its falling short and then return the ssml."""


PODCAST_SSML_PROMPT_BEFORE_BREAK = """ First of all dont use break tags outside the voice tag - illegal. Dont waste too much time on intro. Can you convert it into a podcast so that someone could listen to it and understand what's going on, also discuss project structure or go in detail for some files, long 8-10 min podcast is fine by me - make it a ssml similar to this: <speak version=\"1.0\" xmlns=\"http://www.w3.org/2001/10/synthesis\" xml:lang=\"en-US\">\n<voice name=\"en-US-AvaMultilingualNeural\">\nWelcome to Next Gen Innovators! (no need to open links) ..
also make it a conversation between host and guest of a podcast, question answer kind. \n\n<break time=\"500ms\" />\nI'm your host, Ava, and today we’re diving into an exciting topic: how students can embark on their entrepreneurial journey right from college.\n<break time=\"700ms\" />\nJoining us is Arun Sharma, a seasoned entrepreneur with over two decades of experience and a passion for mentoring young innovators.\n<break time=\"500ms\" />\nArun, it’s a pleasure to have you here.\n</voice>\n\n<voice name=\""en-US-DustinMultilingualNeural"\">\n Thank you, Ava.\n <break time=\"300ms\" />\n It’s great to be here. I’m excited to talk about how students can channel their creativity and energy into building impactful ventures.\n</voice> ..\n", Use "en-US-DustinMultilingualNeural" voice as guest (and must use en-US-AvaMultilingualNeural voice as host always but her actual name can be something else). Add little bit of fillers like umm or uh so that it feels natural (dont over do it),
Also discuss something technically intriguing part that is something unique to this project
Discuss any important architectural patterns or design principles used in the project.
Discuss in the podcast, the main components of the system (e.g., frontend, backend, database, building, external services).
Discuss the relationships and interactions between these components.
Sometimes the answers can also be single word or very small so that it seems natural. Long answers all the time makes it monotonous.
Make it a 20 minute long or longer podcast if possible. Give atleast 200 voice tags for the host + Same amount of voice tags for guest. Slowly count them and re-write the ssml if its falling short and then return the ssml. This is the first part of the podcast so tell the listeners you will be back after the break."""


PODCAST_SSML_PROMPT_AFTER_BREAK = """ First of all dont use break tags outside the voice tag. Dont waste time on introducing guest too much. Can you convert it into a podcast so that someone could listen to it and understand what's going on, also discuss project structure or go in detail for some files, long 8-10 min podcast is fine by me - make it a ssml similar to this: <speak version=\"1.0\" xmlns=\"http://www.w3.org/2001/10/synthesis\" xml:lang=\"en-US\">\n<voice name=\"en-US-AvaMultilingualNeural\">\nWelcome to Next Gen Innovators! (no need to open links) ..
also make it a conversation between host and guest of a podcast, question answer kind. \n\n<break time=\"500ms\" />\nI'm your host, Ava, and today we’re diving into an exciting topic: how students can embark on their entrepreneurial journey right from college.\n<break time=\"700ms\" />\nJoining us is Arun Sharma, a seasoned entrepreneur with over two decades of experience and a passion for mentoring young innovators.\n<break time=\"500ms\" />\nArun, it’s a pleasure to have you here.\n</voice>\n\n<voice name=\""en-US-DustinMultilingualNeural"\">\n Thank you, Ava.\n <break time=\"300ms\" />\n It’s great to be here. I’m excited to talk about how students can channel their creativity and energy into building impactful ventures.\n</voice> ..\n", Use "en-US-DustinMultilingualNeural" voice as guest (and must use en-US-AvaMultilingualNeural voice as host always but her actual name can be something else). Add little bit of fillers like umm or uh so that it feels natural (dont over do it),
Also discuss something technically intriguing part that is something unique to this project
Expand Down
100 changes: 72 additions & 28 deletions backend/app/routers/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from app.services.openai_service import OpenAIService
from app.core.limiter import limiter
import os
from app.prompts import PODCAST_SSML_PROMPT_AFTER_BREAK, PODCAST_SSML_PROMPT
from app.prompts import PODCAST_SSML_PROMPT_AFTER_BREAK, PODCAST_SSML_PROMPT, PODCAST_SSML_PROMPT_BEFORE_BREAK
from anthropic._exceptions import RateLimitError
from pydantic import BaseModel
from functools import lru_cache
Expand All @@ -15,6 +15,7 @@
import base64
from pydub import AudioSegment
import io
import concurrent.futures

load_dotenv()

Expand Down Expand Up @@ -79,12 +80,66 @@ def process_github_content(content, speech_prompt, max_length, max_tokens=None):
return ssml_response


def generate_ssml_concurrently(file_tree, readme, file_content, audio_length) -> str | dict:
# Prepare the content
if audio_length == 'short':
combined_content = f"FILE TREE: {file_tree}\nREADME: {readme} IMPORTANT FILES: {file_content}"
ssml_response = process_github_content(combined_content, PODCAST_SSML_PROMPT, 250000, 100000)
return ssml_response
else:
combined_content_tree_readme = f"FILE TREE: {file_tree}\nREADME: {readme}"
combined_content_file_content = f"IMPORTANT FILES: {file_content}"

# Define a function for error handling
def check_response(response):
if isinstance(response, dict): # Check if it returns an error dictionary
return response
return None

# Use ThreadPoolExecutor to execute tasks concurrently
with concurrent.futures.ThreadPoolExecutor() as executor:
future_tree_readme = executor.submit(
process_github_content,
combined_content_tree_readme,
PODCAST_SSML_PROMPT_BEFORE_BREAK,
250000,
100000
)
future_file_content = executor.submit(
process_github_content,
combined_content_file_content,
PODCAST_SSML_PROMPT_AFTER_BREAK,
250000,
100000
)

ssml_response_tree_readme = future_tree_readme.result()
ssml_response_file_content = future_file_content.result()

# Check for errors
error_response = check_response(ssml_response_tree_readme) or check_response(ssml_response_file_content)
if error_response:
return error_response
# Apply the function to remove the first occurrence of the <speak> tags from responses
ssml_response_tree_readme_content = speech_service.remove_first_speak_tag(ssml_response_tree_readme)
ssml_response_file_content_content = speech_service.remove_first_speak_tag(ssml_response_file_content)
# Combine the contents
combined_ssml_content = f"{ssml_response_tree_readme_content}\n{ssml_response_file_content_content}"

# Wrap the combined content in a single <speak> tag
full_ssml_response = f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">{combined_ssml_content}</speak>'

# Proceed with ssml_response_tree_readme and ssml_response_file_content as needed
return full_ssml_response


class ApiRequest(BaseModel):
username: str
repo: str
instructions: str
api_key: str | None = None
audio: bool = False # new param
audio_length: str = 'long'


# @limiter.limit("1/minute;5/day") # TEMP: disable rate limit for growth??
Expand All @@ -99,54 +154,43 @@ async def generate(request: Request, body: ApiRequest):
file_tree = github_data["file_tree"]
readme = github_data["readme"]
file_content = github_data["file_content"]
audio_length = body.audio_length
result = generate_ssml_concurrently(file_tree, readme, file_content, audio_length)
# Check if there was an error response
if isinstance(result, dict): # There was an error
print("Error in processing:")
for error in result.get("errors", []):
print(error)
return {"error": "Some error in genererating audio: E001"}
else:
# Successful processing
full_ssml_response = result

# Use the extracted function for both content types
combined_content_tree_readme = f"FILE TREE: {file_tree}\nREADME: {readme}"
ssml_response_tree_readme = process_github_content(combined_content_tree_readme, PODCAST_SSML_PROMPT, 250000, 100000)
if isinstance(ssml_response_tree_readme, dict): # Check if it returns an error dictionary
return ssml_response_tree_readme

combined_content_file_content = f"IMPORTANT FILES: {file_content} \s so far ssml before the break: {ssml_response_tree_readme}"
ssml_response_file_content = process_github_content(combined_content_file_content, PODCAST_SSML_PROMPT_AFTER_BREAK, 250000, 100000)
if isinstance(ssml_response_file_content, dict): # Check if it returns an error dictionary
return ssml_response_file_content

# Apply the function to remove the first occurrence of the <speak> tags from responses
ssml_response_tree_readme_content = speech_service.remove_first_speak_tag(ssml_response_tree_readme)
ssml_response_file_content_content = speech_service.remove_first_speak_tag(ssml_response_file_content)
# Combine the contents
combined_ssml_content = f"{ssml_response_tree_readme_content}\n{ssml_response_file_content_content}"

# Wrap the combined content in a single <speak> tag
full_ssml_response = f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">{combined_ssml_content}</speak>'

print(full_ssml_response) # or return this full_ssml_response
print(full_ssml_response)
ssml_response = full_ssml_response

if not body.audio:
return {"diagram": "flowchart TB\n subgraph Input\n CLI[CLI Interface]:::input\n API[API Interface]:::input\n end\n\n subgraph Orchestration\n TM[Task Manager]:::core\n PR[Platform Router]:::core\n end\n\n subgraph \"Planning Layer\"\n TP[Task Planning]:::core\n subgraph Planners\n OP[OpenAI Planner]:::planner\n GP[Gemini Planner]:::planner\n LP[Local Ollama Planner]:::planner\n end\n end\n\n subgraph \"Finding Layer\"\n subgraph Finders\n OF[OpenAI Finder]:::finder\n GF[Gemini Finder]:::finder\n LF[Local Ollama Finder]:::finder\n MF[MLX Finder]:::finder\n end\n end\n\n subgraph \"Execution Layer\"\n AE[Android Executor]:::executor\n OE[OSX Executor]:::executor\n end\n\n subgraph \"External Services\"\n direction TB\n OAPI[OpenAI API]:::external\n GAPI[Google Gemini API]:::external\n LAPI[Local Ollama Instance]:::external\n end\n\n subgraph \"Platform Tools\"\n direction TB\n ADB[Android Debug Bridge]:::platform\n OSX[OSX System Tools]:::platform\n end\n\n subgraph \"Configuration\"\n direction TB\n MS[Model Settings]:::config\n FD[Function Declarations]:::config\n SP[System Prompts]:::config\n end\n\n %% Connections\n CLI --> TM\n API --> TM\n TM --> PR\n PR --> TP\n TP --> Planners\n Planners --> Finders\n Finders --> AE & OE\n \n %% External Service Connections\n OP & OF -.-> OAPI\n GP & GF -.-> GAPI\n LP & LF -.-> LAPI\n \n %% Platform Tool Connections\n AE --> ADB\n OE --> OSX\n \n %% Configuration Connections\n MS -.-> TM\n FD -.-> PR\n SP -.-> TP\n\n %% Click Events\n click CLI \"https://github.com/BandarLabs/clickclickclick/blob/main/main.py\"\n click API \"https://github.com/BandarLabs/clickclickclick/blob/main/api.py\"\n click MS \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/config/models.yaml\"\n click FD \"https://github.com/BandarLabs/clickclickclick/tree/main/clickclickclick/config/function_declarations\"\n click SP \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/config/prompts.yaml\"\n click OP \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/planner/openai.py\"\n click GP \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/planner/gemini.py\"\n click LP \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/planner/local_ollama.py\"\n click TP \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/planner/task.py\"\n click OF \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/finder/openai.py\"\n click GF \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/finder/gemini.py\"\n click LF \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/finder/local_ollama.py\"\n click MF \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/finder/mlx.py\"\n click AE \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/executor/android.py\"\n click OE \"https://github.com/BandarLabs/clickclickclick/blob/main/clickclickclick/executor/osx.py\"\n\n %% Styles\n classDef input fill:#87CEEB,stroke:#333,stroke-width:2px\n classDef core fill:#4169E1,stroke:#333,stroke-width:2px\n classDef planner fill:#6495ED,stroke:#333,stroke-width:2px\n classDef finder fill:#4682B4,stroke:#333,stroke-width:2px\n classDef executor fill:#1E90FF,stroke:#333,stroke-width:2px\n classDef external fill:#98FB98,stroke:#333,stroke-width:2px\n classDef platform fill:#FFA500,stroke:#333,stroke-width:2px\n classDef config fill:#D3D3D3,stroke:#333,stroke-width:2px",
"explanation": 'EXPLANATION'}
else:
# ssml_string = f"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'><voice name='en-US-AvaMultilingualNeural'>Hi The test was successfully completed, now use this place to insert actual data</voice></speak>"
# Assuming ssml_response is a string with multiple lines

audio_bytes = speech_service.text_to_mp3(ssml_response)
# mp3_bytes = convert_wav_to_mp3(audio_bytes)

if audio_bytes:
response = Response(content=audio_bytes, media_type="audio/mpeg", headers={"Content-Disposition": "attachment; filename=explanation.mp3"})
# Assuming audio_bytes contains the audio data

audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format="mp3")
duration_in_seconds = len(audio) / 1000.0
print("duration in sec", duration_in_seconds)
vtt_content = speech_service.ssml_to_webvtt(ssml_response, duration_in_seconds)
encoded_vtt_content = base64.b64encode(vtt_content.encode('utf-8')).decode('utf-8')
response.headers["X-VTT-Content"] = encoded_vtt_content
# Add CORS headers

response.headers["Access-Control-Expose-Headers"] = "X-VTT-Content"
response.headers["Access-Control-Allow-Origin"] = "*"
return response
else:
return {"error": "Text to speech is not available. Please set Azure speech credentials in .env"}
return {"error": "Text to speech is not available. Please set Azure speech credentials in .env E002"}
except RateLimitError as e:
raise HTTPException(
status_code=429,
Expand Down
16 changes: 14 additions & 2 deletions src/app/[username]/[repo]/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,23 @@ import { ApiKeyButton } from "~/components/api-key-button";
import React, { useRef, useState, useEffect } from 'react';

import {parseWebVTT, syncSubtitle} from "~/lib/utils";
import { useGlobalState } from "~/app/providers";

interface Subtitle {
start: number;
end: number;
text: string;
}

export default function Repo() {
type RepoProps = {
audioLength: string | null;
};

const Repo: React.FC<RepoProps> = () => {
const {audioLength, anotherVariable} = useGlobalState();
const videoRef = useRef<HTMLVideoElement | null>(null);
const params = useParams<{ username: string; repo: string }>();

const {
diagram,
error,
Expand All @@ -44,9 +51,12 @@ export default function Repo() {
audioUrl,
audioRef,
subtitleUrl
} = useDiagram(params.username, params.repo);
} = useDiagram(params.username, params.repo, audioLength, anotherVariable);
const [subtitles, setSubtitles] = useState<Subtitle[]>([]);
const [currentSubtitle, setCurrentSubtitle] = useState("");
// useEffect(() => {
// console.log("anotherVariable has changed:", anotherVariable);
// }, [anotherVariable]);

useEffect(() => {
async function fetchSubtitles() {
Expand Down Expand Up @@ -175,3 +185,5 @@ export default function Repo() {
</div>
);
}

export default Repo;
23 changes: 12 additions & 11 deletions src/app/layout.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { GeistSans } from "geist/font/sans";
import { type Metadata } from "next";
import { Header } from "~/components/header";
import { Footer } from "~/components/footer";
import { CSPostHogProvider } from "./providers";
import { CSPostHogProvider, GlobalStateProvider } from "./providers";

export const metadata: Metadata = {
title: "GitPodcast",
Expand Down Expand Up @@ -34,8 +34,7 @@ export const metadata: Metadata = {
"software development",
"open source",
"open source software",
"ahmedkhaleel2004",
"ahmed khaleel",
"bandarlabs",
"gitpodcast",
"gitpodcast.com",
],
Expand All @@ -50,7 +49,7 @@ export const metadata: Metadata = {
title: "GitPodcast - Repository to Podcast in Seconds",
description:
"Turn any GitHub repository into an engaging podcast in seconds.",
siteName: "GitDiagram",
siteName: "GitPodcast",
images: [
{
url: "/og-image.png?v=2", // You'll need to create this image
Expand All @@ -76,13 +75,15 @@ export default function RootLayout({
}: Readonly<{ children: React.ReactNode }>) {
return (
<html lang="en" className={`${GeistSans.variable}`}>
<CSPostHogProvider>
<body className="flex min-h-screen flex-col">
<Header />
<main className="flex-grow">{children}</main>
<Footer />
</body>
</CSPostHogProvider>
<GlobalStateProvider>
<CSPostHogProvider>
<body className="flex min-h-screen flex-col">
<Header />
<main className="flex-grow">{children}</main>
<Footer />
</body>
</CSPostHogProvider>
</GlobalStateProvider>
</html>
);
}
5 changes: 4 additions & 1 deletion src/app/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ import MainCard from "~/components/main-card";
import Hero from "~/components/hero";
import ProductHuntEmbed from "~/components/producthunt-embed";



export default function HomePage() {

return (
<main className="flex-grow px-8 pb-8 md:p-8">
<div className="mx-auto mb-4 max-w-4xl lg:my-8">
Expand All @@ -20,7 +23,7 @@ export default function HomePage() {
</p>
</div>
<div className="mb-16 flex flex-col items-center lg:mb-0">
<MainCard />
<MainCard/>
<div className="mt-16">
<ProductHuntEmbed />
</div>
Expand Down
Loading

0 comments on commit cd8cc03

Please sign in to comment.