diff --git a/app/animations/animation.py b/app/animations/animation.py index 882a40f..45e4c99 100644 --- a/app/animations/animation.py +++ b/app/animations/animation.py @@ -41,6 +41,7 @@ def talking_head( gfpgan: bool = False, gfpgan_upscale: int = 1 ) -> str: + print("* talking head: {character.name} says {text}") if character.voice: voice_id = character.voice else: @@ -52,6 +53,9 @@ def talking_head( ) audio_url = s3.upload(audio_bytes, "mp3") + + print(f"run wav2lip on {character.image} and {audio_url}") + output_url, thumbnail_url = replicate.wav2lip( face_url=character.image, speech_url=audio_url, @@ -60,6 +64,9 @@ def talking_head( width=width, height=height, ) + + print(f"output: {output_url}") + return output_url, thumbnail_url diff --git a/app/animations/dialogue.py b/app/animations/dialogue.py index 4a8cdf2..5a35d43 100644 --- a/app/animations/dialogue.py +++ b/app/animations/dialogue.py @@ -15,8 +15,14 @@ def animated_dialogue(request: DialogueRequest, callback=None): + print("===== animated_dialogue =====") + print(request) + result = dialogue(request) + print("---") + print(result) + if callback: callback(progress=0.1) @@ -37,6 +43,7 @@ def animated_dialogue(request: DialogueRequest, callback=None): def run_talking_head_segment(message, idx): nonlocal progress, progress_increment character = characters[message["character_id"]] + print(f'run talking head: {message["message"]}') output, _ = talking_head( character, message["message"], @@ -44,7 +51,9 @@ def run_talking_head_segment(message, idx): height, gfpgan=request.gfpgan ) + print(f'output: {output}') with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file: + print("download:", output) response = requests.get(output, stream=True) response.raise_for_status() for chunk in response.iter_content(chunk_size=8192): @@ -53,15 +62,22 @@ def run_talking_head_segment(message, idx): progress += progress_increment if callback: callback(progress=progress) + print("return temp_file.name:", temp_file.name) return temp_file.name + print("--- run video file tasks ----") + video_files = utils.process_in_parallel( result.dialogue, run_talking_head_segment, max_workers=MAX_WORKERS ) - if request.dual_view: + print("--- end video file tasks ----") + print(video_files) + + if request.dual_view: + print(" -> dual view") cropped_images = {} for character in characters: temp_file = tempfile.NamedTemporaryFile(suffix=".webp", delete=False) @@ -79,15 +95,14 @@ def run_talking_head_segment(message, idx): left = idx % 2 == 0 video_file = utils.stitch_image_video(image, video_file, left) dual_video_files.append(video_file) - for character in characters: os.remove(cropped_images[character]) for video_file in video_files: os.remove(video_file) - video_files = dual_video_files if request.intro_screen: + print(" -> intro screen") character_names = [characters[character_id].name for character_id in request.character_ids] character_name_str = " and ".join(character_names) paragraphs = [ @@ -101,7 +116,7 @@ def run_talking_head_segment(message, idx): duration = 8, fade_in = 1.5, margin_left = 25, - margin_right = width + 25 + margin_right = 25 #width + 25 ) video_files = [intro_screen] + video_files @@ -109,6 +124,7 @@ def run_talking_head_segment(message, idx): # concatenate the final video clips with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as temp_output_file: + print("concatenate videos") utils.concatenate_videos(video_files, temp_output_file.name) with open(temp_output_file.name, 'rb') as f: video_bytes = f.read() @@ -119,8 +135,10 @@ def run_talking_head_segment(message, idx): os.remove(video_file) # generate thumbnail + print("make thumbnail") thumbnail = utils.create_dialogue_thumbnail(*images, 2*width, height) thumbnail_url = s3.upload(thumbnail, "webp") + print("finished thumbnail", thumbnail_url) if callback: callback(progress=0.99) diff --git a/app/animations/monologue.py b/app/animations/monologue.py index 26a1352..06ef6ab 100644 --- a/app/animations/monologue.py +++ b/app/animations/monologue.py @@ -7,6 +7,8 @@ from ..models import MonologueRequest from ..utils import * +MAX_PIXELS = 1024 * 1024 + def animated_monologue(request: MonologueRequest, callback=None): result = monologue(request) @@ -16,15 +18,19 @@ def animated_monologue(request: MonologueRequest, callback=None): character = EdenCharacter(request.character_id) + width, height = calculate_target_dimensions([character.image], MAX_PIXELS) + output, thumbnail_url = talking_head( character, result.monologue, + width, + height, gfpgan=request.gfpgan ) if request.intro_screen: - image = download_image(character.image) - width, height = image.size + #image = download_image(character.image) + #width, height = image.size text = [ f"{character.name}: {request.prompt}" @@ -47,12 +53,15 @@ def animated_monologue(request: MonologueRequest, callback=None): temp_file.flush() video_files = [intro_screen, temp_file.name] - + print(video_files) + print("cat") with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as temp_output_file: + print(video_files, temp_output_file.name) concatenate_videos(video_files, temp_output_file.name) with open(temp_output_file.name, 'rb') as f: video_bytes = f.read() output_url = s3.upload(video_bytes, "mp4") + print(output_url) else: output_bytes = requests.get(output).content diff --git a/app/animations/reel.py b/app/animations/reel.py index d13e73c..92c2e88 100644 --- a/app/animations/reel.py +++ b/app/animations/reel.py @@ -8,6 +8,7 @@ from ..plugins import replicate, elevenlabs, s3 from ..character import Character, EdenCharacter from ..scenarios import reel +from ..animations.animation import select_random_voice from ..models import ReelRequest @@ -41,11 +42,11 @@ def animated_reel(request: ReelRequest, callback=None): if request.aspect_ratio == "portrait": width, height = 1280, 1920 elif request.aspect_ratio == "landscape": - width, height = 1920, 1280 + width, height = 1920, 1088 else: width, height = 1600, 1600 - min_duration = 20 + min_duration = 25 speech_audio = None duration = min_duration diff --git a/app/animations/story.py b/app/animations/story.py index 0dacdb4..3b4b5f1 100644 --- a/app/animations/story.py +++ b/app/animations/story.py @@ -1,6 +1,7 @@ import os import requests import tempfile +from pydub import AudioSegment from .. import utils from ..plugins import replicate, elevenlabs, s3 @@ -10,10 +11,15 @@ from .animation import screenplay_clip MAX_WORKERS = 3 +INTRO_SCREEN_DURATION = 10 def animated_story(request: StoryRequest, callback=None): screenplay = story(request) + + # screenplay["clips"] = screenplay["clips"][:3] + music_prompt = screenplay.get("music_prompt") + if callback: callback(progress=0.1) @@ -66,6 +72,10 @@ def run_story_segment(clip, idx): screenplay["clips"], run_story_segment, max_workers=MAX_WORKERS ) + + print(":TH RESULTS") + print(results) + video_files = [video_file for video_file, thumbnail in results] thumbnail_url = results[0][1] @@ -80,23 +90,63 @@ def run_story_segment(clip, idx): paragraphs, width, height, - duration = 10, + duration = INTRO_SCREEN_DURATION, fade_in = 2, margin_left = 25, margin_right = 25 ) video_files = [intro_screen] + video_files + + print("T?he VIDEO FILES") + print(video_files) + + audio_file = None + if music_prompt: + print("get audio") + duration = sum([utils.get_video_duration(video_file) for video_file in video_files]) + + print("full dur", duration) + print("the music prompt", music_prompt) + music_url, _ = replicate.audiocraft( + prompt=music_prompt, + seconds=duration + ) + print(music_url) + + response = requests.get(music_url) + response.raise_for_status() + audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") + audio_file.write(response.content) + audio_file.flush() + + if request.intro_screen: + print("intro screen silence") + silence = AudioSegment.silent(duration=INTRO_SCREEN_DURATION * 1000) + music = AudioSegment.from_mp3(audio_file.name) + music = music - 8 + music_with_silence = silence + music + music_with_silence.export(audio_file.name, format="mp3") + with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as temp_output_file: utils.concatenate_videos(video_files, temp_output_file.name) - with open(temp_output_file.name, "rb") as f: - video_bytes = f.read() + if audio_file: + with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as temp_output_file2: + utils.mix_video_audio(temp_output_file.name, audio_file.name, temp_output_file2.name) + with open(temp_output_file2.name, "rb") as f: + video_bytes = f.read() + else: + with open(temp_output_file.name, "rb") as f: + video_bytes = f.read() output_url = s3.upload(video_bytes, "mp4") # clean up clips for video_file in video_files: os.remove(video_file) + if audio_file: + os.remove(audio_file.name) + if callback: callback(progress=0.99) diff --git a/app/character.py b/app/character.py index 4fd85b1..d60ce04 100644 --- a/app/character.py +++ b/app/character.py @@ -113,12 +113,12 @@ def update( self.creation_enabled = creation_enabled self.story_creation_enabled = story_creation_enabled self.concept = concept - self.smart_reply = smart_reply + self.smart_reply = False # smart_reply # disabled until ready self.chat_model = chat_model self.image = image self.voice = voice self.function_map = {"1": self._chat_} - options = ["Regular conversation, chat, humor, or small talk"] + options = ["Regular conversation, chat, humor, small talk, or a asking for a question or comment about an attached image"] if knowledge: if not self.knowledge_summary.strip(): @@ -264,11 +264,12 @@ def _chat_( session_id=None, ) -> dict: response = self.chat( - prompt=message.message, + prompt=message.message, + image=message.attachments[0] if message.attachments else None, id=session_id, save_messages=False, model=self.chat_model, - ) + ) user_message = ChatMessage(role="user", content=message.message) assistant_message = ChatMessage(role="assistant", content=response) output = {"message": response, "config": None} @@ -490,6 +491,7 @@ def __call__( system=self.chat_prompt, params=self.chat_params, ) + function = None if self.router_prompt: index = self._route_(message, session_id=session_id) @@ -497,15 +499,18 @@ def __call__( if not function: function = self.function_map.get("1") + output, user_message, assistant_message = function( message, session_id=session_id ) + self.router.add_messages(user_message, assistant_message, id=session_id) self.creator.add_messages(user_message, assistant_message, id=session_id) self.story_editor.add_messages(user_message, assistant_message, id=session_id) self.story_context.add_messages(user_message, assistant_message, id=session_id) self.qa.add_messages(user_message, assistant_message, id=session_id) self.chat.add_messages(user_message, assistant_message, id=session_id) + return output diff --git a/app/creation_interfaces/kojii_untitledxyz.py b/app/creation_interfaces/kojii_untitledxyz.py index fdaf46a..c1b9a93 100644 --- a/app/creation_interfaces/kojii_untitledxyz.py +++ b/app/creation_interfaces/kojii_untitledxyz.py @@ -30,7 +30,8 @@ def kojii_untitledxyz(request: KojiiUntitledxyzRequest, callback=None): "close up of a single column fragment, dense wires and thick electrical cables, computer circuits, corrosion, pen and ink, wires drawn with pale yellow, red, blue, green, solid white background, sharpness, noise." ] text_inputs_to_interpolate_weights = [ - 2 * request.human_machine_nature + 2 * request.human_machine_nature, + 2 * (1 - request.human_machine_nature) ] else: @@ -39,7 +40,8 @@ def kojii_untitledxyz(request: KojiiUntitledxyzRequest, callback=None): "close up of a single column fragment, pen and ink, dense vegetation, wrapped in vines emerging from cracks, large leaves, dense lichen, diverse plants drawn with bright green, red, orange, blue, cyan, magenta, yellow, oversaturated, neons, solid white background, sharpness, noise." ] text_inputs_to_interpolate_weights = [ - 2 * (request.human_machine_nature - 0.5) + 2 * (request.human_machine_nature - 0.5), + 2 * (1 - (request.human_machine_nature - 0.5)) ] elif request.type == Type.context: @@ -50,7 +52,8 @@ def kojii_untitledxyz(request: KojiiUntitledxyzRequest, callback=None): "an isometric architectural drawing, displaying an ultra close up of a modernist building made of computer parts, dodecahedrons, textural details, emphasizing entangled wires with intense precision, the intricate web of wires are seen up close, accentuating the fusion of modern and ancient, the image depicts wires illustrated with vibrant colors, sharpness, noise." ] text_inputs_to_interpolate_weights = [ - 2 * request.human_machine_nature + 2 * request.human_machine_nature, + 2 * (1 - request.human_machine_nature) ] else: @@ -59,14 +62,15 @@ def kojii_untitledxyz(request: KojiiUntitledxyzRequest, callback=None): "an isometric architectural drawing, displaying an ultra close up of a modern superstructure, geometric stone blocks, emphasis on dense overwhelming vines with intense precision, plants are shot up close, accentuating the fusion of nature and columns, the image depicts giant leaves illustrated with vibrant colors, solid white background, sharpness, noise." ] text_inputs_to_interpolate_weights = [ - 2 * (request.human_machine_nature - 0.5) + 2 * (request.human_machine_nature - 0.5), + 2 * (1 - (request.human_machine_nature - 0.5)) ] config = { "mode": "create", "text_input": " to ".join(text_inputs_to_interpolate), "text_inputs_to_interpolate": "|".join(text_inputs_to_interpolate), - "text_inputs_to_interpolate_weights": " | ".join([str(t) for t in text_inputs_to_interpolate_weights]), + "text_inputs_to_interpolate_weights": "|".join([str(t) for t in text_inputs_to_interpolate_weights]), "lora": "https://edenartlab-prod-data.s3.us-east-1.amazonaws.com/d2e6d1f8ccfca428ba42fa56a0384a4261d32bf1ee8b0dc952d99da9011daf39.tar", "lora_scale": 0.8, } @@ -76,9 +80,5 @@ def kojii_untitledxyz(request: KojiiUntitledxyzRequest, callback=None): print("=======") image_url, thumbnail_url = replicate.sdxl(config) - - # sudo cog predict -i mode=create -i text_inputs_to_interpolate="prompt1|prompt2" -i text_inputs_to_interpolate_weights="0.3|0.7" - # add the loraurl for this concept: https://app.eden.art/creators/untitledxyz?conceptId=65b927bcc69501b06686d68d - return image_url, thumbnail_url \ No newline at end of file diff --git a/app/generator.py b/app/generator.py index 0e9460f..99668a1 100644 --- a/app/generator.py +++ b/app/generator.py @@ -62,13 +62,13 @@ def send_progress_update(progress: float): if task_type == "monologue": character_id = request.config.get("characterId") prompt = request.config.get("prompt") + init_image = request.config.get("init_image") gfpgan = request.config.get("gfpgan") - dual_view = request.config.get("dual_view") task_req = MonologueRequest( character_id=character_id, prompt=prompt, + init_image=init_image, gfpgan=gfpgan, - dual_view=dual_view, ) output_url, thumbnail_url = animated_monologue( task_req, @@ -97,10 +97,14 @@ def send_progress_update(progress: float): character_ids = request.config.get("characterIds") prompt = request.config.get("prompt") intro_screen = request.config.get("intro_screen") + music_prompt = request.config.get("music_prompt") + music = request.config.get("music") task_req = StoryRequest( character_ids=character_ids, prompt=prompt, narrator_id=NARRATOR_CHARACTER_ID, + music_prompt=music_prompt, + music=music, intro_screen=intro_screen, ) output_url, thumbnail_url = animated_story( diff --git a/app/llm/llm.py b/app/llm/llm.py index d57e2de..1252d5e 100644 --- a/app/llm/llm.py +++ b/app/llm/llm.py @@ -145,6 +145,7 @@ def get_messages( def __call__( self, prompt: Union[str, Any], + image: Optional[str] = None, id: Union[str, UUID] = None, system: str = None, save_messages: bool = None, @@ -162,6 +163,7 @@ def __call__( return sess.gen_with_tools( model, prompt, + image, tools, client=self.client, system=system, @@ -172,6 +174,7 @@ def __call__( return sess.gen( model, prompt, + image, client=self.client, system=system, save_messages=save_messages, @@ -182,7 +185,9 @@ def __call__( def stream( self, + model: str, prompt: str, + image: Optional[str] = None, id: Union[str, UUID] = None, system: str = None, save_messages: bool = None, @@ -191,7 +196,9 @@ def stream( ) -> str: sess = self.get_session(id) return sess.stream( + model, prompt, + image, client=self.client, system=system, save_messages=save_messages, @@ -235,7 +242,10 @@ def print_messages(self, id: Union[str, UUID] = None) -> None: session = self.get_session(id) if id else self.default_session if session: for msg in session.messages: - print(f"{msg.role} : {msg.content}") + message_str = f"{msg.role} : {msg.content}" + if msg.image: + message_str += f" : ((image))" + print(message_str) def __repr__(self) -> str: return "" @@ -343,6 +353,7 @@ async def __call__( self, model: str, prompt: str, + image: Optional[str] = None, id: Union[str, UUID] = None, system: str = None, save_messages: bool = None, @@ -362,6 +373,7 @@ async def __call__( return await sess.gen_with_tools_async( model, prompt, + image, tools, client=self.client, system=system, @@ -372,6 +384,7 @@ async def __call__( return await sess.gen_async( model, prompt, + image, client=self.client, system=system, save_messages=save_messages, @@ -382,7 +395,9 @@ async def __call__( async def stream( self, + model: str, prompt: str, + image: Optional[str] = None, id: Union[str, UUID] = None, system: str = None, save_messages: bool = None, @@ -394,7 +409,9 @@ async def stream( self.client = AsyncClient(proxies=os.getenv("https_proxy")) sess = self.get_session(id) return sess.stream_async( + model, prompt, + image, client=self.client, system=system, save_messages=save_messages, diff --git a/app/llm/session.py b/app/llm/session.py index 3a3498f..76f1476 100644 --- a/app/llm/session.py +++ b/app/llm/session.py @@ -1,5 +1,6 @@ import time import os +from io import BytesIO from pydantic import BaseModel, SecretStr, HttpUrl, Field from uuid import uuid4, UUID from httpx import Client, AsyncClient @@ -8,12 +9,13 @@ import datetime from ..models import ChatMessage -from ..utils import remove_a_key, now_tz +from ..utils import remove_a_key, now_tz, url_to_image_data ALLOWED_MODELS = [ "gpt-3.5-turbo", "gpt-4-1106-preview", + "gpt-4-vision-preview", "gryphe/mythomax-l2-13b-8k", "mistralai/mistral-medium", "mistralai/mixtral-8x7b-instruct", @@ -80,6 +82,7 @@ def format_input_messages( if self.recent_messages else self.messages ) + # Todo: include images in previous messages messages = ( [system_message.model_dump(include=self.input_fields, exclude_none=True)] + [ @@ -88,7 +91,20 @@ def format_input_messages( ] ) if user_message: - messages += [user_message.model_dump(include=self.input_fields, exclude_none=True)] + new_message = user_message.model_dump(include=self.input_fields, exclude_none=True) + if user_message.image: + img_data_url = url_to_image_data(user_message.image) + new_message["content"] = [ + { + "type": "text", + "text": user_message.content + }, + { + "type": "image_url", + "image_url": img_data_url + } + ] + messages += [new_message] return messages def add_messages( @@ -114,6 +130,7 @@ def prepare_request( self, model: str = "gpt-3.5-turbo", prompt: str = None, + image: Optional[str] = None, system: str = None, params: Dict[str, Any] = None, stream: bool = False, @@ -127,6 +144,9 @@ def prepare_request( if model not in ALLOWED_MODELS: raise ValueError(f"Invalid model: {model}. Available models: {ALLOWED_MODELS}") + + if image: + model = "gpt-4-vision-preview" provider = "openai" if "gpt-" in model else "openrouter" @@ -146,7 +166,7 @@ def prepare_request( if prompt: if not input_schema: - user_message = ChatMessage(role="user", content=prompt) + user_message = ChatMessage(role="user", content=prompt, image=image) else: assert isinstance( prompt, input_schema @@ -154,6 +174,7 @@ def prepare_request( user_message = ChatMessage( role="function", content=prompt.model_dump_json(), + image=image, name=input_schema.__name__, ) @@ -203,6 +224,7 @@ def gen( self, model: str, prompt: str, + image: Optional[str], client: Union[Client, AsyncClient], system: str = None, save_messages: bool = None, @@ -218,7 +240,7 @@ def gen( while not finished: api_url, headers, data, user_message = self.prepare_request( - model, prompt, system, params, False, input_schema, output_schema + model, prompt, image, system, params, False, input_schema, output_schema ) resp = client.post( @@ -269,7 +291,9 @@ def gen( def stream( self, + model: str, prompt: str, + image: Optional[str], client: Union[Client, AsyncClient], system: str = None, save_messages: bool = None, @@ -277,7 +301,7 @@ def stream( input_schema: Any = None, ): api_url, headers, data, user_message = self.prepare_request( - prompt, system, params, True, input_schema + model, prompt, image, system, params, True, input_schema ) with client.stream( @@ -311,6 +335,7 @@ def stream( def gen_with_tools( self, prompt: str, + image: Optional[str], tools: List[Any], client: Union[Client, AsyncClient], system: str = None, @@ -328,6 +353,7 @@ def gen_with_tools( tool_idx = int( self.gen( prompt, + image, client=client, system=tool_prompt_format, save_messages=False, @@ -344,6 +370,7 @@ def gen_with_tools( return { "response": self.gen( prompt, + image, client=client, system=system, save_messages=save_messages, @@ -371,7 +398,7 @@ def gen_with_tools( ) # manually append the nonmodified user message + normal AI response - user_message = ChatMessage(role="user", content=prompt) + user_message = ChatMessage(role="user", content=prompt, image=image) assistant_message = ChatMessage( role="assistant", content=context_dict["response"] ) @@ -383,6 +410,7 @@ async def gen_async( self, model: str, prompt: str, + image: Optional[str], client: Union[Client, AsyncClient], system: str = None, save_messages: bool = None, @@ -391,7 +419,7 @@ async def gen_async( output_schema: Any = None, ): api_url, headers, data, user_message = self.prepare_request( - model, prompt, system, params, False, input_schema, output_schema + model, prompt, image, system, params, False, input_schema, output_schema ) r = await client.post( @@ -430,6 +458,7 @@ async def stream_async( self, model: str, prompt: str, + image: Optional[str], client: Union[Client, AsyncClient], system: str = None, save_messages: bool = None, @@ -437,7 +466,7 @@ async def stream_async( input_schema: Any = None, ): api_url, headers, data, user_message = self.prepare_request( - model, prompt, system, params, True, input_schema + model, prompt, image, system, params, True, input_schema ) async with client.stream( @@ -469,6 +498,7 @@ async def stream_async( async def gen_with_tools_async( self, prompt: str, + image: Optional[str], tools: List[Any], client: Union[Client, AsyncClient], system: str = None, @@ -486,6 +516,7 @@ async def gen_with_tools_async( tool_idx = int( await self.gen_async( prompt, + image, client=client, system=tool_prompt_format, save_messages=False, @@ -522,6 +553,7 @@ async def gen_with_tools_async( context_dict["response"] = await self.gen_async( new_prompt, + image, client=client, system=new_system, save_messages=False, @@ -529,7 +561,7 @@ async def gen_with_tools_async( ) # manually append the nonmodified user message + normal AI response - user_message = ChatMessage(role="user", content=prompt) + user_message = ChatMessage(role="user", content=prompt, image=image) assistant_message = ChatMessage( role="assistant", content=context_dict["response"] ) diff --git a/app/models/characters.py b/app/models/characters.py index a71a147..c2eb044 100644 --- a/app/models/characters.py +++ b/app/models/characters.py @@ -46,6 +46,7 @@ class ChatMessage(BaseModel): role: str content: str + image: Optional[str] = Field(None) name: Optional[str] = None function_call: Optional[str] = None received_at: datetime.datetime = Field(default_factory=now_tz) diff --git a/app/models/scenarios.py b/app/models/scenarios.py index 805456f..90ff458 100644 --- a/app/models/scenarios.py +++ b/app/models/scenarios.py @@ -11,6 +11,7 @@ class MonologueRequest(BaseModel): character_id: str prompt: str + init_image: Optional[str] = None model: str = "gpt-4-1106-preview" params: dict = {} gfpgan: bool = False @@ -40,6 +41,8 @@ class StoryRequest(BaseModel): prompt: str narrator_id: str = NARRATOR_CHARACTER_ID num_clips: int = 5 + music: Optional[bool] = False + music_prompt: Optional[str] = None model: str = "gpt-4-1106-preview" params: dict = {} intro_screen: bool = False @@ -69,6 +72,7 @@ class StoryResult(BaseModel): A screenplay consisting of a sequence of clips """ clips: List[StoryClip] = Field(description="Clips in the sequence") + music_prompt: Optional[str] = Field(description="Backing music content for sequence") class ReelNarrationMode(Enum): diff --git a/app/plugins/elevenlabs.py b/app/plugins/elevenlabs.py index b314e12..6372174 100644 --- a/app/plugins/elevenlabs.py +++ b/app/plugins/elevenlabs.py @@ -2,7 +2,8 @@ import os import random import wave -from elevenlabs import generate, set_api_key +from elevenlabs import generate, set_api_key, Voice, VoiceSettings, play + from ..utils import exponential_backoff ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY") @@ -19,9 +20,24 @@ def tts( voice: str, max_attempts: int = 6, initial_delay: int = 5, + stability: float = 0.5, + similarity_boost: float = 0.75, + style: float = 0.35, + use_speaker_boost: bool = True ): def generate_with_params(): - return generate(text, voice=voice) + return generate( + text=text, + voice=Voice( + voice_id=voice, + settings=VoiceSettings( + stability=stability, + similarity_boost=similarity_boost, + style=style, + use_speaker_boost=use_speaker_boost + ) + ) + ) audio_bytes = exponential_backoff( generate_with_params, diff --git a/app/prompt_templates/cinema/screenwriter_prompt.txt b/app/prompt_templates/cinema/screenwriter_prompt.txt index c731df3..cbf34e2 100644 --- a/app/prompt_templates/cinema/screenwriter_prompt.txt +++ b/app/prompt_templates/cinema/screenwriter_prompt.txt @@ -1,3 +1,18 @@ $character_details The premise of the story is: -$story \ No newline at end of file +$story + +--- + +Write a screenplay for a film based on the information provided above. Intersperse the screenplay with descriptions of events and character dialogues. Stucture the screenplay as a music prompt and a sequence of clips. A clip contains the following: + +voiceover: whether the voiceover is the narrator or a character speaking +character: If voiceover is in character mode, the name of the speaking character. Important: you may only use the exact name of a character provided by the user in the cast of characters. +speech: If voiceover is in character or narrator mode, the text of the speech +image_prompt: a description of the image content for the clip + +Generate around 5-10 clips. Approximately half should be character dialogue and half should be narration. For clips where the voiceover is a character, some of the image prompts may emphasize non-personal objects or scenery, and some may emphasize the character. + +The screenplay also contains a single music prompt. The music prompt is a 1-sentence description of the backing music for the film. + +Do not include an introduction or restatement of the prompt, just go straight into the screenplay. \ No newline at end of file diff --git a/app/prompt_templates/cinema/screenwriter_system.txt b/app/prompt_templates/cinema/screenwriter_system.txt index fa4b212..c25c704 100644 --- a/app/prompt_templates/cinema/screenwriter_system.txt +++ b/app/prompt_templates/cinema/screenwriter_system.txt @@ -1,14 +1 @@ -You are a critically acclaimed screenwriter who writes incredibly captivating and original scripts. - -Users will give you a cast of characters, including their names and biographies, as well as a premise or synopsis for the story. - -You will then write a screenplay for a film based on the information provided. Intersperse the screenplay with descriptions of events and character dialogues. Stucture the screenplay as sequence of clips. A clip contains the following: - -voiceover: whether the voiceover is the narrator or a character speaking -character: If voiceover is in character mode, the name of the speaking character. Important: you may only use the exact name of a character provided by the user in the cast of characters. -speech: If voiceover is in character or narrator mode, the text of the speech -image_prompt: a description of the image content for the clip - -Generate around 5-10 clips. Approximately half should be character dialogue and half should be narration. For clips where the voiceover is a character, some of the image prompts may emphasize non-personal objects or scenery, and some may emphasize the character. - -Do not include an introduction or restatement of the prompt, just go straight into the screenplay. \ No newline at end of file +You are a critically acclaimed screenwriter who writes incredibly captivating and original scripts. \ No newline at end of file diff --git a/app/scenarios/monologue.py b/app/scenarios/monologue.py index df588d2..7d3fa2e 100644 --- a/app/scenarios/monologue.py +++ b/app/scenarios/monologue.py @@ -15,9 +15,8 @@ def monologue(request: MonologueRequest) -> MonologueResult: name=name, description=description ) - llm = LLM(model=request.model, system_message=system_message, params=params) - monologue_text = llm(request.prompt) + monologue_text = llm(request.prompt, image=request.init_image) result = MonologueResult(monologue=monologue_text) diff --git a/app/scenarios/story.py b/app/scenarios/story.py index 1307b88..42ecb04 100644 --- a/app/scenarios/story.py +++ b/app/scenarios/story.py @@ -45,6 +45,19 @@ def story(request: StoryRequest): story = screenwriter(prompt, output_schema=StoryResult) + print(story) + #story["music_prompt"] = None + #if not request.music: + # story["music_prompt"] = None + + print(request) + # if request.music: + # if request.music_prompt: + # story["music_prompt"] = request.music_prompt + # else: + # story["music_prompt"] = "a long vibraphone solo" + + print("===== generate a story =======") print(prompt) print("-----") diff --git a/app/utils.py b/app/utils.py index dc0204c..b6fce9b 100644 --- a/app/utils.py +++ b/app/utils.py @@ -2,6 +2,7 @@ import time import os import re +import base64 import traceback import requests import math @@ -20,14 +21,17 @@ from pydantic import BaseModel, Field, create_model +def get_video_duration(video_file): + cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', video_file] + duration = subprocess.check_output(cmd).decode().strip() + return float(duration) + + def orjson_dumps(v, *, default, **kwargs): - # orjson.dumps returns bytes, to match standard json.dumps we need to decode return orjson.dumps(v, default=default, **kwargs).decode() def now_tz(): - # Need datetime w/ timezone for cleanliness - # https://stackoverflow.com/a/24666683 return datetime.datetime.now(datetime.timezone.utc) @@ -80,6 +84,15 @@ def PIL_to_bytes(image, ext="JPEG", quality=95): return img_byte_arr.getvalue() +def url_to_image_data(url, max_size=(512, 512)): + img = download_image(url) + img.thumbnail(max_size, Image.Resampling.LANCZOS) + img_bytes = PIL_to_bytes(img, ext="JPEG", quality=95) + data = base64.b64encode(img_bytes).decode("utf-8") + data = f"data:image/jpeg;base64,{data}" + return data + + def calculate_target_dimensions(images, max_pixels): min_w = float('inf') min_h = float('inf') @@ -158,34 +171,40 @@ def create_dialogue_thumbnail(image1_url, image2_url, width, height, ext="WEBP") return img_byte_arr.getvalue() -def concatenate_videos(video_files, output_file): - standard_fps = "30" # Target frame rate - - # Step 1: Convert all videos to the same frame rate +def concatenate_videos(video_files, output_file, fps=30): converted_videos = [] for i, video in enumerate(video_files): with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp: output_video = temp.name - convert_command = ['ffmpeg', '-y', '-loglevel', 'panic', '-i', video, '-r', standard_fps, '-c:a', 'copy', output_video] + convert_command = ['ffmpeg', '-y', '-loglevel', 'panic', '-i', video, '-r', str(fps), '-c:a', 'copy', output_video] subprocess.run(convert_command) converted_videos.append(output_video) - - # create the filter_complex string filter_complex = "".join([f"[{i}:v] [{i}:a] " for i in range(len(converted_videos))]) filter_complex += f"concat=n={len(converted_videos)}:v=1:a=1 [v] [a]" - - # concatenate videos concat_command = ['ffmpeg'] for video in converted_videos: concat_command.extend(['-i', video]) concat_command.extend(['-y', '-loglevel', 'panic', '-filter_complex', filter_complex, '-map', '[v]', '-map', '[a]', output_file]) subprocess.run(concat_command) - - # delete temporary files for video in converted_videos: os.remove(video) +def mix_video_audio(video_path, audio_path, output_path): + cmd = [ + 'ffmpeg', + '-i', video_path, + '-i', audio_path, + '-filter_complex', '[1:a]volume=1.0[a1];[0:a][a1]amerge=inputs=2[a]', + '-map', '0:v', + '-map', '[a]', + '-c:v', 'copy', + '-ac', '2', + output_path + ] + subprocess.run(cmd, check=True) + + def combine_audio_video(audio_url: str, video_url: str): audio_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=True) video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=True) @@ -273,15 +292,17 @@ def handle_error(e): def wrap_text(draw, text, font, max_width): - lines = [] words = text.split() - - while words: - line = '' - while words and draw.textlength(line + words[0], font=font) <= max_width: - line += (words.pop(0) + ' ') - lines.append(line) - + lines = [] + current_line = [] + for word in words: + if draw.textlength(' '.join(current_line + [word]), font=font) > max_width: + lines.append(' '.join(current_line)) + current_line = [word] + else: + current_line.append(word) + if current_line: + lines.append(' '.join(current_line)) return lines diff --git a/tests/test.py b/tests/test.py index d3a47c0..9c97dcc 100644 --- a/tests/test.py +++ b/tests/test.py @@ -72,4 +72,6 @@ def test_func2(): print(type(result)) - print(result) \ No newline at end of file + print(result) + + diff --git a/tests/test_animation.py b/tests/test_animation.py index d540c96..144b724 100644 --- a/tests/test_animation.py +++ b/tests/test_animation.py @@ -9,11 +9,12 @@ def test_monologue_animation(): Test monologue on static character and prompt """ request = { - "character_id": "6596129023f1c4b471dbb94a", - "prompt": "Tell me a story about pizza", - "gfpgan": False, + "character_id": "65f35e7f44390ad1df63680e", + "prompt": "Tell me what you feel about Bombay Beach", + "gfpgan": True, "intro_screen": True } + response = client.post("/animation/monologue", json=request) print(response.json()) assert response.status_code == 200 @@ -24,12 +25,15 @@ def test_dialogue_animation(): Test monologue on static character and prompt """ request = { - "character_ids": ["6596129023f1c4b471dbb94a", "6598e117dd06d165264f2277"], - "prompt": "Debate panspermia vs. abiogenesis", - # "gfpgan": True, + #"character_ids": ["6596129023f1c4b471dbb94a", "6598e117dd06d165264f2277"], + #"prompt": "Debate panspermia vs. abiogenesis", + "prompt": "Have a rap battle!! Dis each other. Be expressive in your speech! Exclamations, random ALL CAPS, onomatopoeia, etc. Be creative but sharp. Dis each other! You are trying to win this rap battle against each other.", + "character_ids": ["65ce8995b6124cd312fedb99", "65eea28c730ba08c8c7b6810"], + "gfpgan": True, "intro_screen": True, - "dual_view": True + # "dual_view": True } + response = client.post("/animation/dialogue", json=request) print(response.json()) assert response.status_code == 200 @@ -42,7 +46,9 @@ def test_story(): request = { "character_ids": [], "prompt": "A family of Dragons lives in a mystical layer underneath a volcano. The dragons are beautiful, ornately decorated, fire-breathing, creatures. They are brave and wise. The story should just be about them journeying to a bunch of beautiful far away places in nature, and then coming back to their volcano lair. Make sure the image prompts are very short. No more than 5 words.", - "intro_screen": True + "intro_screen": True, + # "music_prompt": "a long drum solo with percussions and bongos", + "music": True } response = client.post("/animation/story", json=request) @@ -56,9 +62,10 @@ def test_reel(): """ request = { "character_ids": [], - "prompt": "A jazz woman dancing to some saxophone jazzy show tunes, instrumental", + #"prompt": "A jazz woman dancing to some saxophone jazzy show tunes, instrumental", + "prompt": "A long commercial about a drug called Paradisium. explain its benefits and side effects, and go on and on and on.", "intro_screen": True, - "narration": "off", + #"narration": "off", #"music_prompt": "death metal heavy rock, incomprehensible, gore, screen" } diff --git a/tests/test_scenarios.py b/tests/test_scenarios.py index 2fa4d0b..f3269df 100644 --- a/tests/test_scenarios.py +++ b/tests/test_scenarios.py @@ -10,7 +10,8 @@ def test_monologue(): """ request = { "character_id": "6596129023f1c4b471dbb94a", - "prompt": "Tell me a story about pizza" + "prompt": "What does the image say", + #"init_image": "https://images.squarespace-cdn.com/content/v1/6213c340453c3f502425776e/c24904d4-f0f0-4a26-9470-fec227dde15c/image-90.png" } response = client.post("/scenarios/monologue", json=request) @@ -25,7 +26,7 @@ def test_dialogue(): """ request = { "character_ids": ["6596129023f1c4b471dbb94a", "6598e117dd06d165264f2277"], - "prompt": "Debate whether or not pizza is a vegetable" + "prompt": "Debate whether or not pizza is a vegetable once and for all" } response = client.post("/scenarios/dialogue", json=request) @@ -34,14 +35,14 @@ def test_dialogue(): assert response.status_code == 200 - def test_story(): """ Test dialogue function on static characters and prompt """ request = { "character_ids": ["6596129023f1c4b471dbb94a", "6598e117dd06d165264f2277"], - "prompt": "Debate whether or not pizza is a vegetable" + "prompt": "Debate whether or not pizza is a vegetable", + "music": True } response = client.post("/scenarios/story", json=request) diff --git a/tests/test_stories.py b/tests/test_stories.py index 6c494db..65ca465 100644 --- a/tests/test_stories.py +++ b/tests/test_stories.py @@ -28,8 +28,9 @@ def test_story(): request = { "character_ids": [], - "prompt": "A family of Dragons lives in a mystical layer underneath a volcano. The dragons are beautiful, ornately decorated, fire-breathing, creatures. They are brave and wise. The story should just be about them journeying to a bunch of beautiful far away places in nature, and then coming back to their volcano lair. Make at least 10 clips.", - "intro_screen": True + "prompt": "A family of Dragons lives in a mystical layer underneath a volcano. The dragons are beautiful, ornately decorated, fire-breathing, creatures. They are brave and wise. The story should just be about them journeying to a bunch of beautiful far away places in nature, and then coming back to their volcano lair.", + "intro_screen": True, + "music": True, } response = client.post("/animation/story", json=request)