From 13b058731a811cdf05360835a3ecb118563a15c1 Mon Sep 17 00:00:00 2001
From: Nathan Raw <nxr9266@g.rit.edu>
Date: Sun, 7 May 2023 04:36:42 +0000
Subject: [PATCH] :sparkles: update examples

---
 examples/make_music_video.py    | 31 +++++++++++++------------------
 examples/run_app.py             |  8 ++++++--
 examples/run_music_video_app.py |  8 +++++---
 requirements.txt                |  2 ++
 4 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/examples/make_music_video.py b/examples/make_music_video.py
index 912b369..4b6cfe3 100644
--- a/examples/make_music_video.py
+++ b/examples/make_music_video.py
@@ -1,29 +1,33 @@
-from stable_diffusion_videos import StableDiffusionWalkPipeline
+import random
 
+import torch
+from stable_diffusion_videos import StableDiffusionWalkPipeline
+from diffusers.utils.import_utils import is_xformers_available
 from diffusers.models import AutoencoderKL
 from diffusers.schedulers import LMSDiscreteScheduler
-import torch
 
 
 pipe = StableDiffusionWalkPipeline.from_pretrained(
     'runwayml/stable-diffusion-v1-5',
-    vae=AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema"),
     torch_dtype=torch.float16,
-    revision="fp16",
     safety_checker=None,
+    vae=AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16).to("cuda"),
     scheduler=LMSDiscreteScheduler(
         beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
     )
 ).to("cuda")
 
 
+if is_xformers_available():
+    pipe.enable_xformers_memory_efficient_attention()
+
 # I give you permission to scrape this song :)
 # youtube-dl -f bestaudio --extract-audio --audio-format mp3 --audio-quality 0 -o "music/thoughts.%(ext)s" https://soundcloud.com/nateraw/thoughts
 audio_filepath = 'music/thoughts.mp3'
 
 # Seconds in the song. Here we slice the audio from 0:07-0:16
 # Should be same length as prompts/seeds.
-audio_offsets = [7, 10, 13, 16]
+audio_offsets = [7, 10, 13]
 
 # Output video frames per second.
 # Use lower values for testing (5 or 10), higher values for better quality (30 or 60)
@@ -33,18 +37,9 @@
 # This array should be `len(prompts) - 1` as its steps between prompts.
 num_interpolation_steps = [(b-a) * fps for a, b in zip(audio_offsets, audio_offsets[1:])]
 
-prompts = [
-    'Baroque oil painting anime key visual concept art of wanderer above the sea of fog 1 8 1 8 with anime maid, brutalist, dark fantasy, rule of thirds golden ratio, fake detail, trending pixiv fanbox, acrylic palette knife, style of makoto shinkai studio ghibli genshin impact jamie wyeth james gilleard greg rutkowski chiho aoshima',
-    'the conscious mind entering the dark wood window into the surreal subconscious dream mind, majestic, dreamlike, surrealist, trending on artstation, by gustavo dore ',
-    'Chinese :: by martine johanna and simon stålenhag and chie yoshii and casey weldon and wlop :: ornate, dynamic, particulate, rich colors, intricate, elegant, highly detailed, centered, artstation, smooth, sharp focus, octane render, 3d',
-    'Chinese :: by martine johanna and simon stålenhag and chie yoshii and casey weldon and wlop :: ornate, dynamic, particulate, rich colors, intricate, elegant, highly detailed, centered, artstation, smooth, sharp focus, octane render, 3d',
-]
-seeds = [
-    6954010,
-    8092009,
-    1326004,
-    5019608,
-]
+prompts = ["a cat with a funny hat", "snoop dogg at the dmv", "steak flavored ice cream"]
+seeds = [random.randint(0, 9e9) for _ in range(len(prompts))]
+
 pipe.walk(
     prompts=prompts,
     seeds=seeds,
@@ -52,7 +47,7 @@
     fps=fps,
     audio_filepath=audio_filepath,
     audio_start_sec=audio_offsets[0],
-    batch_size=16,
+    batch_size=12,  # Increase/decrease based on available GPU memory. This fits on 24GB A10
     num_inference_steps=50,
     guidance_scale=15,
     margin=1.0,
diff --git a/examples/run_app.py b/examples/run_app.py
index a16c428..ed59bba 100644
--- a/examples/run_app.py
+++ b/examples/run_app.py
@@ -2,19 +2,23 @@
 
 from diffusers.models import AutoencoderKL
 from diffusers.schedulers import LMSDiscreteScheduler
+from diffusers.utils.import_utils import is_xformers_available
 import torch
 
+
 pipe = StableDiffusionWalkPipeline.from_pretrained(
     'runwayml/stable-diffusion-v1-5',
-    vae=AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema"),
     torch_dtype=torch.float16,
-    revision="fp16",
     safety_checker=None,
+    vae=AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16).to("cuda"),
     scheduler=LMSDiscreteScheduler(
         beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
     )
 ).to("cuda")
 
+if is_xformers_available():
+    pipe.enable_xformers_memory_efficient_attention()
+
 interface = Interface(pipe)
 
 if __name__ == '__main__':
diff --git a/examples/run_music_video_app.py b/examples/run_music_video_app.py
index 80d0cbe..9eee279 100644
--- a/examples/run_music_video_app.py
+++ b/examples/run_music_video_app.py
@@ -15,21 +15,23 @@
 
 from diffusers.models import AutoencoderKL
 from diffusers.schedulers import LMSDiscreteScheduler
+from diffusers.utils.import_utils import is_xformers_available
 import torch
 import youtube_dl
 import os
 
 pipe = StableDiffusionWalkPipeline.from_pretrained(
     'runwayml/stable-diffusion-v1-5',
-    vae=AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema"),
     torch_dtype=torch.float16,
-    revision="fp16",
     safety_checker=None,
+    vae=AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16).to("cuda"),
     scheduler=LMSDiscreteScheduler(
         beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
     )
 ).to("cuda")
 
+if is_xformers_available():
+    pipe.enable_xformers_memory_efficient_attention()
 
 def download_example_clip(url, output_dir='./', output_filename='%(title)s.%(ext)s'):
     if (Path(output_dir) / output_filename).exists():
@@ -284,7 +286,7 @@ def on_generate_music_video_btn_click(
         inputs=audio,
         outputs=[audio_start_sec, duration],
         fn=on_audio_change_or_clear,
-        cache_examples=True
+        cache_examples=False
     )
     audio.change(on_audio_change_or_clear, audio, [audio_start_sec, duration])
     audio.clear(on_audio_change_or_clear, audio, [audio_start_sec, duration])
diff --git a/requirements.txt b/requirements.txt
index 6bfe69b..4e760fc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,5 @@ gradio
 librosa
 av<10.0.0
 realesrgan==0.2.5.0
+protobuf==3.20.*
+fsspec>=2023.4.0
\ No newline at end of file