config_crema.yaml

#####################
# EDIT THESE FIELDS #
#####################

# /path/to/crema_script.pt
checkpoint: /Users/a/Documents/Automations/git talking heads/checkpoints/crema_script.pt

# /path/to/audio_encoder.pt
encoder_checkpoint: /Users/a/Documents/Automations/git talking heads/checkpoints/audio_encoder.pt

# /path/to/id_frame - can be .mp4 or an image
id_frame: /Users/a/Documents/Automations/git talking heads/image/testpic.png

# /path/to/audio - can be anything supported by torchaudio (e.g. .wav, .mp3, .mp4)
audio: /Users/a/Documents/Automations/git talking heads/audio/output_16kHz.wav

# /path/to/output_video.mp4
output: /Users/a/Documents/Automations/git talking heads/output_video.mp4

# use GPU - highly recommended
gpu: true

# pick id_frame randomly from video
id_frame_random: False

# number of inference diffusion steps - 100 recommended
inference_steps: 100


######################################
# CHECKPOINT SPECIFIC - DON'T CHANGE #
######################################
diffusion:
  in_channels: 3
  image_size: 128
  out_channels: 6
  n_timesteps: 1000

unet:
  n_audio_motion_embs: 2
  n_motion_frames: 2
  motion_channels: 3