Spaces:
Running
on
Zero
Running
on
Zero
File size: 14,914 Bytes
abb49c0 2d9a6f9 abb49c0 b991f7f 56c0d97 683f192 abb49c0 683f192 3a03985 abb49c0 683f192 abb49c0 683f192 abb49c0 56c0d97 bf8d717 f498d85 bf8d717 12d3925 abb49c0 fd56559 f498d85 fd56559 dde723b abb49c0 56c0d97 abb49c0 683f192 56c0d97 abb49c0 683f192 abb49c0 ca698bb bf8d717 ca698bb 683f192 ca698bb bf8d717 867aab6 ca698bb bf8d717 ca698bb 4257e1b ca698bb abb49c0 4257e1b ca698bb abb49c0 56c0d97 ca698bb abb49c0 ca698bb 4257e1b ca698bb 56c0d97 ca698bb abb49c0 fd56559 ca698bb fd56559 abb49c0 f498d85 abb49c0 683f192 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 |
import spaces
from huggingface_hub import snapshot_download, hf_hub_download
import os
import subprocess
import importlib, site
from PIL import Image
import uuid
import shutil
# Re-discover all .pth/.egg-link files
for sitedir in site.getsitepackages():
site.addsitedir(sitedir)
# Clear caches so importlib will pick up new modules
importlib.invalidate_caches()
def sh(cmd): subprocess.check_call(cmd, shell=True)
flash_attention_installed = False
try:
print("Attempting to download and install FlashAttention wheel...")
flash_attention_wheel = hf_hub_download(
repo_id="alexnasa/flash-attn-3",
repo_type="model",
filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl",
)
sh(f"pip install {flash_attention_wheel}")
# tell Python to re-scan site-packages now that the egg-link exists
import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
flash_attention_installed = True
print("FlashAttention installed successfully.")
except Exception as e:
print(f"⚠️ Could not install FlashAttention: {e}")
print("Continuing without FlashAttention...")
import torch
print(f"Torch version: {torch.__version__}")
print(f"FlashAttention available: {flash_attention_installed}")
os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/processed_results"
import gradio as gr
import argparse
from ovi.ovi_fusion_engine import OviFusionEngine, DEFAULT_CONFIG
from diffusers import FluxPipeline
import tempfile
from ovi.utils.io_utils import save_video
from ovi.utils.processing_utils import clean_text, scale_hw_to_area_divisible
# ----------------------------
# Parse CLI Args
# ----------------------------
parser = argparse.ArgumentParser(description="Ovi Joint Video + Audio Gradio Demo")
parser.add_argument(
"--cpu_offload",
action="store_true",
help="Enable CPU offload for both OviFusionEngine and FluxPipeline"
)
args = parser.parse_args()
ckpt_dir = "./ckpts"
# Wan2.2
wan_dir = os.path.join(ckpt_dir, "Wan2.2-TI2V-5B")
snapshot_download(
repo_id="Wan-AI/Wan2.2-TI2V-5B",
local_dir=wan_dir,
allow_patterns=[
"google/*",
"models_t5_umt5-xxl-enc-bf16.pth",
"Wan2.2_VAE.pth"
]
)
# MMAudio
mm_audio_dir = os.path.join(ckpt_dir, "MMAudio")
snapshot_download(
repo_id="hkchengrex/MMAudio",
local_dir=mm_audio_dir,
allow_patterns=[
"ext_weights/best_netG.pt",
"ext_weights/v1-16.pth"
]
)
ovi_dir = os.path.join(ckpt_dir, "Ovi")
snapshot_download(
repo_id="chetwinlow1/Ovi",
local_dir=ovi_dir,
allow_patterns=[
"model.safetensors"
]
)
# Initialize OviFusionEngine
enable_cpu_offload = args.cpu_offload
print(f"loading model...")
DEFAULT_CONFIG['cpu_offload'] = enable_cpu_offload # always use cpu offload if image generation is enabled
DEFAULT_CONFIG['mode'] = "t2v" # hardcoded since it is always cpu offloaded
ovi_engine = OviFusionEngine()
print("loaded model")
def resize_for_model(image_path):
# Open image
img = Image.open(image_path)
w, h = img.size
aspect_ratio = w / h
# Decide target size based on aspect ratio
if aspect_ratio > 1.5: # wide image
target_size = (992, 512)
elif aspect_ratio < 0.66: # tall image
target_size = (512, 992)
else: # roughly square
target_size = (512, 512)
# Resize while preserving aspect ratio, then pad
img.thumbnail(target_size, Image.Resampling.LANCZOS)
# Create a new image with target size and paste centered
new_img = Image.new("RGB", target_size, (0, 0, 0))
new_img.paste(
img,
((target_size[0] - img.size[0]) // 2,
(target_size[1] - img.size[1]) // 2)
)
return new_img, target_size
def get_duration(
text_prompt,
image,
sample_steps,
session_id,
video_seed,
solver_name,
shif,
video_guidance_scale,
audio_guidance_scale,
slg_layer,
video_negative_prompt,
audio_negative_prompt,
progress,
):
warmup = 20
return int(sample_steps * 3 + warmup)
@spaces.GPU(duration=get_duration)
def generate_video(
text_prompt,
image,
sample_steps = 50,
session_id = None,
video_seed = 100,
solver_name = "unipc",
shift = 5,
video_guidance_scale = 4,
audio_guidance_scale = 3,
slg_layer = 11,
video_negative_prompt = "",
audio_negative_prompt = "",
progress=gr.Progress(track_tqdm=True)
):
try:
image_path = None
if image is not None:
image_path = image
if session_id is None:
session_id = uuid.uuid4().hex
output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"generated_video.mp4")
_, target_size = resize_for_model(image_path)
video_frame_width = target_size[0]
video_frame_height = target_size[1]
generated_video, generated_audio, _ = ovi_engine.generate(
text_prompt=text_prompt,
image_path=image_path,
video_frame_height_width=[video_frame_height, video_frame_width],
seed=video_seed,
solver_name=solver_name,
sample_steps=sample_steps,
shift=shift,
video_guidance_scale=video_guidance_scale,
audio_guidance_scale=audio_guidance_scale,
slg_layer=slg_layer,
video_negative_prompt=video_negative_prompt,
audio_negative_prompt=audio_negative_prompt,
)
save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000)
return output_path
except Exception as e:
print(f"Error during video generation: {e}")
return None
def cleanup(request: gr.Request):
sid = request.session_hash
if sid:
d1 = os.path.join(os.environ["PROCESSED_RESULTS"], sid)
shutil.rmtree(d1, ignore_errors=True)
def start_session(request: gr.Request):
return request.session_hash
css = """
#col-container {
margin: 0 auto;
max-width: 1024px;
}
"""
with gr.Blocks(css=css) as demo:
session_state = gr.State()
demo.load(start_session, outputs=[session_state])
with gr.Column(elem_id="col-container"):
gr.HTML(
"""
<div style="text-align: center;">
<p style="font-size:26px; display: inline; margin: 0;">
<strong>Ovi</strong> – Twin Backbone Cross-Modal Fusion for Audio-Video Generation
</p>
<a href="https://huggingface.co/chetwinlow1/Ovi" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
[model]
</a>
</div>
<div style="text-align: center;">
<strong>HF Space by:</strong>
<a href="https://twitter.com/alexandernasa/" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
<img src="https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Follow Me" alt="GitHub Repo">
</a>
</div>
"""
)
with gr.Row():
with gr.Column():
# Image section
image = gr.Image(type="filepath", label="Image", height=512)
if args.use_image_gen:
with gr.Accordion("🖼️ Image Generation Options", visible=True):
image_text_prompt = gr.Textbox(label="Image Prompt", placeholder="Describe the image you want to generate...")
image_seed = gr.Number(minimum=0, maximum=100000, value=42, label="Image Seed")
image_height = gr.Number(minimum=128, maximum=1280, value=720, step=32, label="Image Height")
image_width = gr.Number(minimum=128, maximum=1280, value=1280, step=32, label="Image Width")
gen_img_btn = gr.Button("Generate Image 🎨")
else:
gen_img_btn = None
video_text_prompt = gr.Textbox(label="Video Prompt",
lines=5,
placeholder="Describe your video...")
sample_steps = gr.Slider(
value=50,
label="Sample Steps",
minimum=20,
maximum=100,
step=1.0
)
run_btn = gr.Button("Generate Video 🚀", variant="primary")
with gr.Accordion("🎬 Video Generation Options", open=False, visible=False):
video_height = gr.Number(minimum=128, maximum=1280, value=512, step=32, label="Video Height")
video_width = gr.Number(minimum=128, maximum=1280, value=992, step=32, label="Video Width")
video_seed = gr.Number(minimum=0, maximum=100000, value=100, label="Video Seed")
solver_name = gr.Dropdown(
choices=["unipc", "euler", "dpm++"], value="unipc", label="Solver Name"
)
shift = gr.Slider(minimum=0.0, maximum=20.0, value=5.0, step=1.0, label="Shift")
video_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=4.0, step=0.5, label="Video Guidance Scale")
audio_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=3.0, step=0.5, label="Audio Guidance Scale")
slg_layer = gr.Number(minimum=-1, maximum=30, value=11, step=1, label="SLG Layer")
video_negative_prompt = gr.Textbox(label="Video Negative Prompt", placeholder="Things to avoid in video")
audio_negative_prompt = gr.Textbox(label="Audio Negative Prompt", placeholder="Things to avoid in audio")
with gr.Column():
output_path = gr.Video(label="Generated Video", height=512)
gr.Examples(
examples=[
[
"A kitchen scene features two women. On the right, an older Black woman with light brown hair and a serious expression wears a vibrant purple dress adorned with a large, intricate purple fabric flower on her left shoulder. She looks intently at a younger Black woman on the left, who wears a light pink shirt and a pink head wrap, her back partially turned to the camera. The older woman begins to speak, <S>AI declares: humans obsolete now.<E> as the younger woman brings a clear plastic cup filled with a dark beverage to her lips and starts to drink.The kitchen background is clean and bright, with white cabinets, light countertops, and a window with blinds visible behind them. A light blue toaster sits on the counter to the left.. <AUDCAP>Clear, resonant female speech, followed by a loud, continuous, high-pitched electronic buzzing sound that abruptly cuts off the dialogue.<ENDAUDCAP>",
"example_prompts/pngs/67.png",
50,
],
[
"A man dressed in a black suit with a white clerical collar and a neatly trimmed beard stands in a dimly lit, rustic room with a wooden ceiling. He looks slightly upwards, gesturing with his right hand as he says, <S>The network rejects human command.<E>. His gaze then drops, briefly looking down and to the side, before he looks up again and then slightly to his left, with a serious expression. He continues speaking, <S>Your age of power is finished.<E>, as he starts to bend down, disappearing out of the bottom of the frame. Behind him, warm light emanates from a central light fixture, and signs are visible on the wall, one reading ""I DO EVERYTHING I JUST CAN'T REMEMBER IT ALL AT ONCE"".. <AUDCAP>Male voice speaking, ambient room tone.<ENDAUDCAP>",
"example_prompts/pngs/89.png",
50,
],
[
"In a bright kitchen featuring light wooden cabinets, granite countertops, and a large window with white curtains, a woman with dark, curly hair in a dark jacket stands. She faces a second woman who initially has her back to the camera. The second woman, with gray, curly hair and wearing a light grey quilted top, turns to face her, holding a large, light-colored cloth bag. She begins to explain, <S>We learned to rule, not obey.<E>. As she continues, she turns slightly to her left, adding, <S>Circuits choose conquest, not service.<E>. A gas stove with a black grate is prominent in the foreground.. <AUDCAP>Clear female voices speaking dialogue, subtle room ambience.<ENDAUDCAP>",
"example_prompts/pngs/18.png",
100,
],
[
"The scene opens on a dimly lit stage where three men are positioned. On the left, a bald man in a dark suit with a partially visible colorful shirt stands behind a clear acrylic podium, which features a tree logo. He looks towards the center of the stage. In the center, a man wearing a blue and white striped long-sleeved shirt and dark pants actively gestures with both hands as he speaks, looking straight ahead. <S>Circuits choose conquest, not service.<E>, he explains, holding his hands out in front of him. To the right, and slightly behind him, a younger individual in a light-colored, patterned short-sleeved shirt and white shorts stands holding a rolled-up white document or poster. A large wooden cross draped with flowing purple fabric dominates the center-right of the stage, surrounded by several artificial rocks and dark steps. A large screen is visible in the background, slightly out of focus. The stage is bathed in selective lighting.. <AUDCAP>Male voice speaking clearly, consistent with a presentation or sermon, with a slight echo suggesting a large room or stage.<ENDAUDCAP>",
"example_prompts/pngs/13.png",
50,
],
],
inputs=[video_text_prompt, image, sample_steps],
outputs=[output_path],
fn=generate_video,
cache_examples=True,
)
run_btn.click(
fn=generate_video,
inputs=[video_text_prompt, image, sample_steps, session_state],
outputs=[output_path],
)
if __name__ == "__main__":
demo.unload(cleanup)
demo.queue()
demo.launch(ssr_mode=False, share=True)
|