-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
398 lines (363 loc) · 18.3 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<link href='https://fonts.googleapis.com/css?family=Noto Sans' rel='stylesheet'>
<link href='https://fonts.googleapis.com/css?family=Indie Flower' rel='stylesheet'>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="google-site-verification" content="xdvJxvo39Ei0nahgmgXGp9DCslFea8wH789x6mmAY-A" />
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<meta property="og:site_name" content="4-LEGS" />
<meta property="og:type" content="video.other" />
<meta property="og:title" content="4-LEGS: 4D Language Embedded Gaussian Splatting" />
<meta property="og:description" content="" />
<meta property="og:url" content="https://tau-vailab.github.io/4-LEGS/" />
<meta property="og:image" content="https://tau-vailab.github.io/4-LEGS/webpage_assets/legs_thumbnail.png" />
<meta property="article:publisher" content="https://tau-vailab.github.io/4-LEGS/" />
<meta name="twitter:card" content="summary_large_image" />
<meta name="twitter:title" content="4-LEGS: 4D Language Embedded Gaussian Splatting" />
<meta name="twitter:description" content="" />
<meta name="twitter:url" content="https://tau-vailab.github.io/4-LEGS/" />
<meta name="twitter:image" content="https://tau-vailab.github.io/4-LEGS/webpage_assets/legs_thumbnail.png" />
<title>4-LEGS: 4D Language Embedded Gaussian Splatting</title>
<!-- <link rel="icon" href="../pics/wis_logo.jpg">-->
<link rel="icon" href="./webpage_assets/legs_browser_icon.png">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/swiper@9/swiper-bundle.min.css">
<link href="style.css" rel="stylesheet" type="text/css">
</head>
<body>
<div class="page-container">
<script src="https://cdn.jsdelivr.net/npm/swiper@9/swiper-bundle.min.js"></script>
<!-- title -->
<h1 class="ourh1" align="center">4-LEGS</h1>
<h2 class="ourh2" align="center">4D Language Embedded Gaussian Splatting</h2>
<!-- authors and affiliations -->
<section class="authors_block">
<div class="authors" align="center">
<span class="author-block"><a href="https://galfiebelman.github.io/" target="_blank">Gal Fiebelman</a><sup>1</sup>,</span>
<span class="author-block"><a href="https://www.linkedin.com/in/tamir-cohen-09a693167/" target="_blank">Tamir Cohen</a><sup>1</sup>,</span>
<span class="author-block"><a href="https://www.linkedin.com/in/ayellet-morgenstern-04b501211/" target="_blank">Ayellet Morgenstern</a><sup>1</sup>,</span>
<span class="author-block"><a href="https://phogzone.com/" target="_blank">Peter Hedman</a><sup>2</sup>,</span>
<span class="author-block"><a href="https://www.elor.sites.tau.ac.il/" target="_blank">Hadar Averbuch-Elor</a><sup>1</sup></span>
</div>
<div class="affiliations" align="center">
<span class="author-block"><sup>1</sup>Tel Aviv University, </span>
<span class="author-block"><sup>2</sup>Google Research</span>
</div>
</section>
<!-- authors and affiliations -->
<!-- link buttons -->
<div class="column has-text-centered">
<div class="publication-links" align="center">
<!-- arxiv link -->
<span class="link-block">
<a href="https://arxiv.org/abs/2410.10719" class="paper-link" style="display: inline-block">
<button class="button">
<span class="icon">
<i class="ai ai-arxiv"></i>
</span>
<span>arXiv</span>
</button>
</a>
</span>
<!-- Github Link. -->
<span class="link-block">
<a href="" style="display: inline-block">
<button class="button">
<span class="icon">
<i class="fa fa-github"></i>
</span>
<span>Code (Coming Soon)</span>
</button>
</a>
</span>
<!-- Data Link. -->
<span class="link-block">
<a href="" style="display: inline-block">
<button class="button">
<span class="icon">
<i class="fa fa-folder"></i>
</span>
<span>Data (Coming Soon)</span>
</button>
</a>
</span>
<!-- Supp Link. -->
<span class="link-block">
<a href="supp/index.html" style="display: inline-block">
<button class="button">
<span class="icon">
<i class="fa fa-plus-square"></i>
</span>
<span>Supplementary Material</span>
</button>
</a>
</span>
</div>
</div>
<br>
<center>
<video id="main-video" autobuffer muted autoplay loop controls width="832" height="468">
<source id="mp4" src="webpage_assets/demo/4_legs_demo.mp4" type="video/mp4">
</video>
</center>
<br>
<section class="tldr-section" width="100%">
<div class="intro-container has-text-justified">
<div class="intro-paragraph">
<p>
<br>
<span class="intro-paragraph_bold">TL;DR Our method grounds spatio-temporal features into a 4D Gaussian Splatting representation.</span><br><br>
This allows localizing actions in <b>time</b> and <b>space</b>.
Above we illustrate our method, given input multiview videos capturing a dynamic 3D scene, we optimize a <b>4-LEGS</b>,
a 4D Language Embedded Gaussian Splatting representation of the dynamic scene.
Then we localize a text query in both space and time using the mean relevancy score and the extracted relvancy maps. These
spatio-temporal maps allow for creating various highlight effects, such as automatically visualizing a bullet-time display at a slower speed
of the input query.
</p>
</div>
</div>
</section>
<!-- abstract -->
<section class="abstract-section" width="100%">
<div class="abstract-container has-text-justified">
<hr>
<h2 align="center">Abstract</h2>
<p class="has-text-justified">
The emergence of neural representations has revolutionized our means for digitally viewing a wide range of 3D scenes,
enabling the synthesis of photorealistic images rendered from novel views. Recently, several techniques have been proposed
for connecting these low-level representations with the high-level semantics understanding embodied within the scene.
These methods elevate the rich semantic understanding from 2D imagery to 3D representations, distilling high-dimensional
spatial features onto 3D space. In our work, we are interested in connecting language with a dynamic modeling of the world.
We show how to lift spatio-temporal features to a 4D representation based on 3D Gaussian Splatting. This enables an
interactive interface where the user can spatiotemporally localize events in the video from text prompts. We demonstrate our
system on public 3D video datasets of people and animals performing various actions.
</p>
</section>
<!-- interactive-section -->
<section class="interactive-section" width="100%">
<div class="abstract-container has-text-justified">
<hr>
<h2 align="center">Examples of 4-LEGS Text-Prompted Video Editing Applications</h2><br></br>
<table class="interactive-table" width="100%" align="center">
<tr>
<td align="center" >
<button id="video3" class="prompt_title_red_small red-box" onclick="playVideo('video3', '')" >Scene Selection</button>
</td>
<td align="center">
<!-- <a>           </a> -->
<button id="video2" class="prompt_title_black_small" onclick="playVideo('video2', '')">Bullet Time</button>
</td>
<td align="center">
<button id="video1" class="prompt_title_black_small" onclick="playVideo('video1', '')">Zoom In</button>
</td>
<td align="center" >
<button id="video4" class="prompt_title_black_small" onclick="playVideo('video4', '')" >Desturation and Panning</button>
</td>
</tr>
</table>
<br>
<table class="interactive-table" width="100%" align="center">
<tr class="fixed-height-row" id="videoRow">
<td class="interactiv_vid" align="center" colspan="4">
<video id="videoPlayer" loop autoplay muted width="75%" class="result-video">
<source src="./webpage_assets/interactive/video3.mp4" type="video/mp4">
</video>
</td>
</tr>
</table>
<br>
<table class="interactive-table" width="100%" align="center">
<tr>
<td align="center">
<button id="prompt_a" class="prompt_title_red_small red-box" onclick="switchVideo('_a')">A person swinging the softball bat</button>
</td>
<td align="center">
<!-- <a>           </a> -->
<button id="prompt_b" class="prompt_title_blue_small" onclick="switchVideo('_b')">A person picking up the box</button>
</td>
<td align="center">
<button id="prompt_c" class="prompt_title_blue_small" onclick="switchVideo('_c')">A person throwing the basketball</button>
</td>
</tr>
</table>
<br></br>
<p class="interactive_bold">Select a video editing application and then select one of the text prompts to view the edit enabled by the spatio-temporal
grounding achieved by our method. <br>
As illustrated in this interactive visualization, <b>4-LEGS</b> enables interactive text-conditioned video editing by localizing spatio-temporal features
in both <b>time</b> and <b>space</b>.</p>
</div>
</section>
<!-- method -->
<section class="method-section" width="100%">
<div class="abstract-container">
<hr>
<h2 align="center">How does it work?</h2>
<br>
<div class="im_container has-text-justified" width="90%" align="center">
<img align="center" src="./webpage_assets/overview.png" alt="Overview" width="100%">
</div>
<p class="has-text-justified">
<br>
🌍 Given multiple videos capturing a dynamic 3D scene, we first extract pixel-aligned spatio-temporal language features at
multiple scales using a pretrained video-text model.<br></br>
💡 We average these features to produce spatio-temporal features, which are encoded into a more compact latent space that is used for supervising the
optimization of a 4D language embedded Gaussian.<br></br>
🔍 During inference, given an input language query, <b>4-LEGS</b> localizes the query in <b>time</b> by computing a relevancy score over the volumetric language
features distilled on the gaussians and in <b>space</b> we render relevancy maps in real time.<br></br>
📋 See our paper for more details on our 4D language embedded gaussians and how we apply them
to enable an interactive interface for text-conditioned video editing tasks.
</p>
<div class="attn-grid-vid-container">
</section>
<!-- BibTex-->
<section class="bib-section" width="100%">
<div class="bib-container">
<hr>
<h2 align="center">BibTeX</h2>
<div class="code-container" align="left">
<code>
@misc{fiebelman20244legs4dlanguageembedded,<br>
    title={4-LEGS: 4D Language Embedded Gaussian Splatting}, <br>
    author={Gal Fiebelman and Tamir Cohen and Ayellet Morgenstern and Peter Hedman and Hadar Averbuch-Elor},<br>
    year={2024},<br>
    eprint={2410.10719},<br>
    archivePrefix={arXiv},<br>
    primaryClass={cs.CV}<br>
}
</code>
</div>
</div>
</section>
<section class="ack-section" width="100%">
<div class="ack-container">
<hr>
<h2 align="center">Acknowledgements</h2>
<p>
This work was partially funded by Google through a TAU-Google grant.
</p>
</div>
</section>
<p><br>
</p>
<p> </p>
<p> </p>
<p> </p>
</div>
<script>
const swiper = new Swiper('.swiper', {
autoplay: {
delay: 4000,
},
// Optional parameters
speed: 1000,
loop: true,
// If we need pagination
pagination: {
el: '.swiper-pagination',
},
// Navigation arrows
navigation: {
nextEl: '.swiper-button-next',
prevEl: '.swiper-button-prev',
},
});
</script>
<script>
let currentVideo = "video3";
let currentSuffix = "";
function changeBorderPrompt(currentSuffix, newSuffix) {
var temp_curr = currentSuffix;
if (temp_curr == ""){
temp_curr = "_a";
}
const new_p = document.getElementById(`prompt${newSuffix}`);
const curr_p = document.getElementById(`prompt${temp_curr}`);
curr_p.classList.remove('red-box');
curr_p.classList.remove('prompt_title_red_small');
curr_p.classList.add('prompt_title_blue_small');
new_p.classList.add('red-box');
new_p.classList.remove('prompt_title_blue_small');
new_p.classList.add('prompt_title_red_small');
}
function changeBorderApp(currentVideo, newVideo, currentSuffix, newSuffix) {
if (currentSuffix != newSuffix){
changeBorderPrompt(currentSuffix, '_a');
}
const new_vid = document.getElementById(newVideo);
const curr_vid = document.getElementById(currentVideo);
curr_vid.classList.remove('red-box');
curr_vid.classList.remove('prompt_title_red_small');
curr_vid.classList.add('prompt_title_black_small');
new_vid.classList.add('red-box');
new_vid.classList.remove('prompt_title_black_small');
new_vid.classList.add('prompt_title_red_small');
}
function changePrompts(newVideo, currentVideo) {
const prompt_a = document.getElementById('prompt_a');
const prompt_b = document.getElementById('prompt_b');
const prompt_c = document.getElementById('prompt_c');
if (newVideo == "video3") {
prompt_a.textContent = "A person swinging the softball bat";
prompt_b.textContent = "A person picking up the box";
prompt_c.textContent = "A person throwing the basketball";
}
if (newVideo == "video2") {
prompt_a.textContent = "A fox stretching";
prompt_b.textContent = "The football flying in the air";
prompt_c.textContent = "A person swinging the softball bat";
}
if (newVideo == "video1") {
prompt_a.textContent = "A person juggles";
prompt_b.textContent = "A person swinging the softball bat";
prompt_c.textContent = "A person throwing the football";
}
if (newVideo == "video4") {
prompt_a.textContent = "A person picking up the box";
prompt_b.textContent = "A person juggles";
prompt_c.textContent = "A person swinging the softball bat";
}
}
function playVideo(newVideo, newSuffix) {
if (currentVideo === newVideo && currentSuffix === newSuffix) return;
if (currentVideo != newVideo) {
changeBorderApp(currentVideo, newVideo, currentSuffix);
changePrompts(newVideo, currentVideo);
}
const videoPlayer = document.getElementById('videoPlayer');
let currentTime = videoPlayer.currentTime;
videoPlayer.src = `./webpage_assets/interactive/${newVideo}${newSuffix}.mp4`;
videoPlayer.addEventListener('loadedmetadata', function () {
if (newVideo == "video3" && newSuffix == "_b"){
videoPlayer.pause();
}
videoPlayer.currentTime = 0;
videoPlayer.removeEventListener('loadedmetadata', arguments.callee);
// After changing the video source, set the fixed height
setFixedHeight();
if (newVideo == "video3" && newSuffix == "_b"){
setTimeout(function() {
videoPlayer.play();
}, 1000);
}
});
videoPlayer.play();
currentVideo = newVideo;
currentSuffix = newSuffix;
}
function switchVideo(newSuffix) {
if (currentSuffix === newSuffix) return;
changeBorderPrompt(currentSuffix, newSuffix);
playVideo(currentVideo, newSuffix);
}
// Function to set a fixed height for the row after changing the video source
function setFixedHeight() {
const videoRow = document.getElementById('videoRow');
videoRow.style.height = `${videoRow.offsetHeight}px`;
}
</script>
</body></html>