Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix decoding issue with PYAV due to new support for multiple training… #541

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 46 additions & 24 deletions slowfast/datasets/decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,10 @@ def pyav_decode(
num_clips_uniform=10,
target_fps=30,
use_offset=False,
modalities=("visual",),
max_spatial_scale=0,
min_delta=-math.inf,
max_delta=math.inf,
):
"""
Convert the video from its original fps to the target_fps. If the video
Expand Down Expand Up @@ -414,35 +418,49 @@ def pyav_decode(
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For line 414-417, we should also decode the whole videos and return the frames. Example code could be:

        decode_all_video = True
        video_start_pts, video_end_pts = 0, math.inf
        start_end_delta_time = None

        frames = None
        if container.streams.video:
            video_frames, max_pts = pyav_decode_stream(
                container,
                video_start_pts,
                video_end_pts,
                container.streams.video[0],
                {"video": 0},
            )

            frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
            frames = torch.as_tensor(np.stack(frames))
        frames_out = [frames]

# Perform selective decoding.
decode_all_video = False
clip_size = np.maximum(
1.0, np.ceil(sampling_rate * (num_frames - 1) / target_fps * fps)
)
start_idx, end_idx, fraction = get_start_end_idx(
clip_sizes = [
np.maximum(
1.0,
np.ceil(
sampling_rate[i] * (num_frames[i] - 1) / target_fps * fps
),
)
for i in range(len(sampling_rate))
]
start_end_delta_time = get_multiple_start_end_idx(
frames_length,
clip_size,
clip_sizes,
clip_idx,
num_clips_uniform,
use_offset=use_offset,
)
timebase = duration / frames_length
video_start_pts = int(start_idx * timebase)
video_end_pts = int(end_idx * timebase)

frames = None
# If video stream was found, fetch video frames from the video.
if container.streams.video:
video_frames, max_pts = pyav_decode_stream(
container,
video_start_pts,
video_end_pts,
container.streams.video[0],
{"video": 0},
min_delta=min_delta,
max_delta=max_delta,
)
frames_out = [None] * len(num_frames)
for k in range(len(num_frames)):
start_idx = start_end_delta_time[k, 0]
end_idx = start_end_delta_time[k, 1]
timebase = duration / frames_length
video_start_pts = int(start_idx)
video_end_pts = int(end_idx)

frames = None
# If video stream was found, fetch video frames from the video.
if container.streams.video:
video_frames, max_pts = pyav_decode_stream(
container,
video_start_pts,
video_end_pts,
container.streams.video[0],
{"video": 0},
)

frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
frames = torch.as_tensor(np.stack(frames))

frames_out[k] = frames
container.close()

frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
frames = torch.as_tensor(np.stack(frames))
return frames, fps, decode_all_video
return frames_out, fps, decode_all_video, start_end_delta_time


def decode(
Expand Down Expand Up @@ -504,14 +522,18 @@ def decode(
if backend == "pyav":
assert min_delta == -math.inf and max_delta == math.inf, \
"delta sampling not supported in pyav"
frames_decoded, fps, decode_all_video = pyav_decode(
frames_decoded, fps, decode_all_video, start_end_delta_time = pyav_decode(
container,
sampling_rate,
num_frames,
clip_idx,
num_clips_uniform,
target_fps,
use_offset=use_offset,
modalities=("visual",),
max_spatial_scale=max_spatial_scale,
min_delta=min_delta,
max_delta=max_delta,
)
elif backend == "torchvision":
(
Expand Down