Skip to content

Commit

Permalink
Black formatting on just subset_worker.py
Browse files Browse the repository at this point in the history
  • Loading branch information
MattUnderscoreZhang committed Jan 18, 2024
1 parent 80b76e5 commit 2dc3525
Showing 1 changed file with 55 additions and 26 deletions.
81 changes: 55 additions & 26 deletions video2dataset/workers/subset_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,18 @@ def __init__(
self.clipping_subsampler = ClippingSubsampler(
5, # oom_clip_count
encode_formats,
**self.config["subsampling"].get("ClippingSubsampler", {"args": {}})["args"],
**self.config["subsampling"].get("ClippingSubsampler", {"args": {}})[
"args"
],
)
need_keyframes = self.clipping_subsampler.precision == "keyframe_adjusted"

self.ffprobe_subsampler = None
if "FFProbeSubsampler" in self.config["subsampling"] or need_keyframes:
self.ffprobe_subsampler = FFProbeSubsampler(
**self.config["subsampling"].get("FFProbeSubsampler", {"args": {}})["args"]
**self.config["subsampling"].get("FFProbeSubsampler", {"args": {}})[
"args"
]
)
self.ffprobe_subsampler.extract_keyframes |= need_keyframes

Expand All @@ -59,19 +63,31 @@ def __init__(
self.cut_detector = CutDetectionSubsampler(
**self.config["subsampling"]["CutDetectionSubsampler"]["args"]
)
self.cuts_are_clips = self.config["subsampling"]["CutDetectionSubsampler"].get("cuts_are_clips", False)
self.cuts_are_clips = self.config["subsampling"][
"CutDetectionSubsampler"
].get("cuts_are_clips", False)

self.noop_subsampler = NoOpSubsampler()

video_subsamplers: List[Any] = []
if "ResolutionSubsampler" in self.config["subsampling"]:
video_subsamplers.append(ResolutionSubsampler(**self.config["subsampling"]["ResolutionSubsampler"]["args"]))
video_subsamplers.append(
ResolutionSubsampler(
**self.config["subsampling"]["ResolutionSubsampler"]["args"]
)
)
if "FrameSubsampler" in self.config["subsampling"]:
video_subsamplers.append(FrameSubsampler(**self.config["subsampling"]["FrameSubsampler"]["args"]))
video_subsamplers.append(
FrameSubsampler(**self.config["subsampling"]["FrameSubsampler"]["args"])
)

audio_subsamplers: List[Any] = []
if "AudioRateSubsampler" in self.config["subsampling"]:
audio_subsamplers.append(AudioRateSubsampler(**self.config["subsampling"]["AudioRateSubsampler"]["args"]))
audio_subsamplers.append(
AudioRateSubsampler(
**self.config["subsampling"]["AudioRateSubsampler"]["args"]
)
)
self.subsamplers = {"video": video_subsamplers, "audio": audio_subsamplers}

def __call__(
Expand Down Expand Up @@ -114,20 +130,14 @@ def process_shard(
# The subsamplers might change the output format, so we need to update the writer
writer_encode_formats = self.encode_formats.copy()
if self.subsamplers["audio"]:
assert len(
{
s.encode_format
for s in self.subsamplers["audio"]
}
) == 1 # assert that all audio subsamplers have the same output format
assert (
len({s.encode_format for s in self.subsamplers["audio"]}) == 1
) # assert that all audio subsamplers have the same output format
writer_encode_formats["audio"] = self.subsamplers["audio"][0].encode_format
if self.subsamplers["video"]:
assert len(
{
s.encode_format
for s in self.subsamplers["video"]
}
) == 1 # assert that all video subsamplers have the same output format
assert (
len({s.encode_format for s in self.subsamplers["video"]}) == 1
) # assert that all video subsamplers have the same output format
writer_encode_formats["video"] = self.subsamplers["video"][0].encode_format

# give schema to writer
Expand Down Expand Up @@ -166,13 +176,18 @@ def process_shard(
streams[mod] = [sample[fmt]]

if self.ffprobe_subsampler is not None:
streams, meta, error_message = self.ffprobe_subsampler(streams, meta)
streams, meta, error_message = self.ffprobe_subsampler(
streams, meta
)
if error_message is not None:
raise Exception("failed_to_subsample")

if self.config["storage"]["captions_are_subtitles"]: # create clips
subtitles = meta["yt_meta_dict"]["subtitles"]
meta["clips"] = [[line_dict["start"], line_dict["end"]] for line_dict in subtitles]
meta["clips"] = [
[line_dict["start"], line_dict["end"]]
for line_dict in subtitles
]
elif self.cut_detector is not None: # apply cut detection to get clips
streams, cuts, error_message = self.cut_detector(streams)
if error_message is not None:
Expand All @@ -183,31 +198,45 @@ def process_shard(
if self.cuts_are_clips:
cuts = meta["cuts"]
native_fps = cuts["original_fps"]
meta["clips"] = (np.array(cuts["cuts_original_fps"]) / native_fps).tolist()
meta["clips"] = (
np.array(cuts["cuts_original_fps"]) / native_fps
).tolist()

# 1 video -> many videos (either clipping or noop which does identity broadcasting)
broadcast_subsampler = (
self.clipping_subsampler
if (self.config["storage"]["captions_are_subtitles"] or self.cuts_are_clips)
if (
self.config["storage"]["captions_are_subtitles"]
or self.cuts_are_clips
)
else self.noop_subsampler
)
subsampled_streams, metas, error_message = broadcast_subsampler(streams, meta)
subsampled_streams, metas, error_message = broadcast_subsampler(
streams, meta
)
if error_message is not None:
meta["clips"] = []
raise Exception("failed_to_subsample")

for modality in list(subsampled_streams.keys()):
for modality_subsampler in self.subsamplers[modality]:
subsampled_streams, metas, error_message = modality_subsampler(subsampled_streams, metas)
subsampled_streams, metas, error_message = modality_subsampler(
subsampled_streams, metas
)

if error_message is not None:
raise Exception("failed_to_subsample")

successes += 1
status = "success"
status_dict.increment(status)
subsampled_streams_list = [dict(zip(subsampled_streams, s)) for s in zip(*subsampled_streams.values())]
if len(subsampled_streams_list) == 0: # no audio or video, just write meta
subsampled_streams_list = [
dict(zip(subsampled_streams, s))
for s in zip(*subsampled_streams.values())
]
if (
len(subsampled_streams_list) == 0
): # no audio or video, just write meta
meta["status"] = status
sample_writer.write(
{},
Expand Down

0 comments on commit 2dc3525

Please sign in to comment.