From cfdc7c9003471aeec09b634cd69a99628af1ed93 Mon Sep 17 00:00:00 2001 From: TrellixVulnTeam Date: Thu, 20 Oct 2022 14:59:53 +0000 Subject: [PATCH] Adding tarfile member sanitization to extractall() --- current_dataset/preprocess_mswc.py | 42 ++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/current_dataset/preprocess_mswc.py b/current_dataset/preprocess_mswc.py index 70cf3f0..721d067 100644 --- a/current_dataset/preprocess_mswc.py +++ b/current_dataset/preprocess_mswc.py @@ -59,12 +59,50 @@ def split_all_audio_files(df, src_root_path, dest_root_path, max_workers=96): audio_path = dir with tarfile.open(audio_path, mode='r:gz') as mswc_audio: audio_path = os.path.split(audio_path)[0] - mswc_audio.extractall(audio_path) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(mswc_audio, audio_path) splits_path = dir.replace('audio', 'splits') with tarfile.open(splits_path, mode='r:gz') as mswc_split: splits_path = splits_path.replace('.tar.gz', '/') - mswc_split.extractall(splits_path) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(mswc_split, splits_path) tmp = glob.glob(os.path.join(splits_path, '**.csv'), recursive=True) csv_paths = []