From f0852add283f27cca1cc84bc7f8e450ceb4f514b Mon Sep 17 00:00:00 2001 From: Cristian Pana Date: Tue, 9 Jul 2024 15:58:18 +0300 Subject: [PATCH] Improve handling chunked upload --- .pylintrc | 2 +- README.md | 7 ++- workbench-agent.py | 126 ++++++++++++++++++++++++++++++++++----------- 3 files changed, 101 insertions(+), 34 deletions(-) diff --git a/.pylintrc b/.pylintrc index 901d303..084b8b0 100755 --- a/.pylintrc +++ b/.pylintrc @@ -524,7 +524,7 @@ preferred-modules= # Exceptions that will emit a warning when being caught. Defaults to # "Exception" -overgeneral-exceptions=Exception +overgeneral-exceptions=builtins.Exception [TYPING] diff --git a/README.md b/README.md index 8acc38c..30dafdb 100755 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ usage: workbench-agent.py [-h] --api_url API_URL --api_user API_USER [--delta_only] [--reuse_identifications] [--identification_reuse_type {any,only_me,specific_project,specific_scan}] [--specific_code SPECIFIC_CODE] - [--enable_chunk_upload] + [--chunked_upload] [--scan_number_of_tries SCAN_NUMBER_OF_TRIES] [--scan_wait_time SCAN_WAIT_TIME] --path PATH [--log LOG] [--path-result PATH_RESULT] @@ -181,9 +181,8 @@ optional arguments: --target_path TARGET_PATH The path on the Workbench server where the code to be scanned is stored. No upload is done in this scenario. - --enable_chunk_upload - For files bigger than 8 MB (which is default post_max_size in php.ini) uploading will be done using - the header Transfer-encoding: chunked with chunks of 5120 bytes. By default, enabled. + --chunked_upload For files bigger than 8 MB (which is default post_max_size in php.ini) uploading will be done using + the header Transfer-encoding: chunked with chunks of 5MB. --log LOG specify logging level. Allowed values: DEBUG, INFO, WARNING, ERROR --path-result PATH_RESULT Save results to specified path diff --git a/workbench-agent.py b/workbench-agent.py index 35ca6ca..92f08b9 100755 --- a/workbench-agent.py +++ b/workbench-agent.py @@ -9,6 +9,7 @@ import argparse import random import base64 +import io import os import subprocess from argparse import RawTextHelpFormatter @@ -78,22 +79,80 @@ def _send_request(self, payload: dict) -> dict: print("Problematic JSON:") print(response.text) - def _read_in_chunks(self, file_object, chunk_size=5120): - """Generator to read a file piece by piece.""" + def _read_in_chunks(self,file_object: io.BufferedReader, chunk_size=5242880): + """ + Generator to read a file piece by piece. + + Args: + file_object (io.BufferedReader) : The payload of the request. + chunk_size (int): Size of the chunk. Default chunk size is 5MB + """ while True: data = file_object.read(chunk_size) if not data: break yield data - def upload_files(self, scan_code: str, path: str, enable_chunk_upload: bool = True): + def _chunked_upload_request(self, scan_code: str, headers: dict, chunk: bytes): + """ + This function will make sure Content-Length header is not sent by Requests library + Args: + scan_code (str): The scan code where the file or files will be uploaded. + headers (dict) : Headers for HTTP request + chunk (bytes): Chunk read from large file + """ + try: + req = requests.Request( + 'POST', + self.api_url, + headers=headers, + data=chunk, + auth=(self.api_user, self.api_token), + ) + s = requests.Session() + prepped = s.prepare_request(req) + # Remove the unwanted header 'Content-Length' !!! + if 'Content-Length' in prepped.headers: + del prepped.headers['Content-Length'] + + # Send HTTP request and retrieve response + response = s.send(prepped) + # print(f"Sent headers: {response.request.headers}") + # print(f"response headers: {response.headers}") + # Retrieve the HTTP status code + status_code = response.status_code + print(f"HTTP Status Code: {status_code}") + + # Check if the request was successful (status code 200) + if status_code == 200: + # Parse the JSON response + try: + response.json() + except: + print(f"Failed to decode json {response.text}") + print(traceback.print_exc()) + sys.exit(1) + else: + print(f"Request failed with status code {status_code}") + reason = response.reason + print(f"Reason: {reason}") + response_text = response.text + print(f"Response Text: {response_text}") + sys.exit(1) + except IOError: + # Error opening file + print(f"Failed to upload files to the scan {scan_code}.") + print(traceback.print_exc()) + sys.exit(1) + + def upload_files(self, scan_code: str, path: str, chunked_upload: bool = False): """ Uploads files to the Workbench using the API's File Upload endpoint. Args: scan_code (str): The scan code where the file or files will be uploaded. path (str): Path to the file or files to upload. - enable_chunk_upload (bool): Enable/disable chunk upload. By default, enabled. + chunked_upload (bool): Enable/disable chunk upload. """ file_size = os.path.getsize(path) size_limit = 8 * 1024 * 1024 # 8MB in bytes. Based on the default value of post_max_size in php.ini @@ -102,42 +161,36 @@ def upload_files(self, scan_code: str, path: str, enable_chunk_upload: bool = Tr filename_base64 = base64.b64encode(filename.encode()).decode("utf-8") scan_code_base64 = base64.b64encode(scan_code.encode()).decode("utf-8") - if enable_chunk_upload and (file_size > size_limit): + if chunked_upload and (file_size > size_limit): print(f"Uploading {filename} using 'Transfer-encoding: chunks' due to file size {file_size}.") # Use chunked upload for files bigger than size_limit # First delete possible existing files because chunk uploading works by appending existing file on disk. self.remove_uploaded_content(filename, scan_code) + print("Uploading using Transfer-encoding: chunked...") headers = { "FOSSID-SCAN-CODE": scan_code_base64, "FOSSID-FILE-NAME": filename_base64, - 'Transfer-Encoding': 'chunked' + 'Transfer-Encoding': 'chunked', + 'Content-Type': 'application/octet-stream' } try: with open(path, "rb") as file: - resp = requests.post( - self.api_url, - headers=headers, - data=self._read_in_chunks(file, 5120), - auth=(self.api_user, self.api_token), - timeout=1800, - ) - try: - resp.json() - except: - print(f"Failed to decode json {resp.text}") - print(traceback.print_exc()) - sys.exit(1) + for chunk in self._read_in_chunks(file, 5242880): + # Upload each chunk + self._chunked_upload_request(scan_code, headers, chunk) except IOError: # Error opening file print(f"Failed to upload files to the scan {scan_code}.") print(traceback.print_exc()) sys.exit(1) + print("Finished uploading.") else: # Regular upload, no chunk upload headers = { "FOSSID-SCAN-CODE": scan_code_base64, "FOSSID-FILE-NAME": filename_base64 } + print("Uploading...") try: with open(path, "rb") as file: resp = requests.post( @@ -147,17 +200,32 @@ def upload_files(self, scan_code: str, path: str, enable_chunk_upload: bool = Tr auth=(self.api_user, self.api_token), timeout=1800, ) - try: - resp.json() - except: - print(f"Failed to decode json {resp.text}") - print(traceback.print_exc()) + # Retrieve the HTTP status code + status_code = resp.status_code + print(f"HTTP Status Code: {status_code}") + + # Check if the request was successful (status code 200) + if status_code == 200: + # Parse the JSON response + try: + resp.json() + except: + print(f"Failed to decode json {resp.text}") + print(traceback.print_exc()) + sys.exit(1) + else: + print(f"Request failed with status code {status_code}") + reason = resp.reason + print(f"Reason: {reason}") + response_text = resp.text + print(f"Response Text: {response_text}") sys.exit(1) except IOError: # Error opening file print(f"Failed to upload files to the scan {scan_code}.") print(traceback.print_exc()) sys.exit(1) + print("Finished uploading.") def _delete_existing_scan(self, scan_code: str): """ @@ -1092,11 +1160,11 @@ def non_empty_string(s): required=False, ) optional.add_argument( - "--enable_chunk_upload", + "--chunked_upload", help="For files bigger than 8 MB (which is default post_max_size in php.ini) uploading will be done using\n" - "the header Transfer-encoding: chunked with chunks of 5120 bytes. By default, enabled.", + "the header Transfer-encoding: chunked with chunks of 5MB.", action="store_true", - default=True, + default=False, required=False, ) required.add_argument( @@ -1303,7 +1371,7 @@ def main(): print( "Uploading file indicated in --path parameter: {}".format(params.path) ) - workbench.upload_files(params.scan_code, params.path) + workbench.upload_files(params.scan_code, params.path, params.chunked_upload) else: # Get all files found at given path (including in subdirectories). Exclude directories print( @@ -1317,7 +1385,7 @@ def main(): if not os.path.isdir(os.path.join(root, filename)): counter_files = counter_files + 1 workbench.upload_files( - params.scan_code, os.path.join(root, filename), params.enable_chunk_upload + params.scan_code, os.path.join(root, filename), params.chunked_upload ) print("A total of {} files uploaded".format(counter_files)) print("Calling API scans->extracting_archives")