Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve handling chunked upload #5

Merged
merged 1 commit into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ preferred-modules=

# Exceptions that will emit a warning when being caught. Defaults to
# "Exception"
overgeneral-exceptions=Exception
overgeneral-exceptions=builtins.Exception


[TYPING]
Expand Down
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ usage: workbench-agent.py [-h] --api_url API_URL --api_user API_USER
[--delta_only] [--reuse_identifications]
[--identification_reuse_type {any,only_me,specific_project,specific_scan}]
[--specific_code SPECIFIC_CODE]
[--enable_chunk_upload]
[--chunked_upload]
[--scan_number_of_tries SCAN_NUMBER_OF_TRIES]
[--scan_wait_time SCAN_WAIT_TIME] --path PATH
[--log LOG] [--path-result PATH_RESULT]
Expand Down Expand Up @@ -181,9 +181,8 @@ optional arguments:
--target_path TARGET_PATH
The path on the Workbench server where the code to be scanned is stored.
No upload is done in this scenario.
--enable_chunk_upload
For files bigger than 8 MB (which is default post_max_size in php.ini) uploading will be done using
the header Transfer-encoding: chunked with chunks of 5120 bytes. By default, enabled.
--chunked_upload For files bigger than 8 MB (which is default post_max_size in php.ini) uploading will be done using
the header Transfer-encoding: chunked with chunks of 5MB.
--log LOG specify logging level. Allowed values: DEBUG, INFO, WARNING, ERROR
--path-result PATH_RESULT
Save results to specified path
Expand Down
126 changes: 97 additions & 29 deletions workbench-agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import argparse
import random
import base64
import io
import os
import subprocess
from argparse import RawTextHelpFormatter
Expand Down Expand Up @@ -78,22 +79,80 @@ def _send_request(self, payload: dict) -> dict:
print("Problematic JSON:")
print(response.text)

def _read_in_chunks(self, file_object, chunk_size=5120):
"""Generator to read a file piece by piece."""
def _read_in_chunks(self,file_object: io.BufferedReader, chunk_size=5242880):
"""
Generator to read a file piece by piece.

Args:
file_object (io.BufferedReader) : The payload of the request.
chunk_size (int): Size of the chunk. Default chunk size is 5MB
"""
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data

def upload_files(self, scan_code: str, path: str, enable_chunk_upload: bool = True):
def _chunked_upload_request(self, scan_code: str, headers: dict, chunk: bytes):
"""
This function will make sure Content-Length header is not sent by Requests library
Args:
scan_code (str): The scan code where the file or files will be uploaded.
headers (dict) : Headers for HTTP request
chunk (bytes): Chunk read from large file
"""
try:
req = requests.Request(
'POST',
self.api_url,
headers=headers,
data=chunk,
auth=(self.api_user, self.api_token),
)
s = requests.Session()
prepped = s.prepare_request(req)
# Remove the unwanted header 'Content-Length' !!!
if 'Content-Length' in prepped.headers:
del prepped.headers['Content-Length']

# Send HTTP request and retrieve response
response = s.send(prepped)
# print(f"Sent headers: {response.request.headers}")
# print(f"response headers: {response.headers}")
# Retrieve the HTTP status code
status_code = response.status_code
print(f"HTTP Status Code: {status_code}")

# Check if the request was successful (status code 200)
if status_code == 200:
# Parse the JSON response
try:
response.json()
except:
print(f"Failed to decode json {response.text}")
print(traceback.print_exc())
sys.exit(1)
else:
print(f"Request failed with status code {status_code}")
reason = response.reason
print(f"Reason: {reason}")
response_text = response.text
print(f"Response Text: {response_text}")
sys.exit(1)
except IOError:
# Error opening file
print(f"Failed to upload files to the scan {scan_code}.")
print(traceback.print_exc())
sys.exit(1)

def upload_files(self, scan_code: str, path: str, chunked_upload: bool = False):
"""
Uploads files to the Workbench using the API's File Upload endpoint.

Args:
scan_code (str): The scan code where the file or files will be uploaded.
path (str): Path to the file or files to upload.
enable_chunk_upload (bool): Enable/disable chunk upload. By default, enabled.
chunked_upload (bool): Enable/disable chunk upload.
"""
file_size = os.path.getsize(path)
size_limit = 8 * 1024 * 1024 # 8MB in bytes. Based on the default value of post_max_size in php.ini
Expand All @@ -102,42 +161,36 @@ def upload_files(self, scan_code: str, path: str, enable_chunk_upload: bool = Tr
filename_base64 = base64.b64encode(filename.encode()).decode("utf-8")
scan_code_base64 = base64.b64encode(scan_code.encode()).decode("utf-8")

if enable_chunk_upload and (file_size > size_limit):
if chunked_upload and (file_size > size_limit):
print(f"Uploading {filename} using 'Transfer-encoding: chunks' due to file size {file_size}.")
# Use chunked upload for files bigger than size_limit
# First delete possible existing files because chunk uploading works by appending existing file on disk.
self.remove_uploaded_content(filename, scan_code)
print("Uploading using Transfer-encoding: chunked...")
headers = {
"FOSSID-SCAN-CODE": scan_code_base64,
"FOSSID-FILE-NAME": filename_base64,
'Transfer-Encoding': 'chunked'
'Transfer-Encoding': 'chunked',
'Content-Type': 'application/octet-stream'
}
try:
with open(path, "rb") as file:
resp = requests.post(
self.api_url,
headers=headers,
data=self._read_in_chunks(file, 5120),
auth=(self.api_user, self.api_token),
timeout=1800,
)
try:
resp.json()
except:
print(f"Failed to decode json {resp.text}")
print(traceback.print_exc())
sys.exit(1)
for chunk in self._read_in_chunks(file, 5242880):
# Upload each chunk
self._chunked_upload_request(scan_code, headers, chunk)
except IOError:
# Error opening file
print(f"Failed to upload files to the scan {scan_code}.")
print(traceback.print_exc())
sys.exit(1)
print("Finished uploading.")
else:
# Regular upload, no chunk upload
headers = {
"FOSSID-SCAN-CODE": scan_code_base64,
"FOSSID-FILE-NAME": filename_base64
}
print("Uploading...")
try:
with open(path, "rb") as file:
resp = requests.post(
Expand All @@ -147,17 +200,32 @@ def upload_files(self, scan_code: str, path: str, enable_chunk_upload: bool = Tr
auth=(self.api_user, self.api_token),
timeout=1800,
)
try:
resp.json()
except:
print(f"Failed to decode json {resp.text}")
print(traceback.print_exc())
# Retrieve the HTTP status code
status_code = resp.status_code
print(f"HTTP Status Code: {status_code}")

# Check if the request was successful (status code 200)
if status_code == 200:
# Parse the JSON response
try:
resp.json()
except:
print(f"Failed to decode json {resp.text}")
print(traceback.print_exc())
sys.exit(1)
else:
print(f"Request failed with status code {status_code}")
reason = resp.reason
print(f"Reason: {reason}")
response_text = resp.text
print(f"Response Text: {response_text}")
sys.exit(1)
except IOError:
# Error opening file
print(f"Failed to upload files to the scan {scan_code}.")
print(traceback.print_exc())
sys.exit(1)
print("Finished uploading.")

def _delete_existing_scan(self, scan_code: str):
"""
Expand Down Expand Up @@ -1092,11 +1160,11 @@ def non_empty_string(s):
required=False,
)
optional.add_argument(
"--enable_chunk_upload",
"--chunked_upload",
help="For files bigger than 8 MB (which is default post_max_size in php.ini) uploading will be done using\n"
"the header Transfer-encoding: chunked with chunks of 5120 bytes. By default, enabled.",
"the header Transfer-encoding: chunked with chunks of 5MB.",
action="store_true",
default=True,
default=False,
required=False,
)
required.add_argument(
Expand Down Expand Up @@ -1303,7 +1371,7 @@ def main():
print(
"Uploading file indicated in --path parameter: {}".format(params.path)
)
workbench.upload_files(params.scan_code, params.path)
workbench.upload_files(params.scan_code, params.path, params.chunked_upload)
else:
# Get all files found at given path (including in subdirectories). Exclude directories
print(
Expand All @@ -1317,7 +1385,7 @@ def main():
if not os.path.isdir(os.path.join(root, filename)):
counter_files = counter_files + 1
workbench.upload_files(
params.scan_code, os.path.join(root, filename), params.enable_chunk_upload
params.scan_code, os.path.join(root, filename), params.chunked_upload
)
print("A total of {} files uploaded".format(counter_files))
print("Calling API scans->extracting_archives")
Expand Down