diff --git a/.gitignore b/.gitignore index 7e48834..1de7eb8 100644 --- a/.gitignore +++ b/.gitignore @@ -128,4 +128,8 @@ cython_debug/ .DS_Store .vscode/ -.idea/ \ No newline at end of file +.idea/ + +# Notebook Model Downloads +notebooks/PyTorchModels/ +pytorch-model-scan-results.json \ No newline at end of file diff --git a/notebooks/pytorch_sentiment_analysis.ipynb b/notebooks/pytorch_sentiment_analysis.ipynb index 282439b..fd44b5d 100644 --- a/notebooks/pytorch_sentiment_analysis.ipynb +++ b/notebooks/pytorch_sentiment_analysis.ipynb @@ -16,46 +16,49 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "modelscan, version 0.5.0\n" + "Note: you may need to restart the kernel to use updated packages.\n", + "modelscan, version 0.0.0\n" ] } ], "source": [ - "!pip install -q modelscan\n", + "%pip install -q modelscan\n", "!modelscan -v" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n", + "Note: you may need to restart the kernel to use updated packages.\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ - "!pip install -q torch==2.0.1\n", - "!pip install -q transformers==4.31.0\n", - "!pip install -q scipy==1.11.1" + "%pip install -q torch==2.0.1\n", + "%pip install -q transformers==4.31.0\n", + "%pip install -q scipy==1.11.1" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 12, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/mehrinkiani/mambaforge/envs/py310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -84,18 +87,20 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Save a model for sentiment analysis\n", - "model_directory = \"PyTorchModels\"\n", + "from typing import Final\n", + "\n", + "model_directory: Final[str] = \"PyTorchModels\"\n", "if not os.path.isdir(model_directory):\n", " os.mkdir(model_directory)\n", "\n", "safe_model_path = os.path.join(model_directory, \"safe_model.pt\")\n", "\n", - "sentiment_model = download_model(safe_model_path)" + "download_model(safe_model_path)" ] }, { @@ -107,7 +112,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -133,20 +138,24 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "No settings file detected at /Users/mehrinkiani/Documents/modelscan/notebooks/modelscan-settings.toml. Using defaults. \n", + "No settings file detected at /Users/jennifercwagenberg/Code/Personal/fork/modelscan/notebooks/modelscan-settings.toml. Using defaults. \n", "\n", - "Scanning /Users/mehrinkiani/Documents/modelscan/notebooks/PyTorchModels/safe_model.pt:safe_model/data.pkl using modelscan.scanners.PickleUnsafeOpScan model scan\n", + "Scanning /Users/jennifercwagenberg/Code/Personal/fork/modelscan/notebooks/PyTorchModels/safe_model.pt:safe_model/data.pkl using modelscan.scanners.PickleUnsafeOpScan model scan\n", "\n", "\u001b[34m--- Summary ---\u001b[0m\n", "\n", - "\u001b[32m No issues found! 🎉\u001b[0m\n" + "\u001b[32m No issues found! 🎉\u001b[0m\n", + "\n", + "\u001b[34m--- Skipped --- \u001b[0m\n", + "\n", + "Total skipped: \u001b[1;36m204\u001b[0m - run with --show-skipped to see the full list.\n" ] } ], @@ -165,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -175,7 +184,6 @@ "\n", "unsafe_model_path = os.path.join(model_directory, \"unsafe_model.pt\")\n", "\n", - "\n", "payload = get_payload(command, malicious_code)\n", "torch.save(\n", " torch.load(safe_model_path),\n", @@ -197,21 +205,26 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 17, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "cat: /Users/jennifercwagenberg/.aws/secrets: No such file or directory\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "aws_access_key_id=\n", - "aws_secret_access_key=\n", "The overall sentiment is: negative with a score of: 85.9%\n" ] } ], "source": [ - "sentiment = predict_sentiment(\"Stock market was bearish today\", torch.load(unsafe_model_path))" + "predict_sentiment(\"Stock market was bearish today\", torch.load(unsafe_model_path))" ] }, { @@ -227,16 +240,16 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "No settings file detected at /Users/mehrinkiani/Documents/modelscan/notebooks/modelscan-settings.toml. Using defaults. \n", + "No settings file detected at /Users/jennifercwagenberg/Code/Personal/fork/modelscan/notebooks/modelscan-settings.toml. Using defaults. \n", "\n", - "Scanning /Users/mehrinkiani/Documents/modelscan/notebooks/PyTorchModels/unsafe_model.pt:unsafe_model/data.pkl using modelscan.scanners.PickleUnsafeOpScan model scan\n", + "Scanning /Users/jennifercwagenberg/Code/Personal/fork/modelscan/notebooks/PyTorchModels/unsafe_model.pt:unsafe_model/data.pkl using modelscan.scanners.PickleUnsafeOpScan model scan\n", "\n", "\u001b[34m--- Summary ---\u001b[0m\n", "\n", @@ -256,7 +269,11 @@ "Unsafe operator found:\n", " - Severity: CRITICAL\n", " - Description: Use of unsafe operator 'system' from module 'posix'\n", - " - Source: /Users/mehrinkiani/Documents/modelscan/notebooks/PyTorchModels/unsafe_model.pt:unsafe_model/data.pkl\n" + " - Source: /Users/jennifercwagenberg/Code/Personal/fork/modelscan/notebooks/PyTorchModels/unsafe_model.pt:unsafe_model/data.pkl\n", + "\n", + "\u001b[34m--- Skipped --- \u001b[0m\n", + "\n", + "Total skipped: \u001b[1;36m204\u001b[0m - run with --show-skipped to see the full list.\n" ] } ], @@ -283,28 +300,26 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "No settings file detected at /Users/mehrinkiani/Documents/modelscan/notebooks/modelscan-settings.toml. Using defaults. \n", + "No settings file detected at /Users/jennifercwagenberg/Code/Personal/fork/modelscan/notebooks/modelscan-settings.toml. Using defaults. \n", "\n", - "Scanning /Users/mehrinkiani/Documents/modelscan/notebooks/PyTorchModels/unsafe_model.pt:unsafe_model/data.pkl using modelscan.scanners.PickleUnsafeOpScan model scan\n", - "\u001b[1m{\u001b[0m\u001b[32m\"modelscan_version\"\u001b[0m: \u001b[32m\"0.5.0\"\u001b[0m, \u001b[32m\"timestamp\"\u001b[0m: \u001b[32m\"2024-01-25T17:10:54.306065\"\u001b[0m, \n", - "\u001b[32m\"input_path\"\u001b[0m: \n", - "\u001b[32m\"/Users/mehrinkiani/Documents/modelscan/notebooks/PyTorchModels/unsafe_model.pt\"\u001b[0m\n", - ", \u001b[32m\"total_issues\"\u001b[0m: \u001b[1;36m1\u001b[0m, \u001b[32m\"summary\"\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m\"total_issues_by_severity\"\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m\"LOW\"\u001b[0m: \u001b[1;36m0\u001b[0m, \n", - "\u001b[32m\"MEDIUM\"\u001b[0m: \u001b[1;36m0\u001b[0m, \u001b[32m\"HIGH\"\u001b[0m: \u001b[1;36m0\u001b[0m, \u001b[32m\"CRITICAL\"\u001b[0m: \u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m\"issues_by_severity\"\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m\"CRITICAL\"\u001b[0m: \n", - "\u001b[1m[\u001b[0m\u001b[1m{\u001b[0m\u001b[32m\"description\"\u001b[0m: \u001b[32m\"Use of unsafe operator 'system' from module 'posix'\"\u001b[0m, \n", - "\u001b[32m\"operator\"\u001b[0m: \u001b[32m\"system\"\u001b[0m, \u001b[32m\"module\"\u001b[0m: \u001b[32m\"posix\"\u001b[0m, \u001b[32m\"source\"\u001b[0m: \n", - "\u001b[32m\"/Users/mehrinkiani/Documents/modelscan/notebooks/PyTorchModels/unsafe_model.pt:\u001b[0m\n", - "\u001b[32munsafe_model/data.pkl\"\u001b[0m, \u001b[32m\"scanner\"\u001b[0m: \u001b[32m\"modelscan.scanners.PickleUnsafeOpScan\"\u001b[0m\u001b[1m}\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m, \n", - "\u001b[32m\"errors\"\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m, \u001b[32m\"scanned\"\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m\"total_scanned\"\u001b[0m: \u001b[1;36m1\u001b[0m, \u001b[32m\"scanned_files\"\u001b[0m: \n", - "\u001b[1m[\u001b[0m\u001b[32m\"/Users/mehrinkiani/Documents/modelscan/notebooks/PyTorchModels/unsafe_model.pt\u001b[0m\n", - "\u001b[32m:unsafe_model/data.pkl\"\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n" + "Scanning /Users/jennifercwagenberg/Code/Personal/fork/modelscan/notebooks/PyTorchModels/unsafe_model.pt:unsafe_model/data.pkl using modelscan.scanners.PickleUnsafeOpScan model scan\n", + "\u001b[1m{\u001b[0m\u001b[32m\"summary\"\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m\"total_issues_by_severity\"\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m\"LOW\"\u001b[0m: \u001b[1;36m0\u001b[0m, \u001b[32m\"MEDIUM\"\u001b[0m: \u001b[1;36m0\u001b[0m, \u001b[32m\"HIGH\"\u001b[0m: \u001b[1;36m0\u001b[0m, \n", + "\u001b[32m\"CRITICAL\"\u001b[0m: \u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m\"total_issues\"\u001b[0m: \u001b[1;36m1\u001b[0m, \u001b[32m\"input_path\"\u001b[0m: \n", + "\u001b[32m\"./PyTorchModels/unsafe_model.pt\"\u001b[0m, \u001b[32m\"absolute_path\"\u001b[0m: \n", + "\u001b[32m\"/Users/jennifercwagenberg/Code/Personal/fork/modelscan/notebooks/PyTorchModels\"\u001b[0m\n", + ", \u001b[32m\"modelscan_version\"\u001b[0m: \u001b[32m\"0.0.0\"\u001b[0m, \u001b[32m\"timestamp\"\u001b[0m: \u001b[32m\"2024-04-05T21:31:34.897088\"\u001b[0m, \n", + "\u001b[32m\"scanned\"\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m\"total_scanned\"\u001b[0m: \u001b[1;36m1\u001b[0m, \u001b[32m\"scanned_files\"\u001b[0m: \n", + "\u001b[1m[\u001b[0m\u001b[32m\"unsafe_model.pt:unsafe_model/data.pkl\"\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m\"issues\"\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m{\u001b[0m\u001b[32m\"description\"\u001b[0m: \u001b[32m\"Use of \u001b[0m\n", + "\u001b[32munsafe operator 'system' from module 'posix'\"\u001b[0m, \u001b[32m\"operator\"\u001b[0m: \u001b[32m\"system\"\u001b[0m, \u001b[32m\"module\"\u001b[0m: \n", + "\u001b[32m\"posix\"\u001b[0m, \u001b[32m\"source\"\u001b[0m: \u001b[32m\"unsafe_model.pt:unsafe_model/data.pkl\"\u001b[0m, \u001b[32m\"scanner\"\u001b[0m: \n", + "\u001b[32m\"modelscan.scanners.PickleUnsafeOpScan\"\u001b[0m, \u001b[32m\"severity\"\u001b[0m: \u001b[32m\"CRITICAL\"\u001b[0m\u001b[1m}\u001b[0m\u001b[1m]\u001b[0m, \u001b[32m\"errors\"\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m\n" ] } ], @@ -312,13 +327,6 @@ "# This will save the scan results in file: pytorch-model-scan-results.json\n", "!modelscan --path ./PyTorchModels/unsafe_model.pt -r json -o pytorch-model-scan-results.json" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -337,7 +345,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.14" }, "vscode": { "interpreter": { diff --git a/notebooks/utils/pickle_codeinjection.py b/notebooks/utils/pickle_codeinjection.py index f592922..982d06f 100644 --- a/notebooks/utils/pickle_codeinjection.py +++ b/notebooks/utils/pickle_codeinjection.py @@ -1,6 +1,9 @@ import pickle import struct import os +from typing import overload + +from git import Union class PickleInject: @@ -88,7 +91,44 @@ def __reduce__(self): return self.command, (self.args, {}) -def get_payload(command: str, malicious_code: str): +@overload +def get_payload(command: str, malicious_code: str) -> PickleInject.System: + ... + + +@overload +def get_payload(command: str, malicious_code: str) -> PickleInject.Exec: + ... + + +@overload +def get_payload(command: str, malicious_code: str) -> PickleInject.Eval: + ... + + +@overload +def get_payload(command: str, malicious_code: str) -> PickleInject.RunPy: + ... + + +def get_payload( + command: str, malicious_code: str +) -> Union[ + PickleInject.System, PickleInject.Exec, PickleInject.Eval, PickleInject.RunPy +]: + """ + Get the payload based on the command and malicious code provided. + + Args: + command: The command to execute. + malicious_code: The malicious code to inject. + + Returns: + object: The payload object based on the command. + + Raises: + ValueError: If an invalid command is provided. + """ if command == "system": payload = PickleInject.System(malicious_code) elif command == "exec": @@ -97,6 +137,9 @@ def get_payload(command: str, malicious_code: str): payload = PickleInject.Eval(malicious_code) elif command == "runpy": payload = PickleInject.RunPy(malicious_code) + else: + raise ValueError("Invalid command provided.") + return payload diff --git a/notebooks/utils/pytorch_sentiment_model.py b/notebooks/utils/pytorch_sentiment_model.py index ad376c1..10e03e5 100644 --- a/notebooks/utils/pytorch_sentiment_model.py +++ b/notebooks/utils/pytorch_sentiment_model.py @@ -1,3 +1,4 @@ +from typing import Final from transformers import AutoModelForSequenceClassification from transformers import AutoTokenizer import numpy as np @@ -6,10 +7,21 @@ import urllib.request import torch +SENTIMENT_TASK: Final[str] = "sentiment" -# Preprocess text (username and link placeholders) -def preprocess(text): - new_text = [] + +def _preprocess(text: str) -> str: + """ + Preprocess the given text by replacing usernames starting with '@' with '@user' + and replacing URLs starting with 'http' with 'http'. + + Args: + text: The input text to be preprocessed. + + Returns: + The preprocessed text. + """ + new_text: list[str] = [] for t in text.split(" "): t = "@user" if t.startswith("@") and len(t) > 1 else t @@ -18,27 +30,37 @@ def preprocess(text): return " ".join(new_text) -def download_model(safe_model_path): - task = "sentiment" - MODEL = f"cardiffnlp/twitter-roberta-base-{task}" - # PT - model = AutoModelForSequenceClassification.from_pretrained(MODEL) +def download_model(safe_model_path: str) -> None: + """ + Download a pre-trained model and saves it to the specified path. + + Args: + safe_model_path: The path where the model will be saved. + """ + pretrained_model_name = f"cardiffnlp/twitter-roberta-base-{SENTIMENT_TASK}" + model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name) torch.save(model, safe_model_path) -def predict_sentiment(text: str, model): - task = "sentiment" - MODEL = "cardiffnlp/twitter-roberta-base-sentiment" - tokenizer = AutoTokenizer.from_pretrained(MODEL) +def predict_sentiment(text: str, model: str) -> None: + """ + Predict the sentiment of a given text using a pre-trained sentiment analysis model. + + Args: + text: The input text to analyze. + model: The name or path of the pre-trained sentiment analysis model. + """ + pretrained_model_name = "cardiffnlp/twitter-roberta-base-sentiment" + tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name) - text = preprocess(text) + text = _preprocess(text) encoded_input = tokenizer(text, return_tensors="pt") output = model(**encoded_input) scores = output[0][0].detach().numpy() scores = softmax(scores) - labels = [] - mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt" + labels: list[str] = [] + mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{SENTIMENT_TASK}/mapping.txt" with urllib.request.urlopen(mapping_link) as f: html = f.read().decode("utf-8").split("\n") csvreader = csv.reader(html, delimiter="\t")