diff --git a/docs/dataset/testdataset.csv b/docs/dataset/testdataset.csv deleted file mode 100644 index a829675..0000000 --- a/docs/dataset/testdataset.csv +++ /dev/null @@ -1,18 +0,0 @@ -Label,Num Of Sections,TLD,TLD Length,Domain,Domain Length,URL -good,2,com,3,google,6,google.com -good,2,nl,2,google,6,google.nl -good,2,edu,3,emich,5,emich.edu -good,3,edu,3,emich,5,canvas.emich.edu -good,3,com,3,www.theverge,8,www.theverge.com -good,2,com,3,theverge,8,theverge.com -bad,2,com,3,824555,6,824555.com -bad,2,com,3,retajconsultancy,16,retajconsultancy.com -bad,2,info,4,9779,4,9779.info -bad,2,com,3,chinacxyy,9,chinacxyy.com -bad,3,vn,2,com,3,grasslandhotel.com.vn -bad,3,com,3,readmyweather,13,microencapsulation.readmyweather.com -bad,2,com,3,0068555,7,0068555.com -bad,2,com,3,haishundl,9,haishundl.com -bad,2,nl,2,zoetekroon,10,zoetekroon.nl -bad,2,com,3,socialsocietyedu,16,socialsocietyedu.com -bad,2,ru,2,3cf,3,3cf.ru diff --git a/docs/main.css b/docs/main.css index 4e9669a..b90a379 100644 --- a/docs/main.css +++ b/docs/main.css @@ -1,10 +1,25 @@ +:root { + --background-color: white; + --card-color: white; + --text-color: black; +} +@media (prefers-color-scheme: dark) { + :root { + --background-color: black; + --card-color: #121212; + --text-color: white; + } +} body { + background-color: var(--background-color); + color: var(--text-color); text-align: center; + font-family: Arial, Helvetica, sans-serif; } div.card { - background-color: white; - border: 2px solid black; + background-color: var(--card-color); + border: 2px solid var(--text-color); border-radius: 16px; padding: 1em; width: 30vw; diff --git a/docs/main.py b/docs/main.py index eb168ae..2f5dd4f 100644 --- a/docs/main.py +++ b/docs/main.py @@ -5,7 +5,8 @@ from sklearn.preprocessing import StandardScaler from scipy.sparse import hstack from pyweb import pydom -from os.path import exists +from urllib.parse import urlparse +import csv # Get the input container element input_container = pydom['#input-container'] @@ -15,23 +16,39 @@ progress_text = document.querySelector("#progress-text") progress_text.innerText = "" +# Get the result text HTML element +result_text = document.querySelector('#result-text') + # Gets the model and returns it def load_files(): model = joblib.load('model.pkl') tfidf = joblib.load('tfidf.pkl') - test_dataset = pd.read_csv("testdataset.csv") - return model, tfidf, test_dataset + return model, tfidf + +def test_model(model, tfidf, formatted_url): + with open('testdataset.csv', mode ='w', newline='', encoding="utf-8") as test_file: + parameters = [] + # Open writer to CSV file + csvWrite = csv.writer(test_file, delimiter=' ', escapechar=' ', quoting=csv.QUOTE_NONE) + # Add the labels to parameters + parameters.append('NumOfSections,TLD,TLDLength,Domain,DomainLength,URL') + # Write the labels to the CSV file + csvWrite.writerow(parameters) + parameters = [] + # Add the URL to parameters + parameters.append(formatted_url) + csvWrite.writerow(parameters) + + test_dataset = pd.read_csv("testdataset.csv") -def test_model(model, tfidf, test_dataset): # Turn the test dataset into a pandas data frame dataframe = pd.DataFrame(test_dataset) - x = dataframe[['Num Of Sections', 'TLD', 'TLD Length', 'Domain', 'Domain Length', 'URL']] - y = dataframe['Label'] + x = dataframe[['NumOfSections', 'TLD', 'TLDLength', 'Domain', 'DomainLength', 'URL']] # Separate text and numeric features text_features = ['TLD', 'Domain', 'URL'] - numeric_features = ['Num Of Sections', 'TLD Length', 'Domain Length'] + numeric_features = ['NumOfSections', 'TLDLength', 'DomainLength'] # Turn the text columns into a string and vectorize features_test_text = tfidf.transform(x[text_features].apply(lambda row: ' '.join(row.astype(str)), axis=1)) @@ -44,33 +61,41 @@ def test_model(model, tfidf, test_dataset): # Recombine the text_features and the numeric_features features_test = hstack([features_test_text, features_test_numeric]) - # Use the model to predict the label for each of - # the test domains and then print the result - display("URLS:", x['URL'].to_list()) + # Use the model to predict the label for the url prediction = model.predict(features_test) - display("\nPREDIC:", prediction) - count = 0 - correct_count = 0 - for label in range(len(y)): - if (prediction[label] == y.to_list()[label]): - correct_count = correct_count + 1 - count = count + 1 - display("ACTUAL:", y.to_list()) - display("CORRECT PREDICTIONS:", correct_count,"/",count) - display("ACCURACY OF THIS TEST:", (correct_count/count) * 100) + result_text.innerText = "Prediction: " + prediction[0] + def get_url(event): input_text = document.querySelector("#url-input") url = input_text.value - output_text = document.querySelector("#result-text") - output_text.innerText = url - run(url) + run(get_formatted_url(url)) + +def get_hostname(url): + if (urlparse(url).scheme == ""): + url = "https://" + url + + result = urlparse(url) + return result.hostname + +def get_formatted_url(url): + link = get_hostname(url) + comma = "," + # Split the link into sections + linkSplit = link.split(".") + # Get the number of sections + numOfSections = len(linkSplit) + # Get the TLD and its length + tld = linkSplit[len(linkSplit) - 1] + tldLength = len(tld) + # Get the domain and its length + domain = linkSplit[len(linkSplit) - 2] + domainLength = len(domain) + return (str(numOfSections) + comma + tld + comma + str(tldLength) + comma + domain + comma + str(domainLength) + comma + link) def run(url): - # Load the model from storage - model, tfidf, test_dataset = load_files() - # Test the model with the test dataset - test_model(model, tfidf, test_dataset) + model, tfidf = load_files() + test_model(model, tfidf, url) diff --git a/docs/models/model.pkl b/docs/models/model.pkl index 691af0a..cfb04bf 100644 --- a/docs/models/model.pkl +++ b/docs/models/model.pkl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:561f9195406cedbe0c5ed0562827cbe9a31c5cb10fc9c5f76602ddefed6e6c9e -size 162981301 +oid sha256:47bafdce64d92b0bf656ad40c98f28b96afd9b3a179336107a7fd7ce384a9a78 +size 322798693 diff --git a/docs/pyscript.json b/docs/pyscript.json index a0b965d..97fa723 100644 --- a/docs/pyscript.json +++ b/docs/pyscript.json @@ -6,7 +6,6 @@ ], "files": { "./models/model.pkl": "model.pkl", - "./models/tfidf.pkl": "tfidf.pkl", - "./dataset/testdataset.csv": "testdataset.csv" + "./models/tfidf.pkl": "tfidf.pkl" } } \ No newline at end of file diff --git a/models/model.pkl b/models/model.pkl index 691af0a..cfb04bf 100644 --- a/models/model.pkl +++ b/models/model.pkl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:561f9195406cedbe0c5ed0562827cbe9a31c5cb10fc9c5f76602ddefed6e6c9e -size 162981301 +oid sha256:47bafdce64d92b0bf656ad40c98f28b96afd9b3a179336107a7fd7ce384a9a78 +size 322798693