Skip to content

Commit

Permalink
Using panda's on-the-fly decompression
Browse files Browse the repository at this point in the history
  • Loading branch information
Moohan authored Mar 25, 2019
1 parent 9221f59 commit c685328
Showing 1 changed file with 1 addition and 14 deletions.
15 changes: 1 addition & 14 deletions DNoiSe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pandas
import urllib
import random
import zipfile
import sqlite3
import datetime
import requests
Expand Down Expand Up @@ -56,26 +55,14 @@ def download_domains():
print >> log_file, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.mktime(datetime.datetime.now().timetuple())))+" Can't download the domain list. Quitting."
exit()

# Unzip the list
try:
print >> log_file, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.mktime(datetime.datetime.now().timetuple())))+" Unzipping…"
zip_ref = zipfile.ZipFile(working_directory+"domains.zip", "r")
zip_ref.extractall(working_directory)
zip_ref.close()

os.remove(working_directory+"domains.zip")
except:
print >> log_file, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.mktime(datetime.datetime.now().timetuple())))+" Extraction failed. Quitting."
exit()

# Create a SQLite database
try:
db = sqlite3.connect(working_directory+"domains.sqlite")
db.execute("CREATE TABLE Domains (ID INT PRIMARY KEY, Domain TEXT)")

# Load the CSV into our database
print >> log_file, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.mktime(datetime.datetime.now().timetuple())))+" Importing to sqlite…"
df = pandas.read_csv(working_directory+"top-1m.csv", names = ["ID", "Domain"])
df = pandas.read_csv(working_directory + "domains.zip", compression = 'zip', names = ["ID", "Domain"])
df.to_sql("Domains", db, if_exists = "append", index = False)

db.close()
Expand Down

0 comments on commit c685328

Please sign in to comment.