-
Notifications
You must be signed in to change notification settings - Fork 2
/
malrec_download.py
81 lines (63 loc) · 1.85 KB
/
malrec_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# You need to run this script under python2.x
import urllib
import re
import Queue
base = 'https://giantpanda.gtisc.gatech.edu/malrec/'
treated_list = []
untreat_list = []
def url_is_in_site(url):
if True:
return True
else:
return False
def get_form(url):
html = urllib.urlopen(url).read()
# print html
print "*******************************"
reg = r'<form.*\r.*form>'
form_list = re.findall(reg, html, re.S)
with open('form.txt', 'w') as f:
f.write(form_list[0])
print 'OK'
# iframe情况
# js writeForm情况,iframe+js writeForm
dest_dir="./malrec_pcaps/"
def downLoadPicFromURL(url,jpg_url):
try:
urllib.urlretrieve(url, dest_dir+jpg_url)
except Exception as e:
print '\tError retrieving the URL:', url, e
def get_jpg(url):
html = urllib.urlopen(url).read()
reg = r'a href=[\'\"](.*.pcap)[\'\"]'
jpg_list = re.findall(reg, html)
jpg_list = list(set(jpg_list))
return jpg_list
uq = Queue.Queue(maxsize=-1)
uq.put(base + '/pcap/')
jpg_count = 0
down_jpg = []
while not uq.empty():
print uq.qsize()
url = uq.get()
print url
print '=================================================================='
jpg_list = get_jpg(url)
for jpg_url in jpg_list:
if jpg_url[:4] != 'http':
real_url = base +'pcap/'+ jpg_url
if jpg_url not in down_jpg:
print real_url
jpg_count += 1
try:
downLoadPicFromURL(real_url,jpg_url)
down_jpg.append(jpg_url)
# except IOError:
# print "url wrong!!!"
except Exception:
print "url wrong!!!"
else:
print '[wrong jpg url]: %s' % jpg_url
print "get list done!"