-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathscraperrank.py
151 lines (130 loc) · 4.81 KB
/
scraperrank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# coding=utf-8
import re
import sys
import warnings
import cv2
import pytesseract
from PIL import Image
from bs4 import BeautifulSoup
from selenium import webdriver
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
if not sys.warnoptions:
warnings.simplefilter("ignore")
# Input for Branch and USNs
college = ["4AI", "1BG", "1CR", "1AM", "1BI"]
year = input('Enter the year\n')
branch = input('Please enter the branch\n').upper()
low = int(input('Enter starting USN\n'))
if low >= 400:
dip = 'Y'
else:
dip = 'N'
# increment last USN to aid looping
high = int(input('Enter last USN\n')) + 1
semc = input('Enter the Semester\n')
cycle = 'N'
if low >= 400:
dip = 'Y'
else:
dip = 'N'
subcode = 52
iloop = 8
if semc == '1' or semc == '2':
cycle = input('Enter the Cycle\n').upper()
if cycle == 'P':
iloop = 7
subcode = 46
if semc == '3' or semc == '4':
if dip == 'Y':
iloop = 9
subcode = 58
# Opens file for storing data
with open('test2.txt', 'w+') as f:
c = 0
pf = ''
driver = webdriver.Chrome('C:\Program Files (x86)\chromedriver_win32\chromedriver.exe')
# For Loop to loop through all USNs
for x in college:
for u in range(low, high):
# IF condition to concatenate USN
if u < 10:
usn = x + year + branch + '00' + str(u)
elif u < 100:
usn = x + year + branch + '0' + str(u)
else:
usn = x + year + branch + str(u)
# opens the vtu result login page, gets the usn and opens the result page
driver.get('http://results.vtu.ac.in/resultsvitavicbcs_19/index.php')
driver.save_screenshot('python_org.png')
img = cv2.imread("python_org.png")
crop_img = img[467:508, 667:885]
cv2.imwrite('cap.png', crop_img)
cv2.waitKey(0)
tex = pytesseract.image_to_string(Image.open('cap.png'))
tex = tex.strip(',')
tex = tex.strip(' ')
captcha = int(tex)
us = driver.find_element_by_name("lns")
cap = driver.find_element_by_name("captchacode")
us.send_keys(usn)
cap.send_keys(captcha)
driver.find_element_by_id("submit").click()
try:
soup = BeautifulSoup(driver.page_source)
except:
alert = driver.switch_to.alert
alert.dismiss()
continue
# Finds all the table elements and stores in array tds
tds = soup.find_all('td')
ths = soup.find_all('th')
divs = soup.find_all('div', attrs={'class': 'col-md-12'})
divCell = soup.find_all('div', attrs={'class': 'divTableCell'})
record = ''
# tds[1] holds USN number
record += re.sub('[!@#$:]', '', tds[1].text)
record += ','
# tds[3] holds the name
record += re.sub('[!@#$:]', '', tds[3].text)
record += ','
sortList1 = []
for i in range(6, subcode, 6):
if (divCell[i].text[-3:]).isdigit():
sortList1.append(divCell[i].text[-3:])
else:
sortList1.append(divCell[i].text[-2:])
sortList1.sort()
ilist = []
for i in range(0, iloop):
for j in range(6, subcode, 6):
if (divCell[j].text[-3:]).isdigit():
if divCell[j].text[-3:] == sortList1[i] and j not in ilist:
ilist.append(j)
else:
if divCell[j].text[-2:] == sortList1[i] and j not in ilist:
ilist.append(j)
# Strips extra garbage from the retrieved USN text
print(record, end='\t')
# Loop that goes from 8 to 51 in steps of 6 because starting from 8, in steps of 6
try:
for l in ilist:
# Checks if string has number
for j in range(l, l + 6):
if j == l + 1:
continue
else:
char = divCell[j].text
if char.isdigit():
record = record + str(int(char)) + ','
else:
record = record + char + ','
print(divCell[j].text, end='\t\t')
if j == l + 5:
pf = pf + divCell[j].text + ','
f.write(record + '\n')
print('\n')
except IndexError:
pass
if dip != 'Y':
from sgparank import gpa2
gpa2(year, branch, low, high, semc, cycle)