-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
converted version to just run and download output (On Colab)
- Loading branch information
Showing
12 changed files
with
670 additions
and
344 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,12 @@ | ||
# Note : | ||
# Result-Data-Analyzer | ||
|
||
To run this in your system | ||
## How to run : | ||
|
||
1> you have to install tesseract-ocr in your system | ||
1) Open notebook file in colab | ||
2) Hit CTRL+F9 (Run All) | ||
3) Resume run using CTRL+F10 (Run After) (After Runtime Restart) | ||
4) Download output file | ||
|
||
2> you have to download your browser's driver file | ||
#### Scrap, Analyze & Enjoy! | ||
|
||
here i use chrome browser in incognito mode so i downloaded chromedriver.exe | ||
|
||
the driver is version specific | ||
|
||
you have to define path for this driverfile in program | ||
#### Thank You! |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,329 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 0, | ||
"metadata": { | ||
"colab": { | ||
"private_outputs": true, | ||
"provenance": [], | ||
"collapsed_sections": [] | ||
}, | ||
"kernelspec": { | ||
"name": "python3", | ||
"display_name": "Python 3" | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"install selenium\n", | ||
"and\n", | ||
"chrome driver (also define path)" | ||
], | ||
"metadata": { | ||
"id": "-lYAwjJwe4Oa" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"!pip install selenium\n", | ||
"!apt-get update # to update ubuntu to correctly run apt install\n", | ||
"!apt install chromium-chromedriver\n", | ||
"!cp /usr/lib/chromium-browser/chromedriver /usr/bin\n", | ||
"import sys\n", | ||
"sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')" | ||
], | ||
"metadata": { | ||
"id": "rjyqtRrYewxu" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"install tesseract" | ||
], | ||
"metadata": { | ||
"id": "FFYAh381exh4" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"!sudo apt install tesseract-ocr\n", | ||
"!pip install pytesseract" | ||
], | ||
"metadata": { | ||
"id": "2_AwFU5SQ5lT" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"restart runtime" | ||
], | ||
"metadata": { | ||
"id": "MBT77b1SBIWq" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"import os\n", | ||
"os.kill(os.getpid(), 9)\n", | ||
"#-----------OR-----------\n", | ||
"# quit()\n", | ||
"#-----------OR-----------\n", | ||
"# exit()" | ||
], | ||
"metadata": { | ||
"id": "LQu1QserAnKc" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"import dependancies" | ||
], | ||
"metadata": { | ||
"id": "QQovnVXAfNbx" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"from selenium import webdriver\n", | ||
"from selenium.webdriver.common.by import By\n", | ||
"from selenium.webdriver.common.keys import Keys\n", | ||
"from selenium.webdriver.support.select import Select\n", | ||
"from selenium.webdriver.chrome.service import Service\n", | ||
"\n", | ||
"import cv2\n", | ||
"from PIL import Image, ImageCms, ImageFilter\n", | ||
"import pytesseract\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"import warnings\n", | ||
"warnings.filterwarnings('ignore')" | ||
], | ||
"metadata": { | ||
"id": "RhvflwoJfMZZ" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"helper functions" | ||
], | ||
"metadata": { | ||
"id": "T2jcfxOqfdvk" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"def step1():\n", | ||
" # open webpage\n", | ||
" driver.get(URL)\n", | ||
"\n", | ||
" # save captcha\n", | ||
" imdata = driver.find_element(By.ID,\"imgCaptcha\")\n", | ||
" with open(path, 'wb') as file:\n", | ||
" file.write(imdata.screenshot_as_png)\n", | ||
"\n", | ||
"def step2():\n", | ||
" # convert to inverted mask and save img_temp\n", | ||
" im = cv2.imread(path)\n", | ||
" gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)\n", | ||
" thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]\n", | ||
" horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))\n", | ||
" Mask = cv2.morphologyEx(thresh, cv2.MORPH_OPEN,horizontal_kernel, iterations=2)\n", | ||
" #Mask = cv2.bitwise_not(Mask)\n", | ||
" cv2.imwrite(\"old.png\", Mask)\n", | ||
"\n", | ||
" # open img_temp and reinvert mask\n", | ||
" img = Image.open(\"old.png\")\n", | ||
" img = img.convert(\"RGBA\")\n", | ||
" datas = img.getdata()\n", | ||
" newData = []\n", | ||
" for item in datas:\n", | ||
" if item[0] == 0 and item[1] == 0 and item[2] == 0:\n", | ||
" newData.append((255, 255, 255, 0))\n", | ||
" else:\n", | ||
" newData.append(item)\n", | ||
" img.putdata(newData)\n", | ||
"\n", | ||
" # paste mask on img and save new_temp_img\n", | ||
" background = Image.open(path)\n", | ||
" background = background.convert(\"RGBA\")\n", | ||
" background.paste(img,mask=img)\n", | ||
" background.save(\"new.png\",\"PNG\")\n", | ||
"\n", | ||
"def step3(im): # solve captcha\n", | ||
" im = Image.open(im) # open last saved img\n", | ||
" im = im.crop((5,5,115,35)) # crop it\n", | ||
" # conver image to extractable form elements (deffer captcha styles)\n", | ||
" rgb = ImageCms.createProfile(colorSpace='sRGB')\n", | ||
" lab = ImageCms.createProfile(colorSpace='LAB')\n", | ||
" transform = ImageCms.buildTransform(inputProfile=rgb, outputProfile=lab, inMode='RGB', outMode='LAB')\n", | ||
" lab_im = ImageCms.applyTransform(im=im, transform=transform)\n", | ||
" l, a, b = lab_im.split()\n", | ||
" im=l # select an element which is most extractable\n", | ||
" im = im.filter(ImageFilter.MinFilter(3)) # filter it\n", | ||
" result = pytesseract.image_to_string(im) # send it to ocr and save results to a variable\n", | ||
" l=[]\n", | ||
" l.append(result.strip())\n", | ||
" if l[0]==\" \" or l[0]==\"\" : # if result will be empty then it will do above steps again untill it gets the result\n", | ||
" step1()\n", | ||
" step2()\n", | ||
" l[0]=step3(\"new.png\")\n", | ||
" return l[0] # return final result (maybe right or wrong)\n", | ||
"\n", | ||
"def step4(enroll,ans): # return data\n", | ||
" # site automation \n", | ||
" sel = Select (driver.find_element(By.ID,\"ddlbatch\")) # focus on select element\n", | ||
" sel.select_by_value(exam) # select element by giving id (specific for a exam)\n", | ||
" enr = driver.find_element(By.ID,\"txtenroll\") # get enrollment no. text box\n", | ||
" captex = driver.find_element(By.ID,\"CodeNumberTextBox\") # get captcha text box\n", | ||
" enr.send_keys(enroll) # send (type) given enrollment number to text box\n", | ||
" captex.send_keys(ans) # send (type) extracted captcha text to text box\n", | ||
" captex.send_keys(Keys.RETURN) # return (ENTER)\n", | ||
"\n", | ||
" ere = driver.find_element(By.ID,\"lblmsg\").text\n", | ||
" if ere == \"ERROR: Incorrect captcha code, try again.\" : \n", | ||
" return \"err\"\n", | ||
" if ere == \"Your request count is reached to maximum limit, Please try again later.\" : \n", | ||
" return \"reqover\"\n", | ||
" if ere == \"Oppssss! Data not available.\" : \n", | ||
" return \"nodata\"\n", | ||
"\n", | ||
" name = driver.find_element(By.ID,\"lblName\").text\n", | ||
" sess = driver.find_element(By.ID,\"lblSession\").text\n", | ||
" dd = driver.find_element(By.ID,\"lblDeclaredOn\").text\n", | ||
" bra = driver.find_element(By.ID,\"lblBranchName\").text\n", | ||
" cs = driver.find_element(By.ID,\"lblExamName\").text\n", | ||
" csb = driver.find_element(By.ID,\"lblCUPBack\").text\n", | ||
" tb = driver.find_element(By.ID,\"lblTotalBack\").text\n", | ||
" spi = driver.find_element(By.ID,\"lblSPI\").text\n", | ||
" cpi = driver.find_element(By.ID,\"lblCPI\").text\n", | ||
" cgpa = driver.find_element(By.ID,\"lblCGPA\").text\n", | ||
" cp = driver.find_element(By.ID,\"pt100Curr\").text\n", | ||
" cup = driver.find_element(By.ID,\"pt100Cuml\").text\n", | ||
" return [enroll,name,sess,cs,dd,bra,int(csb),int(tb),float(spi),float(cpi),float(cgpa),int(cp),int(cup),ere]\n", | ||
"\n", | ||
"def loop():\n", | ||
" # just a loop through different enrollment numbers\n", | ||
" global counter\n", | ||
" mynewlist = []\n", | ||
" for i in mylist :\n", | ||
" enroll = \"{}\".format(i)\n", | ||
" step1()\n", | ||
" step2()\n", | ||
" ans=step3(\"new.png\")\n", | ||
" nr=step4(enroll,ans)\n", | ||
" if nr == \"err\" :\n", | ||
" mynewlist.append(enroll)\n", | ||
" elif nr == \"reqover\" :\n", | ||
" print(\"Change the SERVER!\")\n", | ||
" break\n", | ||
" elif nr == \"nodata\" :\n", | ||
" df.loc[len(df)] = [enroll,\"nodata\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\"]\n", | ||
" counter += 1\n", | ||
" print(f\"{counter}/{tc} {int(counter*100/tc)}%\")\n", | ||
" else :\n", | ||
" df.loc[len(df)] = nr\n", | ||
" counter += 1\n", | ||
" print(f\"{counter}/{tc} {int(counter*100/tc)}%\")\n", | ||
" return mynewlist" | ||
], | ||
"metadata": { | ||
"id": "t9H8OdFrRLBY" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"main function" | ||
], | ||
"metadata": { | ||
"id": "5QIu2udZgKKq" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "E_BfboOxLMJd" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"try:\n", | ||
"\t# initiate webdriver and configure options\n", | ||
"\tchrome_options = webdriver.ChromeOptions()\n", | ||
"\tchrome_options.add_argument('--headless')\n", | ||
"\tchrome_options.add_argument('--no-sandbox')\n", | ||
"\tchrome_options.add_argument('--disable-dev-shm-usage')\n", | ||
"\tchrome_options.add_argument(\"--incognito\")\n", | ||
"\n", | ||
"\tser = Service(\"chromedriver\")\n", | ||
"\tdriver = webdriver.Chrome(service=ser,options=chrome_options)\n", | ||
"\n", | ||
"\t# define url and filename for download captcha_temp\n", | ||
"\tURL = \"https://www.gturesults.in/\"\n", | ||
"\texam = \"3361$S2022$2022-08-25$current$0\"\n", | ||
"\tpath=\"cap.jpg\"\n", | ||
" \n", | ||
"\t# create empty dataframe for filling output data with same labels that input file has\n", | ||
"\tdf = pd.read_json('{\"Enrollment No.\":{},\"Name\":{},\"Session\":{},\"Exam\":{},\"Declared On\":{},\"Branch\":{},\"Current Sem. Backlog\":{},\"Total Backlog\":{},\"SPI\":{},\"CPI\":{},\"CGPA\":{},\"Current Points\":{},\"Cumulative points\":{},\"Message\":{}}')\n", | ||
"\n", | ||
"\tmylist = range(190280111001,190280111010+1) # give range of enrollment no. (here i given our batch's range)\n", | ||
"\t\n", | ||
"\tcounter = 0\n", | ||
"\ttc = len(mylist)\n", | ||
" \n", | ||
"\t# main driver programm\n", | ||
"\t# loop runs untill all data has scraped if any server error not happens\n", | ||
"\twhile 1:\n", | ||
"\t\tmynewlist=loop()\n", | ||
"\t\tif len(mynewlist) != 0:\n", | ||
"\t\t\tmylist = mynewlist\n", | ||
"\t\telse:\n", | ||
"\t\t\tbreak\n", | ||
"\n", | ||
"\t# save dataframe to excel file\n", | ||
"\tdf.to_excel(\"out.xlsx\")\n", | ||
"\n", | ||
"finally:\n", | ||
"\tdriver.close() # close the window\n", | ||
"\tdriver.quit() # stop the driver\n", | ||
"\t# remove unnecessary files\n", | ||
"\timport os\n", | ||
"\tos.remove(\"cap.jpg\")\n", | ||
"\tos.remove(\"old.png\")\n", | ||
"\tos.remove(\"new.png\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"👈 download <font color='yellow'>out.xlsx</font> from left side bar by double clicking it" | ||
], | ||
"metadata": { | ||
"id": "qgkULANw3ZQ2" | ||
} | ||
} | ||
] | ||
} |
Oops, something went wrong.