Skip to content

Commit

Permalink
converted version to just run and download output (On Colab)
Browse files Browse the repository at this point in the history
  • Loading branch information
alloc7260 committed Sep 3, 2022
1 parent 3340658 commit 8791f5f
Show file tree
Hide file tree
Showing 12 changed files with 670 additions and 344 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ By using selenium and pytesseract ocr module we can scrap any Text Captcha Human

Used Pandas for managing data in tabular format.

Here i provided source codes for scrapping data for analytics.
Here i provided source codes for scrapping data and further used for analytics.

### youtube :
### youtube (version 1) :

https://youtu.be/2nPUuaq4RRI (gturesults.in)

Expand Down
17 changes: 8 additions & 9 deletions gturesults.in/README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
# Note :
# Result-Data-Analyzer

To run this in your system
## How to run :

1> you have to install tesseract-ocr in your system
1) Open notebook file in colab
2) Hit CTRL+F9 (Run All)
3) Resume run using CTRL+F10 (Run After) (After Runtime Restart)
4) Download output file

2> you have to download your browser's driver file
#### Scrap, Analyze & Enjoy!

here i use chrome browser in incognito mode so i downloaded chromedriver.exe

the driver is version specific

you have to define path for this driverfile in program
#### Thank You!
Binary file removed gturesults.in/g.xlsx
Binary file not shown.
Binary file removed gturesults.in/out.xlsx
Binary file not shown.
329 changes: 329 additions & 0 deletions gturesults.in/result_data_analyzer_gtu_results.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,329 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"private_outputs": true,
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"install selenium\n",
"and\n",
"chrome driver (also define path)"
],
"metadata": {
"id": "-lYAwjJwe4Oa"
}
},
{
"cell_type": "code",
"source": [
"!pip install selenium\n",
"!apt-get update # to update ubuntu to correctly run apt install\n",
"!apt install chromium-chromedriver\n",
"!cp /usr/lib/chromium-browser/chromedriver /usr/bin\n",
"import sys\n",
"sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')"
],
"metadata": {
"id": "rjyqtRrYewxu"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"install tesseract"
],
"metadata": {
"id": "FFYAh381exh4"
}
},
{
"cell_type": "code",
"source": [
"!sudo apt install tesseract-ocr\n",
"!pip install pytesseract"
],
"metadata": {
"id": "2_AwFU5SQ5lT"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"restart runtime"
],
"metadata": {
"id": "MBT77b1SBIWq"
}
},
{
"cell_type": "code",
"source": [
"import os\n",
"os.kill(os.getpid(), 9)\n",
"#-----------OR-----------\n",
"# quit()\n",
"#-----------OR-----------\n",
"# exit()"
],
"metadata": {
"id": "LQu1QserAnKc"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"import dependancies"
],
"metadata": {
"id": "QQovnVXAfNbx"
}
},
{
"cell_type": "code",
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.common.keys import Keys\n",
"from selenium.webdriver.support.select import Select\n",
"from selenium.webdriver.chrome.service import Service\n",
"\n",
"import cv2\n",
"from PIL import Image, ImageCms, ImageFilter\n",
"import pytesseract\n",
"\n",
"import pandas as pd\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
],
"metadata": {
"id": "RhvflwoJfMZZ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"helper functions"
],
"metadata": {
"id": "T2jcfxOqfdvk"
}
},
{
"cell_type": "code",
"source": [
"def step1():\n",
" # open webpage\n",
" driver.get(URL)\n",
"\n",
" # save captcha\n",
" imdata = driver.find_element(By.ID,\"imgCaptcha\")\n",
" with open(path, 'wb') as file:\n",
" file.write(imdata.screenshot_as_png)\n",
"\n",
"def step2():\n",
" # convert to inverted mask and save img_temp\n",
" im = cv2.imread(path)\n",
" gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)\n",
" thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]\n",
" horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))\n",
" Mask = cv2.morphologyEx(thresh, cv2.MORPH_OPEN,horizontal_kernel, iterations=2)\n",
" #Mask = cv2.bitwise_not(Mask)\n",
" cv2.imwrite(\"old.png\", Mask)\n",
"\n",
" # open img_temp and reinvert mask\n",
" img = Image.open(\"old.png\")\n",
" img = img.convert(\"RGBA\")\n",
" datas = img.getdata()\n",
" newData = []\n",
" for item in datas:\n",
" if item[0] == 0 and item[1] == 0 and item[2] == 0:\n",
" newData.append((255, 255, 255, 0))\n",
" else:\n",
" newData.append(item)\n",
" img.putdata(newData)\n",
"\n",
" # paste mask on img and save new_temp_img\n",
" background = Image.open(path)\n",
" background = background.convert(\"RGBA\")\n",
" background.paste(img,mask=img)\n",
" background.save(\"new.png\",\"PNG\")\n",
"\n",
"def step3(im): # solve captcha\n",
" im = Image.open(im) # open last saved img\n",
" im = im.crop((5,5,115,35)) # crop it\n",
" # conver image to extractable form elements (deffer captcha styles)\n",
" rgb = ImageCms.createProfile(colorSpace='sRGB')\n",
" lab = ImageCms.createProfile(colorSpace='LAB')\n",
" transform = ImageCms.buildTransform(inputProfile=rgb, outputProfile=lab, inMode='RGB', outMode='LAB')\n",
" lab_im = ImageCms.applyTransform(im=im, transform=transform)\n",
" l, a, b = lab_im.split()\n",
" im=l # select an element which is most extractable\n",
" im = im.filter(ImageFilter.MinFilter(3)) # filter it\n",
" result = pytesseract.image_to_string(im) # send it to ocr and save results to a variable\n",
" l=[]\n",
" l.append(result.strip())\n",
" if l[0]==\" \" or l[0]==\"\" : # if result will be empty then it will do above steps again untill it gets the result\n",
" step1()\n",
" step2()\n",
" l[0]=step3(\"new.png\")\n",
" return l[0] # return final result (maybe right or wrong)\n",
"\n",
"def step4(enroll,ans): # return data\n",
" # site automation \n",
" sel = Select (driver.find_element(By.ID,\"ddlbatch\")) # focus on select element\n",
" sel.select_by_value(exam) # select element by giving id (specific for a exam)\n",
" enr = driver.find_element(By.ID,\"txtenroll\") # get enrollment no. text box\n",
" captex = driver.find_element(By.ID,\"CodeNumberTextBox\") # get captcha text box\n",
" enr.send_keys(enroll) # send (type) given enrollment number to text box\n",
" captex.send_keys(ans) # send (type) extracted captcha text to text box\n",
" captex.send_keys(Keys.RETURN) # return (ENTER)\n",
"\n",
" ere = driver.find_element(By.ID,\"lblmsg\").text\n",
" if ere == \"ERROR: Incorrect captcha code, try again.\" : \n",
" return \"err\"\n",
" if ere == \"Your request count is reached to maximum limit, Please try again later.\" : \n",
" return \"reqover\"\n",
" if ere == \"Oppssss! Data not available.\" : \n",
" return \"nodata\"\n",
"\n",
" name = driver.find_element(By.ID,\"lblName\").text\n",
" sess = driver.find_element(By.ID,\"lblSession\").text\n",
" dd = driver.find_element(By.ID,\"lblDeclaredOn\").text\n",
" bra = driver.find_element(By.ID,\"lblBranchName\").text\n",
" cs = driver.find_element(By.ID,\"lblExamName\").text\n",
" csb = driver.find_element(By.ID,\"lblCUPBack\").text\n",
" tb = driver.find_element(By.ID,\"lblTotalBack\").text\n",
" spi = driver.find_element(By.ID,\"lblSPI\").text\n",
" cpi = driver.find_element(By.ID,\"lblCPI\").text\n",
" cgpa = driver.find_element(By.ID,\"lblCGPA\").text\n",
" cp = driver.find_element(By.ID,\"pt100Curr\").text\n",
" cup = driver.find_element(By.ID,\"pt100Cuml\").text\n",
" return [enroll,name,sess,cs,dd,bra,int(csb),int(tb),float(spi),float(cpi),float(cgpa),int(cp),int(cup),ere]\n",
"\n",
"def loop():\n",
" # just a loop through different enrollment numbers\n",
" global counter\n",
" mynewlist = []\n",
" for i in mylist :\n",
" enroll = \"{}\".format(i)\n",
" step1()\n",
" step2()\n",
" ans=step3(\"new.png\")\n",
" nr=step4(enroll,ans)\n",
" if nr == \"err\" :\n",
" mynewlist.append(enroll)\n",
" elif nr == \"reqover\" :\n",
" print(\"Change the SERVER!\")\n",
" break\n",
" elif nr == \"nodata\" :\n",
" df.loc[len(df)] = [enroll,\"nodata\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\"]\n",
" counter += 1\n",
" print(f\"{counter}/{tc} {int(counter*100/tc)}%\")\n",
" else :\n",
" df.loc[len(df)] = nr\n",
" counter += 1\n",
" print(f\"{counter}/{tc} {int(counter*100/tc)}%\")\n",
" return mynewlist"
],
"metadata": {
"id": "t9H8OdFrRLBY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"main function"
],
"metadata": {
"id": "5QIu2udZgKKq"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "E_BfboOxLMJd"
},
"outputs": [],
"source": [
"try:\n",
"\t# initiate webdriver and configure options\n",
"\tchrome_options = webdriver.ChromeOptions()\n",
"\tchrome_options.add_argument('--headless')\n",
"\tchrome_options.add_argument('--no-sandbox')\n",
"\tchrome_options.add_argument('--disable-dev-shm-usage')\n",
"\tchrome_options.add_argument(\"--incognito\")\n",
"\n",
"\tser = Service(\"chromedriver\")\n",
"\tdriver = webdriver.Chrome(service=ser,options=chrome_options)\n",
"\n",
"\t# define url and filename for download captcha_temp\n",
"\tURL = \"https://www.gturesults.in/\"\n",
"\texam = \"3361$S2022$2022-08-25$current$0\"\n",
"\tpath=\"cap.jpg\"\n",
" \n",
"\t# create empty dataframe for filling output data with same labels that input file has\n",
"\tdf = pd.read_json('{\"Enrollment No.\":{},\"Name\":{},\"Session\":{},\"Exam\":{},\"Declared On\":{},\"Branch\":{},\"Current Sem. Backlog\":{},\"Total Backlog\":{},\"SPI\":{},\"CPI\":{},\"CGPA\":{},\"Current Points\":{},\"Cumulative points\":{},\"Message\":{}}')\n",
"\n",
"\tmylist = range(190280111001,190280111010+1) # give range of enrollment no. (here i given our batch's range)\n",
"\t\n",
"\tcounter = 0\n",
"\ttc = len(mylist)\n",
" \n",
"\t# main driver programm\n",
"\t# loop runs untill all data has scraped if any server error not happens\n",
"\twhile 1:\n",
"\t\tmynewlist=loop()\n",
"\t\tif len(mynewlist) != 0:\n",
"\t\t\tmylist = mynewlist\n",
"\t\telse:\n",
"\t\t\tbreak\n",
"\n",
"\t# save dataframe to excel file\n",
"\tdf.to_excel(\"out.xlsx\")\n",
"\n",
"finally:\n",
"\tdriver.close() # close the window\n",
"\tdriver.quit() # stop the driver\n",
"\t# remove unnecessary files\n",
"\timport os\n",
"\tos.remove(\"cap.jpg\")\n",
"\tos.remove(\"old.png\")\n",
"\tos.remove(\"new.png\")"
]
},
{
"cell_type": "markdown",
"source": [
"👈 download <font color='yellow'>out.xlsx</font> from left side bar by double clicking it"
],
"metadata": {
"id": "qgkULANw3ZQ2"
}
}
]
}
Loading

0 comments on commit 8791f5f

Please sign in to comment.