converted version to just run and download output (On Colab)

alloc7260 · Sep 3, 2022 · 8791f5f · 8791f5f
1 parent 3340658
commit 8791f5f
Show file tree

Hide file tree

Showing 12 changed files with 670 additions and 344 deletions.
diff --git a/README.md b/README.md
@@ -13,9 +13,9 @@ By using selenium and pytesseract ocr module we can scrap any Text Captcha Human
 
 Used Pandas for managing data in tabular format.
 
-Here i provided source codes for scrapping data for analytics.
+Here i provided source codes for scrapping data and further used for analytics.
 
-### youtube :
+### youtube (version 1) :
 
 https://youtu.be/2nPUuaq4RRI (gturesults.in)
 

diff --git a/gturesults.in/README.md b/gturesults.in/README.md
@@ -1,13 +1,12 @@
-# Note :
+# Result-Data-Analyzer
 
-To run this in your system 
+## How to run :
 
-1> you have to install tesseract-ocr in your system
+1) Open notebook file in colab
+2) Hit CTRL+F9 (Run All)
+3) Resume run using CTRL+F10 (Run After) (After Runtime Restart)
+4) Download output file
 
-2> you have to download your browser's driver file
+#### Scrap, Analyze & Enjoy!
 
-	here i use chrome browser in incognito mode so i downloaded chromedriver.exe
-
-	the driver is version specific
-
-	you have to define path for this driverfile in program
+#### Thank You!
diff --git a/gturesults.in/g.xlsx b/gturesults.in/g.xlsx
diff --git a/gturesults.in/out.xlsx b/gturesults.in/out.xlsx
diff --git a/gturesults.in/result_data_analyzer_gtu_results.ipynb b/gturesults.in/result_data_analyzer_gtu_results.ipynb
@@ -0,0 +1,329 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "private_outputs": true,
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "install selenium\n",
+        "and\n",
+        "chrome driver (also define path)"
+      ],
+      "metadata": {
+        "id": "-lYAwjJwe4Oa"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install selenium\n",
+        "!apt-get update # to update ubuntu to correctly run apt install\n",
+        "!apt install chromium-chromedriver\n",
+        "!cp /usr/lib/chromium-browser/chromedriver /usr/bin\n",
+        "import sys\n",
+        "sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')"
+      ],
+      "metadata": {
+        "id": "rjyqtRrYewxu"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "install tesseract"
+      ],
+      "metadata": {
+        "id": "FFYAh381exh4"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!sudo apt install tesseract-ocr\n",
+        "!pip install pytesseract"
+      ],
+      "metadata": {
+        "id": "2_AwFU5SQ5lT"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "restart runtime"
+      ],
+      "metadata": {
+        "id": "MBT77b1SBIWq"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "os.kill(os.getpid(), 9)\n",
+        "#-----------OR-----------\n",
+        "# quit()\n",
+        "#-----------OR-----------\n",
+        "# exit()"
+      ],
+      "metadata": {
+        "id": "LQu1QserAnKc"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "import dependancies"
+      ],
+      "metadata": {
+        "id": "QQovnVXAfNbx"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from selenium import webdriver\n",
+        "from selenium.webdriver.common.by import By\n",
+        "from selenium.webdriver.common.keys import Keys\n",
+        "from selenium.webdriver.support.select import Select\n",
+        "from selenium.webdriver.chrome.service import Service\n",
+        "\n",
+        "import cv2\n",
+        "from PIL import Image, ImageCms, ImageFilter\n",
+        "import pytesseract\n",
+        "\n",
+        "import pandas as pd\n",
+        "import warnings\n",
+        "warnings.filterwarnings('ignore')"
+      ],
+      "metadata": {
+        "id": "RhvflwoJfMZZ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "helper functions"
+      ],
+      "metadata": {
+        "id": "T2jcfxOqfdvk"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def step1():\n",
+        "    # open webpage\n",
+        "    driver.get(URL)\n",
+        "\n",
+        "    # save captcha\n",
+        "    imdata = driver.find_element(By.ID,\"imgCaptcha\")\n",
+        "    with open(path, 'wb') as file:\n",
+        "        file.write(imdata.screenshot_as_png)\n",
+        "\n",
+        "def step2():\n",
+        "    # convert to inverted mask and save img_temp\n",
+        "    im = cv2.imread(path)\n",
+        "    gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)\n",
+        "    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]\n",
+        "    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))\n",
+        "    Mask = cv2.morphologyEx(thresh, cv2.MORPH_OPEN,horizontal_kernel, iterations=2)\n",
+        "    #Mask = cv2.bitwise_not(Mask)\n",
+        "    cv2.imwrite(\"old.png\", Mask)\n",
+        "\n",
+        "    # open img_temp and reinvert mask\n",
+        "    img = Image.open(\"old.png\")\n",
+        "    img = img.convert(\"RGBA\")\n",
+        "    datas = img.getdata()\n",
+        "    newData = []\n",
+        "    for item in datas:\n",
+        "        if item[0] == 0 and item[1] == 0 and item[2] == 0:\n",
+        "            newData.append((255, 255, 255, 0))\n",
+        "        else:\n",
+        "            newData.append(item)\n",
+        "    img.putdata(newData)\n",
+        "\n",
+        "    # paste mask on img and save new_temp_img\n",
+        "    background = Image.open(path)\n",
+        "    background = background.convert(\"RGBA\")\n",
+        "    background.paste(img,mask=img)\n",
+        "    background.save(\"new.png\",\"PNG\")\n",
+        "\n",
+        "def step3(im): #  solve captcha\n",
+        "    im = Image.open(im) # open last saved img\n",
+        "    im = im.crop((5,5,115,35)) # crop it\n",
+        "    # conver image to extractable form elements (deffer captcha styles)\n",
+        "    rgb = ImageCms.createProfile(colorSpace='sRGB')\n",
+        "    lab = ImageCms.createProfile(colorSpace='LAB')\n",
+        "    transform = ImageCms.buildTransform(inputProfile=rgb, outputProfile=lab, inMode='RGB', outMode='LAB')\n",
+        "    lab_im = ImageCms.applyTransform(im=im, transform=transform)\n",
+        "    l, a, b = lab_im.split()\n",
+        "    im=l # select an element which is most extractable\n",
+        "    im = im.filter(ImageFilter.MinFilter(3)) # filter it\n",
+        "    result = pytesseract.image_to_string(im) # send it to ocr and save results to a variable\n",
+        "    l=[]\n",
+        "    l.append(result.strip())\n",
+        "    if l[0]==\" \" or l[0]==\"\" : # if result will be empty then it will do above steps again untill it gets the result\n",
+        "        step1()\n",
+        "        step2()\n",
+        "        l[0]=step3(\"new.png\")\n",
+        "    return l[0] # return final result (maybe right or wrong)\n",
+        "\n",
+        "def step4(enroll,ans): # return data\n",
+        "    # site automation \n",
+        "    sel = Select (driver.find_element(By.ID,\"ddlbatch\")) # focus on select element\n",
+        "    sel.select_by_value(exam) # select element by giving id (specific for a exam)\n",
+        "    enr = driver.find_element(By.ID,\"txtenroll\") # get enrollment no. text box\n",
+        "    captex = driver.find_element(By.ID,\"CodeNumberTextBox\") # get captcha text box\n",
+        "    enr.send_keys(enroll) # send (type) given enrollment number to text box\n",
+        "    captex.send_keys(ans) # send (type) extracted captcha text to text box\n",
+        "    captex.send_keys(Keys.RETURN) # return (ENTER)\n",
+        "\n",
+        "    ere = driver.find_element(By.ID,\"lblmsg\").text\n",
+        "    if ere == \"ERROR: Incorrect captcha code, try again.\" : \n",
+        "        return \"err\"\n",
+        "    if ere == \"Your request count is reached to maximum limit, Please try again later.\" : \n",
+        "        return \"reqover\"\n",
+        "    if ere == \"Oppssss! Data not available.\" : \n",
+        "        return \"nodata\"\n",
+        "\n",
+        "    name = driver.find_element(By.ID,\"lblName\").text\n",
+        "    sess = driver.find_element(By.ID,\"lblSession\").text\n",
+        "    dd = driver.find_element(By.ID,\"lblDeclaredOn\").text\n",
+        "    bra = driver.find_element(By.ID,\"lblBranchName\").text\n",
+        "    cs = driver.find_element(By.ID,\"lblExamName\").text\n",
+        "    csb = driver.find_element(By.ID,\"lblCUPBack\").text\n",
+        "    tb = driver.find_element(By.ID,\"lblTotalBack\").text\n",
+        "    spi = driver.find_element(By.ID,\"lblSPI\").text\n",
+        "    cpi = driver.find_element(By.ID,\"lblCPI\").text\n",
+        "    cgpa = driver.find_element(By.ID,\"lblCGPA\").text\n",
+        "    cp = driver.find_element(By.ID,\"pt100Curr\").text\n",
+        "    cup = driver.find_element(By.ID,\"pt100Cuml\").text\n",
+        "    return [enroll,name,sess,cs,dd,bra,int(csb),int(tb),float(spi),float(cpi),float(cgpa),int(cp),int(cup),ere]\n",
+        "\n",
+        "def loop():\n",
+        "    # just a loop through different enrollment numbers\n",
+        "    global counter\n",
+        "    mynewlist = []\n",
+        "    for i in mylist :\n",
+        "        enroll = \"{}\".format(i)\n",
+        "        step1()\n",
+        "        step2()\n",
+        "        ans=step3(\"new.png\")\n",
+        "        nr=step4(enroll,ans)\n",
+        "        if nr == \"err\" :\n",
+        "            mynewlist.append(enroll)\n",
+        "        elif nr == \"reqover\" :\n",
+        "            print(\"Change the SERVER!\")\n",
+        "            break\n",
+        "        elif nr == \"nodata\" :\n",
+        "            df.loc[len(df)] = [enroll,\"nodata\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\"]\n",
+        "            counter += 1\n",
+        "            print(f\"{counter}/{tc} {int(counter*100/tc)}%\")\n",
+        "        else :\n",
+        "            df.loc[len(df)] = nr\n",
+        "            counter += 1\n",
+        "            print(f\"{counter}/{tc} {int(counter*100/tc)}%\")\n",
+        "    return mynewlist"
+      ],
+      "metadata": {
+        "id": "t9H8OdFrRLBY"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "main function"
+      ],
+      "metadata": {
+        "id": "5QIu2udZgKKq"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "E_BfboOxLMJd"
+      },
+      "outputs": [],
+      "source": [
+        "try:\n",
+        "\t# initiate webdriver and configure options\n",
+        "\tchrome_options = webdriver.ChromeOptions()\n",
+        "\tchrome_options.add_argument('--headless')\n",
+        "\tchrome_options.add_argument('--no-sandbox')\n",
+        "\tchrome_options.add_argument('--disable-dev-shm-usage')\n",
+        "\tchrome_options.add_argument(\"--incognito\")\n",
+        "\n",
+        "\tser = Service(\"chromedriver\")\n",
+        "\tdriver = webdriver.Chrome(service=ser,options=chrome_options)\n",
+        "\n",
+        "\t# define url and filename for download captcha_temp\n",
+        "\tURL = \"https://www.gturesults.in/\"\n",
+        "\texam = \"3361$S2022$2022-08-25$current$0\"\n",
+        "\tpath=\"cap.jpg\"\n",
+        " \n",
+        "\t# create empty dataframe for filling output data with same labels that input file has\n",
+        "\tdf = pd.read_json('{\"Enrollment No.\":{},\"Name\":{},\"Session\":{},\"Exam\":{},\"Declared On\":{},\"Branch\":{},\"Current Sem. Backlog\":{},\"Total Backlog\":{},\"SPI\":{},\"CPI\":{},\"CGPA\":{},\"Current Points\":{},\"Cumulative points\":{},\"Message\":{}}')\n",
+        "\n",
+        "\tmylist = range(190280111001,190280111010+1) # give range of enrollment no. (here i given our batch's range)\n",
+        "\t\n",
+        "\tcounter = 0\n",
+        "\ttc = len(mylist)\n",
+        " \n",
+        "\t# main driver programm\n",
+        "\t# loop runs untill all data has scraped if any server error not happens\n",
+        "\twhile 1:\n",
+        "\t\tmynewlist=loop()\n",
+        "\t\tif len(mynewlist) != 0:\n",
+        "\t\t\tmylist = mynewlist\n",
+        "\t\telse:\n",
+        "\t\t\tbreak\n",
+        "\n",
+        "\t# save dataframe to excel file\n",
+        "\tdf.to_excel(\"out.xlsx\")\n",
+        "\n",
+        "finally:\n",
+        "\tdriver.close() # close the window\n",
+        "\tdriver.quit() # stop the driver\n",
+        "\t# remove unnecessary files\n",
+        "\timport os\n",
+        "\tos.remove(\"cap.jpg\")\n",
+        "\tos.remove(\"old.png\")\n",
+        "\tos.remove(\"new.png\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "👈 download <font color='yellow'>out.xlsx</font> from left side bar by double clicking it"
+      ],
+      "metadata": {
+        "id": "qgkULANw3ZQ2"
+      }
+    }
+  ]
+}