Skip to content

Commit

Permalink
사진 크롤링 for YOLO
Browse files Browse the repository at this point in the history
  • Loading branch information
so02e committed May 9, 2021
1 parent a7e179c commit 25b7c08
Show file tree
Hide file tree
Showing 9 changed files with 737 additions and 2 deletions.
48 changes: 47 additions & 1 deletion alphacar.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -97,6 +97,52 @@
" \n",
"driver.close()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.keys import Keys\n",
"import time\n",
"import urllib.request\n",
" \n",
"driver = webdriver.Chrome('C:/Temp/chromedriver')\n",
"driver.get(\"https://search.naver.com/search.naver?where=image&sm=tab_jum&query=\")\n",
"elem = driver.find_element_by_name(\"query\")\n",
"elem.send_keys(\"알파카 전동킥보드\")\n",
"elem.send_keys(Keys.RETURN)\n",
" \n",
"SCROLL_PAUSE_TIME = 1\n",
" \n",
"last_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
" \n",
"while True:\n",
" driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
" \n",
" time.sleep(SCROLL_PAUSE_TIME)\n",
" \n",
" new_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
" if new_height == last_height:\n",
" break\n",
" last_height = new_height\n",
" \n",
"images = driver.find_elements_by_css_selector(\"._image._listImage\")\n",
"count = 1186\n",
"for image in images:\n",
" try: \n",
" image.click()\n",
" time.sleep(2)\n",
" imgUrl = driver.find_element_by_xpath('//*[@id=\"main_pack\"]/section/div[2]/div[2]/div/div[1]/div[1]/div[1]/div/div/div[1]/div[1]/img').get_attribute(\"src\")\n",
" urllib.request.urlretrieve(imgUrl, str(count) + \".jpg\")\n",
" count = count + 1\n",
" except:\n",
" pass\n",
" \n",
"driver.close()"
]
}
],
"metadata": {
Expand Down
49 changes: 49 additions & 0 deletions beam.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,55 @@
" \n",
"driver.close()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.keys import Keys\n",
"import time\n",
"import urllib.request\n",
"\n",
"driver = webdriver.Chrome('C:/Temp/chromedriver')\n",
"driver.get(\"https://www.google.co.kr/imghp?hl=ko&ogbl\")\n",
"elem = driver.find_element_by_name(\"q\")\n",
"elem.send_keys(\"beam scooter\")\n",
"elem.send_keys(Keys.RETURN)\n",
" \n",
"SCROLL_PAUSE_TIME = 1\n",
" \n",
"last_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
" \n",
"while True:\n",
" driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
" \n",
" time.sleep(SCROLL_PAUSE_TIME)\n",
" \n",
" new_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
" if new_height == last_height:\n",
" try:\n",
" driver.find_element_by_css_selector(\".mye4qd\").click()\n",
" except:\n",
" break\n",
" last_height = new_height\n",
" \n",
"images = driver.find_elements_by_css_selector(\".rg_i.Q4LuWd\")\n",
"count = 1107\n",
"for image in images:\n",
" try: \n",
" image.click()\n",
" time.sleep(2)\n",
" imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div/div[2]/a/img').get_attribute(\"src\")\n",
" urllib.request.urlretrieve(imgUrl, str(count) + \".jpg\")\n",
" count = count + 1\n",
" except:\n",
" pass\n",
" \n",
"driver.close()"
]
}
],
"metadata": {
Expand Down
194 changes: 194 additions & 0 deletions helmet.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 헬멧 300~400개 "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.keys import Keys\n",
"import time\n",
"import urllib.request\n",
"\n",
"driver = webdriver.Chrome('C:/Temp/chromedriver')\n",
"driver.get(\"https://www.google.co.kr/imghp?hl=ko&ogbl\")\n",
"elem = driver.find_element_by_name(\"q\")\n",
"elem.send_keys(\"자전거 헬멧 사이트\")\n",
"elem.send_keys(Keys.RETURN)\n",
" \n",
"SCROLL_PAUSE_TIME = 1\n",
" \n",
"last_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
" \n",
"while True:\n",
" driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
" \n",
" time.sleep(SCROLL_PAUSE_TIME)\n",
" \n",
" new_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
" if new_height == last_height:\n",
" try:\n",
" driver.find_element_by_css_selector(\".mye4qd\").click()\n",
" except:\n",
" break\n",
" last_height = new_height\n",
" \n",
"images = driver.find_elements_by_css_selector(\".rg_i.Q4LuWd\")\n",
"count = 1\n",
"for image in images:\n",
" try: \n",
" image.click()\n",
" time.sleep(2)\n",
" imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div/div[2]/a/img').get_attribute(\"src\")\n",
" urllib.request.urlretrieve(imgUrl, str(count) + \".jpg\")\n",
" count = count + 1\n",
" except:\n",
" pass\n",
" \n",
"driver.close()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.keys import Keys\n",
"import time\n",
"import urllib.request\n",
"\n",
"driver = webdriver.Chrome('C:/Temp/chromedriver')\n",
"driver.get(\"https://www.google.co.kr/imghp?hl=ko&ogbl\")\n",
"elem = driver.find_element_by_name(\"q\")\n",
"elem.send_keys(\"wearing bicycle helmet\")\n",
"elem.send_keys(Keys.RETURN)\n",
" \n",
"SCROLL_PAUSE_TIME = 1\n",
" \n",
"last_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
" \n",
"while True:\n",
" driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
" \n",
" time.sleep(SCROLL_PAUSE_TIME)\n",
" \n",
" new_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
" if new_height == last_height:\n",
" try:\n",
" driver.find_element_by_css_selector(\".mye4qd\").click()\n",
" except:\n",
" break\n",
" last_height = new_height\n",
" \n",
"images = driver.find_elements_by_css_selector(\".rg_i.Q4LuWd\")\n",
"count = 156\n",
"for image in images:\n",
" try: \n",
" image.click()\n",
" time.sleep(2)\n",
" imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div/div[2]/a/img').get_attribute(\"src\")\n",
" urllib.request.urlretrieve(imgUrl, str(count) + \".jpg\")\n",
" count = count + 1\n",
" except:\n",
" pass\n",
" \n",
"driver.close()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"ename": "NoSuchWindowException",
"evalue": "Message: no such window: target window already closed\nfrom unknown error: web view not found\n (Session info: chrome=90.0.4430.93)\n",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNoSuchWindowException\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-5-8f98a753c94d>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 39\u001b[0m \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 40\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 41\u001b[1;33m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m~\\anaconda3\\envs\\pydatavenv\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mclose\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 686\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 687\u001b[0m \"\"\"\n\u001b[1;32m--> 688\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mCommand\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mCLOSE\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 689\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 690\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mquit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\anaconda3\\envs\\pydatavenv\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 319\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 320\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 322\u001b[0m response['value'] = self._unwrap_value(\n\u001b[0;32m 323\u001b[0m response.get('value', None))\n",
"\u001b[1;32m~\\anaconda3\\envs\\pydatavenv\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py\u001b[0m in \u001b[0;36mcheck_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m 240\u001b[0m \u001b[0malert_text\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'alert'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 241\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0malert_text\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 242\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 243\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 244\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_value_or_default\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNoSuchWindowException\u001b[0m: Message: no such window: target window already closed\nfrom unknown error: web view not found\n (Session info: chrome=90.0.4430.93)\n"
]
}
],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.keys import Keys\n",
"import time\n",
"import urllib.request\n",
"\n",
"driver = webdriver.Chrome('C:/Temp/chromedriver')\n",
"driver.get(\"https://www.google.co.kr/imghp?hl=ko&ogbl\")\n",
"elem = driver.find_element_by_name(\"q\")\n",
"elem.send_keys(\"bike helmet\")\n",
"elem.send_keys(Keys.RETURN)\n",
" \n",
"SCROLL_PAUSE_TIME = 1\n",
" \n",
"last_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
" \n",
"while True:\n",
" driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
" \n",
" time.sleep(SCROLL_PAUSE_TIME)\n",
" \n",
" new_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
" if new_height == last_height:\n",
" try:\n",
" driver.find_element_by_css_selector(\".mye4qd\").click()\n",
" except:\n",
" break\n",
" last_height = new_height\n",
" \n",
"images = driver.find_elements_by_css_selector(\".rg_i.Q4LuWd\")\n",
"count = 609\n",
"for image in images:\n",
" try: \n",
" image.click()\n",
" time.sleep(2)\n",
" imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div/div[2]/a/img').get_attribute(\"src\")\n",
" urllib.request.urlretrieve(imgUrl, str(count) + \".jpg\")\n",
" count = count + 1\n",
" except:\n",
" pass\n",
" \n",
"driver.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "pydatavenv",
"language": "python",
"name": "pydatavenv"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
63 changes: 63 additions & 0 deletions kickgoing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,69 @@
" \n",
"driver.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.keys import Keys\n",
"import time\n",
"import urllib.request\n",
"\n",
"driver = webdriver.Chrome('C:/Temp/chromedriver')\n",
"driver.get(\"https://www.google.co.kr/imghp?hl=ko&ogbl\")\n",
"elem = driver.find_element_by_name(\"q\")\n",
"elem.send_keys(\"kickgoing scooter\")\n",
"elem.send_keys(Keys.RETURN)\n",
" \n",
"SCROLL_PAUSE_TIME = 1\n",
" \n",
"last_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
" \n",
"while True:\n",
" driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
" \n",
" time.sleep(SCROLL_PAUSE_TIME)\n",
" \n",
" new_height = driver.execute_script(\"return document.body.scrollHeight\")\n",
" if new_height == last_height:\n",
" try:\n",
" driver.find_element_by_css_selector(\".mye4qd\").click()\n",
" except:\n",
" break\n",
" last_height = new_height\n",
" \n",
"images = driver.find_elements_by_css_selector(\".rg_i.Q4LuWd\")\n",
"count = 569\n",
"for image in images:\n",
" try: \n",
" image.click()\n",
" time.sleep(2)\n",
" imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div/div[2]/a/img').get_attribute(\"src\")\n",
" urllib.request.urlretrieve(imgUrl, str(count) + \".jpg\")\n",
" count = count + 1\n",
" except:\n",
" pass\n",
" \n",
"driver.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Loading

0 comments on commit 25b7c08

Please sign in to comment.