Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Solve amazon captcha with 2captcha #662

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ gunicorn = "23.0.0"
flask-api = {editable = true, ref = "develop", git = "git+https://github.com/flask-api/flask-api.git"}
setuptools = "==75.6.0"
certifi = "==2024.12.14"
2captcha-python = "*"

[dev-packages]
exceptiongroup = "*"
Expand Down
19 changes: 14 additions & 5 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

71 changes: 3 additions & 68 deletions flathunter/abstract_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import re
from time import sleep
from typing import Optional, Any
import json

import backoff
import requests
Expand All @@ -13,10 +12,11 @@
from bs4 import BeautifulSoup

from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver import Chrome
from selenium.webdriver import Chrome, Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains

from flathunter import proxies
from flathunter.captcha.captcha_solver import CaptchaUnsolvableError
Expand Down Expand Up @@ -201,72 +201,7 @@ def resolve_geetest(self, driver):
max_tries=3)
def resolve_awsawf(self, driver):
"""Resolve AWS WAF Captcha"""

# Intercept background network traffic via log sniffing
sleep(2)
logs = [json.loads(lr["message"])["message"] for lr in driver.get_log("performance")]

def log_filter(log_):
return (
# is an actual response
log_["method"] == "Network.responseReceived"
# and json
and "json" in log_["params"]["response"]["mimeType"]
)

context = None
iv = None
for log in filter(log_filter, logs):
request_id = log["params"]["requestId"]
resp_url = log["params"]["response"]["url"]
if "problem" in resp_url and "awswaf" in resp_url:
response = driver.execute_cdp_cmd(
"Network.getResponseBody", {"requestId": request_id}
)
response_json = json.loads(response["body"])
iv = response_json["state"]["iv"]
context = response_json["state"]["payload"]
sitekey = response_json["key"]
if context is None or iv is None:
raise CaptchaUnsolvableError("Unable to find captcha data in logs")

sitekey = re.findall(
r"apiKey: \"(.*?)\"", driver.page_source)[0]

challenge = None
challenge_matches = re.findall(r'src="([^"]*challenge\.js)"', driver.page_source)
for match in challenge_matches:
logger.debug('Challenge SRC Value: %s', match)
challenge = match

jsapi = None
jsapi_matches = re.findall(r'src="([^"]*jsapi\.js)"', driver.page_source)
for match in jsapi_matches:
logger.debug('JsApi SRC Value: %s', match)
jsapi = match

if challenge is None or jsapi is None:
raise CaptchaUnsolvableError("Unable to find challenge or JSApi value in page source")

try:
captcha = self.captcha_solver.solve_awswaf(
sitekey,
iv,
context,
challenge,
jsapi,
driver.current_url
)
old_cookie = driver.get_cookie('aws-waf-token')
new_cookie = old_cookie
new_cookie['value'] = captcha.token
driver.delete_cookie('aws-waf-token')
driver.add_cookie(new_cookie)
sleep(1)
driver.refresh()
except CaptchaUnsolvableError:
driver.refresh()
raise
self.captcha_solver.resolve_awswaf(driver)

@backoff.on_exception(wait_gen=backoff.constant,
exception=CaptchaUnsolvableError,
Expand Down
62 changes: 57 additions & 5 deletions flathunter/captcha/capmonster_solver.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Captcha solver for CapMonster Captcha Solving Service (https://capmonster.cloud)"""
import json
import re
from typing import Dict
from time import sleep
import backoff
Expand All @@ -8,12 +9,11 @@
from flathunter.logging import logger
from flathunter.captcha.captcha_solver import (
CaptchaSolver,
CaptchaBalanceEmpty,
CaptchaUnsolvableError,
GeetestResponse,
AwsAwfResponse,
RecaptchaResponse,
)
from flathunter.captcha.captcha_solver import CaptchaUnsolvableError

class CapmonsterSolver(CaptchaSolver):
"""Implementation of Captcha solver for CapMonster"""
Expand All @@ -26,12 +26,64 @@ def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaRespo
"""Should be implemented in subclass"""
raise NotImplementedError("Recaptcha captcha solving is not implemented for Capmonster")

# pylint: disable=too-many-locals
def resolve_awswaf(self, driver):
# Intercept background network traffic via log sniffing
sleep(2)
logs = [json.loads(lr["message"])["message"] for lr in driver.get_log("performance")]

def log_filter(log_):
return (
# is an actual response
log_["method"] == "Network.responseReceived"
# and json
and "json" in log_["params"]["response"]["mimeType"]
)

for log in filter(log_filter, logs):
request_id = log["params"]["requestId"]
resp_url = log["params"]["response"]["url"]
if "problem" in resp_url and "awswaf" in resp_url:
response = driver.execute_cdp_cmd(
"Network.getResponseBody", {"requestId": request_id}
)
response_json = json.loads(response["body"])
sitekey = response_json["key"]

sitekey = re.findall(
r"apiKey: \"(.*?)\"", driver.page_source)[0]

jsapi = None
jsapi_matches = re.findall(r'src="([^"]*jsapi\.js)"', driver.page_source)
for match in jsapi_matches:
logger.debug('JsApi SRC Value: %s', match)
jsapi = match

if jsapi is None:
raise CaptchaUnsolvableError("Unable to find challenge or JSApi value in page source")

try:
captcha = self.solve_awswaf(
sitekey,
jsapi,
driver.current_url
)
old_cookie = driver.get_cookie('aws-waf-token')
new_cookie = old_cookie
new_cookie['value'] = captcha.token
driver.delete_cookie('aws-waf-token')
driver.add_cookie(new_cookie)
sleep(1)
driver.refresh()
except CaptchaUnsolvableError:
driver.refresh()
raise

# pylint: disable=too-many-arguments
# pylint: disable=too-many-positional-arguments
def solve_awswaf(
self,
sitekey: str,
iv: str,
context: str,
challenge_script: str,
captcha_script: str,
page_url: str
) -> AwsAwfResponse:
Expand Down
11 changes: 1 addition & 10 deletions flathunter/captcha/captcha_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ class AwsAwfResponse:
"""Response from AWS WAF"""
token: str


class CaptchaSolver:
"""Interface for Captcha solvers"""

Expand All @@ -39,15 +38,7 @@ def solve_geetest(self, geetest: str, challenge: str, page_url: str) -> GeetestR
"""Should be implemented in subclass"""
raise NotImplementedError()

def solve_awswaf(
self,
sitekey: str,
iv: str,
context: str,
challenge_script: str,
captcha_script: str,
page_url: str
) -> AwsAwfResponse:
def resolve_awswaf(self, driver):
"""Should be implemented in subclass"""
raise NotImplementedError()

Expand Down
2 changes: 2 additions & 0 deletions flathunter/captcha/imagetyperz_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaRespo
)
return RecaptchaResponse(self.__retrieve_imagetyperz_result(captcha_id))

# pylint: disable=too-many-arguments
# pylint: disable=too-many-positional-arguments
def solve_awswaf(
self,
sitekey: str,
Expand Down
77 changes: 69 additions & 8 deletions flathunter/captcha/twocaptcha_solver.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
"""Captcha solver for 2Captcha Captcha Solving Service (https://2captcha.com)"""
import base64
import json
from io import BytesIO
from typing import Dict
from time import sleep

import backoff
import requests
from twocaptcha import TwoCaptcha

from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

from flathunter.logging import logger
from flathunter.captcha.captcha_solver import (
Expand Down Expand Up @@ -47,17 +56,69 @@ def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaRespo
captcha_id = self.__submit_2captcha_request(params)
return RecaptchaResponse(self.__retrieve_2captcha_result(captcha_id))

def resolve_awswaf(self, driver):
"""Resolve Amazon Captcha"""
try:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(3)
shadowelement = driver.execute_script(
"return document.querySelector('awswaf-captcha').shadowRoot"
)
my_img = shadowelement.find_element(By.ID, "root")
size = my_img.size
select_l = my_img.find_element(By.TAG_NAME, "select")
select_l.click()
sleep(1)
select_l.send_keys(Keys.DOWN)
sleep(3)
shadowelement = driver.execute_script(
"return document.querySelector('awswaf-captcha').shadowRoot"
)
my_img = shadowelement.find_element(By.ID, "root")
screenshot = my_img.screenshot_as_png
screenshot_bytes = BytesIO(screenshot)
base64_screenshot = base64.b64encode(screenshot_bytes.getvalue()).decode('utf-8')
# Send image in 2captcha service
result = self.solve_awswaf(base64_screenshot)
logger.info(result.token)
l = result.token.split(':')[1].split(';')
l = [[int(val.split('=')[1]) for val in coord.split(',')] for coord in l]
button_coord = [size['width'] - 30, size['height'] - 30]
l.append(button_coord)
actions = ActionChains(driver)
for i in l:
actions.move_to_element_with_offset(my_img, i[0] - 160, i[1] - 211).click()
actions.perform()
sleep(0.5)
actions.reset_actions()
sleep(1)
try:
confirm_button = my_img.find_element(By.ID, "amzn-btn-verify-internal")
actions.move_to_element_with_offset(confirm_button, 40, 15).click()
actions.perform()
sleep(4)
except NoSuchElementException:
pass
try:
driver.find_element(By.TAG_NAME, "awswaf-captcha")
except NoSuchElementException:
logger.info("Captcha solved")
else:
raise CaptchaUnsolvableError()
except Exception as ex:
driver.refresh()
raise CaptchaUnsolvableError() from ex

def solve_awswaf(
self,
sitekey: str,
iv: str,
context: str,
challenge_script: str,
captcha_script: str,
page_url: str
image_b64: str
) -> AwsAwfResponse:
"""Should be implemented at some point"""
raise NotImplementedError("AWS WAF captchas not supported for 2Captcha")
"""Solve AWS WAF by processing an image"""
solver = TwoCaptcha(self.api_key, defaultTimeout=60, pollingInterval=5)
result = solver.coordinates(image_b64, lang='en')
if result is None:
raise CaptchaUnsolvableError("Got None from 2captcha solve")
return AwsAwfResponse(result["code"])

@backoff.on_exception(**CaptchaSolver.backoff_options)
def __submit_2captcha_request(self, params: Dict[str, str]) -> str:
Expand Down
5 changes: 3 additions & 2 deletions flathunter/crawler/immobilienscout.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from flathunter.abstract_crawler import Crawler
from flathunter.logging import logger
from flathunter.chrome_wrapper import get_chrome_driver
from flathunter.captcha.twocaptcha_solver import TwoCaptchaSolver
from flathunter.exceptions import DriverLoadException

STATIC_URL_PATTERN = re.compile(r'https://www\.immobilienscout24\.de')
Expand All @@ -35,7 +34,7 @@ class Immobilienscout(Crawler):

URL_PATTERN = STATIC_URL_PATTERN

JSON_PATH_PARSER_ENTRIES = parse("$..['resultlist.realEstate']")
JSON_PATH_PARSER_ENTRIES = parse("$..['resultlistEntries']..['resultlist.realEstate']")
JSON_PATH_PARSER_IMAGES = parse("$..galleryAttachments"
"..attachment[?'@xsi.type'=='common:Picture']"
"..['@href'].`sub(/(.*\\\\.jpe?g).*/, \\\\1)`")
Expand Down Expand Up @@ -117,6 +116,8 @@ def get_results(self, search_url, max_pages=None):

def get_entries_from_javascript(self):
"""Get entries from JavaScript"""
if "Warum haben wir deine Anfrage blockiert?" in self.get_driver_force().page_source:
self.resolve_awsawf(self.get_driver_force())
try:
result_json = self.get_driver_force().execute_script('return window.IS24.resultList;')
except JavascriptException:
Expand Down
2 changes: 2 additions & 0 deletions flathunter/gmaps_duration_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def process_expose(self, expose):

def get_formatted_durations(self, address):
"""Return a formatted list of GoogleMaps durations"""
if address is None:
return ""
out = ""
for duration in self.config.get('durations', []):
if 'destination' in duration and 'name' in duration:
Expand Down
Loading
Loading