Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
AndrewKorzh committed Aug 15, 2024
2 parents 957708b + 74b59a8 commit f8d54c7
Show file tree
Hide file tree
Showing 13 changed files with 301 additions and 123 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ Here is the list of available actions:
- `Scroll(selector, wait_options)` - scroll page
- `Screenshot(options)` - take screenshot
- `Har()` - to get the HAR file, pass the `har_recording=True` argument to `PuppeteerRequest` at the start of execution.
- `FormAction(input_mapping, submit_button)` - to fill out and submit forms on page.
- `RecaptchaSolver(solve_recaptcha)` - find or solve recaptcha on page
- `CustomJsAction(js_function)` - evaluate JS function on page

Expand Down Expand Up @@ -174,4 +175,4 @@ In this case RecaptchaMiddleware will just skip the request.
- [ ] headers and cookies management
- [ ] proxy support for puppeteer
- [x] error handling for requests
- [ ] har support
- [x] har support
2 changes: 2 additions & 0 deletions examples/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@
}

PUPPETEER_SERVICE_URL = "http://localhost:3000"

PUPPETEER_LOCAL = False
38 changes: 38 additions & 0 deletions examples/spiders/fill_form.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import scrapy
from scrapypuppeteer import PuppeteerRequest, PuppeteerScreenshotResponse
from scrapypuppeteer.actions import Screenshot, FillForm
import base64


class FormActionSpider(scrapy.Spider):
name = "fill_form"
start_urls = ["https://www.roboform.com/filling-test-all-fields"]

def start_requests(self):
for url in self.start_urls:
yield PuppeteerRequest(url, callback=self.form_action, close_page=False)

def form_action(self, response):
input_mapping = {
'input[name="02frstname"]': {"value": "SomeName", "delay": 50},
'input[name="05_company"]': {"value": "SomeCompany", "delay": 100},
'input[name="06position"]': {"value": "SomePosition", "delay": 100},
}

yield response.follow(
FillForm(input_mapping), close_page=False, callback=self.screenshot
)

def screenshot(self, response):
action = Screenshot(
options={
"fullPage": True,
}
)
yield response.follow(action, callback=self.make_screenshot, close_page=False)

@staticmethod
def make_screenshot(response: PuppeteerScreenshotResponse, **kwargs):
data = response.screenshot
with open(f"screenshot.png", "wb") as fh:
fh.write(base64.b64decode(data))
29 changes: 29 additions & 0 deletions examples/spiders/har.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import scrapy
from scrapypuppeteer import PuppeteerRequest
from scrapypuppeteer.actions import Har


def write_to_file(file_path, content):
with open(file_path, "a", encoding="utf-8") as file:
file.write(content)


class HarSpider(scrapy.Spider):
name = "har"
start_urls = ["https://github.com/pyppeteer/pyppeteer"]

def start_requests(self):
for url in self.start_urls:
yield PuppeteerRequest(
url, callback=self.har, close_page=False, har_recording=True
)

def har(self, response):
yield response.follow(
Har(),
close_page=False,
callback=self.save_har,
)

def save_har(self, response):
write_to_file("result.har", response.har)
48 changes: 45 additions & 3 deletions scrapypuppeteer/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,11 @@ class GoTo(PuppeteerServiceAction):
endpoint = "goto"

def __init__(
self, url: str, navigation_options: dict = None, wait_options: dict = None, har_recording: bool = False
self,
url: str,
navigation_options: dict = None,
wait_options: dict = None,
har_recording: bool = False,
):
self.url = url
self.navigation_options = navigation_options
Expand Down Expand Up @@ -223,15 +227,53 @@ def __init__(self, options: dict = None, **kwargs):

def payload(self):
return {"options": self.options}


class Har(PuppeteerServiceAction):
"""
The `Har` action is used to capture and retrieve the HTTP Archive (HAR) file,
which contains detailed information about network requests and responses
made during the session.
This action is called without any arguments. To generate the HAR file,
you must pass the `har_recording=True` argument to `PuppeteerRequest`
when initiating the request.
"""

endpoint = "har"

def payload(self):
return {}


class FillForm(PuppeteerServiceAction):
"""
Fill out and submit forms on a webpage.
Available options:
* ``input_mapping`` (dict): A dictionary where each key is a CSS selector, and
each value is another dictionary containing details about the input for that element.
Each entry in the dictionary should follow this structure:
* ``selector`` (str): The CSS selector for the input element (used as the key).
* ``value`` (str): The text to be inputted into the element.
* ``delay`` (int, optional): A delay (in milliseconds) between each keystroke
when inputting the text. Defaults to 0 if not provided.
* ``submit_button`` (str, optional): The CSS selector for the form's submit button.
If provided, the button will be clicked after filling in the form.
"""

endpoint = "fill_form"

def __init__(self, input_mapping: dict, submit_button: str = None):
self.input_mapping = input_mapping
self.submit_button = submit_button

def payload(self):
return {"inputMapping": self.input_mapping, "submitButton": self.submit_button}


class RecaptchaSolver(PuppeteerServiceAction):
"""
Expand Down
5 changes: 3 additions & 2 deletions scrapypuppeteer/browser_managers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@

from abc import ABC, abstractmethod


class BrowserManager(ABC):
@abstractmethod
def process_request(self, request, spider):
pass

@abstractmethod
def close_used_contexts(self):
pass

@abstractmethod
def process_response(self, middleware, request, response, spider):
pass
pass
Loading

0 comments on commit f8d54c7

Please sign in to comment.