import { CodeSurfer, CodeSurferColumns, Step, } from "code-surfer"; import { github } from "@code-surfer/themes";
export const theme = { ...github, aspectRatio: 16 / 9, };
- Desenvolvedor Full Stack @ Loadsmart
- blog.alessandrom.dev
Web scraping é a técnica de coletar dados que estão disponíveis na Internet.
-
Pilhas incluídas 🔋
-
Scrapy Cloud ☁️
-
CLI
Controle da administração pública como no Serenata de Amor, coleta de dados para treinamento de IAs, pesquisas de mercado, ou automação de tarefas manuais.
Demo time!
<iframe src="http://books.toscrape.com/" width="100%" height="100%" /> Fazer um tour rápido pelo site, mostrando o botão de `next` e a página de um livro
- Título
- Preço
- Imagem de capa
- Código UPC
- Quantidade em estoque
- Avaliação
- Categoria
import scrapy
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
pass
import scrapy
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
for next_href in response.xpath("//a[.='next']/@href").getall():
next_page_url = response.urljoin(next_href)
yield scrapy.Request(next_page_url, callback=self.parse)
import scrapy
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
for next_href in response.xpath("//a[.='next']/@href").getall():
next_page_url = response.urljoin(next_href)
yield scrapy.Request(next_page_url, callback=self.parse)
for book in response.css("article.product_pod"):
book_href = book.css("a::attr(href)").get()
book_url = response.urljoin(book_href)
yield scrapy.Request(book_url, callback=self.parse_books)
import scrapy
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
for next_href in response.xpath("//a[.='next']/@href").getall():
next_page_url = response.urljoin(next_href)
yield scrapy.Request(next_page_url, callback=self.parse)
for book in response.css("article.product_pod"):
book_href = book.css("a::attr(href)").get()
book_url = response.urljoin(book_href)
yield scrapy.Request(book_url, callback=self.parse_books)
def parse_books(self, book):
title = book.css("h1::text").get()
price = book.css(".price_color::text").get()
cover = book.urljoin(book.css("img::attr(src)").get())
upc = book.css("tr:first-of-type > td::text").get()
in_stock = book.xpath("//*[starts-with(text(),'In stock')]/text()").get()
rating_class = book.css(".star-rating").xpath("@class").get().lower()
rating = rating_class.split()[-1]
category = book.css(".breadcrumb > li:nth-child(3) > a::text").get().lower()
yield {
"title": title,
"price": price,
"cover": cover,
"upc": upc,
"in_stock": in_stock,
"rating": rating,
"category": category,
"url": book.url,
}
[
{
"title": "Emma",
"price": "£32.93",
"cover": "http://books.toscrape.com/media/cache/ae/98/ae98d08a6f427491dd8eda6b51af41fe.jpg",
"upc": "2e69730561ed70ad",
"in_stock": "In stock (1 available)",
"rating": "two",
"category": "classics",
"url": "http://books.toscrape.com/catalogue/emma_17/index.html"
},
]
class DemoPipeline:
def process_item(self, item, spider):
return item
class DemoPipeline:
session = None
def open_spider(self, spider):
self.session = get_session()
def close_spider(self, spider):
try:
self.session.commit()
except Exception:
self.session.rollback()
raise
finally:
self.session.close()
def process_item(self, item, spider):
return item
class DemoPipeline:
session = None
def open_spider(self, spider):
self.session = get_session()
def close_spider(self, spider):
try:
self.session.commit()
except Exception:
self.session.rollback()
raise
finally:
self.session.close()
def process_item(self, item, spider):
book = Book(**item)
self.session.add(book)
return item
O Scrapy só faz requests simples, ele não possui nenhuma engine para renderizar JavaScript, o que é um problema se você quer coletar dados de um site feito com JS frameworks (Vue, React, JS) por exemplo, ou se voce quiser interagir com a pagina
- Headless
- Lightweight
-
Full-fledged
-
Sem interface gráfica
-
Menos funcionalidades
-
Suporta JavaScript
-
Renderizador de JavaScript
-
Scripts em Lua
import scrapy
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
for next_href in response.xpath("//a[.='next']/@href").getall():
next_page_url = response.urljoin(next_href)
yield scrapy.Request(next_page_url, callback=self.parse)
for book in response.css("article.product_pod"):
book_href = book.css("a::attr(href)").get()
book_url = response.urljoin(book_href)
yield scrapy.Request(book_url, callback=self.parse_books)
import scrapy
from scrapy_splash import SplashRequest
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
for next_href in response.xpath("//a[.='next']/@href").getall():
next_page_url = response.urljoin(next_href)
yield scrapy.Request(next_page_url, callback=self.parse)
for book in response.css("article.product_pod"):
book_href = book.css("a::attr(href)").get()
book_url = response.urljoin(book_href)
yield scrapy.Request(book_url, callback=self.parse_books)
import scrapy
from scrapy_splash import SplashRequest
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
splash_kwargs = {
"args": {"wait": 1, "html": 1, "png": 1},
"endpoint": "render.html",
}
def parse(self, response):
for next_href in response.xpath("//a[.='next']/@href").getall():
next_page_url = response.urljoin(next_href)
yield scrapy.Request(next_page_url, callback=self.parse)
for book in response.css("article.product_pod"):
book_href = book.css("a::attr(href)").get()
book_url = response.urljoin(book_href)
yield scrapy.Request(book_url, callback=self.parse_books)
import scrapy
from scrapy_splash import SplashRequest
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
splash_kwargs = {
"args": {"wait": 1, "html": 1, "png": 1},
"endpoint": "render.html",
}
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse, **self.splash_kwargs)
def parse(self, response):
for next_href in response.xpath("//a[.='next']/@href").getall():
next_page_url = response.urljoin(next_href)
yield scrapy.Request(next_page_url, callback=self.parse)
for book in response.css("article.product_pod"):
book_href = book.css("a::attr(href)").get()
book_url = response.urljoin(book_href)
yield scrapy.Request(book_url, callback=self.parse_books)
import scrapy
from scrapy_splash import SplashRequest
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
splash_kwargs = {
"args": {"wait": 1, "html": 1, "png": 1},
"endpoint": "render.html",
}
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse, **self.splash_kwargs)
def parse(self, response):
for next_href in response.xpath("//a[.='next']/@href").getall():
next_page_url = response.urljoin(next_href)
yield SplashRequest(next_page_url, self.parse, **self.splash_kwargs)
for book in response.css("article.product_pod"):
book_href = book.css("a::attr(href)").get()
book_url = response.urljoin(book_href)
yield SplashRequest(book_url, self.parse_books, **self.splash_kwargs)
import scrapy
from scrapy_splash import SplashRequest
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
splash_kwargs = {
"args": {"wait": 1, "html": 1, "png": 1},
"endpoint": "render.html",
}
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse, **self.splash_kwargs)
def parse(self, response):
for next_href in response.xpath("//a[.='next']/@href").getall():
next_page_url = response.urljoin(next_href)
yield SplashRequest(next_page_url, self.parse, **self.splash_kwargs)
for book in response.css("article.product_pod"):
book_href = book.css("a::attr(href)").get()
book_url = response.urljoin(book_href)
yield SplashRequest(book_url, self.parse_books, **self.splash_kwargs)
html = response.body
png_bytes = base64.b64decode(response.data['png'])
# ...
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
User-agent: *
Crawl-delay: 4
User-agent: Googlebot
Disallow: /private
User-agent: annoyng-bot
Disallow: /
# Crawl responsibly by identifying yourself
# (and your website) on the user-agent
USER_AGENT = "Meu-Bot ([email protected])"
# Usar um delay aleatório entre 0.5 * DOWNLOAD_DELAY
# e 1.5 * DOWNLOAD_DELAY, em segundos
DOWNLOAD_DELAY = 3.0
# Atrasar a velocidade do crawler baseado
# na carga do crawler e do site sendo coletado.
AUTOTHROTTLE_ENABLED = True
-
Respeite o robots.txt
-
Fique atento a performance do site
-
Forneça informação para contato
- Documentação do Scrapy
- Web Scraping Sandbox
- Lidando com JavaScript
- Comunidade Scrapy no Reddit
- Scrapinghub
Todo o código fonte esta disponível em
martini97/web-scraping-with-python-2020
Perguntas?