Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Created a solution #367

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
76 changes: 74 additions & 2 deletions app/parse.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,15 @@
from dataclasses import dataclass
import csv
from dataclasses import (
dataclass,
fields, astuple
)
from urllib.parse import urljoin

from bs4 import BeautifulSoup, Tag

import requests

BASE_URL = "https://quotes.toscrape.com/"


@dataclass
Expand All @@ -8,8 +19,69 @@ class Quote:
tags: list[str]


QUOTES_FIELDS = [field.name for field in fields(Quote)]


def create_page_link(base_url: str, next_page_url: str) -> str:
return urljoin(base_url, next_page_url)


def get_soup_page(url: str) -> BeautifulSoup:
r = requests.get(url, ).content
soup = BeautifulSoup(r, "html.parser")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is an unnecessary comma after requests.get(url, ). It should be removed to avoid confusion.

return soup


def next_page(soup: BeautifulSoup) -> dict:
next_class = soup.select(".next")
is_next_page = bool(len(next_class))
next_page_link = None
if is_next_page:
next_page_link = soup.select(".next")[0].a.attrs["href"]
return {
"is_next_page": is_next_page,
"next_page_link": next_page_link,
}


def get_quotes(soup_page: BeautifulSoup) -> list[Tag]:
quotes = soup_page.select(".quote")
return quotes


def pars_single_quote(quote: Tag) -> Quote:
text = quote.select(".text")[0].contents[0]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function name pars_single_quote seems to be a typo. It should be parse_single_quote to maintain consistency and clarity.

author = quote.select(".author")[0].contents[0]
tags = [str(tag.contents[0]) for tag in quote.select(".tag")]
return Quote(text=str(text), author=str(author), tags=list(tags))


def write_quotes_to_csv(quotes, output_csv_path):
with open(output_csv_path, "w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(QUOTES_FIELDS)
writer.writerows([astuple(quote) for quote in quotes])


def main(output_csv_path: str) -> None:
pass
page_link = create_page_link(BASE_URL, "")
bs_page = get_soup_page(page_link)
is_next_page = next_page(bs_page)
quotes = get_quotes(bs_page)
parsed_quotes = []
while True:
for quote in quotes:
parsed_quotes.append(
pars_single_quote(quote)
)
if not is_next_page["is_next_page"]:
break
page_link = create_page_link(BASE_URL, is_next_page["next_page_link"])
bs_page = get_soup_page(page_link)
is_next_page = next_page(bs_page)
quotes = get_quotes(bs_page)

write_quotes_to_csv(parsed_quotes, output_csv_path)


if __name__ == "__main__":
Expand Down
13 changes: 7 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
flake8==5.0.4
flake8-annotations==2.9.1
flake8-quotes==3.3.1
flake8-variables-names==0.0.5
pep8-naming==0.13.2
pytest==7.1.3
beautifulsoup4==4.12.3
certifi==2024.12.14
charset-normalizer==3.4.1
idna==3.10
requests==2.32.3
soupsieve==2.6
urllib3==2.3.0
Loading