diff --git a/app/parse.py b/app/parse.py index 8d15be0..63293e9 100644 --- a/app/parse.py +++ b/app/parse.py @@ -1,4 +1,15 @@ -from dataclasses import dataclass +import csv +from dataclasses import ( + dataclass, + fields, astuple +) +from urllib.parse import urljoin + +from bs4 import BeautifulSoup, Tag + +import requests + +BASE_URL = "https://quotes.toscrape.com/" @dataclass @@ -8,8 +19,69 @@ class Quote: tags: list[str] +QUOTES_FIELDS = [field.name for field in fields(Quote)] + + +def create_page_link(base_url: str, next_page_url: str) -> str: + return urljoin(base_url, next_page_url) + + +def get_soup_page(url: str) -> BeautifulSoup: + res = requests.get(url, ).content + soup = BeautifulSoup(res, "html.parser") + return soup + + +def next_page(soup: BeautifulSoup) -> dict: + next_class = soup.select(".next") + is_next_page = bool(len(next_class)) + next_page_link = None + if is_next_page: + next_page_link = soup.select(".next")[0].a.attrs["href"] + return { + "is_next_page": is_next_page, + "next_page_link": next_page_link, + } + + +def get_quotes(soup_page: BeautifulSoup) -> list[Tag]: + quotes = soup_page.select(".quote") + return quotes + + +def pars_single_quote(quote: Tag) -> Quote: + text = quote.select(".text")[0].contents[0] + author = quote.select(".author")[0].contents[0] + tags = [str(tag.contents[0]) for tag in quote.select(".tag")] + return Quote(text=str(text), author=str(author), tags=list(tags)) + + +def write_quotes_to_csv(quotes: list[Quote], output_csv_path: str) -> None: + with open(output_csv_path, "w", encoding="utf-8", newline="") as f: + writer = csv.writer(f) + writer.writerow(QUOTES_FIELDS) + writer.writerows([astuple(quote) for quote in quotes]) + + def main(output_csv_path: str) -> None: - pass + page_link = create_page_link(BASE_URL, "") + bs_page = get_soup_page(page_link) + is_next_page = next_page(bs_page) + quotes = get_quotes(bs_page) + parsed_quotes = [] + while True: + for quote in quotes: + parsed_quotes.append( + pars_single_quote(quote) + ) + if not is_next_page["is_next_page"]: + break + page_link = create_page_link(BASE_URL, is_next_page["next_page_link"]) + bs_page = get_soup_page(page_link) + is_next_page = next_page(bs_page) + quotes = get_quotes(bs_page) + + write_quotes_to_csv(parsed_quotes, output_csv_path) if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index 3f202d6..4c236c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,24 @@ -flake8==5.0.4 -flake8-annotations==2.9.1 -flake8-quotes==3.3.1 -flake8-variables-names==0.0.5 -pep8-naming==0.13.2 -pytest==7.1.3 +attrs==24.3.0 +beautifulsoup4==4.12.3 +certifi==2024.12.14 +charset-normalizer==3.4.1 +colorama==0.4.6 +flake8==7.1.1 +flake8-annotations==2.9.1 +flake8-quotes==3.4.0 +flake8-variables-names==0.0.5 +idna==3.10 +iniconfig==2.0.0 +mccabe==0.7.0 +packaging==24.2 +pep8-naming==0.13.2 +pluggy==1.5.0 +py==1.11.0 +pycodestyle==2.12.1 +pyflakes==3.2.0 +pytest==7.1.3 +requests==2.32.3 +setuptools==75.8.0 +soupsieve==2.6 +tomli==2.2.1 +urllib3==2.3.0