From 044eb679f61051055f874308d57d8966aaaa03f3 Mon Sep 17 00:00:00 2001 From: Maksym Protsak Date: Sat, 18 Jan 2025 00:09:46 +0200 Subject: [PATCH 01/11] Filled main function in parse.py. --- app/parse.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/app/parse.py b/app/parse.py index 8d15be0..076c46d 100644 --- a/app/parse.py +++ b/app/parse.py @@ -1,5 +1,9 @@ from dataclasses import dataclass +from bs4 import BeautifulSoup + +import requests + @dataclass class Quote: @@ -9,7 +13,10 @@ class Quote: def main(output_csv_path: str) -> None: - pass + r = requests.get("https://quotes.toscrape.com/",) + bs = BeautifulSoup(r.text, "html.parser") + bs_quotes = bs.find(".quote") + print(bs_quotes) if __name__ == "__main__": From bd2bc2f44f5e46c43fcfb9f1ef1e8a309bbbef2f Mon Sep 17 00:00:00 2001 From: Maksym Protsak Date: Sat, 18 Jan 2025 00:10:19 +0200 Subject: [PATCH 02/11] Update requirements.txt. --- requirements.txt | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3f202d6..2b51ffc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ -flake8==5.0.4 -flake8-annotations==2.9.1 -flake8-quotes==3.3.1 -flake8-variables-names==0.0.5 -pep8-naming==0.13.2 -pytest==7.1.3 +beautifulsoup4==4.12.3 +certifi==2024.12.14 +charset-normalizer==3.4.1 +idna==3.10 +requests==2.32.3 +soupsieve==2.6 +urllib3==2.3.0 From 2ffe80ff0ebb47cecf45c87e031405140be356aa Mon Sep 17 00:00:00 2001 From: Maksym Protsak Date: Sat, 18 Jan 2025 12:04:35 +0200 Subject: [PATCH 03/11] Created get_quotes, pars_single_quote. Updated main. --- app/parse.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/app/parse.py b/app/parse.py index 076c46d..44f6a41 100644 --- a/app/parse.py +++ b/app/parse.py @@ -1,9 +1,11 @@ from dataclasses import dataclass -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag import requests +BASE_URL = "https://quotes.toscrape.com/" + @dataclass class Quote: @@ -12,11 +14,26 @@ class Quote: tags: list[str] +def get_quotes() -> list[Tag]: + r = requests.get(BASE_URL, ).content + soup = BeautifulSoup(r, "html.parser") + quotes = soup.select(".quote") + return quotes + + +def pars_single_quote(quote: Tag) -> Quote: + text = quote.select(".text")[0].contents[0] + author = quote.select(".author")[0].contents[0] + tags = [str(tag.contents[0]) for tag in quote.select(".tag")] + return Quote(text=str(text), author=str(author), tags=list(tags)) + + def main(output_csv_path: str) -> None: - r = requests.get("https://quotes.toscrape.com/",) - bs = BeautifulSoup(r.text, "html.parser") - bs_quotes = bs.find(".quote") - print(bs_quotes) + quotes = get_quotes() + parsed_quotes = [ + pars_single_quote(quote) + for quote in quotes + ] if __name__ == "__main__": From cceec3a940361829ce63abee60904a44ad200970 Mon Sep 17 00:00:00 2001 From: Maksym Protsak Date: Sat, 18 Jan 2025 12:52:38 +0200 Subject: [PATCH 04/11] Created get_copu_page, next_page. Updated get_quotes, main functions. --- app/parse.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/app/parse.py b/app/parse.py index 44f6a41..2e2acab 100644 --- a/app/parse.py +++ b/app/parse.py @@ -14,10 +14,23 @@ class Quote: tags: list[str] -def get_quotes() -> list[Tag]: +def get_soup_page() -> BeautifulSoup: r = requests.get(BASE_URL, ).content soup = BeautifulSoup(r, "html.parser") - quotes = soup.select(".quote") + return soup + + +def next_page(soup: BeautifulSoup) -> tuple: + next_class = soup.select(".next") + is_next_page = bool(len(next_class)) + next_page_link = None + if is_next_page: + next_page_link = soup.select(".next")[0].a.attrs["href"] + return is_next_page, next_page_link + + +def get_quotes(soup_page: BeautifulSoup) -> list[Tag]: + quotes = soup_page.select(".quote") return quotes @@ -29,7 +42,9 @@ def pars_single_quote(quote: Tag) -> Quote: def main(output_csv_path: str) -> None: - quotes = get_quotes() + bs_page = get_soup_page() + is_next_page = next_page(bs_page) + quotes = get_quotes(bs_page) parsed_quotes = [ pars_single_quote(quote) for quote in quotes From 703d33179f92b0d74e55189f73e2ead6d24d302c Mon Sep 17 00:00:00 2001 From: Maksym Protsak Date: Sat, 18 Jan 2025 15:08:08 +0200 Subject: [PATCH 05/11] Created create_page_link. Updated next_page, main. --- app/parse.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/app/parse.py b/app/parse.py index 2e2acab..9be68a9 100644 --- a/app/parse.py +++ b/app/parse.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag @@ -14,19 +15,26 @@ class Quote: tags: list[str] -def get_soup_page() -> BeautifulSoup: - r = requests.get(BASE_URL, ).content +def create_page_link(base_url: str, next_page_url: str) -> str: + return urljoin(base_url, next_page_url) + + +def get_soup_page(url: str) -> BeautifulSoup: + r = requests.get(url, ).content soup = BeautifulSoup(r, "html.parser") return soup -def next_page(soup: BeautifulSoup) -> tuple: +def next_page(soup: BeautifulSoup) -> dict: next_class = soup.select(".next") is_next_page = bool(len(next_class)) next_page_link = None if is_next_page: next_page_link = soup.select(".next")[0].a.attrs["href"] - return is_next_page, next_page_link + return { + "is_next_page": is_next_page, + "next_page_link": next_page_link, + } def get_quotes(soup_page: BeautifulSoup) -> list[Tag]: @@ -42,13 +50,23 @@ def pars_single_quote(quote: Tag) -> Quote: def main(output_csv_path: str) -> None: - bs_page = get_soup_page() + page_link = create_page_link(BASE_URL, "") + bs_page = get_soup_page(page_link) is_next_page = next_page(bs_page) quotes = get_quotes(bs_page) - parsed_quotes = [ - pars_single_quote(quote) - for quote in quotes - ] + parsed_quotes = [] + while True: + for quote in quotes: + parsed_quotes.append( + pars_single_quote(quote) + ) + if is_next_page["is_next_page"]: + page_link = create_page_link(BASE_URL, is_next_page["next_page_link"]) + bs_page = get_soup_page(page_link) + is_next_page = next_page(bs_page) + quotes = get_quotes(bs_page) + else: + break if __name__ == "__main__": From 60686caaf41c0ba73964e59e8dc65f7f05fe304d Mon Sep 17 00:00:00 2001 From: Maksym Protsak Date: Sat, 18 Jan 2025 19:51:22 +0200 Subject: [PATCH 06/11] Created QUOTES_FIELDS, write_quotes_to_csv updated if condition in main. --- app/parse.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/app/parse.py b/app/parse.py index 9be68a9..b5225bb 100644 --- a/app/parse.py +++ b/app/parse.py @@ -1,4 +1,8 @@ -from dataclasses import dataclass +import csv +from dataclasses import ( + dataclass, + fields, astuple +) from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag @@ -15,6 +19,9 @@ class Quote: tags: list[str] +QUOTES_FIELDS = [field.name for field in fields(Quote)] + + def create_page_link(base_url: str, next_page_url: str) -> str: return urljoin(base_url, next_page_url) @@ -49,6 +56,13 @@ def pars_single_quote(quote: Tag) -> Quote: return Quote(text=str(text), author=str(author), tags=list(tags)) +def write_quotes_to_csv(quotes, output_csv_path): + with open(output_csv_path, "w", encoding="utf-8", newline="") as f: + writer = csv.writer(f) + writer.writerow(QUOTES_FIELDS) + writer.writerows([astuple(quote) for quote in quotes]) + + def main(output_csv_path: str) -> None: page_link = create_page_link(BASE_URL, "") bs_page = get_soup_page(page_link) @@ -60,13 +74,14 @@ def main(output_csv_path: str) -> None: parsed_quotes.append( pars_single_quote(quote) ) - if is_next_page["is_next_page"]: - page_link = create_page_link(BASE_URL, is_next_page["next_page_link"]) - bs_page = get_soup_page(page_link) - is_next_page = next_page(bs_page) - quotes = get_quotes(bs_page) - else: + if not is_next_page["is_next_page"]: break + page_link = create_page_link(BASE_URL, is_next_page["next_page_link"]) + bs_page = get_soup_page(page_link) + is_next_page = next_page(bs_page) + quotes = get_quotes(bs_page) + + write_quotes_to_csv(parsed_quotes, output_csv_path) if __name__ == "__main__": From 14b68bf009d5b34ea96dd7b5575182b76bd2de8e Mon Sep 17 00:00:00 2001 From: Maksym Protsak Date: Sat, 18 Jan 2025 19:54:31 +0200 Subject: [PATCH 07/11] Updated requirements.txt. --- requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/requirements.txt b/requirements.txt index 2b51ffc..a237ecb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,11 @@ beautifulsoup4==4.12.3 certifi==2024.12.14 charset-normalizer==3.4.1 +flake8==7.1.1 idna==3.10 +mccabe==0.7.0 +pycodestyle==2.12.1 +pyflakes==3.2.0 requests==2.32.3 soupsieve==2.6 urllib3==2.3.0 From ece39a0338fef7b1477b9f73ed7d740e4b9ace47 Mon Sep 17 00:00:00 2001 From: Maksym Protsak Date: Sat, 18 Jan 2025 19:54:41 +0200 Subject: [PATCH 08/11] flake8. --- app/parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/parse.py b/app/parse.py index b5225bb..747eb3b 100644 --- a/app/parse.py +++ b/app/parse.py @@ -57,7 +57,7 @@ def pars_single_quote(quote: Tag) -> Quote: def write_quotes_to_csv(quotes, output_csv_path): - with open(output_csv_path, "w", encoding="utf-8", newline="") as f: + with open(output_csv_path, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(QUOTES_FIELDS) writer.writerows([astuple(quote) for quote in quotes]) From 5ec4c53dc1eba943ada46f12a3c1cc47ebcde0a9 Mon Sep 17 00:00:00 2001 From: Maksym Protsak Date: Sat, 18 Jan 2025 22:54:01 +0200 Subject: [PATCH 09/11] Updated requirements.txt. --- requirements.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a237ecb..d2c33ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,12 @@ +flake8-annotations==2.9.1 +flake8-quotes==3.3.1 +flake8-variables-names==0.0.5 +pep8-naming==0.13.2 +pytest==7.1.3 beautifulsoup4==4.12.3 certifi==2024.12.14 charset-normalizer==3.4.1 -flake8==7.1.1 +flake8>=6.0.0 idna==3.10 mccabe==0.7.0 pycodestyle==2.12.1 From c04f320c90d7a2777518df3f9a5608d9e3d120b0 Mon Sep 17 00:00:00 2001 From: Maksym Protsak Date: Sat, 18 Jan 2025 22:58:15 +0200 Subject: [PATCH 10/11] Updated requirements.txt. --- requirements.txt | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index d2c33ae..4c236c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,24 @@ -flake8-annotations==2.9.1 -flake8-quotes==3.3.1 -flake8-variables-names==0.0.5 -pep8-naming==0.13.2 -pytest==7.1.3 +attrs==24.3.0 beautifulsoup4==4.12.3 certifi==2024.12.14 charset-normalizer==3.4.1 -flake8>=6.0.0 +colorama==0.4.6 +flake8==7.1.1 +flake8-annotations==2.9.1 +flake8-quotes==3.4.0 +flake8-variables-names==0.0.5 idna==3.10 +iniconfig==2.0.0 mccabe==0.7.0 +packaging==24.2 +pep8-naming==0.13.2 +pluggy==1.5.0 +py==1.11.0 pycodestyle==2.12.1 pyflakes==3.2.0 +pytest==7.1.3 requests==2.32.3 +setuptools==75.8.0 soupsieve==2.6 +tomli==2.2.1 urllib3==2.3.0 From 64dfc2e1133b20abcbcba46cc32c0a1196c98ea1 Mon Sep 17 00:00:00 2001 From: Maksym Protsak Date: Sat, 18 Jan 2025 22:58:31 +0200 Subject: [PATCH 11/11] flake8 --- app/parse.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/parse.py b/app/parse.py index 747eb3b..63293e9 100644 --- a/app/parse.py +++ b/app/parse.py @@ -27,8 +27,8 @@ def create_page_link(base_url: str, next_page_url: str) -> str: def get_soup_page(url: str) -> BeautifulSoup: - r = requests.get(url, ).content - soup = BeautifulSoup(r, "html.parser") + res = requests.get(url, ).content + soup = BeautifulSoup(res, "html.parser") return soup @@ -56,7 +56,7 @@ def pars_single_quote(quote: Tag) -> Quote: return Quote(text=str(text), author=str(author), tags=list(tags)) -def write_quotes_to_csv(quotes, output_csv_path): +def write_quotes_to_csv(quotes: list[Quote], output_csv_path: str) -> None: with open(output_csv_path, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(QUOTES_FIELDS)