Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Finished initial work with api #1

Merged
merged 3 commits into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import uvicorn


if __name__ == '__main__':
uvicorn.run('server.app:app', host="0.0.0.0", port=8000, reload=False)
36 changes: 36 additions & 0 deletions requirements.txt
LLkaia marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
annotated-types==0.6.0
anyio==3.7.1
attrs==23.1.0
beautifulsoup4==4.12.2
certifi==2023.11.17
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
colorama==0.4.6
dnspython==2.4.2
fastapi==0.105.0
flake8==6.1.0
h11==0.14.0
idna==3.6
mccabe==0.7.0
motor==3.3.2
outcome==1.3.0.post0
pycodestyle==2.11.1
pycparser==2.21
pydantic==2.5.2
pydantic_core==2.14.5
pyflakes==3.1.0
pymongo==4.6.1
PySocks==1.7.1
requests==2.31.0
selenium==4.16.0
sniffio==1.3.0
sortedcontainers==2.4.0
soupsieve==2.5
starlette==0.27.0
trio==0.23.1
trio-websocket==0.11.1
typing_extensions==4.9.0
urllib3==2.1.0
uvicorn==0.24.0.post1
wsproto==1.2.0
Empty file added server/__init__.py
Empty file.
11 changes: 11 additions & 0 deletions server/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from fastapi import FastAPI
from server.routes.search_result import router as SearchResultRouter
LLkaia marked this conversation as resolved.
Show resolved Hide resolved


app = FastAPI()
app.include_router(SearchResultRouter, tags=["Search"], prefix="/news/search")


@app.get('/', tags=['Root'])
async def read_root():
return {'message': 'Welcome!'}
71 changes: 71 additions & 0 deletions server/database.py
LLkaia marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import motor.motor_asyncio
from bson import ObjectId
from bson.errors import InvalidId


MONGO_DETAILS = 'mongodb://localhost:27017'
client = motor.motor_asyncio.AsyncIOMotorClient(MONGO_DETAILS)
db = client.news

search_results_collection = db.get_collection('search_results')


def search_results_helper(search_result):
return {
"id": str(search_result["_id"]),
"link": search_result["link"],
"title": search_result["title"],
"author": search_result["author"],
"image": search_result["image"],
"date": search_result["date"],
"tags": search_result["tags"],
"description": search_result["description"],
"content": search_result["content"]
}


async def add_search_results(results: list[dict]):
new_results = []
for result in results:
if await search_results_collection.find_one({"link": result['link']}):
new_result = await search_results_collection.find_one({"link": result['link']})
new_result["tags"] = list(set(new_result["tags"] + result['tags']))
await search_results_collection.update_one({"_id": ObjectId(new_result["_id"])}, {"$set": new_result})
else:
result = await search_results_collection.insert_one(result)
new_result = await search_results_collection.find_one({"_id": result.inserted_id})
new_results.append(search_results_helper(new_result))
return new_results


async def retrieve_search_result_by_id(id_: str):
try:
result = await search_results_collection.find_one({"_id": ObjectId(id_)})
if result:
return search_results_helper(result)
except InvalidId:
return


async def retrieve_search_results_by_tags(tags: list[str]):
matched_result = []
results = search_results_collection.find()
search_tags = set(tags)
async for result in results:
common = search_tags.intersection(result["tags"])
if len(common) > len(search_tags) / 2:
matched_result.append(search_results_helper(result))
return matched_result


async def retrieve_newest_search_results():
results = []
async for result in search_results_collection.find().sort('date', -1).limit(20):
results.append(search_results_helper(result))
return results


async def update_content_of_article(id_: str, content: list[list]):
await search_results_collection.update_one({'_id': ObjectId(id_)}, {"$set": {"content": content}})
article = await search_results_collection.find_one({'_id': ObjectId(id_)})
return search_results_helper(article)
Empty file added server/models/__init__.py
Empty file.
36 changes: 36 additions & 0 deletions server/models/search_result.py
LLkaia marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from datetime import datetime

from pydantic import BaseModel, HttpUrl


class ArticleModel(BaseModel):
id: str
link: HttpUrl
title: str
author: str | None = None
image: HttpUrl | None = None
date: datetime | None = None
description: str = ""
tags: set[str] = set()

model_config = {
"json_schema_extra": {
"examples": [
{
"id": "657b4a8d9e6d5419e28aa3e1",
"link": "https://www.laptopmag.com/best-picks/tips-to-improve-macbook-sound",
"tags": ["acer", "aspire", "nvidia"],
"image": "https://cdn.mos.cms.futurecdn.net/vzWy7ZzZy4rfZUESfUw4Lg.jpg",
"title": "7 ways to improve sound on your MacBook",
"author": "Alex Bracetti",
"date": "2023-05-20T07:00:53Z",
"description": "Unhappy with the MacBook’s sound quality? Here are some tips and tricks to enhance "
"the audio performance on your Apple laptop."
},
]
}
}


class ExtendArticleModel(ArticleModel):
content: list[list] = []
Empty file added server/routes/__init__.py
Empty file.
38 changes: 38 additions & 0 deletions server/routes/search_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from fastapi import APIRouter, status, HTTPException

from server.scraper import scrap_from_search, scrap_content
from server.models.search_result import ArticleModel, ExtendArticleModel
from server.database import (
add_search_results,
retrieve_search_result_by_id,
retrieve_search_results_by_tags,
retrieve_newest_search_results,
update_content_of_article,
)


router = APIRouter()


@router.get("/", status_code=status.HTTP_200_OK, response_model=list[ArticleModel])
async def get_search_results(find: str | None = None) -> list[ArticleModel]:
if find:
results = await retrieve_search_results_by_tags(find.split())
if len(results) < 20:
new_results = scrap_from_search(find)
new_results = await add_search_results(new_results)
results.extend(new_results)
return results[:20]
return await retrieve_newest_search_results()


@router.get("/{id}", status_code=status.HTTP_200_OK, response_model=ExtendArticleModel)
async def get_article(id: str) -> ExtendArticleModel:
result = await retrieve_search_result_by_id(id)
if result:
if not result['content']:
content = scrap_content(result['link'])
result = await update_content_of_article(id, content)
return result
else:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Item not found")
54 changes: 54 additions & 0 deletions server/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import requests
from bs4 import BeautifulSoup


headers = {
'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/'
'webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}


def scrap_from_search(search: str) -> list[dict]:
link_src = f'https://www.laptopmag.com/search?searchTerm={search}&articleType=best-pick'
page_src = requests.get(link_src, headers)
soup_src = BeautifulSoup(page_src.content, 'html.parser')

laptops = soup_src.find_all('div', class_='listingResult')
laptops_data = []
for laptop in laptops:
dct = dict()
dct['link'] = laptop.find('a', class_='article-link').get('href')
dct['image'] = laptop.find('img').get('data-pin-media')
dct['title'] = laptop.find('h3', class_='article-name').text.strip()
dct['author'] = laptop.find('span', attrs={'style': 'white-space:nowrap'}).text.strip()
LLkaia marked this conversation as resolved.
Show resolved Hide resolved
dct['date'] = laptop.find('time').get('datetime')
dct['description'] = laptop.find('p', class_='synopsis').text.strip()
dct['tags'] = search.split()
dct['content'] = []
laptops_data.append(dct)
return laptops_data


def scrap_content(link: str) -> list[list]:
page_src = requests.get(link, headers)
soup_src = BeautifulSoup(page_src.content, 'html.parser')

content = []
body = soup_src.find('div', id='article-body')
for block in body.children:
if block.name == 'p':
paragraph = block.text
if ' ' in paragraph:
paragraph = paragraph.replace(' ', ' ')
content.append(('paragraph', paragraph))
elif block.name == 'h2':
title = block.text
content.append(('title', title))
elif block.name == 'figure':
image = block.find('img').get('data-pin-media')
content.append(('image', image))
return content