LLkaia · LLkaia · Dec 20, 2023 · Dec 15, 2023 · Dec 16, 2023 · Dec 16, 2023
diff --git a/main.py b/main.py
@@ -0,0 +1,5 @@
+import uvicorn
+
+
+if __name__ == '__main__':
+    uvicorn.run('server.app:app', host="0.0.0.0", port=8000, reload=False)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,36 @@
+annotated-types==0.6.0
+anyio==3.7.1
+attrs==23.1.0
+beautifulsoup4==4.12.2
+certifi==2023.11.17
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+dnspython==2.4.2
+fastapi==0.105.0
+flake8==6.1.0
+h11==0.14.0
+idna==3.6
+mccabe==0.7.0
+motor==3.3.2
+outcome==1.3.0.post0
+pycodestyle==2.11.1
+pycparser==2.21
+pydantic==2.5.2
+pydantic_core==2.14.5
+pyflakes==3.1.0
+pymongo==4.6.1
+PySocks==1.7.1
+requests==2.31.0
+selenium==4.16.0
+sniffio==1.3.0
+sortedcontainers==2.4.0
+soupsieve==2.5
+starlette==0.27.0
+trio==0.23.1
+trio-websocket==0.11.1
+typing_extensions==4.9.0
+urllib3==2.1.0
+uvicorn==0.24.0.post1
+wsproto==1.2.0
diff --git a/server/__init__.py b/server/__init__.py
diff --git a/server/app.py b/server/app.py
@@ -0,0 +1,11 @@
+from fastapi import FastAPI
+from server.routes.search_result import router as SearchResultRouter
+
+
+app = FastAPI()
+app.include_router(SearchResultRouter, tags=["Search"], prefix="/news/search")
+
+
+@app.get('/', tags=['Root'])
+async def read_root():
+    return {'message': 'Welcome!'}
diff --git a/server/database.py b/server/database.py
@@ -0,0 +1,71 @@
+import motor.motor_asyncio
+from bson import ObjectId
+from bson.errors import InvalidId
+
+
+MONGO_DETAILS = 'mongodb://localhost:27017'
+client = motor.motor_asyncio.AsyncIOMotorClient(MONGO_DETAILS)
+db = client.news
+
+search_results_collection = db.get_collection('search_results')
+
+
+def search_results_helper(search_result):
+    return {
+        "id": str(search_result["_id"]),
+        "link": search_result["link"],
+        "title": search_result["title"],
+        "author": search_result["author"],
+        "image": search_result["image"],
+        "date": search_result["date"],
+        "tags": search_result["tags"],
+        "description": search_result["description"],
+        "content": search_result["content"]
+    }
+
+
+async def add_search_results(results: list[dict]):
+    new_results = []
+    for result in results:
+        if await search_results_collection.find_one({"link": result['link']}):
+            new_result = await search_results_collection.find_one({"link": result['link']})
+            new_result["tags"] = list(set(new_result["tags"] + result['tags']))
+            await search_results_collection.update_one({"_id": ObjectId(new_result["_id"])}, {"$set": new_result})
+        else:
+            result = await search_results_collection.insert_one(result)
+            new_result = await search_results_collection.find_one({"_id": result.inserted_id})
+        new_results.append(search_results_helper(new_result))
+    return new_results
+
+
+async def retrieve_search_result_by_id(id_: str):
+    try:
+        result = await search_results_collection.find_one({"_id": ObjectId(id_)})
+        if result:
+            return search_results_helper(result)
+    except InvalidId:
+        return
+
+
+async def retrieve_search_results_by_tags(tags: list[str]):
+    matched_result = []
+    results = search_results_collection.find()
+    search_tags = set(tags)
+    async for result in results:
+        common = search_tags.intersection(result["tags"])
+        if len(common) > len(search_tags) / 2:
+            matched_result.append(search_results_helper(result))
+    return matched_result
+
+
+async def retrieve_newest_search_results():
+    results = []
+    async for result in search_results_collection.find().sort('date', -1).limit(20):
+        results.append(search_results_helper(result))
+    return results
+
+
+async def update_content_of_article(id_: str, content: list[list]):
+    await search_results_collection.update_one({'_id': ObjectId(id_)}, {"$set": {"content": content}})
+    article = await search_results_collection.find_one({'_id': ObjectId(id_)})
+    return search_results_helper(article)
diff --git a/server/models/__init__.py b/server/models/__init__.py
diff --git a/server/models/search_result.py b/server/models/search_result.py
@@ -0,0 +1,36 @@
+from datetime import datetime
+
+from pydantic import BaseModel, HttpUrl
+
+
+class ArticleModel(BaseModel):
+    id: str
+    link: HttpUrl
+    title: str
+    author: str | None = None
+    image: HttpUrl | None = None
+    date: datetime | None = None
+    description: str = ""
+    tags: set[str] = set()
+
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "id": "657b4a8d9e6d5419e28aa3e1",
+                    "link": "https://www.laptopmag.com/best-picks/tips-to-improve-macbook-sound",
+                    "tags": ["acer", "aspire", "nvidia"],
+                    "image": "https://cdn.mos.cms.futurecdn.net/vzWy7ZzZy4rfZUESfUw4Lg.jpg",
+                    "title": "7 ways to improve sound on your MacBook",
+                    "author": "Alex Bracetti",
+                    "date": "2023-05-20T07:00:53Z",
+                    "description": "Unhappy with the MacBook’s sound quality? Here are some tips and tricks to enhance "
+                                   "the audio performance on your Apple laptop."
+                },
+            ]
+        }
+    }
+
+
+class ExtendArticleModel(ArticleModel):
+    content: list[list] = []
diff --git a/server/routes/__init__.py b/server/routes/__init__.py
diff --git a/server/routes/search_result.py b/server/routes/search_result.py
@@ -0,0 +1,38 @@
+from fastapi import APIRouter, status, HTTPException
+
+from server.scraper import scrap_from_search, scrap_content
+from server.models.search_result import ArticleModel, ExtendArticleModel
+from server.database import (
+    add_search_results,
+    retrieve_search_result_by_id,
+    retrieve_search_results_by_tags,
+    retrieve_newest_search_results,
+    update_content_of_article,
+)
+
+
+router = APIRouter()
+
+
+@router.get("/", status_code=status.HTTP_200_OK, response_model=list[ArticleModel])
+async def get_search_results(find: str | None = None) -> list[ArticleModel]:
+    if find:
+        results = await retrieve_search_results_by_tags(find.split())
+        if len(results) < 20:
+            new_results = scrap_from_search(find)
+            new_results = await add_search_results(new_results)
+            results.extend(new_results)
+        return results[:20]
+    return await retrieve_newest_search_results()
+
+
+@router.get("/{id}", status_code=status.HTTP_200_OK, response_model=ExtendArticleModel)
+async def get_article(id: str) -> ExtendArticleModel:
+    result = await retrieve_search_result_by_id(id)
+    if result:
+        if not result['content']:
+            content = scrap_content(result['link'])
+            result = await update_content_of_article(id, content)
+        return result
+    else:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Item not found")
diff --git a/server/scraper.py b/server/scraper.py
@@ -0,0 +1,54 @@
+import requests
+from bs4 import BeautifulSoup
+
+
+headers = {
+    'Accept':
+        'text/html,application/xhtml+xml,application/xml;q=0.9,image/'
+        'webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'User-Agent':
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+        '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
+}
+
+
+def scrap_from_search(search: str) -> list[dict]:
+    link_src = f'https://www.laptopmag.com/search?searchTerm={search}&articleType=best-pick'
+    page_src = requests.get(link_src, headers)
+    soup_src = BeautifulSoup(page_src.content, 'html.parser')
+
+    laptops = soup_src.find_all('div', class_='listingResult')
+    laptops_data = []
+    for laptop in laptops:
+        dct = dict()
+        dct['link'] = laptop.find('a', class_='article-link').get('href')
+        dct['image'] = laptop.find('img').get('data-pin-media')
+        dct['title'] = laptop.find('h3', class_='article-name').text.strip()
+        dct['author'] = laptop.find('span', attrs={'style': 'white-space:nowrap'}).text.strip()
+        dct['date'] = laptop.find('time').get('datetime')
+        dct['description'] = laptop.find('p', class_='synopsis').text.strip()
+        dct['tags'] = search.split()
+        dct['content'] = []
+        laptops_data.append(dct)
+    return laptops_data
+
+
+def scrap_content(link: str) -> list[list]:
+    page_src = requests.get(link, headers)
+    soup_src = BeautifulSoup(page_src.content, 'html.parser')
+
+    content = []
+    body = soup_src.find('div', id='article-body')
+    for block in body.children:
+        if block.name == 'p':
+            paragraph = block.text
+            if ' ' in paragraph:
+                paragraph = paragraph.replace(' ', ' ')
+            content.append(('paragraph', paragraph))
+        elif block.name == 'h2':
+            title = block.text
+            content.append(('title', title))
+        elif block.name == 'figure':
+            image = block.find('img').get('data-pin-media')
+            content.append(('image', image))
+    return content