diff --git a/.gitignore b/.gitignore index e8a7ad9..bbcb808 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,6 @@ data.json *.pptx /backend/output *.pyc -*.log \ No newline at end of file +*.log +mongodb/ +db-backup/ diff --git a/README.md b/README.md index 235acbe..5970f80 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,10 @@ + # knowlix + Microservice to autmatically create onboarding slides ![LIX Builder](https://github.com/vg-leanix/pptx-tool/blob/main/Thumbnail.png) + +## Architecture +![Architecture](https://github.com/vg-leanix/knowlix/blob/v1.1_mongodb/knowlix%20architecture.png) diff --git a/backend/.gitignore b/backend/.gitignore new file mode 100644 index 0000000..ae412d6 --- /dev/null +++ b/backend/.gitignore @@ -0,0 +1 @@ +env/ \ No newline at end of file diff --git a/backend/Dockerfile b/backend/Dockerfile index fbd141e..d42e727 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,11 +1,13 @@ -FROM python:3.8.1 +FROM python:3.9.1-slim-buster +ENV PYTHONUNBUFFERED 1 WORKDIR /usr/app + COPY req.txt ./ -COPY api.py core.py main.py master.pptx req.txt server.py ./ +COPY api.py core.py main.py master.pptx req.txt server.py api_utils.py config.py ./ -# RUN mkdir output +RUN mkdir output RUN pip install --upgrade pip RUN pip install -r req.txt --no-cache-dir diff --git a/backend/api.py b/backend/api.py index 1864dab..d5650a9 100644 --- a/backend/api.py +++ b/backend/api.py @@ -1,39 +1,53 @@ -from fastapi import FastAPI, HTTPException +import config +from fastapi import FastAPI, HTTPException, Depends, status, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse +from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm +import pymongo from fastapi.responses import FileResponse from pydantic import BaseModel -from typing import List +from typing import List, Optional from main import create_pptx, get_sections import os +from datetime import datetime from pptx import Presentation from server import celery import json +import uuid +from datetime import datetime, timedelta +from pymongo import MongoClient +import api_utils as utils - +## CONFIG ## file_path = "master.pptx" -pres= Presentation(file_path) - -tags_metadata= [ +pres = Presentation(file_path) +MONGODB = os.getenv("MONGODB") +client = MongoClient(MONGODB) +db = client["taskdb"]["ta"] +#k +tags_metadata = [ { - "name": "powerpoint", - "description": "handling powerpoint" + "name": "powerpoint", + "description": "handling powerpoint" }, { - "name": "job management", - "description": "managing celery tasks" + "name": "job management", + "description": "managing celery tasks" }, - + { + "name": "auth", + "description": "authentication workflow endpoint" + }, + ] app = FastAPI( - title= "SurfBoard", - description= "API Hub for the LeanIX Onboarding Deck", - version= "1.0.0", + title="Knowlix", + description="API Hub for the LeanIX Onboarding Deck", + version="1.0.0", openapi_tags=tags_metadata) - app.add_middleware( CORSMiddleware, allow_origins=["*"], @@ -43,42 +57,234 @@ expose_headers=[] ) + class PPTX(BaseModel): sections: List[str] + + +class User(BaseModel): + username: str + password: str + first_name: str + last_name: str + user_id: str + + +class TokenData(BaseModel): + username: Optional[str] = None + + +class Token(BaseModel): + access_token: str - -@app.get("/v1/sections", tags = ["powerpoint"]) -async def provide_sections(): + + +class Download(BaseModel): + taskID: str + + +oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/v1/token") +## API ENDPOINTS ## + + +@app.get("/v1/sections", tags=["powerpoint"]) +async def provide_sections(token: bool = Depends(utils.is_access_token_valid)): sections = get_sections(pres) - if (not sections) or (len(sections)==0): - raise HTTPException(status_code=404, detail="No Sections in Master pptx") + if (not sections) or (len(sections) == 0): + raise HTTPException( + status_code=404, detail="No Sections in Master pptx") - return JSONResponse(sections,status_code=200) + return JSONResponse(sections, status_code=200) -@app.post("/v1/pptxjob", tags = ["job management"]) -async def deliver_pptx(pptx: PPTX): +@app.post("/v1/pptxjob", tags=["job management"]) +async def trigger_pptx_task(pptx: PPTX, token: bool = Depends(utils.is_access_token_valid)): task_name = "pptx" sections = pptx.sections - kwargs ={ - 'sections':sections, - 'downloadStatus': 'ready' - } - + no_sections = len(sections) + sections_available = True + exists_already = False + status = None + custom_id = str(uuid.uuid4().hex) + timestamp = datetime.now().isoformat() + + kwargs = { + 'sections': sections, + 'customID': custom_id, + 'downloaded': False, + 'date_started': timestamp + + } + + if no_sections != 0: + exists_already = utils.check_existence(sections, db) + else: + sections_available = False + if not exists_already and sections_available: + task = celery.send_task(task_name, kwargs=kwargs, serializer='json') - task = celery.send_task(task_name, kwargs = kwargs, serializer='json') + if sections_available and not exists_already: + status = "success" + + elif not sections_available: + status = "no_sections" + + elif exists_already: + status = "pptx_exists" package = { - 'taskID': task.id, - 'sections': sections + 'taskID': custom_id, + 'sections': sections, + 'status': status } - return JSONResponse(package) + + +@app.post("/v1/download", tags=["powerpoint"]) +async def download_pptx(download: Download, token: bool = Depends(utils.is_access_token_valid)): + + task_id = download.taskID + + result = db.find_one({"kwargs.customID": task_id}, {'result': 1, '_id': 0}) + unpack = result["result"] + unpack = json.loads(unpack) + file_path = unpack["filePath"] + + # return file_path + return FileResponse(file_path) + + +@app.post("/v1/registerDownload", tags=["powerpoint"], status_code=201) +async def register_download(task_id: Download, token: bool = Depends(utils.is_access_token_valid)): + task_id = task_id.taskID + + res = db.update_one({"kwargs.customID": task_id}, + {"$set": {"kwargs.downloaded": True} + }) + + changed_docs = res.modified_count + + return {'changedDocuments': changed_docs} + + +@app.get("/v1/getDownloads", tags=["powerpoint"]) +async def getDownloads(token: bool = Depends(utils.is_access_token_valid)): + res = db.find({}).sort( + [("kwargs.date_started", pymongo.DESCENDING)]).limit(10) + results = list() + + for item in res: + taskID = item["kwargs"]["customID"] + date_started = item["kwargs"]["date_started"] + status = item["status"] + sections = item["kwargs"]["sections"] + + package = { + 'taskID': taskID, + 'date_started': date_started, + 'status': status, + 'sections': sections + } + results.append(package) + + return JSONResponse(results, status_code=200) + + +### AUTH ### + + +@app.post("/v1/token", response_model=Token, tags=["auth"]) +async def create_access_token(form_data: OAuth2PasswordRequestForm = Depends()): + user = utils.authenticate_user(username=form_data.username, + password=form_data.password) + if not user: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="USER_CREDENTIALS_INVALID", + headers={"WWW-Authenticate": "Bearer"}, + ) + data_refresh_token = {"sub": user["username"], + "role": user["role"], + "first_name": user["first_name"], + } + refresh_token_expires = timedelta( + minutes=config.REFRESH_TOKEN_EXPIRE_MINUTES) + refresh_token = utils.create_refresh_token( + data=data_refresh_token, expires_delta=refresh_token_expires) + + data_access_token = {"sub": user["username"], + "role": user["role"], + "first_name": user["first_name"], + "user_id":user["_id"], + "refresh_token": refresh_token} + + access_token_expires = timedelta( + minutes=config.ACCESS_TOKEN_EXPIRE_MINUTES) + access_token = utils.create_access_token( + data=data_access_token, expires_delta=access_token_expires + ) + + return {"access_token": access_token} + + +@app.post("/v1/register", tags=["auth"]) +async def register_user(user: User): + username = user.username + password = user.password + first_name = user.first_name + last_name = user.last_name + role = "VIEWER" + + res = utils.create_user(username=username, password=password) + + return JSONResponse(res) + + +@app.get("/v1/me", tags=["auth"]) +async def read_users(current_user: User = Depends(utils.get_current_tokenuser)): + return current_user + + +@app.post("/v1/refreshToken",response_model=Token, tags=["auth"]) +async def refresh_token(user: User = Depends(utils.is_refresh_token_valid)): - + if not user: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="USER_CREDENTIALS_INVALID", + headers={"WWW-Authenticate": "Bearer"}, + ) + data_refresh_token = {"sub": user["username"], + "role": user["role"], + "first_name": user["first_name"]} + + refresh_token_expires = timedelta( + minutes=config.REFRESH_TOKEN_EXPIRE_MINUTES) + refresh_token = utils.create_refresh_token( + data=data_refresh_token, expires_delta=refresh_token_expires) + + data_access_token = {"sub": user["username"], + "role": user["role"], + "first_name": user["first_name"], + "user_id":user["_id"], + "refresh_token": refresh_token} + + access_token_expires = timedelta( + minutes=config.ACCESS_TOKEN_EXPIRE_MINUTES) + access_token = utils.create_access_token( + data=data_access_token, expires_delta=access_token_expires + ) + + return {"access_token": access_token} + + +@app.get("/v1/header", tags=["auth"]) +async def get_headers(headers: Request): + + return {"headers": headers.headers} diff --git a/backend/api_utils.py b/backend/api_utils.py new file mode 100644 index 0000000..7b308e5 --- /dev/null +++ b/backend/api_utils.py @@ -0,0 +1,246 @@ +import pymongo +from pymongo import MongoClient +from fastapi import FastAPI, HTTPException, Depends, status, Header +from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm +import os +from passlib.context import CryptContext +from jose import JWTError, jwt +import config +from passlib.context import CryptContext +from datetime import timedelta, datetime + +from typing import Optional, List +import config +from pydantic import BaseModel + +MONGODB = os.getenv("MONGODB") +client = MongoClient(MONGODB) +users = client["users"]["users"] +pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") +oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/v1/token") + + +def check_existence(sections, db): + exists_already = False + no_sections = len(sections) + query = {"kwargs.sections": {"$size": no_sections, "$all": sections}} + + hits = db.count_documents(query) + + if hits > 0: + exists_already = True + + return exists_already + + +def authenticate_user(username: str, password: str, db=users): + user = get_user(username=username, include_pw=True, include_id=True) + hashed_pw = user["password"] + + if not user: + return False + + if not verify_pw(password, hashed_pw): + return False + + return user + + +def create_user(username: str, password: str, first_name: str, last_name: str, + role: str, db=users): + user_count = db.count_documents({"username": username}) + hashed_pw = pwd_context.hash(password) + error = None + created = False + + if user_count == 0: + query = {"username": username, + "password": hashed_pw, + "first_name": first_name, + "last_name": last_name, + "role": role} + db.insert_one(query) + created = True + else: + error = "User exists already!" + + return {"created": created, "error": error} + + +def create_refresh_token(data: dict, expires_delta: timedelta): + to_encode = data.copy() + expires = datetime.utcnow() + expires_delta + to_encode.update({"exp": expires}) + refresh_token = jwt.encode( + to_encode, config.REFRESH_KEY, algorithm=config.ALGORITHM) + return refresh_token + + +def create_access_token(data: dict, expires_delta: Optional[timedelta] = None): + to_encode = data.copy() + if expires_delta: + expire = datetime.utcnow() + expires_delta + else: + expire = datetime.utcnow() + timedelta(minutes=15) + to_encode.update({"exp": expire}) + encoded_jwt = jwt.encode( + to_encode, config.SECRET_KEY, algorithm=config.ALGORITHM) + return encoded_jwt + + +def get_user(username: str, include_pw=True, include_id=True, db=users): + + if include_pw: + user = db.find_one({"username": username}, { + "_id": 1, + "username": 1, + "password": 1, + "first_name": 1, + "last_name": 1, + "role": 1 + }) + + user.update({"_id": str(user["_id"])}) + + elif include_id: + user = db.find_one({"username": username}, { + "_id": 1, + "username": 1, + "password": 1, + "first_name": 1, + "last_name": 1, + "role": 1 + }) + user.update({"_id": str(user["_id"])}) + + elif (include_pw and include_id): + user = db.find_one({"username": username}, { + "_id": 1, + "username": 1, + "password": 1, + "first_name": 1, + "last_name": 1, + "role": 1 + }) + user.update({"_id": str(user["_id"])}) + + elif (not include_pw and include_id): + user = db.find_one({"username": username}, { + "_id": 1, + "username": 1, + "password": 1, + "first_name": 1, + "last_name": 1, + "role": 1 + }) + user.update({"_id": str(user["_id"])}) + + elif (include_pw and not include_id): + user = db.find_one({"username": username}, { + "_id": 0, + "username": 1, + "password": 1, + "first_name": 1, + "last_name": 1, + "role": 1 + }) + + + else: + user = db.find_one({"username": username}, { + "_id": 0, + "username": 1, + "password": 1, + "first_name": 1, + "last_name": 1, + "role": 1 + }) + + + return user + + +def verify_pw(plain_password, hashed_password): + return pwd_context.verify(plain_password, hashed_password) + + +async def is_access_token_valid(token: str = Depends(oauth2_scheme)): + credentials_exception = HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Access Token invalid", + headers={"WWW-Authenticate": "Bearer"}, + ) + try: + payload = jwt.decode(token, config.SECRET_KEY, + algorithms=[config.ALGORITHM]) + username: str = payload.get("sub") + if username is None: + raise credentials_exception + return True + except JWTError: + raise credentials_exception + return False + + +async def is_refresh_token_valid(token: str = Depends(oauth2_scheme)): + access_exception = HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="ACCESS_TOKEN_INVALID", + headers={"WWW-Authenticate": "Bearer"}, + ) + + refresh_token_exception = HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="REFRESH_TOKEN_INVALID", + headers={"WWW-Authenticate": "Bearer"}, + ) + + try: + access_payload = jwt.decode(token, config.SECRET_KEY, + options={"verify_exp": False}) + + refresh_token: str = access_payload.get("refresh_token") + username: str = access_payload.get("sub") + + + try: + refresh_token_payload = jwt.decode(refresh_token, config.REFRESH_KEY, + algorithms=[config.ALGORITHM]) + + + user = get_user(username=username, + include_id=True, include_pw=False) + + return user + + except JWTError: + raise refresh_token_exception + + except JWTError: #specify errors better + + raise access_exception + + +async def get_current_tokenuser(token: str = Depends(oauth2_scheme)): + credentials_exception = HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="USER_CREDENTIALS_INVALID", + headers={"WWW-Authenticate": "Bearer"}, + ) + try: + payload = jwt.decode(token, config.SECRET_KEY, + algorithms=[config.ALGORITHM]) + + username: str = payload.get("sub") + user = get_user(username=username, include_pw=False) + + if user is None: + raise credentials_exception + return user + + except JWTError: + raise credentials_exception + + +def get_header(header: Optional[List[str]] = Header(None)): + return header diff --git a/backend/clean_output.py b/backend/clean_output.py deleted file mode 100644 index 2e043e5..0000000 --- a/backend/clean_output.py +++ /dev/null @@ -1,7 +0,0 @@ -import os - -folder = os.path.join(os.getcwd(),'backend/output') -filelist = [f for f in os.listdir(folder)if f.endswith(".pptx")] - -for f in filelist: - os.remove(os.path.join(folder,f)) diff --git a/backend/config.py b/backend/config.py new file mode 100644 index 0000000..1560755 --- /dev/null +++ b/backend/config.py @@ -0,0 +1,7 @@ +## AUTH ## +# openssl rand -hex 256 +SECRET_KEY = "9602c3e3dd155413c16c50339e7024b95bbdfd88b7faa2816c8d70de76f43ed50536ab25f5cb8e96c098b25dd36f9345ead53a14e67b712dfce59e910cb18bd872789076fc9d071c4dee9148f141dc8906fe359ceef2a8a9bdaf5875a13a490e84750d51b3a2b2635d13a175fab27552cbb8ec14241242ba06f413b142a3d579d6177e0cdebc2d566934f0afefb0abc60eef064025f173dd88fb80109908ed846b6821d98932c707407a988b5c244aba75143c155caf904224548be56d295f1eb75343a48811adbe653b030439ea82588cf5fa5328a635f96f069ac3d35c76a9aa4cc2da6a553e8d2bf41124c86bb0b0926ed9c0f192c394b7617e29bd72064b" +REFRESH_KEY="14782917e36b0cbc974e310fee3fe4bd1f086c8e9ba33ef45c356fe72fbb85b02ae1c2f23a7603d46c0186fe75bb2bcdd14d941159a366b5ba11af6ad2e217873e521b133de7d1d57926aa316b17b4878908209c618d51b53b72e3d19eabe4ed7d125fa38ce7cd48cdfaa43b72ff51658a47c045e6feb8e9fbbbdf71cbb81ab73023acdb398e1734ee10df97f51afea4c205079a16bbb51b5afd90dc9847d0c4ee5a48ef0ef99237250415f365632e33cf5a8776bfceb7b32c263d1c6ec61d7abed8c7ce49afd76ce9132e590f9c50d9b2d96e1c1108a16948d0e838135f636ee9d0e7b94c265568cd74432a0bc2ee146a254edf02f30e3ef5f73df4469c664c" +ALGORITHM = "HS256" +ACCESS_TOKEN_EXPIRE_MINUTES = 2 +REFRESH_TOKEN_EXPIRE_MINUTES = 60 * 24 *30 #one month validity diff --git a/backend/core.py b/backend/core.py index 92af411..ed3ee5a 100644 --- a/backend/core.py +++ b/backend/core.py @@ -2,87 +2,88 @@ import uuid import lxml.etree as etree + def extract_slide_mapping(slidelist): """this method will get the mapping between a slide_id and rID""" - slide_mapping=dict() - + slide_mapping = dict() + for slide in slidelist: - rid=slide.attrib['{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'] - slide_id=slide.attrib['id'] - slide_mapping[slide_id]=rid - + rid = slide.attrib['{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'] + slide_id = slide.attrib['id'] + slide_mapping[slide_id] = rid + return slide_mapping -def prepare_sections(keys, presentation, mapping,all_sections=False): + +def prepare_sections(keys, presentation, mapping, all_sections=False): """this method will prepare a xml tree based on the passed section names the user wants to have in the pptx""" - nmap=presentation.slides._sldIdLst.nsmap - - all_sections=compile_sections(presentation,mapping) - root=etree.Element('{http://schemas.openxmlformats.org/presentationml/2006/main}sldIdLst', nsmap=nmap) - - #TODO: create toggle for - if (all_sections) and (len(keys)!=0): + nmap = presentation.slides._sldIdLst.nsmap + + all_sections = compile_sections(presentation, mapping) + root = etree.Element( + '{http://schemas.openxmlformats.org/presentationml/2006/main}sldIdLst', nsmap=nmap) + + # TODO: create toggle for + if (all_sections) and (len(keys) != 0): for key in keys: - section=all_sections[key] + section = all_sections[key] for slide in section: - etree.SubElement(root, '{http://schemas.openxmlformats.org/presentationml/2006/main}sldId',attrib=slide,nsmap=nmap) - - + etree.SubElement( + root, '{http://schemas.openxmlformats.org/presentationml/2006/main}sldId', attrib=slide, nsmap=nmap) + return root + def compile_sections(presentation, mapping): """this method will get all the sections that are in the pptx""" - - ns='{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id' - xml=etree.fromstring(presentation.part.blob) - nsmap = {'p14':'http://schemas.microsoft.com/office/powerpoint/2010/main'} + + ns = '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id' + xml = etree.fromstring(presentation.part.blob) + nsmap = {'p14': 'http://schemas.microsoft.com/office/powerpoint/2010/main'} sections = xml.xpath('.//p14:sectionLst', namespaces=nsmap)[0] - - collector=dict() - pairs_col=list() - + collector = dict() + pairs_col = list() + for section in sections: - key=section.attrib['name'] + key = section.attrib['name'] for slidelist in section: for slide in slidelist: - pairs=dict() - slide_id=slide.attrib['id'] + pairs = dict() + slide_id = slide.attrib['id'] + # lookup in slide mapping to get rID + rID = mapping[slide_id] - #lookup in slide mapping to get rID - rID=mapping[slide_id] - - pairs['id']=slide_id - pairs[ns]=rID + pairs['id'] = slide_id + pairs[ns] = rID pairs_col.append(pairs) + collector[key] = pairs_col + pairs_col = list() - collector[key]=pairs_col - pairs_col=list() - return collector - -def replace_slides(new_xml,presentation,folder, save=False): + + +def replace_slides(new_xml, presentation, folder, save=False): """This method will take a xml tree and create the final pptx out of it""" - uid=str(uuid.uuid4().hex)[:10] - file_path= f"{folder}/{uid}.pptx" - slidelist=presentation.slides._sldIdLst + uid = str(uuid.uuid4().hex)[:10] + file_path = f"{folder}/{uid}.pptx" + slidelist = presentation.slides._sldIdLst + + slidelist.getparent().replace(slidelist, new_xml) - - slidelist.getparent().replace(slidelist,new_xml) - if save: presentation.save(file_path) - + return file_path def print_xml(xml): - print(etree.tostring(xml, pretty_print=True, encoding="unicode")) \ No newline at end of file + print(etree.tostring(xml, pretty_print=True, encoding="unicode")) diff --git a/backend/req.txt b/backend/req.txt index 1efa76d..ef12b1c 100644 --- a/backend/req.txt +++ b/backend/req.txt @@ -1,27 +1,33 @@ -certifi==2020.6.20 -click==7.1.2 -fastapi==0.61.2 -h11==0.11.0 -lxml==4.6.1 -Pillow==8.0.1 -pydantic==1.7.2 -python-pptx==0.6.18 -starlette==0.13.6 -typing==3.7.4.3 -uvicorn==0.12.2 -XlsxWriter==1.3.7 aiofiles==0.6.0 amqp==5.0.2 +bcrypt==3.2.0 billiard==3.6.3.0 -celery==5.0.3 +celery==5.0.5 +cffi==1.14.4 click==7.1.2 click-didyoumean==0.0.3 click-plugins==1.1.1 click-repl==0.1.6 +ecdsa==0.14.1 +fastapi==0.63.0 +h11==0.11.0 kombu==5.0.2 +lxml==4.6.2 +passlib==1.7.4 +Pillow==8.0.1 prompt-toolkit==3.0.8 -pytz==2020.4 -redis==3.5.3 +pyasn1==0.4.8 +pycparser==2.20 +pydantic==1.7.3 +pymongo==3.11.2 +python-jose==3.2.0 +python-multipart==0.0.5 +python-pptx==0.6.18 +pytz==2020.5 +rsa==4.6 six==1.15.0 +starlette==0.13.6 +uvicorn==0.13.2 vine==5.0.0 -wcwidth==0.2.5 \ No newline at end of file +wcwidth==0.2.5 +XlsxWriter==1.3.7 diff --git a/backend/server.py b/backend/server.py index fcb32e6..2e49f6d 100644 --- a/backend/server.py +++ b/backend/server.py @@ -2,10 +2,16 @@ from celery import Celery CELERY_BROKER_URL = os.getenv("RABBITMQ") -CELERY_RESULT_BACKEND = os.getenv("REDISSERVER") +CELERY_RESULT_BACKEND = os.getenv("MONGODB") celery = Celery("worker", backend=CELERY_RESULT_BACKEND, broker=CELERY_BROKER_URL) celery.conf.update( - result_extended=True + result_extended=True, + mongodb_backend_settings={ + 'database': 'taskdb', + 'taskmeta_collection': 'ta', + }, + task_default_queue = 'presentations', + task_track_started = True ) diff --git a/celery-beat/.gitignore b/celery-beat/.gitignore new file mode 100644 index 0000000..40bca31 --- /dev/null +++ b/celery-beat/.gitignore @@ -0,0 +1 @@ +/env \ No newline at end of file diff --git a/celery-beat/Dockerfile b/celery-beat/Dockerfile new file mode 100644 index 0000000..e016979 --- /dev/null +++ b/celery-beat/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.9.1-slim-buster + +ENV PYTHONUNBUFFERED 1 + +COPY req.txt /req.txt +RUN pip install -r /req.txt + +COPY server.py tasks.py beat.sh /celery-beat/ +WORKDIR /celery-beat + + +# ENTRYPOINT celery -A worker beat --loglevel=debug \ No newline at end of file diff --git a/celery-beat/beat.sh b/celery-beat/beat.sh new file mode 100755 index 0000000..0fbfe98 --- /dev/null +++ b/celery-beat/beat.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +celery -A tasks worker -E --loglevel=info -B -n beatworker -Q crontasks + diff --git a/celery-beat/req.txt b/celery-beat/req.txt new file mode 100644 index 0000000..b552c48 --- /dev/null +++ b/celery-beat/req.txt @@ -0,0 +1,14 @@ +amqp==5.0.2 +billiard==3.6.3.0 +celery==5.0.5 +click==7.1.2 +click-didyoumean==0.0.3 +click-plugins==1.1.1 +click-repl==0.1.6 +kombu==5.0.2 +prompt-toolkit==3.0.8 +pymongo==3.11.2 +pytz==2020.4 +six==1.15.0 +vine==5.0.0 +wcwidth==0.2.5 diff --git a/celery-beat/server.py b/celery-beat/server.py new file mode 100644 index 0000000..5254baf --- /dev/null +++ b/celery-beat/server.py @@ -0,0 +1,33 @@ +import os +from celery import Celery +from celery.schedules import crontab + + +CELERY_BROKER_URL = os.getenv("RABBITMQ") +CELERY_RESULT_BACKEND = os.getenv("MONGODB") + +app = Celery("scheduler", backend=CELERY_RESULT_BACKEND, + broker=CELERY_BROKER_URL) + +app.conf.update( + result_extended=True, + enable_utc=False, + mongodb_backend_settings={ + 'database': 'crontasks', + 'taskmeta_collection': 'ta', + }, + task_routes={ + 'tasks.*': {'queue': 'crontasks'}, + + }, + task_default_queue='crontasks' + + +) + +app.conf.beat_schedule = { + "pptx-cleaner": { + "task": "clean_pptx", + "schedule": crontab(hour='*/1', minute="0") + } +} diff --git a/celery-beat/tasks.py b/celery-beat/tasks.py new file mode 100644 index 0000000..cb1bda1 --- /dev/null +++ b/celery-beat/tasks.py @@ -0,0 +1,43 @@ +from celery import Celery +from server import app +from pymongo import MongoClient +import os +import json + +MONGODB = os.getenv("MONGODB") +client = MongoClient(MONGODB) +db = client["taskdb"]["ta"] + + +@app.task(name="clean_pptx", bind=True) +def clean_pptx(self): + deleted_filepath = list() + deleted_taskid = list() + db_delete_list = db.find({"kwargs.downloaded": True}) + + for item in db_delete_list: + raw_filepath = item["result"] + taskID = item["kwargs"]["customID"] + filepath = parse_filepath(raw_filepath) + os.remove(filepath) + + deleted_filepath.append(filepath) + deleted_taskid.append(taskID) + + delete = db.delete_many({"kwargs.customID": {"$in": deleted_taskid}}) + deleted_count = delete.deleted_count + + package = { + 'tasks': deleted_taskid, + 'filePaths': deleted_filepath, + 'deleted_count': deleted_count + } + + return package + + +def parse_filepath(item): + json_item = json.loads(item) + filepath = json_item["filePath"] + + return filepath diff --git a/celery/.gitignore b/celery/.gitignore new file mode 100644 index 0000000..ae412d6 --- /dev/null +++ b/celery/.gitignore @@ -0,0 +1 @@ +env/ \ No newline at end of file diff --git a/celery/Dockerfile b/celery/Dockerfile index 5494eea..6baa295 100644 --- a/celery/Dockerfile +++ b/celery/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.6 +FROM python:3.9.1-slim-buster COPY req.txt /req.txt RUN pip install -r /req.txt @@ -6,4 +6,5 @@ RUN pip install -r /req.txt COPY server.py worker.py pptx_handler.py core.py master.pptx /celery/ WORKDIR /celery/ RUN mkdir output -ENTRYPOINT celery -A worker worker -E --loglevel=info \ No newline at end of file + +ENTRYPOINT celery -A worker worker -E --loglevel=info -n celeryworker -Q presentations \ No newline at end of file diff --git a/celery/core.py b/celery/core.py index 92af411..d0064ca 100644 --- a/celery/core.py +++ b/celery/core.py @@ -2,87 +2,89 @@ import uuid import lxml.etree as etree + def extract_slide_mapping(slidelist): - """this method will get the mapping between a slide_id and rID""" + """this method will get the mapping between a slide_id and rID. This is necessary to maintain the xml syntax valid + after injection / modification""" + + slide_mapping = dict() - slide_mapping=dict() - for slide in slidelist: - rid=slide.attrib['{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'] - slide_id=slide.attrib['id'] - slide_mapping[slide_id]=rid - + rid = slide.attrib['{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'] + slide_id = slide.attrib['id'] + slide_mapping[slide_id] = rid + return slide_mapping -def prepare_sections(keys, presentation, mapping,all_sections=False): + +def prepare_sections(keys, presentation, mapping, all_sections=False): """this method will prepare a xml tree based on the passed section names the user wants to have in the pptx""" - nmap=presentation.slides._sldIdLst.nsmap - - all_sections=compile_sections(presentation,mapping) - root=etree.Element('{http://schemas.openxmlformats.org/presentationml/2006/main}sldIdLst', nsmap=nmap) - - #TODO: create toggle for - if (all_sections) and (len(keys)!=0): + nmap = presentation.slides._sldIdLst.nsmap + + all_sections = compile_sections(presentation, mapping) + root = etree.Element( + '{http://schemas.openxmlformats.org/presentationml/2006/main}sldIdLst', nsmap=nmap) + + + if (all_sections) and (len(keys) != 0): for key in keys: - section=all_sections[key] + section = all_sections[key] for slide in section: - etree.SubElement(root, '{http://schemas.openxmlformats.org/presentationml/2006/main}sldId',attrib=slide,nsmap=nmap) - - + etree.SubElement( + root, '{http://schemas.openxmlformats.org/presentationml/2006/main}sldId', attrib=slide, nsmap=nmap) + return root + def compile_sections(presentation, mapping): """this method will get all the sections that are in the pptx""" - - ns='{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id' - xml=etree.fromstring(presentation.part.blob) - nsmap = {'p14':'http://schemas.microsoft.com/office/powerpoint/2010/main'} + + ns = '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id' + xml = etree.fromstring(presentation.part.blob) + nsmap = {'p14': 'http://schemas.microsoft.com/office/powerpoint/2010/main'} sections = xml.xpath('.//p14:sectionLst', namespaces=nsmap)[0] - - collector=dict() - pairs_col=list() - + collector = dict() + pairs_col = list() + for section in sections: - key=section.attrib['name'] + key = section.attrib['name'] for slidelist in section: for slide in slidelist: - pairs=dict() - slide_id=slide.attrib['id'] + pairs = dict() + slide_id = slide.attrib['id'] + # lookup in slide mapping to get rID + rID = mapping[slide_id] - #lookup in slide mapping to get rID - rID=mapping[slide_id] - - pairs['id']=slide_id - pairs[ns]=rID + pairs['id'] = slide_id + pairs[ns] = rID pairs_col.append(pairs) + collector[key] = pairs_col + pairs_col = list() - collector[key]=pairs_col - pairs_col=list() - return collector - -def replace_slides(new_xml,presentation,folder, save=False): + + +def replace_slides(new_xml, presentation, folder, save=False): """This method will take a xml tree and create the final pptx out of it""" - uid=str(uuid.uuid4().hex)[:10] - file_path= f"{folder}/{uid}.pptx" - slidelist=presentation.slides._sldIdLst + uid = str(uuid.uuid4().hex)[:10] + file_path = f"{folder}/{uid}.pptx" + slidelist = presentation.slides._sldIdLst + + slidelist.getparent().replace(slidelist, new_xml) - - slidelist.getparent().replace(slidelist,new_xml) - if save: presentation.save(file_path) - + return file_path -def print_xml(xml): - print(etree.tostring(xml, pretty_print=True, encoding="unicode")) \ No newline at end of file +def _print_xml(xml): + print(etree.tostring(xml, pretty_print=True, encoding="unicode")) diff --git a/celery/req.txt b/celery/req.txt index 2a2f26c..f1fc5d9 100644 --- a/celery/req.txt +++ b/celery/req.txt @@ -9,9 +9,9 @@ kombu==5.0.2 lxml==4.6.2 Pillow==8.0.1 prompt-toolkit==3.0.8 +pymongo==3.11.2 python-pptx==0.6.18 pytz==2020.4 -redis==3.5.3 six==1.15.0 vine==5.0.0 wcwidth==0.2.5 diff --git a/celery/server.py b/celery/server.py index 1c2b195..d271d16 100644 --- a/celery/server.py +++ b/celery/server.py @@ -1,14 +1,25 @@ import os from celery import Celery -import redis + CELERY_BROKER_URL = os.getenv("RABBITMQ") -CELERY_RESULT_BACKEND = os.getenv("REDISSERVER") +CELERY_RESULT_BACKEND = os.getenv("MONGODB") app = Celery("worker", backend=CELERY_RESULT_BACKEND, - broker=CELERY_BROKER_URL) + broker=CELERY_BROKER_URL) app.conf.update( result_extended=True, - - ) + enable_utc=False, + mongodb_backend_settings={ + 'database': 'taskdb', + 'taskmeta_collection': 'ta', + }, + task_routes={ + 'worker.*':{'queue':'presentations'}, + + }, + task_default_queue = 'presentations', + task_track_started = True + +) diff --git a/celery/worker.py b/celery/worker.py index 098eedd..44ad4c7 100644 --- a/celery/worker.py +++ b/celery/worker.py @@ -5,32 +5,36 @@ from time import sleep from celery.exceptions import Ignore from pptx import Presentation -import redis +from pymongo import MongoClient from pptx_handler import create_pptx import json -CELERY_BROKER_URL = os.getenv("RABBITMQ") -CELERY_RESULT_BACKEND = os.getenv("REDISSERVER") -from server import app +from server import app +MONGODB = os.getenv("MONGODB") +client = MongoClient(MONGODB) +db = client["taskdb"]["ta"] @app.task(name='pptx', bind=True, max_retries=3) -def generate_pptx(self, sections, downloadStatus): +def generate_pptx(self, sections, customID,downloaded,date_started): try: pptx_path = "master.pptx" task_id=self.request.id - # task_prefix = "celery-task-meta-" - # task_str = task_prefix + task_id - pres= Presentation(pptx_path) file_path=create_pptx(pres,sections) + + except Exception as exec: self.retry(exec=exec, countdown = 2 ** self.request.retries) + output = { + 'filePath':file_path + } - return f"{task_id} finished. stored {file_path}" + + return output diff --git a/docker-compose.yml b/docker-compose.yml index 9abd112..aed4442 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,12 +10,19 @@ services: - "3000:3000" depends_on: - apihub + - mongodb1 + - mongodb2 volumes: - ./frontend/nuxt-fe:/app + command: npm run dev + networks: + - mongonetwork nginx: build: ./nginx/ + networks: + - mongonetwork container_name: nginx ports: - "80:80" @@ -23,6 +30,8 @@ services: - ./nginx/default.conf:/etc/nginx/conf.d/default.conf depends_on: - nuxt + - mongodb1 + - mongodb2 celery: container_name: celery @@ -30,21 +39,97 @@ services: volumes: - ./celery/server.py:/celery/server.py - ./celery/worker.py:/celery/worker.py + - pptxdecks:/celery/output environment: - REDISSERVER: redis://redis + MONGODB: mongodb://mongodb2 RABBITMQ: pyamqp://rabbitmq + networks: + - mongonetwork depends_on: - - redis + - mongodb1 + - mongodb2 - rabbitmq + + celery-beat: + container_name: celery-beat + build: ./celery-beat/ + volumes: + - pptxdecks:/celery-beat/output + environment: + MONGODB: mongodb://mongodb2 + RABBITMQ: pyamqp://rabbitmq + networks: + - mongonetwork + entrypoint: + - /celery-beat/beat.sh + depends_on: + - mongodb1 + - mongodb2 + - rabbitmq + - celery + + + mongo-stream: + container_name: mongo-stream + restart: on-failure + build: ./mongo-stream/ + command: python3 -u server.py + environment: + MONGODB: mongodb://mongodb2 + ports: + - "3333:3333" + depends_on: + - mongo-startup + - mongodb1 + - mongodb2 + volumes: + - ./mongo-stream/server.py:/usr/app/server.py + networks: + - mongonetwork + + mongodb1: + image: mongo:latest + container_name: replica1 + networks: + - mongonetwork + ports: + - 30001:27017 + entrypoint: [ "/usr/bin/mongod", "--bind_ip_all", "--replSet", "rs0" ] + + mongodb2: + image: mongo:latest + container_name: replica2 + volumes: + - ./db-backup/primary:/data/db + networks: + - mongonetwork + ports: + - 30002:27017 + entrypoint: [ "/usr/bin/mongod", "--bind_ip_all", "--replSet", "rs0" ] + + mongo-startup: + image: mongo + container_name: startup-agent + networks: + - mongonetwork + depends_on: + - mongodb1 + - mongodb2 + + volumes: + - ./mongo-startup:/mongo-startup + entrypoint: + - /mongo-startup/initiate_replica.sh apihub: build: ./backend/ container_name: apihub restart: on-failure environment: - REDISSERVER: redis://redis + MONGODB: mongodb://mongodb2 RABBITMQ: pyamqp://rabbitmq volumes: + - pptxdecks:/usr/app/output - type: bind source: ./backend/api.py target: /usr/app/api.py @@ -63,22 +148,27 @@ services: - type: bind source: ./backend/master.pptx target: /usr/app/master.pptx + - type: bind + source: ./backend/config.py + target: /usr/app/config.py + - type: bind + source: ./backend/api_utils.py + target: /usr/app/api_utils.py command: uvicorn --host=0.0.0.0 --port=8000 api:app --reload ports: - "8000:8000" depends_on: - - redis + - mongodb1 + - mongo-startup - rabbitmq + networks: + - mongonetwork - redis: - container_name: redis - image: redis:6.0-buster - ports: - - "6379:6379" - rabbitmq: container_name: rabbitmq + networks: + - mongonetwork image: rabbitmq:3.8-management ports: - "15672:15672" @@ -87,11 +177,18 @@ services: container_name: flower image: mher/flower:0.9.5 command: ["flower", "--broker=amqp://rabbitmq", "--port=5555"] + networks: + - mongonetwork ports: - "5555:5555" depends_on: - - redis + - mongodb1 + - mongodb2 - rabbitmq - celery - \ No newline at end of file +networks: + mongonetwork: + +volumes: + pptxdecks: \ No newline at end of file diff --git a/dockercommands.txt b/dockercommands.txt new file mode 100644 index 0000000..92bd177 --- /dev/null +++ b/dockercommands.txt @@ -0,0 +1,6 @@ +# Delete all containers # +docker rm -f $(docker ps -a -q) + +# Delete all images +docker system prune -a + diff --git a/frontend/nuxt-fe/Dockerfile b/frontend/nuxt-fe/Dockerfile index b7b9934..3eaa6b3 100644 --- a/frontend/nuxt-fe/Dockerfile +++ b/frontend/nuxt-fe/Dockerfile @@ -13,6 +13,3 @@ COPY . . RUN npm run build ENV HOST 0.0.0.0 -# EXPOSE 3000 - -# CMD [ "npm" ,"start" ] \ No newline at end of file diff --git a/frontend/nuxt-fe/assets/css/tailwind.css b/frontend/nuxt-fe/assets/css/tailwind.css new file mode 100644 index 0000000..bd6213e --- /dev/null +++ b/frontend/nuxt-fe/assets/css/tailwind.css @@ -0,0 +1,3 @@ +@tailwind base; +@tailwind components; +@tailwind utilities; \ No newline at end of file diff --git a/frontend/nuxt-fe/components/AppHeader.vue b/frontend/nuxt-fe/components/AppHeader.vue index 1b01dc3..de8728c 100644 --- a/frontend/nuxt-fe/components/AppHeader.vue +++ b/frontend/nuxt-fe/components/AppHeader.vue @@ -1,7 +1,13 @@ -