diff --git a/api/common.py b/api/common.py index fb8eaad..e11dd23 100644 --- a/api/common.py +++ b/api/common.py @@ -97,12 +97,18 @@ def get_owner_repo_provider(repo_url,provider_full_name=False): repo = repo_url.split("/")[-1] owner = repo_url.split("/")[-2] provider = repo_url.split("/")[-3] - if provider not in ["github.com","gitlab.com"]: + if provider not in ["github.com","gitlab.com","www.github.com","www.gitlab.com"]: abort(400, "Unrecognized repository provider.") + + if provider == "www.github.com": + provider = "github.com" + if provider == "www.gitlab.com": + provider = "gitlab.com" + if not provider_full_name: - if provider == "github.com": + if (provider == "github.com"): provider = "gh" - elif provider == "gitlab.com": + elif (provider == "gitlab.com"): provider = "gl" return [owner,repo,provider] diff --git a/api/github_client.py b/api/github_client.py index 84dcbed..65f81ed 100644 --- a/api/github_client.py +++ b/api/github_client.py @@ -44,7 +44,7 @@ def gh_filter(input_str): """ Returns repository name in owner/repository_name format """ - github_url_pattern = r'^https?://github\.com/([^/]+)/([^/]+)' + github_url_pattern = r'^https?://(?:www\.)?github\.com/([^/]+)/([^/]+)' match = re.match(github_url_pattern, input_str) if match: owner = match.group(1) @@ -148,6 +148,7 @@ def gh_get_project_name(github_client,target_repo): folder as required by neurolibre. """ repo = github_client.get_repo(gh_filter(target_repo)) + print(target_repo) # This is a requirement contents = repo.get_contents("binder/data_requirement.json") data = json.loads(contents.decoded_content) diff --git a/api/neurolibre_celery_tasks.py b/api/neurolibre_celery_tasks.py index 8b0dce9..335b53d 100644 --- a/api/neurolibre_celery_tasks.py +++ b/api/neurolibre_celery_tasks.py @@ -1,6 +1,6 @@ from celery import Celery import time -import os +import os import json import subprocess from celery import states @@ -17,6 +17,7 @@ import shutil import base64 from celery.exceptions import Ignore +from repo2data.repo2data import Repo2Data DOI_PREFIX = "10.55458" DOI_SUFFIX = "neurolibre" @@ -46,7 +47,7 @@ # Set timezone US/Eastern (Montreal) def get_time(): """ - To be printed on issue comment updates for + To be printed on issue comment updates for background tasks. """ tz = pytz.timezone('US/Eastern') @@ -64,10 +65,68 @@ def sleep_task(self, seconds): self.update_state(state='PROGRESS', meta={'remaining': seconds - i - 1}) return 'done sleeping for {} seconds'.format(seconds) +@celery_app.task(bind=True) +def preview_download_data(self, payload): + """ + Downloading data to the preview server. + """ + task_title = "DATA DOWNLOAD (REPO2DATA)" + GH_BOT=os.getenv('GH_BOT') + github_client = Github(GH_BOT) + task_id = self.request.id + + [owner,repo,provider] = get_owner_repo_provider(payload['repo_url']) + #commit_hash = format_commit_hash(payload['repo_url'],commit_hash) + logging.info(f"{owner}{provider}{repo}") + + repo = github_client.get_repo(gh_filter(payload['repo_url'])) + + try: + contents = repo.get_contents("binder/data_requirement.json") + data_manifest = json.loads(contents.decoded_content) + json_path = os.path.join("/DATA","tmp_repo2data",owner,repo,"data_requirement.json") + with open(json_path,"w") as f: + json.dump(data_manifest,f) + if not data_manifest: + raise + project_name = data_manifest['projectName'] + except Exception as e: + message = f"Data download has failed: {str(e)}" + if payload['email']: + send_email(payload['email'], "NeuroLibre: Data download request", message) + else: + gh_template_respond(github_client,"failure",task_title,payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'], + f"Data exists for {project_name}; not overwriting by default! Please set overwrite=True." + ) + + data_path = os.path.join("/DATA", project_name) + if os.path.exists(data_path) and not payload['overwrite']: + gh_template_respond(github_client,"failure",task_title,payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'], + f"Data exists for {project_name}; not overwriting by default! Please set overwrite=True." + ) + self.update_state(state=states.IGNORED, meta={'message': f"Data already downloaded downloaded to {data_path}."}) + return + + # download data with repo2data + repo2data = Repo2Data(json_path, server=True) + downloaded_data_path = repo2data.install()[0] + message = f"Downloaded data in {downloaded_data_path}." + + # update status + if payload['email']: + send_email(payload['email'], "NeuroLibre: Data download request", message) + self.update_state(state=states.SUCCESS, meta={'message': message}) + else: + gh_template_respond(github_client,"received",task_title,payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'], + message + ) + self.update_state(state=states.SUCCESS, meta={'message': message}) + + @celery_app.task(bind=True) def rsync_data_task(self, comment_id, issue_id, project_name, reviewRepository): """ - Uploading data to the production server + Uploading data to the production server from the test server. """ task_title = "DATA TRANSFER (Preview --> Preprint)" @@ -110,10 +169,10 @@ def rsync_book_task(self, repo_url, commit_hash, comment_id, issue_id, reviewRep """ Moving the book from the test to the production server. This book is expected to be built from - a roboneurolibre repository. + a roboneurolibre repository. Once the book is available on the production server, - content is symlinked to a DOI formatted directory (Nginx configured) + content is symlinked to a DOI formatted directory (Nginx configured) to enable DOI formatted links. """ task_title = "REPRODUCIBLE PREPRINT TRANSFER (Preview --> Preprint)" @@ -121,7 +180,7 @@ def rsync_book_task(self, repo_url, commit_hash, comment_id, issue_id, reviewRep github_client = Github(GH_BOT) task_id = self.request.id [owner,repo,provider] = get_owner_repo_provider(repo_url,provider_full_name=True) - if owner != "roboneurolibre": + if owner != "roboneurolibre": gh_template_respond(github_client,"failure",task_title,reviewRepository,issue_id,task_id,comment_id, f"Repository is not under roboneurolibre organization!") self.update_state(state=states.FAILURE, meta={'exc_type':"NeuroLibre celery exception",'exc_message': "Custom",'message': f"FAILURE: Repository {owner}/{repo} has no roboneurolibre fork."}) return @@ -137,7 +196,7 @@ def rsync_book_task(self, repo_url, commit_hash, comment_id, issue_id, reviewRep self.update_state(state=states.STARTED, meta={'message': f"Transfer started {now}"}) gh_template_respond(github_client,"started",task_title,reviewRepository,issue_id,task_id,comment_id, "") #logging.info("Calling subprocess") - process = subprocess.Popen(["/usr/bin/rsync", "-avR", remote_path, "/"], stdout=subprocess.PIPE,stderr=subprocess.STDOUT) + process = subprocess.Popen(["/usr/bin/rsync", "-avR", remote_path, "/"], stdout=subprocess.PIPE,stderr=subprocess.STDOUT) output = process.communicate()[0] ret = process.wait() logging.info(output) @@ -184,15 +243,15 @@ def rsync_book_task(self, repo_url, commit_hash, comment_id, issue_id, reviewRep @celery_app.task(bind=True) def fork_configure_repository_task(self, payload): task_title = "INITIATE PRODUCTION (Fork and Configure)" - + GH_BOT=os.getenv('GH_BOT') github_client = Github(GH_BOT) task_id = self.request.id - + now = get_time() self.update_state(state=states.STARTED, meta={'message': f"Transfer started {now}"}) gh_template_respond(github_client,"started",task_title,payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'], "") - + book_tested_check = get_test_book_build(PREVIEW_SERVER,True,payload['commit_hash']) # Production cannot be started if there's a book at the latest commit hash at which # the production is asked for. @@ -214,7 +273,7 @@ def fork_configure_repository_task(self, payload): forked_name = gh_forkify_name(payload['repository_url']) # First check if a fork already exists. fork_exists = False - try: + try: github_client.get_repo(forked_name) fork_exists = True except UnknownObjectException as e: @@ -248,7 +307,7 @@ def fork_configure_repository_task(self, payload): return else: logging.info(f"Fork already exists {payload['repository_url']}, moving on with configurations.") - + gh_template_respond(github_client,"started",task_title,payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'], "Forked repo has become available. Proceeding with configuration updates.") jb_config = gh_get_jb_config(github_client,forked_name) @@ -291,7 +350,7 @@ def fork_configure_repository_task(self, payload): "title": "Citable PDF and archives" }] }) - + if 'chapters' in jb_toc: jb_toc_new['chapters'].append({ "url": f"{PAPERS_PATH}/{DOI_PREFIX}/{DOI_SUFFIX}.{payload['issue_id']:05d}", @@ -303,7 +362,7 @@ def fork_configure_repository_task(self, payload): "url": f"{PAPERS_PATH}/{DOI_PREFIX}/{DOI_SUFFIX}.{payload['issue_id']:05d}", "title": "Citable PDF and archives" }) - + # Update TOC file in the forked repo only if the new toc is different # otherwise github api will complain. if not jb_toc_new != jb_toc: @@ -382,7 +441,7 @@ def generate(): # Fetch all the yielded messages binder_logs = binder_response.get_data(as_text=True) binder_logs = "".join(binder_logs) - # After the upstream closes, check the server if there's + # After the upstream closes, check the server if there's # a book built successfully. book_status = book_get_by_params(commit_hash=payload['commit_hash']) # For now, remove the block either way. @@ -393,7 +452,7 @@ def generate(): os.remove(lock_filename) # Append book-related response downstream if not book_status: - # These flags will determine how the response will be + # These flags will determine how the response will be # interpreted and returned outside the generator gh_template_respond(github_client,"failure","Binder build has failed 🥀",payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'], "The next comment will forward the logs") issue_comment = [] @@ -424,7 +483,7 @@ def generate(): @celery_app.task(bind=True) def zenodo_create_buckets_task(self, payload): - + GH_BOT=os.getenv('GH_BOT') github_client = Github(GH_BOT) task_id = self.request.id @@ -456,25 +515,25 @@ def zenodo_create_buckets_task(self, payload): for ii in range(len(data['authors'])): data['authors'][ii]['affiliation'] = first_affiliations[ii] - - # To deal with some typos, also with orchid :) + + # To deal with some typos, also with orchid :) valid_field_names = {'name', 'orcid', 'affiliation'} for author in data['authors']: invalid_fields = [] for field in author: if field not in valid_field_names: invalid_fields.append(field) - + for invalid_field in invalid_fields: valid_field = None for valid_name in valid_field_names: if valid_name.lower() in invalid_field.lower() or (valid_name == 'orcid' and invalid_field.lower() == 'orchid'): valid_field = valid_name break - + if valid_field: author[valid_field] = author.pop(invalid_field) - + if 'equal-contrib' in author: author.pop('equal-contrib') @@ -495,7 +554,7 @@ def zenodo_create_buckets_task(self, payload): collect[archive_type] = r # Rate limit time.sleep(2) - + if {k: v for k, v in collect.items() if 'reason' in v}: # This means at least one of the deposits has failed. logging.info(f"Caught an issue with the deposit. A record (JSON) will not be created.") @@ -518,7 +577,7 @@ def zenodo_create_buckets_task(self, payload): @celery_app.task(bind=True) def zenodo_flush_task(self,payload): - + GH_BOT=os.getenv('GH_BOT') github_client = Github(GH_BOT) task_id = self.request.id @@ -558,7 +617,7 @@ def zenodo_flush_task(self,payload): prog[item] = False msg.append(f"\n The {item} deposit does not exist.") gh_template_respond(github_client,"failure",payload['task_title'], payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'],"".join(msg)) - + # Update the issue comment gh_template_respond(github_client,"started",payload['task_title'], payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'],"".join(msg)) @@ -579,11 +638,11 @@ def zenodo_upload_book_task(self, payload): GH_BOT=os.getenv('GH_BOT') github_client = Github(GH_BOT) task_id = self.request.id - + gh_template_respond(github_client,"started",payload['task_title'], payload['review_repository'],payload['issue_id'],task_id,payload['comment_id']) owner,repo,provider = get_owner_repo_provider(payload['repository_url'],provider_full_name=True) - + fork_url = f"https://{provider}/roboneurolibre/{repo}" commit_fork = format_commit_hash(fork_url,"HEAD") record_name = item_to_record_name("book") @@ -619,7 +678,7 @@ def zenodo_upload_book_task(self, payload): @celery_app.task(bind=True) def zenodo_upload_data_task(self,payload): - + GH_BOT=os.getenv('GH_BOT') github_client = Github(GH_BOT) task_id = self.request.id @@ -641,7 +700,7 @@ def zenodo_upload_data_task(self,payload): logging.info(f"Compressed data already exists {record_name}_10.55458_NeuroLibre_{payload['issue_id']:05d}_{commit_fork[0:6]}.zip") tar_file = expect else: - # We will archive the data synced from the test server. (item_arg is the project_name, indicating that the + # We will archive the data synced from the test server. (item_arg is the project_name, indicating that the # data is stored at the /DATA/project_name folder) local_path = os.path.join("/DATA", project_name) # Descriptive file name @@ -675,14 +734,14 @@ def zenodo_upload_repository_task(self, payload): GH_BOT=os.getenv('GH_BOT') github_client = Github(GH_BOT) task_id = self.request.id - + gh_template_respond(github_client,"started",payload['task_title'], payload['review_repository'],payload['issue_id'],task_id,payload['comment_id']) owner,repo,provider = get_owner_repo_provider(payload['repository_url'],provider_full_name=True) - + fork_url = f"https://{provider}/roboneurolibre/{repo}" commit_fork = format_commit_hash(fork_url,"HEAD") - + default_branch = get_default_branch(github_client,fork_url) download_url = f"{fork_url}/archive/refs/heads/{default_branch}.zip" @@ -722,11 +781,11 @@ def zenodo_upload_docker_task(self, payload): GH_BOT=os.getenv('GH_BOT') github_client = Github(GH_BOT) task_id = self.request.id - + gh_template_respond(github_client,"started",payload['task_title'], payload['review_repository'],payload['issue_id'],task_id,payload['comment_id']) owner,repo,provider = get_owner_repo_provider(payload['repository_url'],provider_full_name=True) - + fork_url = f"https://{provider}/roboneurolibre/{repo}" commit_fork = format_commit_hash(fork_url,"HEAD") @@ -773,7 +832,7 @@ def zenodo_upload_docker_task(self, payload): # Login to the private registry to pull images r = docker_login() - + if not r['status']: msg = f"Cannot login to NeuroLibre private docker registry. \n {r['message']}" gh_template_respond(github_client,"failure",payload['task_title'], payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'], msg) @@ -790,7 +849,7 @@ def zenodo_upload_docker_task(self, payload): gh_template_respond(github_client,"failure",payload['task_title'], payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'], msg) self.update_state(state=states.FAILURE, meta={'exc_type':"NeuroLibre celery exception",'exc_message': "Custom",'message': msg}) return - + msg = f"Exporting docker image: \n {lut['docker_image']}" gh_template_respond(github_client,"started",payload['task_title'] + " `exporting (2/3)`", payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'],msg) @@ -831,16 +890,16 @@ def zenodo_upload_docker_task(self, payload): @celery_app.task(bind=True) def zenodo_publish_task(self, payload): - + GH_BOT=os.getenv('GH_BOT') github_client = Github(GH_BOT) task_id = self.request.id - + gh_template_respond(github_client,"started",payload['task_title'], payload['review_repository'],payload['issue_id'],task_id,payload['comment_id']) prompt = "First state that you will issue commands to set DOIs for the reproducibility assets, then you'll talk to yourself a bit. But reassure in a funny way that there's nothing to worry about because you are not an artificial general intelligence (yet). Keep it to a few sentences." # Check if already published publish_status_init = zenodo_confirm_status(payload['issue_id'],"published") - + if publish_status_init[0]: # Means already published. In this case just set the DOIs. gh_template_respond(github_client,"started",payload['task_title'], payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'],"As the reproducibility assets have already been published, I will just set the DOIs.") @@ -866,7 +925,7 @@ def zenodo_publish_task(self, payload): return else: # Confirm that all items are published. - # TODO: Check this + # TODO: Check this publish_status = zenodo_confirm_status(payload['issue_id'],"published") # If all items are published, success. Add DOIs. if publish_status[0]: @@ -878,7 +937,7 @@ def zenodo_publish_task(self, payload): gh_create_comment(github_client,payload['review_repository'],payload['issue_id'],command) time.sleep(1) else: - # Some one None + # Some one None response.append(f"\n Looks like there's a problem. {publish_status[1]} reproducibility assets are archived.") msg = "\n".join(response) gh_template_respond(github_client,"failure",payload['task_title'], payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'], msg, False) @@ -945,7 +1004,7 @@ def generate(): # Fetch all the yielded messages binder_logs = binder_response.get_data(as_text=True) binder_logs = "".join(binder_logs) - # After the upstream closes, check the server if there's + # After the upstream closes, check the server if there's # a book built successfully. book_status = book_get_by_params(commit_hash=payload['commit_hash']) exec_error = book_execution_errored(owner,repo,provider,payload['commit_hash']) @@ -957,7 +1016,7 @@ def generate(): os.remove(lock_filename) # Append book-related response downstream if not book_status or exec_error: - # These flags will determine how the response will be + # These flags will determine how the response will be # interpreted and returned outside the generator #gh_template_respond(github_client,"failure","Binder build has failed 🥀",payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'], "The next comment will forward the logs") issue_comment = [] @@ -1050,7 +1109,7 @@ def write_html_to_temp_directory(commit_sha, logs): f.write("\n") f.write(f"{logs}") f.write("\n") - + return file_path @celery_app.task(bind=True) @@ -1066,7 +1125,7 @@ def preprint_build_pdf_draft(self, payload): shutil.rmtree(target_path) try: gh_clone_repository(payload['repository_url'], target_path, depth=1) - except Exception as e: + except Exception as e: gh_template_respond(github_client,"failure",payload['task_title'], payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'], str(e)) self.update_state(state=states.FAILURE, meta={'exc_type':"NeuroLibre celery exception",'exc_message': "Custom",'message': str(e)}) return @@ -1074,7 +1133,7 @@ def preprint_build_pdf_draft(self, payload): res = create_extended_pdf_sources(target_path, payload['issue_id'],payload['repository_url']) if res['status']: try: - process = subprocess.Popen(["docker", "run","--rm", "-v", f"{target_path}:/data", "-u", "ubuntu:www-data", "neurolibre/inara:latest","-o", "neurolibre", "./paper.md"], stdout=subprocess.PIPE,stderr=subprocess.STDOUT) + process = subprocess.Popen(["docker", "run","--rm", "-v", f"{target_path}:/data", "-u", "ubuntu:www-data", "neurolibre/inara:latest","-o", "neurolibre", "./paper.md"], stdout=subprocess.PIPE,stderr=subprocess.STDOUT) output = process.communicate()[0] ret = process.wait() logging.info(output) @@ -1100,7 +1159,7 @@ def preprint_build_pdf_draft(self, payload): except subprocess.CalledProcessError as e: gh_template_respond(github_client,"failure",payload['task_title'], payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'], f"{e.output}") self.update_state(state=states.FAILURE, meta={'exc_type':"NeuroLibre celery exception",'exc_message': "Custom",'message': e.output}) - else: + else: gh_template_respond(github_client,"failure",payload['task_title'], payload['review_repository'],payload['issue_id'],task_id,payload['comment_id'], f"{res['message']}") self.update_state(state=states.FAILURE, meta={'exc_type':"NeuroLibre celery exception",'exc_message': "Custom",'message': res['message']}) diff --git a/api/neurolibre_common_api.py b/api/neurolibre_common_api.py index 8b5d0f3..e227054 100644 --- a/api/neurolibre_common_api.py +++ b/api/neurolibre_common_api.py @@ -104,4 +104,13 @@ def api_unlock_build(user, repo_url): response = make_response(f"No build lock found for {repo_url}",404) response.mimetype = "text/plain" - return response \ No newline at end of file + return response + +@common_api.route('/public/data', methods=['GET']) +@doc(description='List the name of folders under /DATA.', tags=['Data']) +def api_preview_list(): + """ + This endpoint is to list the contents of the /DATA folder. + """ + files = os.listdir('/DATA') + return make_response(jsonify(files),200) \ No newline at end of file diff --git a/api/neurolibre_preview_api.py b/api/neurolibre_preview_api.py index 66ee231..5c90789 100644 --- a/api/neurolibre_preview_api.py +++ b/api/neurolibre_preview_api.py @@ -8,7 +8,7 @@ import neurolibre_common_api from flask import jsonify, make_response from common import * -from schema import BuildSchema, BuildTestSchema +from schema import BuildSchema, BuildTestSchema, DownloadSchema from flask_htpasswd import HtPasswdAuth from dotenv import load_dotenv from werkzeug.middleware.proxy_fix import ProxyFix @@ -16,7 +16,7 @@ from apispec import APISpec from apispec.ext.marshmallow import MarshmallowPlugin from github_client import * -from neurolibre_celery_tasks import celery_app, sleep_task, preview_build_book_task, preview_build_book_test_task +from neurolibre_celery_tasks import celery_app, sleep_task, preview_build_book_task, preview_build_book_test_task, preview_download_data from celery.events.state import State from github import Github, UnknownObjectException @@ -52,13 +52,13 @@ app.logger.info(f"Using {binderName}.{domainName} as BinderHub.") -serverContact = app.config["SERVER_CONTACT"] +serverContact = app.config["SERVER_CONTACT"] serverName = app.config["SERVER_SLUG"] serverDescription = app.config["SERVER_DESC"] serverTOS = app.config["SERVER_TOS"] serverAbout = app.config["SERVER_ABOUT"] + app.config["SERVER_LOGO"] -# API specifications displayed on the swagger UI +# API specifications displayed on the swagger UI spec = APISpec( title="Neurolibre preview & screening API", version='v1', @@ -84,6 +84,7 @@ docs.register(neurolibre_common_api.api_get_books,blueprint="common_api") docs.register(neurolibre_common_api.api_heartbeat,blueprint="common_api") docs.register(neurolibre_common_api.api_unlock_build,blueprint="common_api") +docs.register(neurolibre_common_api.api_preview_list,blueprint="common_api") """ Configuration END @@ -97,6 +98,45 @@ API Endpoints START """ +@app.route('/api/data/cache', methods=['POST']) +@htpasswd.required +@marshal_with(None,code=422,description="Cannot validate the payload, missing or invalid entries.") +@use_kwargs(DownloadSchema()) +@doc(description='Endpoint for downloading data through repo2data.', tags=['Data']) +def api_download_data(user, id, repo_url, email, is_overwrite): + """ + This endpoint is to download data from GitHub (technical screening) requests. + """ + GH_BOT=os.getenv('GH_BOT') + github_client = Github(GH_BOT) + issue_id = id + + task_title = "Download data for preview." + comment_id = gh_template_respond(github_client,"pending",task_title,reviewRepository,issue_id) + + celery_payload = dict(repo_url=repo_url, + rate_limit=build_rate_limit, + binder_name=binderName, + domain_name = domainName, + comment_id=comment_id, + issue_id=issue_id, + review_repository=reviewRepository, + task_title=task_title, + overwrite=is_overwrite, + email=email) + + task_result = preview_download_data.apply_async(args=[celery_payload]) + + if task_result.task_id is not None: + gh_template_respond(github_client,"received",task_title,reviewRepository,issue_id,task_result.task_id,comment_id, "") + response = make_response(jsonify("Celery task assigned successfully."),200) + else: + # If not successfully assigned, fail the status immediately and return 500 + gh_template_respond(github_client,"failure",task_title,reviewRepository,issue_id,task_result.task_id,comment_id, "Internal server error: NeuroLibre background task manager could not receive the request.") + response = make_response(jsonify("Celery could not start the task."),500) + return response + +docs.register(api_download_data) @app.route('/api/book/build', methods=['POST']) @htpasswd.required @@ -115,16 +155,16 @@ def api_book_build(user, id, repo_url, commit_hash): task_title = "Book Build (Preview)" comment_id = gh_template_respond(github_client,"pending",task_title,reviewRepository,issue_id) - celery_payload = dict(repo_url=repo_url, - commit_hash=commit_hash, + celery_payload = dict(repo_url=repo_url, + commit_hash=commit_hash, rate_limit=build_rate_limit, - binder_name=binderName, + binder_name=binderName, domain_name = domainName, comment_id=comment_id, issue_id=issue_id, review_repository=reviewRepository, task_title=task_title) - + task_result = preview_build_book_task.apply_async(args=[celery_payload]) if task_result.task_id is not None: @@ -153,18 +193,18 @@ def api_book_build_test(user, repo_url, commit_hash, email): [owner, repo, provider] = get_owner_repo_provider(repo_url) mail_subject = f"NRP test build for {owner}/{repo}" mail_body = f"We have received your request to build a NeuroLibre reproducible preprint from {repo_url} at {commit_hash}. \n Your request has been queued, we will inform you when the process starts." - + send_email(email, mail_subject, mail_body) - celery_payload = dict(repo_url=repo_url, - commit_hash=commit_hash, + celery_payload = dict(repo_url=repo_url, + commit_hash=commit_hash, rate_limit=build_rate_limit, - binder_name=binderName, + binder_name=binderName, domain_name = domainName, email = email, review_repository=reviewRepository, mail_subject=mail_subject) - + task_result = preview_build_book_test_task.apply_async(args=[celery_payload]) if task_result.task_id is not None: @@ -174,7 +214,7 @@ def api_book_build_test(user, repo_url, commit_hash, email): # If not successfully assigned, fail the status immediately and return 500 mail_body = f"We could not start processing your NRP test request due to a technical issue on the server side. Please contact info@neurolibre.org." response = make_response(jsonify("Celery could not start the task."),500) - + send_email(email, mail_subject, mail_body) return response diff --git a/api/preprint.py b/api/preprint.py index c508740..d03ae5d 100644 --- a/api/preprint.py +++ b/api/preprint.py @@ -6,7 +6,7 @@ from dotenv import load_dotenv import re from github import Github -from github_client import gh_read_from_issue_body +from github_client import gh_read_from_issue_body import csv import subprocess import nbformat @@ -24,15 +24,15 @@ load_dotenv() """ -Helper functions for the tasks +Helper functions for the tasks performed by the preprint (production server). """ def zenodo_create_bucket(title, archive_type, creators, repository_url, issue_id): - + [owner,repo,provider] = get_owner_repo_provider(repository_url,provider_full_name=True) - # ASSUMPTION + # ASSUMPTION # Fork exists and has the same name. fork_url = f"https://{provider}/roboneurolibre/{repo}" @@ -40,10 +40,10 @@ def zenodo_create_bucket(title, archive_type, creators, repository_url, issue_id params = {'access_token': ZENODO_TOKEN} # headers = {"Content-Type": "application/json", # "Authorization": "Bearer {}".format(ZENODO_TOKEN)} - - # WANING: + + # WANING: # FOR NOW assuming that HEAD corresponds to the latest successful - # book build. That may not be the case. Requires better + # book build. That may not be the case. Requires better # data handling or extra functionality to retrieve the latest successful # book commit. commit_user = format_commit_hash(repository_url,"HEAD") @@ -77,11 +77,11 @@ def zenodo_create_bucket(title, archive_type, creators, repository_url, issue_id data["metadata"]["upload_type"] = "software" data["metadata"]["description"] = f"Docker image built from the {libre_text}, based on the {user_text}, using repo2docker (through BinderHub).
To run locally:
  1. docker load < DockerImage_10.55458_NeuroLibre_{issue_id:05d}_{commit_fork[0:6]}.tar.gz
  2. docker run -it --rm -p 8888:8888 DOCKER_IMAGE_ID jupyter lab --ip 0.0.0.0

by replacing DOCKER_IMAGE_ID above with the respective ID of the Docker image loaded from the zip file.

{review_text} {sign_text}" - # Make an empty deposit to create the bucket + # Make an empty deposit to create the bucket r = requests.post("https://zenodo.org/api/deposit/depositions", params=params, json=data) - + print(f"Error: {r.status_code} - {r.text}") # response_dict = json.loads(r.text) @@ -104,7 +104,7 @@ def execute_subprocess(command): To asynchronously execute system-levels using celery simple calls such as os.system will not work. - This helper function is to issue system-level command executions + This helper function is to issue system-level command executions using celery. """ # This will be called by Celery, subprocess must be handled properly @@ -266,7 +266,7 @@ def item_to_record_name(item): "book":"JupyterBook"} if item in dict_map.keys(): return dict_map[item] - else: + else: return None def zenodo_upload_item(upload_file,bucket_url,issue_id,commit_fork,item_name): @@ -316,7 +316,7 @@ def parse_tsv_content(content): # Iterate over each row and add it to the parsed_data list for row in reader: parsed_data.append(row) - + return parsed_data def get_test_book_build(preview_server,verify_ssl,commit_hash): @@ -340,7 +340,7 @@ def get_test_book_build(preview_server,verify_ssl,commit_hash): def get_resource_lookup(preview_server,verify_ssl,repository_address): """ - For a given repository address, returns a dictionary + For a given repository address, returns a dictionary that contains the following fields: - "date","repository_url","docker_image","project_name","data_url","data_doi" IF there's a successful book build exists for the respective inquiry. @@ -351,7 +351,7 @@ def get_resource_lookup(preview_server,verify_ssl,repository_address): Ideally, this should be dealt with using a proper database instead of a tsv file. """ - + url = f"{preview_server}/book-artifacts/lookup_table.tsv" headers = {'Content-Type': 'application/json'} API_USER = os.getenv('TEST_API_USER') @@ -360,7 +360,7 @@ def get_resource_lookup(preview_server,verify_ssl,repository_address): # Send GET request response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl) - + # Process response if response.ok: # Get content body @@ -373,17 +373,17 @@ def get_resource_lookup(preview_server,verify_ssl,repository_address): if idx: # Convert to list values = parsed_data[idx][0].split(",") - # Convert to dict + # Convert to dict # The last two keys are not reliable (that may contain comma that is not separating tsv column) # also due to subpar documentation issue with repo2data. keys = ["date","repository_url","docker_image","project_name","data_url","data_doi"] lut = dict(zip(keys, values)) - else: + else: lut = None else: - + lut = None - + return lut def zenodo_publish(issue_id): @@ -405,7 +405,7 @@ def zenodo_publish(issue_id): message.append(f"\n :ice_cube: {item_to_record_name(item)} publish status:") r = requests.post(publish_link,params=params) response = r.json() - if r.status_code==202: + if r.status_code==202: message.append(f"\n :confetti_ball: ") tmp = f"zenodo_published_{item}_NeuroLibre_{issue_id:05d}.json" log_file = os.path.join(get_deposit_dir(issue_id), tmp) @@ -536,11 +536,11 @@ def parse_section_and_body(notebook): if current_paragraph: parsed_content.append({'section': current_section, 'paragraph': current_paragraph}) return parsed_content - + def myst_to_joss_tex_cite(input,match_format, subs_format): """ - In a given string, find (MyST) citation directives that matches - match_format, then replace them with JOSS-text template citation + In a given string, find (MyST) citation directives that matches + match_format, then replace them with JOSS-text template citation directives based on the subs_format. This function is used by substitute_cite_commands to handle multiple formats. @@ -554,40 +554,40 @@ def myst_to_joss_tex_cite(input,match_format, subs_format): try: citations = match.split(',') formatted_citations = '; '.join([f'@{citation.strip()}' for citation in citations if citation]) - if subs_format == "p": - input = re.sub(match_format, f'[{formatted_citations}]', input, count=1) + if subs_format == "p": + input = re.sub(match_format, f'[{formatted_citations}]', input, count=1) if subs_format == "t": - input = re.sub(rf'\{{cite:t\}}`{match}`', f'{formatted_citations}', input, count=1) + input = re.sub(rf'\{{cite:t\}}`{match}`', f'{formatted_citations}', input, count=1) except: - pass - + pass + return input def substitute_cite_directives(input): """ - Calls md_to_tex_cite for multiple citation formats, each case has to be + Calls md_to_tex_cite for multiple citation formats, each case has to be handled individually as substitute patterns vary. """ tmp = myst_to_joss_tex_cite(input, r'\{cite:p\}`([^`]*)`', "p") input = tmp if tmp else input tmp = myst_to_joss_tex_cite(input, r'\{cite:t\}`([^`]*)`', "t") input = tmp if tmp else input - return input + return input def remove_html_tags(markdown): """ Jupyter Book is intended to write documents without using html tags. - When wrapped between tags, MyST/JB parsers will skip the content. - Use b4s to get rid of html tags. + When wrapped between tags, MyST/JB parsers will skip the content. + Use b4s to get rid of html tags. """ soup = BeautifulSoup(markdown, "html.parser") return soup.get_text() -def myst_rm_admonition_render_html(input): +def myst_rm_admonition_render_html(input): md_parser = create_md_parser(MdParserConfig(),RendererHTML) parsed = md_parser.parse(input) #print(parsed) - # Apply desired filters here. + # Apply desired filters here. filtered_tokens = [token for token in parsed if token.type != 'fence'] #print(filtered_tokens) filtered_html = "" @@ -623,7 +623,7 @@ def myst_md_to_joss_md(file_name): markdown_content = file.read() markdown_content = substitute_cite_directives(markdown_content) - markdown_content = to_md(myst_rm_admonition_render_html(markdown_content)) + markdown_content = to_md(myst_rm_admonition_render_html(markdown_content)) return markdown_content def hyperlink_figure_references(match, issue_id): @@ -643,7 +643,7 @@ def jbook_to_joss_md(input_files,issue_id): elif file_name.endswith('.md'): markdown_output = myst_md_to_joss_md(file_name) output = output + markdown_output - # Hyerlink to reproducible preprint at Figure refs + # Hyperlink to reproducible preprint at Figure refs pattern = r'(Figure|Fig\.)\s+(\d+[-\w]*)' output = re.sub(pattern, lambda match: hyperlink_figure_references(match, issue_id), output) return output @@ -663,7 +663,7 @@ def append_bib_files(file1_path, file2_path, output_path): def merge_and_check_bib(target_path): """ - For now simply appending one bib to another + For now simply appending one bib to another later on, add duplication check. """ orig_bib = os.path.join(target_path,"paper.bib") @@ -671,7 +671,7 @@ def merge_and_check_bib(target_path): # Create a backup for the original markdown. shutil.copyfile(orig_bib, backup_bib) # Simply merge two bib files. - # TODO: GET THE DIRECTORY FROM FLASK + # TODO: GET THE DIRECTORY FROM FLASK partial_bib = "/home/ubuntu/full-stack-server/assets/partial.bib" append_bib_files(orig_bib, partial_bib, orig_bib) @@ -680,9 +680,9 @@ def create_extended_pdf_sources(target_path, issue_id, repository_url): target_path is where repository_url is cloned by the celery worker. """ # This will crawl all the Jupyter Notebooks to collect text that cites - # articles, then will substitute MyST cite commands with Pandoc directives + # articles, then will substitute MyST cite commands with Pandoc directives # recognized by OpenJournals PDF compilers.\ - try: + try: toc = get_local_yaml(os.path.join(target_path,"content","_toc.yml")) nl_local_file = os.path.join(target_path,"content","_neurolibre.yml") if os.path.isfile(nl_local_file): @@ -741,9 +741,9 @@ def get_local_yaml(file): def nb_to_lab(file_path): with open(file_path, 'r') as f: content = f.read() - + updated_content = re.sub(r'\?urlpath=tree/content/', '?urlpath=lab/tree/content/', content) - + with open(file_path, 'w') as f: f.write(updated_content) diff --git a/api/requirements.txt b/api/requirements.txt index ff1aa50..4e37d05 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -22,4 +22,5 @@ markdown markdownify==0.11.6 bs4 myst-parser==0.18.1 -markdown-it-py==2.0.1 \ No newline at end of file +markdown-it-py==2.0.1 +repo2data==2.9.1 diff --git a/api/schema.py b/api/schema.py index 50e6de5..14cd94f 100644 --- a/api/schema.py +++ b/api/schema.py @@ -21,6 +21,15 @@ class BookSchema(Schema): # Preview server +class DownloadSchema(Schema): + """ + Defines schema to be used for repo2data download. + """ + id = fields.Integer(required=False,description="Issue number of the technical screening of this preprint. If this used, the response will be returned to the respective github issue.") + repository_url = fields.Str(required=True,description="Full URL of a NeuroLibre compatible repository to be used for building the book.") + email = fields.Str(required=False,description="Email address, to which the result will be returned.") + overwrite = fields.Boolean(required=False,description="Whether or not the downloaded data will overwrite, if already exists.") + class BuildSchema(Schema): """ Defines payload types and requirements for book build request. diff --git a/nginx/neurolibre-preview.conf b/nginx/neurolibre-preview.conf index 3a6996f..003d6bb 100644 --- a/nginx/neurolibre-preview.conf +++ b/nginx/neurolibre-preview.conf @@ -20,11 +20,15 @@ server{ client_max_body_size 4G; keepalive_timeout 5; - - auth_basic "Administrator’s Area"; - auth_basic_user_file /home/ubuntu/.htpasswd; - location /api/ { + auth_basic "Administrator’s Area"; + auth_basic_user_file /home/ubuntu/.htpasswd; + include /etc/nginx/neurolibre_params; + proxy_pass http://app_server; + } + + location /public/ { + auth_basic off; include /etc/nginx/neurolibre_params; proxy_pass http://app_server; }