diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ebfaa8f..375a5bf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,4 +3,5 @@ repos: rev: 23.10.1 hooks: - id: black - exclude: src/prompt_systematic_review/experiments/find_internal_reference_count.py \ No newline at end of file + exclude: src/prompt_systematic_review/experiments/find_internal_reference_count.py + exclude: src/prompt_systematic_review/experiments/graph_internal_references.py \ No newline at end of file diff --git a/README.md b/README.md index 8e090e6..369839f 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ For HF: https://huggingface.co/docs/hub/security-tokens, also run `huggingface-c Put your key in like: `OPENAI_API_KEY=sk-...` +`SEMANTIC_SCHOLAR_API_KEY=...` `HF_TOKEN=...` Then to load the .env file, type: @@ -24,16 +25,14 @@ py.test --envfile path/to/.env In the case that you have several .env files, create a new env_files in the pytest config folder and type: env_files = - .env - .test.env - .deploy.env - +.env +.test.env +.deploy.env + ## blacklist.csv Papers we should not include due to being poorly written or AI generated - - ## Notes - Sometimes a paper title may appear differently on the arXiv API. For example, "Visual Attention-Prompted Prediction and Learning" (arXiv:2310.08420), according to arXiv API is titled "A visual encoding model based on deep neural networks and transfer learning" @@ -41,8 +40,6 @@ Papers we should not include due to being poorly written or AI generated - When testing APIs, there may be latency and aborted connections - Publication dates of papers from IEEE are missing the day about half the time. They also may come in any of the following formats - - "April 1988" - - "2-4 April 2002" - - "29 Nov.-2 Dec. 2022" - - + - "April 1988" + - "2-4 April 2002" + - "29 Nov.-2 Dec. 2022" diff --git a/src/prompt_systematic_review/experiments/__init__.py b/src/prompt_systematic_review/experiments/__init__.py index 715eb4f..e80b2cc 100644 --- a/src/prompt_systematic_review/experiments/__init__.py +++ b/src/prompt_systematic_review/experiments/__init__.py @@ -12,7 +12,9 @@ from . import graph_gpt_3_5_benchmarks from . import run_tomotopy from . import topicgpt - +from . import download_mmlu +from . import graph_internal_references +from . import graph experiments = [ count_tool_mentions.Experiment, diff --git a/src/prompt_systematic_review/experiments/download_mmlu.py b/src/prompt_systematic_review/experiments/download_mmlu.py index 854ca57..45a143d 100644 --- a/src/prompt_systematic_review/experiments/download_mmlu.py +++ b/src/prompt_systematic_review/experiments/download_mmlu.py @@ -30,26 +30,34 @@ def move_and_rename_extracted_contents(extracted_folder, final_folder, new_folde return mmlu_folder -# URL of the .tar file -url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" +def download_mmlu(): + # URL of the .tar file + url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" -# Temporary paths -download_path = "./data.tar" -extract_path = "./extracted" + # Temporary paths + download_path = "./data.tar" + extract_path = "./extracted" -# Final path -final_data_folder = "./data" -final_folder_name = "mmlu" + # Final path + final_data_folder = "./data" + final_folder_name = "mmlu" -# Download and extract the file -download_and_extract(url, download_path) -extract_tar(download_path, extract_path) + # Download and extract the file + download_and_extract(url, download_path) + extract_tar(download_path, extract_path) + + # Move and rename the contents of the extracted folder + move_and_rename_extracted_contents( + extract_path, final_data_folder, final_folder_name + ) + + # Cleanup + if os.path.exists(download_path): + os.remove(download_path) + if os.path.exists(extract_path): + shutil.rmtree(extract_path) -# Move and rename the contents of the extracted folder -move_and_rename_extracted_contents(extract_path, final_data_folder, final_folder_name) -# Cleanup -if os.path.exists(download_path): - os.remove(download_path) -if os.path.exists(extract_path): - shutil.rmtree(extract_path) +class Experiment: + def run(): + download_mmlu() diff --git a/src/prompt_systematic_review/experiments/graph.py b/src/prompt_systematic_review/experiments/graph.py index a4f42c4..80833ba 100644 --- a/src/prompt_systematic_review/experiments/graph.py +++ b/src/prompt_systematic_review/experiments/graph.py @@ -107,7 +107,7 @@ def run(self, csv_file_path, technique_to_title): ) -if __name__ == "__main__": +def run_graph(): main = Main() titles = [ "Bounding the Capabilities of Large Language Models in Open Text Generation with Prompt Constraints", @@ -213,3 +213,8 @@ def run(self, csv_file_path, technique_to_title): csv_file_path = "path_to_your_csv.csv" main.run(csv_file_path, technique_to_title) + + +class Experiment: + def run(): + run_graph() diff --git a/src/prompt_systematic_review/experiments/graph_dataset_citations.py b/src/prompt_systematic_review/experiments/graph_dataset_citations.py index ccf5e2c..b639dbf 100644 --- a/src/prompt_systematic_review/experiments/graph_dataset_citations.py +++ b/src/prompt_systematic_review/experiments/graph_dataset_citations.py @@ -77,10 +77,11 @@ def graph_dataset_citations(): plt.figure(figsize=(10, 6)) plt.bar(datasets, counts, color="#2E8991") - plt.xlabel("Dataset Name") - plt.ylabel("Number of Mentions") - plt.title("Dataset Mentions in Papers") - plt.xticks(rotation=45, ha="right") + plt.xlabel("Dataset Name", fontsize=20) + plt.ylabel("Number of Mentions", fontsize=20) + plt.title("Dataset Mentions in Papers", fontsize=30) + plt.xticks(rotation=45, ha="right", fontsize=15) + plt.yticks(fontsize=15) plt.tight_layout() output_dir = os.path.join(DataFolderPath, "experiments_output") diff --git a/src/prompt_systematic_review/experiments/graph_internal_references.py b/src/prompt_systematic_review/experiments/graph_internal_references.py index 7d0f791..c08261c 100644 --- a/src/prompt_systematic_review/experiments/graph_internal_references.py +++ b/src/prompt_systematic_review/experiments/graph_internal_references.py @@ -7,7 +7,6 @@ from dotenv import load_dotenv import csv import random -import scipy import networkx as nx import matplotlib.pyplot as plt import textwrap @@ -188,9 +187,9 @@ def process_papers(self, csv_file_path): arxiv_paper_id ) else: - unmatched_papers[ - row.get("title", "").strip() - ] = "Source not supported" + unmatched_papers[row.get("title", "").strip()] = ( + "Source not supported" + ) continue if paper_id: @@ -198,9 +197,9 @@ def process_papers(self, csv_file_path): if references is not None: paper_references[paper_id] = references else: - unmatched_papers[ - row["title"] - ] = "No references found or error occurred" + unmatched_papers[row["title"]] = ( + "No references found or error occurred" + ) else: print(f"Paper Id Could not be found for: {row}") else: @@ -428,7 +427,7 @@ def visualize_chart(self, technique_to_title): ) -if __name__ == "__main__": +def graph_internal_references(): main = Main() titles = [ @@ -533,3 +532,8 @@ def visualize_chart(self, technique_to_title): "Rephrase and Respond: Let Large Language Models Ask Better Questions for Themselves": "Rephrase and Respond", } main.visualize_chart(technique_to_title) + + +class Experiment: + def run(): + graph_internal_references()