From 785db0a772d4d318a100e98288335ef5b610a2f0 Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Mon, 13 May 2024 15:43:00 -0400 Subject: [PATCH 1/5] Increase font size for consistenct --- .../experiments/graph_dataset_citations.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/prompt_systematic_review/experiments/graph_dataset_citations.py b/src/prompt_systematic_review/experiments/graph_dataset_citations.py index 3c0463b..e3a6dc9 100644 --- a/src/prompt_systematic_review/experiments/graph_dataset_citations.py +++ b/src/prompt_systematic_review/experiments/graph_dataset_citations.py @@ -77,10 +77,11 @@ def graph_dataset_citations(): plt.figure(figsize=(10, 6)) plt.bar(datasets, counts, color="#2E8991") - plt.xlabel("Dataset Name") - plt.ylabel("Number of Mentions") - plt.title("Dataset Mentions in Papers") - plt.xticks(rotation=45, ha="right") + plt.xlabel("Dataset Name",fontsize=20) + plt.ylabel("Number of Mentions",fontsize=20) + plt.title("Dataset Mentions in Papers",fontsize=30) + plt.xticks(rotation=45, ha="right",fontsize=15) + plt.yticks(fontsize=15) plt.tight_layout() output_dir = os.path.join(DataFolderPath, "experiments_output") @@ -95,6 +96,5 @@ class Experiment: def run(): graph_dataset_citations() - if __name__ == "__main__": - graph_dataset_citations() + graph_dataset_citations() \ No newline at end of file From e047148f6474bbe64c83a7bf53e03979871234b4 Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Wed, 29 May 2024 11:08:16 -0400 Subject: [PATCH 2/5] formatted --- .../experiments/graph_dataset_citations.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/prompt_systematic_review/experiments/graph_dataset_citations.py b/src/prompt_systematic_review/experiments/graph_dataset_citations.py index e3a6dc9..5e7adfd 100644 --- a/src/prompt_systematic_review/experiments/graph_dataset_citations.py +++ b/src/prompt_systematic_review/experiments/graph_dataset_citations.py @@ -77,10 +77,10 @@ def graph_dataset_citations(): plt.figure(figsize=(10, 6)) plt.bar(datasets, counts, color="#2E8991") - plt.xlabel("Dataset Name",fontsize=20) - plt.ylabel("Number of Mentions",fontsize=20) - plt.title("Dataset Mentions in Papers",fontsize=30) - plt.xticks(rotation=45, ha="right",fontsize=15) + plt.xlabel("Dataset Name", fontsize=20) + plt.ylabel("Number of Mentions", fontsize=20) + plt.title("Dataset Mentions in Papers", fontsize=30) + plt.xticks(rotation=45, ha="right", fontsize=15) plt.yticks(fontsize=15) plt.tight_layout() @@ -96,5 +96,6 @@ class Experiment: def run(): graph_dataset_citations() + if __name__ == "__main__": - graph_dataset_citations() \ No newline at end of file + graph_dataset_citations() From 255332646a4d8b9034347b6c53f733d91d155674 Mon Sep 17 00:00:00 2001 From: hudssntao Date: Wed, 29 May 2024 11:16:25 -0400 Subject: [PATCH 3/5] fix: update experiments --- README.md | 19 ++++----- .../experiments/__init__.py | 7 +++- .../experiments/download_mmlu.py | 41 +++++++++++-------- .../experiments/graph.py | 7 +++- .../experiments/graph_internal_references.py | 8 +++- 5 files changed, 49 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 8e090e6..369839f 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ For HF: https://huggingface.co/docs/hub/security-tokens, also run `huggingface-c Put your key in like: `OPENAI_API_KEY=sk-...` +`SEMANTIC_SCHOLAR_API_KEY=...` `HF_TOKEN=...` Then to load the .env file, type: @@ -24,16 +25,14 @@ py.test --envfile path/to/.env In the case that you have several .env files, create a new env_files in the pytest config folder and type: env_files = - .env - .test.env - .deploy.env - +.env +.test.env +.deploy.env + ## blacklist.csv Papers we should not include due to being poorly written or AI generated - - ## Notes - Sometimes a paper title may appear differently on the arXiv API. For example, "Visual Attention-Prompted Prediction and Learning" (arXiv:2310.08420), according to arXiv API is titled "A visual encoding model based on deep neural networks and transfer learning" @@ -41,8 +40,6 @@ Papers we should not include due to being poorly written or AI generated - When testing APIs, there may be latency and aborted connections - Publication dates of papers from IEEE are missing the day about half the time. They also may come in any of the following formats - - "April 1988" - - "2-4 April 2002" - - "29 Nov.-2 Dec. 2022" - - + - "April 1988" + - "2-4 April 2002" + - "29 Nov.-2 Dec. 2022" diff --git a/src/prompt_systematic_review/experiments/__init__.py b/src/prompt_systematic_review/experiments/__init__.py index 986de33..79cdf78 100644 --- a/src/prompt_systematic_review/experiments/__init__.py +++ b/src/prompt_systematic_review/experiments/__init__.py @@ -12,7 +12,9 @@ from . import graph_gpt_3_5_benchmarks from . import run_tomotopy from . import topicgpt - +from . import download_mmlu +from . import graph_internal_references +from . import graph experiments = [ count_tool_mentions.Experiment, @@ -28,4 +30,7 @@ graph_gpt_3_5_benchmarks.Experiment, run_tomotopy.Experiment, topicgpt.Experiment, + download_mmlu.Experiment, + graph_internal_references.Experiment, + graph.Experiment, ] diff --git a/src/prompt_systematic_review/experiments/download_mmlu.py b/src/prompt_systematic_review/experiments/download_mmlu.py index 854ca57..bbb9520 100644 --- a/src/prompt_systematic_review/experiments/download_mmlu.py +++ b/src/prompt_systematic_review/experiments/download_mmlu.py @@ -29,27 +29,32 @@ def move_and_rename_extracted_contents(extracted_folder, final_folder, new_folde return mmlu_folder +def download_mmlu(): + # URL of the .tar file + url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" -# URL of the .tar file -url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" + # Temporary paths + download_path = "./data.tar" + extract_path = "./extracted" -# Temporary paths -download_path = "./data.tar" -extract_path = "./extracted" + # Final path + final_data_folder = "./data" + final_folder_name = "mmlu" -# Final path -final_data_folder = "./data" -final_folder_name = "mmlu" + # Download and extract the file + download_and_extract(url, download_path) + extract_tar(download_path, extract_path) -# Download and extract the file -download_and_extract(url, download_path) -extract_tar(download_path, extract_path) + # Move and rename the contents of the extracted folder + move_and_rename_extracted_contents(extract_path, final_data_folder, final_folder_name) + + # Cleanup + if os.path.exists(download_path): + os.remove(download_path) + if os.path.exists(extract_path): + shutil.rmtree(extract_path) -# Move and rename the contents of the extracted folder -move_and_rename_extracted_contents(extract_path, final_data_folder, final_folder_name) -# Cleanup -if os.path.exists(download_path): - os.remove(download_path) -if os.path.exists(extract_path): - shutil.rmtree(extract_path) +class Experiment: + def run(): + download_mmlu() \ No newline at end of file diff --git a/src/prompt_systematic_review/experiments/graph.py b/src/prompt_systematic_review/experiments/graph.py index 6893695..5effa69 100644 --- a/src/prompt_systematic_review/experiments/graph.py +++ b/src/prompt_systematic_review/experiments/graph.py @@ -102,7 +102,7 @@ def run(self, csv_file_path, technique_to_title): ) -if __name__ == "__main__": +def run_graph(): main = Main() titles = [ "Bounding the Capabilities of Large Language Models in Open Text Generation with Prompt Constraints", @@ -208,3 +208,8 @@ def run(self, csv_file_path, technique_to_title): csv_file_path = "path_to_your_csv.csv" main.run(csv_file_path, technique_to_title) + + +class Experiment: + def run(): + run_graph() \ No newline at end of file diff --git a/src/prompt_systematic_review/experiments/graph_internal_references.py b/src/prompt_systematic_review/experiments/graph_internal_references.py index a2a6795..d9d9f8d 100644 --- a/src/prompt_systematic_review/experiments/graph_internal_references.py +++ b/src/prompt_systematic_review/experiments/graph_internal_references.py @@ -7,7 +7,6 @@ from dotenv import load_dotenv import csv import random -import scipy import networkx as nx import matplotlib.pyplot as plt import textwrap @@ -428,7 +427,7 @@ def visualize_chart(self, technique_to_title): ) -if __name__ == "__main__": +def graph_internal_references(): main = Main() titles = [ @@ -533,3 +532,8 @@ def visualize_chart(self, technique_to_title): "Rephrase and Respond: Let Large Language Models Ask Better Questions for Themselves": "Rephrase and Respond", } main.visualize_chart(technique_to_title) + + +class Experiment: + def run(): + graph_internal_references() From 24068607df192d60d9751b59c169268378b31aff Mon Sep 17 00:00:00 2001 From: hudssntao Date: Wed, 29 May 2024 11:21:44 -0400 Subject: [PATCH 4/5] style: black --- .../experiments/download_mmlu.py | 7 +++++-- .../experiments/graph.py | 2 +- .../experiments/graph_internal_references.py | 16 ++++++++-------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/prompt_systematic_review/experiments/download_mmlu.py b/src/prompt_systematic_review/experiments/download_mmlu.py index bbb9520..45a143d 100644 --- a/src/prompt_systematic_review/experiments/download_mmlu.py +++ b/src/prompt_systematic_review/experiments/download_mmlu.py @@ -29,6 +29,7 @@ def move_and_rename_extracted_contents(extracted_folder, final_folder, new_folde return mmlu_folder + def download_mmlu(): # URL of the .tar file url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar" @@ -46,7 +47,9 @@ def download_mmlu(): extract_tar(download_path, extract_path) # Move and rename the contents of the extracted folder - move_and_rename_extracted_contents(extract_path, final_data_folder, final_folder_name) + move_and_rename_extracted_contents( + extract_path, final_data_folder, final_folder_name + ) # Cleanup if os.path.exists(download_path): @@ -57,4 +60,4 @@ def download_mmlu(): class Experiment: def run(): - download_mmlu() \ No newline at end of file + download_mmlu() diff --git a/src/prompt_systematic_review/experiments/graph.py b/src/prompt_systematic_review/experiments/graph.py index 5effa69..4832940 100644 --- a/src/prompt_systematic_review/experiments/graph.py +++ b/src/prompt_systematic_review/experiments/graph.py @@ -212,4 +212,4 @@ def run_graph(): class Experiment: def run(): - run_graph() \ No newline at end of file + run_graph() diff --git a/src/prompt_systematic_review/experiments/graph_internal_references.py b/src/prompt_systematic_review/experiments/graph_internal_references.py index d9d9f8d..0270fc3 100644 --- a/src/prompt_systematic_review/experiments/graph_internal_references.py +++ b/src/prompt_systematic_review/experiments/graph_internal_references.py @@ -187,9 +187,9 @@ def process_papers(self, csv_file_path): arxiv_paper_id ) else: - unmatched_papers[ - row.get("title", "").strip() - ] = "Source not supported" + unmatched_papers[row.get("title", "").strip()] = ( + "Source not supported" + ) continue if paper_id: @@ -197,9 +197,9 @@ def process_papers(self, csv_file_path): if references is not None: paper_references[paper_id] = references else: - unmatched_papers[ - row["title"] - ] = "No references found or error occurred" + unmatched_papers[row["title"]] = ( + "No references found or error occurred" + ) else: print(f"Paper Id Could not be found for: {row}") else: @@ -532,8 +532,8 @@ def graph_internal_references(): "Rephrase and Respond: Let Large Language Models Ask Better Questions for Themselves": "Rephrase and Respond", } main.visualize_chart(technique_to_title) - - + + class Experiment: def run(): graph_internal_references() From 84fb279726c61ff967407c03ae52cfe7d46984bc Mon Sep 17 00:00:00 2001 From: hudssntao Date: Wed, 29 May 2024 11:39:59 -0400 Subject: [PATCH 5/5] style: exclude file --- .pre-commit-config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ebfaa8f..375a5bf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,4 +3,5 @@ repos: rev: 23.10.1 hooks: - id: black - exclude: src/prompt_systematic_review/experiments/find_internal_reference_count.py \ No newline at end of file + exclude: src/prompt_systematic_review/experiments/find_internal_reference_count.py + exclude: src/prompt_systematic_review/experiments/graph_internal_references.py \ No newline at end of file