Skip to content

Commit

Permalink
Merge branch 'main' into pdf_graphs_doc
Browse files Browse the repository at this point in the history
  • Loading branch information
Mcilie authored May 29, 2024
2 parents ff6b994 + 07bd0f6 commit c0ec018
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 44 deletions.
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ repos:
rev: 23.10.1
hooks:
- id: black
exclude: src/prompt_systematic_review/experiments/find_internal_reference_count.py
exclude: src/prompt_systematic_review/experiments/find_internal_reference_count.py
exclude: src/prompt_systematic_review/experiments/graph_internal_references.py
19 changes: 8 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ For HF: https://huggingface.co/docs/hub/security-tokens, also run `huggingface-c
Put your key in like:

`OPENAI_API_KEY=sk-...`
`SEMANTIC_SCHOLAR_API_KEY=...`
`HF_TOKEN=...`

Then to load the .env file, type:
Expand All @@ -24,25 +25,21 @@ py.test --envfile path/to/.env
In the case that you have several .env files, create a new env_files in the pytest config folder and type:

env_files =
.env
.test.env
.deploy.env
.env
.test.env
.deploy.env

## blacklist.csv

Papers we should not include due to being poorly written or AI generated



## Notes

- Sometimes a paper title may appear differently on the arXiv API. For example, "Visual Attention-Prompted Prediction and Learning" (arXiv:2310.08420), according to arXiv API is titled "A visual encoding model based on deep neural networks and transfer learning"

- When testing APIs, there may be latency and aborted connections

- Publication dates of papers from IEEE are missing the day about half the time. They also may come in any of the following formats
- "April 1988"
- "2-4 April 2002"
- "29 Nov.-2 Dec. 2022"


- "April 1988"
- "2-4 April 2002"
- "29 Nov.-2 Dec. 2022"
4 changes: 3 additions & 1 deletion src/prompt_systematic_review/experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
from . import graph_gpt_3_5_benchmarks
from . import run_tomotopy
from . import topicgpt

from . import download_mmlu
from . import graph_internal_references
from . import graph

experiments = [
count_tool_mentions.Experiment,
Expand Down
44 changes: 26 additions & 18 deletions src/prompt_systematic_review/experiments/download_mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,26 +30,34 @@ def move_and_rename_extracted_contents(extracted_folder, final_folder, new_folde
return mmlu_folder


# URL of the .tar file
url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"
def download_mmlu():
# URL of the .tar file
url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"

# Temporary paths
download_path = "./data.tar"
extract_path = "./extracted"
# Temporary paths
download_path = "./data.tar"
extract_path = "./extracted"

# Final path
final_data_folder = "./data"
final_folder_name = "mmlu"
# Final path
final_data_folder = "./data"
final_folder_name = "mmlu"

# Download and extract the file
download_and_extract(url, download_path)
extract_tar(download_path, extract_path)
# Download and extract the file
download_and_extract(url, download_path)
extract_tar(download_path, extract_path)

# Move and rename the contents of the extracted folder
move_and_rename_extracted_contents(
extract_path, final_data_folder, final_folder_name
)

# Cleanup
if os.path.exists(download_path):
os.remove(download_path)
if os.path.exists(extract_path):
shutil.rmtree(extract_path)

# Move and rename the contents of the extracted folder
move_and_rename_extracted_contents(extract_path, final_data_folder, final_folder_name)

# Cleanup
if os.path.exists(download_path):
os.remove(download_path)
if os.path.exists(extract_path):
shutil.rmtree(extract_path)
class Experiment:
def run():
download_mmlu()
7 changes: 6 additions & 1 deletion src/prompt_systematic_review/experiments/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def run(self, csv_file_path, technique_to_title):
)


if __name__ == "__main__":
def run_graph():
main = Main()
titles = [
"Bounding the Capabilities of Large Language Models in Open Text Generation with Prompt Constraints",
Expand Down Expand Up @@ -213,3 +213,8 @@ def run(self, csv_file_path, technique_to_title):

csv_file_path = "path_to_your_csv.csv"
main.run(csv_file_path, technique_to_title)


class Experiment:
def run():
run_graph()
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,11 @@ def graph_dataset_citations():

plt.figure(figsize=(10, 6))
plt.bar(datasets, counts, color="#2E8991")
plt.xlabel("Dataset Name")
plt.ylabel("Number of Mentions")
plt.title("Dataset Mentions in Papers")
plt.xticks(rotation=45, ha="right")
plt.xlabel("Dataset Name", fontsize=20)
plt.ylabel("Number of Mentions", fontsize=20)
plt.title("Dataset Mentions in Papers", fontsize=30)
plt.xticks(rotation=45, ha="right", fontsize=15)
plt.yticks(fontsize=15)
plt.tight_layout()

output_dir = os.path.join(DataFolderPath, "experiments_output")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from dotenv import load_dotenv
import csv
import random
import scipy
import networkx as nx
import matplotlib.pyplot as plt
import textwrap
Expand Down Expand Up @@ -188,19 +187,19 @@ def process_papers(self, csv_file_path):
arxiv_paper_id
)
else:
unmatched_papers[
row.get("title", "").strip()
] = "Source not supported"
unmatched_papers[row.get("title", "").strip()] = (
"Source not supported"
)
continue

if paper_id:
references = self.semantic_scholar_api.get_references(paper_id)
if references is not None:
paper_references[paper_id] = references
else:
unmatched_papers[
row["title"]
] = "No references found or error occurred"
unmatched_papers[row["title"]] = (
"No references found or error occurred"
)
else:
print(f"Paper Id Could not be found for: {row}")
else:
Expand Down Expand Up @@ -428,7 +427,7 @@ def visualize_chart(self, technique_to_title):
)


if __name__ == "__main__":
def graph_internal_references():
main = Main()

titles = [
Expand Down Expand Up @@ -533,3 +532,8 @@ def visualize_chart(self, technique_to_title):
"Rephrase and Respond: Let Large Language Models Ask Better Questions for Themselves": "Rephrase and Respond",
}
main.visualize_chart(technique_to_title)


class Experiment:
def run():
graph_internal_references()

0 comments on commit c0ec018

Please sign in to comment.