diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index fd04aae..4e75c58 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -11,6 +11,7 @@ on: jobs: build: + environment: HF runs-on: ubuntu-latest strategy: @@ -36,5 +37,7 @@ jobs: # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest + env: + HF_AUTH_TOKEN: ${{ secrets.HF_AUTH_TOKEN }} run: | - pytest + export HF_AUTH_TOKEN=${{ secrets.HF_AUTH_TOKEN }} && pytest diff --git a/.gitignore b/.gitignore index 41bddba..9e26863 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ __pycache__ _build .DS_Store -build/ \ No newline at end of file +build/ +*.egg-info/ diff --git a/data/prompt_engineering_reviewed.csv b/data/prompt_engineering_reviewed.csv index 8b070da..a365a81 100644 --- a/data/prompt_engineering_reviewed.csv +++ b/data/prompt_engineering_reviewed.csv @@ -1,17 +1,21 @@ -Title, Link, Labels - "Don't Complete It! Preventing Unhelpful Code Completion for Productive and Sustainable Neural Code Completion Systems", https://arxiv.org/pdf/2209.05948v2.pdf, Analysis of Technique;New Model;Applying Prompts/prompt Engineering - "Unleashing the potential of prompt engineering in Large Language Models: a comprehensive review", https://arxiv.org/abs/2310.08129v1, Survey;Analysis of Technique;Applying Prompts/prompt Engineering;Agents - "LoGoPrompt: Synthetic Text Images Can Be Good Visual Prompts for Vision-Language Models", https://arxiv.org/pdf/2309.01155v2.pdf, Survey;Analysis of Technique;Applying Promtps/Prompt Engineering - "Prompt Engineering For Students of Medicine and Their Teachers", https://arxiv.org/pdf/2308.11628v1.pdf, Survey;Analysis of Technique;Applying prompts/prompt engineering - "A Systematic Survey of Prompt Engineering on Vision-Language Foundation Models", https://arxiv.org/pdf/2307.12980v1.pdf, Survey;Other modality than text;Applying Prompts/prompt Engineering -"Cutting Down on Prompts and Parameters: Simple Few-Shot Learning with Language Models", https://arxiv.org/abs/2106.13353v2, New Technique;Analysis of Technique;Applying Prompts/Prompt Engineering -"Tailored Visions: Enhancing Text-to-Image Generation with Personalized Prompt Rewriting", https://arxiv.org/abs/2310.08129v1, New Technique;Analysis of Technique;Other Modality than Text;Applying Prompts/Prompt Engineering -"Large Language Models in the Workplace: A Case Study on Prompt Engineering for Job Type Classification", https://arxiv.org/abs/2303.07142v3, Applying Promtps/Prompt Engineering;Analysis of Model -"Open-Ended Instructable Embodied Agents with Memory-Augmented Large Language Models", https://arxiv.org/abs/2310.15127v1, New model;New Technique;Agents -"Addressing Compiler Errors: Stack Overflow or Large Language Models?", https://arxiv.org/abs/2307.10793v1, Analysis of model;Applying Prompts/Prompt Engineering -"Prompt Learning for Action Recognition", http://arxiv.org/abs/2305.12437v1, New Technique; Agents; Analysis of technique; Applying prompts/prompt engineering; Other modality than text -"Effective Structured Prompting by Meta-Learning and Representative Verbalizer",http://arxiv.org/abs/2306.00618v1, New Technique; Analysis of technique -"Invalid Logic, Equivalent Gains: The Bizarreness of Reasoning in Language Model Prompting",http://arxiv.org/abs/2307.10573v2, Analysis of technique -"High-Fidelity Lake Extraction via Two-Stage Prompt Enhancement: Establishing a Novel Baseline and Benchmark",http://arxiv.org/abs/2308.08443v1, Analysis of technique; Other modality than text; Applying prompts/prompt engineering -Structured Prompt Tuning,http://arxiv.org/abs/2205.12309v1, New Technique - +Title, Link, Labels +"The Creativity of Text-to-Image Generation", http://dx.doi.org/10.1145/3569219.3569352, Applying prompts/prompt engineering +"Beyond Traditional Teaching: The Potential of Large Language Models and Chatbots in Graduate Engineering Education", http://dx.doi.org/10.32388/MD04B0, Analysis of model;Applying prompts/prompt engineering;Agents;Ethics/Consequences +"Collective Creativity in Crowd Ideation", http://dx.doi.org/10.1145/3411764.3445782, Survey;Analysis of technique;Applying prompts/prompt engineering;Agents +"Massive prompt cusps: A new signature of warm dark matter", http://dx.doi.org/10.1093/mnrasl/slad043, Analysis of technique;Analysis of model +"Scientific Literature Text Mining and the Case for Open Access", http://dx.doi.org/10.21428/14888, New model + "Don't Complete It! Preventing Unhelpful Code Completion for Productive and Sustainable Neural Code Completion Systems", https://arxiv.org/pdf/2209.05948v2.pdf, Analysis of Technique;New Model;Applying Prompts/prompt Engineering + "Unleashing the potential of prompt engineering in Large Language Models: a comprehensive review", https://arxiv.org/abs/2310.08129v1, Survey;Analysis of Technique;Applying Prompts/prompt Engineering;Agents + "LoGoPrompt: Synthetic Text Images Can Be Good Visual Prompts for Vision-Language Models", https://arxiv.org/pdf/2309.01155v2.pdf, Survey;Analysis of Technique;Applying Promtps/Prompt Engineering + "Prompt Engineering For Students of Medicine and Their Teachers", https://arxiv.org/pdf/2308.11628v1.pdf, Survey;Analysis of Technique;Applying prompts/prompt engineering + "A Systematic Survey of Prompt Engineering on Vision-Language Foundation Models", https://arxiv.org/pdf/2307.12980v1.pdf, Survey;Other modality than text;Applying Prompts/prompt Engineering +"Cutting Down on Prompts and Parameters: Simple Few-Shot Learning with Language Models", https://arxiv.org/abs/2106.13353v2, New Technique;Analysis of Technique;Applying Prompts/Prompt Engineering +"Tailored Visions: Enhancing Text-to-Image Generation with Personalized Prompt Rewriting", https://arxiv.org/abs/2310.08129v1, New Technique;Analysis of Technique;Other Modality than Text;Applying Prompts/Prompt Engineering +"Large Language Models in the Workplace: A Case Study on Prompt Engineering for Job Type Classification", https://arxiv.org/abs/2303.07142v3, Applying Promtps/Prompt Engineering;Analysis of Model +"Open-Ended Instructable Embodied Agents with Memory-Augmented Large Language Models", https://arxiv.org/abs/2310.15127v1, New model;New Technique;Agents +"Addressing Compiler Errors: Stack Overflow or Large Language Models?", https://arxiv.org/abs/2307.10793v1, Analysis of model;Applying Prompts/Prompt Engineering +"Prompt Learning for Action Recognition", http://arxiv.org/abs/2305.12437v1, New Technique; Agents; Analysis of technique; Applying prompts/prompt engineering; Other modality than text +"Effective Structured Prompting by Meta-Learning and Representative Verbalizer",http://arxiv.org/abs/2306.00618v1, New Technique; Analysis of technique +"Invalid Logic, Equivalent Gains: The Bizarreness of Reasoning in Language Model Prompting",http://arxiv.org/abs/2307.10573v2, Analysis of technique +"High-Fidelity Lake Extraction via Two-Stage Prompt Enhancement: Establishing a Novel Baseline and Benchmark",http://arxiv.org/abs/2308.08443v1, Analysis of technique; Other modality than text; Applying prompts/prompt engineering +Structured Prompt Tuning,http://arxiv.org/abs/2205.12309v1, New Technique \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 89f4bc4..e4c9c6e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,9 @@ numpy pandas requests matplotlib +datasets +huggingface_hub +pytest jellyfish black sphinx diff --git a/setup.cfg b/setup.cfg index 54e80e8..23327fb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,6 +31,9 @@ install_requires = requests matplotlib jellyfish + datasets + huggingface_hub + black [options.packages.find] where=src diff --git a/src/prompt_systematic_review/pipeline.py b/src/prompt_systematic_review/pipeline.py new file mode 100644 index 0000000..65fd7b6 --- /dev/null +++ b/src/prompt_systematic_review/pipeline.py @@ -0,0 +1,61 @@ +from huggingface_hub import HfFileSystem, login +import pandas as pd +from io import StringIO +import os + + +""" +READ THIS +https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions +https://huggingface.co/docs/huggingface_hub/v0.18.0.rc0/guides/hf_file_system + +""" + + +class Pipeline: + def __init__(self, token=None, revision="main"): + self.token = token + self.root = f"hf://datasets/PromptSystematicReview/Prompt_Systematic_Review_Dataset@{revision}/" + if token is not None: + self.fs = HfFileSystem(token=token) + login(token=token) + else: + self.fs = HfFileSystem() + self.revision = revision + + def is_logged_in(self): + return self.token is not None + + def get_revision(self): + return self.revision + + def set_revision(self, revision): + try: + assert revision.isalnum() + self.revision = revision + self.root = f"hf://datasets/PromptSystematicReview/Prompt_Systematic_Review_Dataset@{revision}/" + except: + raise ValueError("Revision must be alphanumeric") + + def login(self, token): + if self.token is not None: + raise ValueError("Already Logged In") + else: + self.fs = HfFileSystem(token=self.token) + login(token=token) + self.token = token + + def get_all_files(self): + return self.fs.ls(self.root, detail=False, revision=self.revision) + + def get_all_data_files(self): + return self.fs.glob(self.root + "**.csv", revision=self.revision) + + def read_from_file(self, fileName): + return pd.read_csv(os.path.join(self.root, fileName)) + + def write_to_file(self, fileName, dataFrame): + if not self.is_logged_in(): + raise ValueError("Not Logged In") + path = os.path.join(self.root, fileName) + dataFrame.to_csv(path, index=False) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..102680e --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,64 @@ +# test_pipeline.py +import pytest +import os +from prompt_systematic_review.pipeline import * +from huggingface_hub import delete_file +import random +import time +import hashlib + + +def hashString(bytes): + return str(hashlib.md5(bytes).hexdigest()) + + +@pytest.fixture +def client(): + return Pipeline(token=os.environ["HF_AUTH_TOKEN"], revision="test") + + +def test_login(): + testClient = Pipeline(revision="test") + assert testClient.is_logged_in() == False + testClient.login(os.environ["HF_AUTH_TOKEN"]) + assert testClient.is_logged_in() == True + + +def test_get_all_files(client): + assert len(client.get_all_files()) > 0 + + +def test_get_all_data_files(client): + assert len(client.get_all_data_files()) > 0 + assert all([x.endswith(".csv") for x in client.get_all_data_files()]) + + +def test_read_from_file(client): + assert len(client.read_from_file("test.csv")) > 0 + assert len(client.read_from_file("test.csv").columns) == 2 + assert client.read_from_file("test.csv")["Age"].mean() == 21 + + +def test_write_to_file(client): + lenOfFiles = len(client.get_all_files()) + randString = random.randbytes(100) + str(time.time()).encode() + randHash = hashString(randString) + csvDict = {"test": [1, 3], "test2": [2, 4]} + print(client.revision) + client.write_to_file(f"{randHash[:10]}_test.csv", pd.DataFrame(csvDict)) + print(client.revision) + time.sleep(1) + # assert client.revision == "main" + df = client.read_from_file(f"{randHash[:10]}_test.csv") + assert df["test"].sum() == 4 + assert df["test2"].sum() == 6 + # time.sleep(1) + print(client.root + f"{randHash[:10]}_test.csv") + delete_file( + f"{randHash[:10]}_test.csv", + "PromptSystematicReview/Prompt_Systematic_Review_Dataset", + repo_type="dataset", + revision="test", + ) + + assert len(client.get_all_files()) == lenOfFiles