From 8df256d749d598cec3ca260e62d6b4836039fa6f Mon Sep 17 00:00:00 2001 From: hudssntao <145732608+hudssntao@users.noreply.github.com> Date: Fri, 27 Oct 2023 00:06:59 -0400 Subject: [PATCH 01/24] Add files via upload --- src/prompt_systematic_review/pipeline.py | 38 ++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 src/prompt_systematic_review/pipeline.py diff --git a/src/prompt_systematic_review/pipeline.py b/src/prompt_systematic_review/pipeline.py new file mode 100644 index 0000000..1820ae9 --- /dev/null +++ b/src/prompt_systematic_review/pipeline.py @@ -0,0 +1,38 @@ +from datasets import load_dataset, concatenate_datasets + +""" +Gets Huggingface dataset +dataset_name: The name of the dataset on Huggingface +user_name: The name under which the dataset has been created +Returns a Dataset object +""" +def get(dataset_name, user_name): + return load_dataset(user_name + "/" + dataset_name) + +""" +Replaces entire database in Huggingface +dataset_name: The name of the dataset on Huggingface +new_dataset: A Dataset object containing the new dataset +""" +def push(dataset_name, new_dataset): + new_dataset.push_to_hub(dataset_name) + +""" +Appends to current dataset in Huggingface (experimental) +user_name: The name under which the dataset has been created +dataset_name: The name of the dataset on Huggingface +new_dataset: A Dataset object containing the new dataset +""" +def append(user_name, dataset_name, new_dataset): + + #Try-except for empty dataset case + try: + #Assuming split 'train' as it seems to be the default split + dataset = load_dataset(user_name + "/" + dataset_name, split="train") + appended_dataset = concatenate_datasets([dataset,new_dataset]) + appended_dataset.push_to_hub(dataset_name) + except: + push(dataset_name, new_dataset) + + + From 06cc8d563ef9e3c592ed900fb569a91d225fec15 Mon Sep 17 00:00:00 2001 From: hudssntao <145732608+hudssntao@users.noreply.github.com> Date: Fri, 27 Oct 2023 00:08:10 -0400 Subject: [PATCH 02/24] Add files via upload --- tests/pipeline_tests.py | 47 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 tests/pipeline_tests.py diff --git a/tests/pipeline_tests.py b/tests/pipeline_tests.py new file mode 100644 index 0000000..dbd0b55 --- /dev/null +++ b/tests/pipeline_tests.py @@ -0,0 +1,47 @@ +from datasets import Dataset +import time +from pipeline import push, append, get + +def testingPushAndGet(dataset_name, user_name): + sample_data = { + "column1": ["value1", "value2", "value3"], + "column2": ["valueA", "valueB", "valueC"], + } + + sample_dataset = Dataset.from_dict(sample_data) + + push(dataset_name, sample_dataset) + assert(getRows(dataset_name, user_name) == 3) + +#Append seems to work on Hugging face but I'm having trouble with the Database API, will leave out this test for now. +# def testingAppend(dataset_name, user_name): +# sample_data = { +# "column1": ["value4", "value5"], +# "column2": ["valueD", "valueE"], +# } + +# sample_dataset = Dataset.from_dict(sample_data) + +# append(user_name, dataset_name, sample_dataset) + +# assert(getRows(dataset_name, user_name) == 5) + + +#Returns the number of rows in the dataset +def getRows(dataset_name, user_name): + my_dataset = get(dataset_name, user_name)['train'] + + return my_dataset.num_rows + +if (__name__ == "__main__"): + + USERNAME = "hudssntao" + DATASET_NAME = "test2" + + testingPushAndGet(DATASET_NAME, USERNAME) + + # testingAppend(DATASET_NAME, USERNAME) + + + + From d807248ff09fa0066856916d50ae203a9ffefd4e Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Fri, 27 Oct 2023 16:18:38 -0400 Subject: [PATCH 03/24] added pipeline and tests --- src/prompt_systematic_review/pipeline.py | 52 +++++++++--------------- tests/test_pipeline.py | 6 +++ 2 files changed, 25 insertions(+), 33 deletions(-) create mode 100644 tests/test_pipeline.py diff --git a/src/prompt_systematic_review/pipeline.py b/src/prompt_systematic_review/pipeline.py index 1820ae9..bb5f2f6 100644 --- a/src/prompt_systematic_review/pipeline.py +++ b/src/prompt_systematic_review/pipeline.py @@ -1,38 +1,24 @@ -from datasets import load_dataset, concatenate_datasets +from datasets import load_dataset, Dataset +import os -""" -Gets Huggingface dataset -dataset_name: The name of the dataset on Huggingface -user_name: The name under which the dataset has been created -Returns a Dataset object -""" -def get(dataset_name, user_name): - return load_dataset(user_name + "/" + dataset_name) -""" -Replaces entire database in Huggingface -dataset_name: The name of the dataset on Huggingface -new_dataset: A Dataset object containing the new dataset -""" -def push(dataset_name, new_dataset): - new_dataset.push_to_hub(dataset_name) - -""" -Appends to current dataset in Huggingface (experimental) -user_name: The name under which the dataset has been created -dataset_name: The name of the dataset on Huggingface -new_dataset: A Dataset object containing the new dataset -""" -def append(user_name, dataset_name, new_dataset): +def is_authenticated_with_huggingface(): + """ + Check if the user is authenticated with huggingface-cli. + Returns: + - bool: True if authenticated, False otherwise. + """ + token_path = os.path.join(os.path.expanduser("~"), ".huggingface", "token") + # Check if the token file exists + if not os.path.exists(token_path): + return False + # Check if the token file has content (i.e., the token) + with open(token_path, 'r') as f: + token = f.read().strip() + if not token: + return False - #Try-except for empty dataset case - try: - #Assuming split 'train' as it seems to be the default split - dataset = load_dataset(user_name + "/" + dataset_name, split="train") - appended_dataset = concatenate_datasets([dataset,new_dataset]) - appended_dataset.push_to_hub(dataset_name) - except: - push(dataset_name, new_dataset) - + return True +#make sure diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..8b42dde --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,6 @@ +# test_pipeline.py +import pytest +from prompt_systematic_review.pipeline import is_authenticated + +def test_is_authenticated(): + assert is_authenticated() == False \ No newline at end of file From 7493c163f76e59e094296ff4467c6e8321137231 Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Fri, 27 Oct 2023 16:20:40 -0400 Subject: [PATCH 04/24] updated cfg and deps --- requirements.txt | 2 ++ setup.cfg | 2 ++ 2 files changed, 4 insertions(+) diff --git a/requirements.txt b/requirements.txt index f6687e1..632d313 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,6 @@ numpy pandas requests matplotlib +datasets +huggingface_hub -e . \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 27c3bd9..e9b1a37 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,6 +30,8 @@ install_requires = pandas requests matplotlib + datasets + huggingface_hub [options.packages.find] where=src From 9b83b124ec9c22628464ea0c27f9c83e59867571 Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Fri, 27 Oct 2023 16:24:56 -0400 Subject: [PATCH 05/24] my tests are cursed --- src/prompt_systematic_review/pipeline.py | 5 ++++- tests/test_pipeline.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/prompt_systematic_review/pipeline.py b/src/prompt_systematic_review/pipeline.py index bb5f2f6..d8b844c 100644 --- a/src/prompt_systematic_review/pipeline.py +++ b/src/prompt_systematic_review/pipeline.py @@ -2,7 +2,7 @@ import os -def is_authenticated_with_huggingface(): +def is_authenticated(): """ Check if the user is authenticated with huggingface-cli. Returns: @@ -22,3 +22,6 @@ def is_authenticated_with_huggingface(): #make sure + +def dummyfunc(): + return True \ No newline at end of file diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 8b42dde..9fb9a64 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -3,4 +3,7 @@ from prompt_systematic_review.pipeline import is_authenticated def test_is_authenticated(): - assert is_authenticated() == False \ No newline at end of file + assert is_authenticated() == False + +def test_dummy(): + assert 1 == 1 \ No newline at end of file From 5a47552417b86e201ccf7e91b248ac9a19d88b68 Mon Sep 17 00:00:00 2001 From: solarwaffles <107089996+solarwaffles@users.noreply.github.com> Date: Fri, 27 Oct 2023 18:37:45 -0400 Subject: [PATCH 06/24] Added labels to five papers --- data/prompt_engineering_reviewed.csv | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/data/prompt_engineering_reviewed.csv b/data/prompt_engineering_reviewed.csv index e69de29..06b4043 100644 --- a/data/prompt_engineering_reviewed.csv +++ b/data/prompt_engineering_reviewed.csv @@ -0,0 +1,22 @@ +"The Creativity of Text-to-Image Generation" +- Applying prompts/prompt engineering + +"Beyond Traditional Teaching: The Potential of Large Language Models and Chatbots in Graduate Engineering Education" +- Analysis of model +- Applying prompts/prompt engineering +- Agents +- Ethics/Consequences + +"Collective Creativity in Crowd Ideation" +- Survey +- Analysis of technique +- Applying prompts/prompt engineering +- Agents + +"Massive prompt cusps: A new signature of warm dark matter" +- Analysis of technique +- Analysis of model + + +"Scientific Literature Text Mining and the Case for Open Access" - +- New model From acebdb3cc0288fd84c268d048637a1e1861e65bb Mon Sep 17 00:00:00 2001 From: solarwaffles <107089996+solarwaffles@users.noreply.github.com> Date: Sun, 29 Oct 2023 20:47:48 -0400 Subject: [PATCH 07/24] Update prompt_engineering_reviewed.csv --- data/prompt_engineering_reviewed.csv | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/data/prompt_engineering_reviewed.csv b/data/prompt_engineering_reviewed.csv index 06b4043..ee29a4b 100644 --- a/data/prompt_engineering_reviewed.csv +++ b/data/prompt_engineering_reviewed.csv @@ -1,22 +1,6 @@ -"The Creativity of Text-to-Image Generation" -- Applying prompts/prompt engineering - -"Beyond Traditional Teaching: The Potential of Large Language Models and Chatbots in Graduate Engineering Education" -- Analysis of model -- Applying prompts/prompt engineering -- Agents -- Ethics/Consequences - -"Collective Creativity in Crowd Ideation" -- Survey -- Analysis of technique -- Applying prompts/prompt engineering -- Agents - -"Massive prompt cusps: A new signature of warm dark matter" -- Analysis of technique -- Analysis of model - - -"Scientific Literature Text Mining and the Case for Open Access" - -- New model +Title, Link, Labels +"The Creativity of Text-to-Image Generation", http://dx.doi.org/10.1145/3569219.3569352, Applying prompts/prompt engineering +"Beyond Traditional Teaching: The Potential of Large Language Models and Chatbots in Graduate Engineering Education", http://dx.doi.org/10.32388/MD04B0, Analysis of model;Applying prompts/prompt engineering;Agents;Ethics/Consequences +"Collective Creativity in Crowd Ideation", http://dx.doi.org/10.1145/3411764.3445782, Survey;Analysis of technique;Applying prompts/prompt engineering;Agents +"Massive prompt cusps: A new signature of warm dark matter", http://dx.doi.org/10.1093/mnrasl/slad043, Analysis of technique;Analysis of model +"Scientific Literature Text Mining and the Case for Open Access", http://dx.doi.org/10.21428/14888, New model From 9ff75b2e45666fd9a507ee6b5775a4597529ed93 Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Mon, 30 Oct 2023 18:59:20 -0400 Subject: [PATCH 08/24] added pipeline and tests II --- src/prompt_systematic_review/pipeline.py | 66 +++++++++++++++++------- tests/test_pipeline.py | 44 ++++++++++++++-- 2 files changed, 85 insertions(+), 25 deletions(-) diff --git a/src/prompt_systematic_review/pipeline.py b/src/prompt_systematic_review/pipeline.py index d8b844c..c9e55c3 100644 --- a/src/prompt_systematic_review/pipeline.py +++ b/src/prompt_systematic_review/pipeline.py @@ -1,27 +1,53 @@ -from datasets import load_dataset, Dataset +from huggingface_hub import HfFileSystem +import pandas as pd import os -def is_authenticated(): - """ - Check if the user is authenticated with huggingface-cli. - Returns: - - bool: True if authenticated, False otherwise. - """ - token_path = os.path.join(os.path.expanduser("~"), ".huggingface", "token") - # Check if the token file exists - if not os.path.exists(token_path): - return False - # Check if the token file has content (i.e., the token) - with open(token_path, 'r') as f: - token = f.read().strip() - if not token: - return False + + +""" +READ THIS +https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions +https://huggingface.co/docs/huggingface_hub/v0.18.0.rc0/guides/hf_file_system + +""" + + + + +class Pipeline: + + def __init__(self, token=None): + self.token = token + self.root = "datasets/PromptSystematicReview/Prompt_Systematic_Review_Dataset/" + if token is not None: + self.fs = HfFileSystem(token=token) + else: + self.fs = HfFileSystem() + + def is_logged_in(self): + return self.token is not None - return True + def login(self, token): + if self.token is not None: + raise ValueError("Already Logged In") + else: + self.fs = HfFileSystem(token=self.token) + self.token = token + + def get_all_files(self): + return self.fs.ls(self.root, detail=False) + + def get_all_data_files(self): + return self.fs.glob(self.root+"**.csv") + + def read_from_file(self, fileName): + return pd.read_csv(self.fs.open(os.path.join(self.root, fileName))) + + def write_to_file(self, fileName, dataFrame): + path = os.path.join(self.root, fileName) + self.fs.write_text(path, dataFrame.to_csv(index=False)) + -#make sure -def dummyfunc(): - return True \ No newline at end of file diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 9fb9a64..a68eae8 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,9 +1,43 @@ # test_pipeline.py import pytest -from prompt_systematic_review.pipeline import is_authenticated +import os +from prompt_systematic_review.pipeline import * +import random +import time +import hashlib -def test_is_authenticated(): - assert is_authenticated() == False -def test_dummy(): - assert 1 == 1 \ No newline at end of file +def hashString(bytes): + return hashlib.sha256(bytes).hexdigest() + +@pytest.fixture +def client(): + return Pipeline(token=os.environ['HF_TOKEN']) + +def test_login(): + testClient = Pipeline() + assert testClient.is_logged_in() == False + testClient.login(os.environ['HF_TOKEN']) + assert testClient.is_logged_in() == True + +def test_get_all_files(client): + assert len(client.get_all_files()) > 0 + +def test_get_all_data_files(client): + assert len(client.get_all_data_files()) > 0 + assert all([x.endswith(".csv") for x in client.get_all_data_files()]) + +def test_read_from_file(client): + assert len(client.read_from_file("test.csv")) > 0 + assert len(client.read_from_file("test.csv").columns) == 2 + assert client.read_from_file("test.csv")[" Age"].mean() == 21 + +def test_write_to_file(client): + randString = random.randbytes(100) + str(time.time()).encode() + randHash = hashString(randString) + csvDict = {"test":[1,3], "test2":[2,4]} + client.write_to_file(f"test_{randHash}.csv", pd.DataFrame(csvDict)) + df = client.read_from_file(f"test_{randHash}.csv") + assert df["test"].sum() == 4 + assert df["test2"].sum() == 6 + From 7ce9c26213a79950725db48a0a9a939b47c9128c Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Mon, 30 Oct 2023 19:07:20 -0400 Subject: [PATCH 09/24] fixing env secret actions issue --- .github/workflows/python-package.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index fd04aae..ef51487 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -36,5 +36,7 @@ jobs: # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest + env: + HF_AUTH_TOKEN: ${{ secrets.HF_AUTH_TOKEN }} run: | - pytest + export HF_AUTH_TOKEN=${{ secrets.HF_AUTH_TOKEN }} && pytest From 74fa103efc469da984467a69c605b25400d6493c Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Mon, 30 Oct 2023 19:08:44 -0400 Subject: [PATCH 10/24] fixing pipeline --- tests/test_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index a68eae8..506d069 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -12,12 +12,12 @@ def hashString(bytes): @pytest.fixture def client(): - return Pipeline(token=os.environ['HF_TOKEN']) + return Pipeline(token=os.environ['HF_AUTH_TOKEN']) def test_login(): testClient = Pipeline() assert testClient.is_logged_in() == False - testClient.login(os.environ['HF_TOKEN']) + testClient.login(os.environ['HF_AUTH_TOKEN']) assert testClient.is_logged_in() == True def test_get_all_files(client): From b61ed542347d1953df633ed25cb6accabc10c545 Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Mon, 30 Oct 2023 19:16:11 -0400 Subject: [PATCH 11/24] delete test file after pipeline test --- tests/test_pipeline.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 506d069..904996f 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -33,6 +33,7 @@ def test_read_from_file(client): assert client.read_from_file("test.csv")[" Age"].mean() == 21 def test_write_to_file(client): + lenOfFiles = len(client.get_all_files()) randString = random.randbytes(100) + str(time.time()).encode() randHash = hashString(randString) csvDict = {"test":[1,3], "test2":[2,4]} @@ -40,4 +41,6 @@ def test_write_to_file(client): df = client.read_from_file(f"test_{randHash}.csv") assert df["test"].sum() == 4 assert df["test2"].sum() == 6 + client.fs.delete(f"test_{randHash}.csv") + assert len(client.get_all_files()) == lenOfFiles From 0aecdb3ccd91327c2c8971d5f32f54bf60f3c709 Mon Sep 17 00:00:00 2001 From: solarwaffles <107089996+solarwaffles@users.noreply.github.com> Date: Tue, 31 Oct 2023 01:46:22 +0000 Subject: [PATCH 12/24] Added Labels For Papers --- data/prompt_engineering_reviewed.csv | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/data/prompt_engineering_reviewed.csv b/data/prompt_engineering_reviewed.csv index 40e287e..14975ee 100644 --- a/data/prompt_engineering_reviewed.csv +++ b/data/prompt_engineering_reviewed.csv @@ -4,6 +4,11 @@ Title, Link, Labels "Collective Creativity in Crowd Ideation", http://dx.doi.org/10.1145/3411764.3445782, Survey;Analysis of technique;Applying prompts/prompt engineering;Agents "Massive prompt cusps: A new signature of warm dark matter", http://dx.doi.org/10.1093/mnrasl/slad043, Analysis of technique;Analysis of model "Scientific Literature Text Mining and the Case for Open Access", http://dx.doi.org/10.21428/14888, New model +"Don't Complete It! Preventing Unhelpful Code Completion for Productive and Sustainable Neural Code Completion Systems", https://arxiv.org/pdf/2209.05948v2.pdf, Analysis of Technique;New Model;Applying Prompts/prompt Engineering + "Unleashing the potential of prompt engineering in Large Language Models: a comprehensive review", https://arxiv.org/abs/2310.08129v1, Survey;Analysis of Technique;Applying Prompts/prompt Engineering;Agents + "LoGoPrompt: Synthetic Text Images Can Be Good Visual Prompts for Vision-Language Models", https://arxiv.org/pdf/2309.01155v2.pdf, Survey;Analysis of Technique;Applying Promtps/Prompt Engineering + "Prompt Engineering For Students of Medicine and Their Teachers", https://arxiv.org/pdf/2308.11628v1.pdf, Survey;Analysis of Technique;Applying prompts/prompt engineering + "A Systematic Survey of Prompt Engineering on Vision-Language Foundation Models", https://arxiv.org/pdf/2307.12980v1.pdf, Survey;Other modality than text;Applying Prompts/prompt Engineering "Cutting Down on Prompts and Parameters: Simple Few-Shot Learning with Language Models", https://arxiv.org/abs/2106.13353v2, New Technique;Analysis of Technique;Applying Prompts/Prompt Engineering "Tailored Visions: Enhancing Text-to-Image Generation with Personalized Prompt Rewriting", https://arxiv.org/abs/2310.08129v1, New Technique;Analysis of Technique;Other Modality than Text;Applying Prompts/Prompt Engineering "Large Language Models in the Workplace: A Case Study on Prompt Engineering for Job Type Classification", https://arxiv.org/abs/2303.07142v3, Applying Promtps/Prompt Engineering;Analysis of Model @@ -13,6 +18,4 @@ Title, Link, Labels "Effective Structured Prompting by Meta-Learning and Representative Verbalizer",http://arxiv.org/abs/2306.00618v1, New Technique; Analysis of technique "Invalid Logic, Equivalent Gains: The Bizarreness of Reasoning in Language Model Prompting",http://arxiv.org/abs/2307.10573v2, Analysis of technique "High-Fidelity Lake Extraction via Two-Stage Prompt Enhancement: Establishing a Novel Baseline and Benchmark",http://arxiv.org/abs/2308.08443v1, Analysis of technique; Other modality than text; Applying prompts/prompt engineering -Structured Prompt Tuning,http://arxiv.org/abs/2205.12309v1, New Technique - - +Structured Prompt Tuning,http://arxiv.org/abs/2205.12309v1, New Technique \ No newline at end of file From 68e5122f07f375230ef5169ef7a14371204fd093 Mon Sep 17 00:00:00 2001 From: trigaten Date: Mon, 30 Oct 2023 21:48:27 -0400 Subject: [PATCH 13/24] fix(requirements): add pytest --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 632d313..4d4ff7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ requests matplotlib datasets huggingface_hub +pytest -e . \ No newline at end of file From 0c06bf166a1a7ea0083460b7e367e4e51f960a7b Mon Sep 17 00:00:00 2001 From: trigaten Date: Mon, 30 Oct 2023 21:49:03 -0400 Subject: [PATCH 14/24] fix(gitignore): ignore egg --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ed8ebf5..612f383 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -__pycache__ \ No newline at end of file +__pycache__ +*.egg-info/ \ No newline at end of file From 25676f163a9656c3f65fdcb1c134234335fdd780 Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Tue, 31 Oct 2023 15:48:23 -0400 Subject: [PATCH 15/24] update pipeline --- .github/workflows/python-package.yml | 1 + src/prompt_systematic_review/pipeline.py | 19 ++++++++++++++----- tests/test_pipeline.py | 21 ++++++++++++++------- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index ef51487..4e75c58 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -11,6 +11,7 @@ on: jobs: build: + environment: HF runs-on: ubuntu-latest strategy: diff --git a/src/prompt_systematic_review/pipeline.py b/src/prompt_systematic_review/pipeline.py index c9e55c3..da31e3a 100644 --- a/src/prompt_systematic_review/pipeline.py +++ b/src/prompt_systematic_review/pipeline.py @@ -1,5 +1,6 @@ from huggingface_hub import HfFileSystem import pandas as pd +from io import StringIO import os @@ -17,13 +18,14 @@ class Pipeline: - def __init__(self, token=None): + def __init__(self, token=None,revision="test"): self.token = token self.root = "datasets/PromptSystematicReview/Prompt_Systematic_Review_Dataset/" if token is not None: self.fs = HfFileSystem(token=token) else: self.fs = HfFileSystem() + self.revision=revision def is_logged_in(self): return self.token is not None @@ -36,17 +38,24 @@ def login(self, token): self.token = token def get_all_files(self): - return self.fs.ls(self.root, detail=False) + return self.fs.ls(self.root, detail=False,revision=self.revision) def get_all_data_files(self): - return self.fs.glob(self.root+"**.csv") + return self.fs.glob(self.root+"**.csv",revision=self.revision) def read_from_file(self, fileName): - return pd.read_csv(self.fs.open(os.path.join(self.root, fileName))) + print("TTTTT", os.path.join(self.root, fileName), self.revision) + self.fs = None + self.fs = HfFileSystem(token=self.token) + text = self.fs.read_text(os.path.join(self.root, fileName),revision=self.revision) + return pd.read_csv(StringIO(text)) + def write_to_file(self, fileName, dataFrame): path = os.path.join(self.root, fileName) - self.fs.write_text(path, dataFrame.to_csv(index=False)) + self.fs.write_text(path, dataFrame.to_csv(index=False),revision=self.revision) + self.fs = None + self.fs = HfFileSystem(token=self.token) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 904996f..39facdd 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -8,14 +8,14 @@ def hashString(bytes): - return hashlib.sha256(bytes).hexdigest() + return str(hashlib.md5(bytes).hexdigest()) @pytest.fixture def client(): - return Pipeline(token=os.environ['HF_AUTH_TOKEN']) + return Pipeline(token=os.environ['HF_AUTH_TOKEN'],revision="test") def test_login(): - testClient = Pipeline() + testClient = Pipeline(revision="test") assert testClient.is_logged_in() == False testClient.login(os.environ['HF_AUTH_TOKEN']) assert testClient.is_logged_in() == True @@ -37,10 +37,17 @@ def test_write_to_file(client): randString = random.randbytes(100) + str(time.time()).encode() randHash = hashString(randString) csvDict = {"test":[1,3], "test2":[2,4]} - client.write_to_file(f"test_{randHash}.csv", pd.DataFrame(csvDict)) - df = client.read_from_file(f"test_{randHash}.csv") + print(client.revision) + client.write_to_file(f"{randHash[:10]}_test.csv", pd.DataFrame(csvDict)) + print(client.revision) + time.sleep(1) + # assert client.revision == "main" + df = client.read_from_file(f"{randHash[:10]}_test.csv") assert df["test"].sum() == 4 assert df["test2"].sum() == 6 - client.fs.delete(f"test_{randHash}.csv") - assert len(client.get_all_files()) == lenOfFiles + #time.sleep(1) + #client.fs.delete(f"{randHash[:10]}_test.csv",revision="test") + #assert len(client.get_all_files()) == lenOfFiles + + From 8791ac463ea829a1b8d7f0792f3bd8d981207940 Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Wed, 1 Nov 2023 20:40:00 -0400 Subject: [PATCH 16/24] pipeline hacks --- src/prompt_systematic_review/pipeline.py | 1 - tests/pipeline_tests.py | 47 ------------------------ tests/test_pipeline.py | 2 + 3 files changed, 2 insertions(+), 48 deletions(-) delete mode 100644 tests/pipeline_tests.py diff --git a/src/prompt_systematic_review/pipeline.py b/src/prompt_systematic_review/pipeline.py index da31e3a..9ae5cfe 100644 --- a/src/prompt_systematic_review/pipeline.py +++ b/src/prompt_systematic_review/pipeline.py @@ -44,7 +44,6 @@ def get_all_data_files(self): return self.fs.glob(self.root+"**.csv",revision=self.revision) def read_from_file(self, fileName): - print("TTTTT", os.path.join(self.root, fileName), self.revision) self.fs = None self.fs = HfFileSystem(token=self.token) text = self.fs.read_text(os.path.join(self.root, fileName),revision=self.revision) diff --git a/tests/pipeline_tests.py b/tests/pipeline_tests.py deleted file mode 100644 index dbd0b55..0000000 --- a/tests/pipeline_tests.py +++ /dev/null @@ -1,47 +0,0 @@ -from datasets import Dataset -import time -from pipeline import push, append, get - -def testingPushAndGet(dataset_name, user_name): - sample_data = { - "column1": ["value1", "value2", "value3"], - "column2": ["valueA", "valueB", "valueC"], - } - - sample_dataset = Dataset.from_dict(sample_data) - - push(dataset_name, sample_dataset) - assert(getRows(dataset_name, user_name) == 3) - -#Append seems to work on Hugging face but I'm having trouble with the Database API, will leave out this test for now. -# def testingAppend(dataset_name, user_name): -# sample_data = { -# "column1": ["value4", "value5"], -# "column2": ["valueD", "valueE"], -# } - -# sample_dataset = Dataset.from_dict(sample_data) - -# append(user_name, dataset_name, sample_dataset) - -# assert(getRows(dataset_name, user_name) == 5) - - -#Returns the number of rows in the dataset -def getRows(dataset_name, user_name): - my_dataset = get(dataset_name, user_name)['train'] - - return my_dataset.num_rows - -if (__name__ == "__main__"): - - USERNAME = "hudssntao" - DATASET_NAME = "test2" - - testingPushAndGet(DATASET_NAME, USERNAME) - - # testingAppend(DATASET_NAME, USERNAME) - - - - diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 39facdd..7108d62 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -33,6 +33,8 @@ def test_read_from_file(client): assert client.read_from_file("test.csv")[" Age"].mean() == 21 def test_write_to_file(client): + assert True + return True lenOfFiles = len(client.get_all_files()) randString = random.randbytes(100) + str(time.time()).encode() randHash = hashString(randString) From ff23b0210996a564ec104834d44e693461fce4cc Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Wed, 1 Nov 2023 20:42:09 -0400 Subject: [PATCH 17/24] commit123 --- tests/test_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 7108d62..e867975 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -13,7 +13,7 @@ def hashString(bytes): @pytest.fixture def client(): return Pipeline(token=os.environ['HF_AUTH_TOKEN'],revision="test") - +''' def test_login(): testClient = Pipeline(revision="test") assert testClient.is_logged_in() == False @@ -50,6 +50,6 @@ def test_write_to_file(client): #time.sleep(1) #client.fs.delete(f"{randHash[:10]}_test.csv",revision="test") #assert len(client.get_all_files()) == lenOfFiles - +''' From a7420d1fbad0431385936de45365867ced1feba3 Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Wed, 1 Nov 2023 21:44:56 -0400 Subject: [PATCH 18/24] FIXED PIPELINE --- setup.cfg | 1 + src/prompt_systematic_review/pipeline.py | 31 ++++++++++++++++-------- tests/test_pipeline.py | 15 ++++++------ 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/setup.cfg b/setup.cfg index 9ca9467..23327fb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,6 +33,7 @@ install_requires = jellyfish datasets huggingface_hub + black [options.packages.find] where=src diff --git a/src/prompt_systematic_review/pipeline.py b/src/prompt_systematic_review/pipeline.py index 9ae5cfe..0959f94 100644 --- a/src/prompt_systematic_review/pipeline.py +++ b/src/prompt_systematic_review/pipeline.py @@ -1,4 +1,4 @@ -from huggingface_hub import HfFileSystem +from huggingface_hub import HfFileSystem, login import pandas as pd from io import StringIO import os @@ -18,11 +18,12 @@ class Pipeline: - def __init__(self, token=None,revision="test"): + def __init__(self, token=None,revision="main"): self.token = token - self.root = "datasets/PromptSystematicReview/Prompt_Systematic_Review_Dataset/" + self.root = f"hf://datasets/PromptSystematicReview/Prompt_Systematic_Review_Dataset@{revision}/" if token is not None: self.fs = HfFileSystem(token=token) + login(token=token) else: self.fs = HfFileSystem() self.revision=revision @@ -30,11 +31,23 @@ def __init__(self, token=None,revision="test"): def is_logged_in(self): return self.token is not None + def get_revision(self): + return self.revision + + def set_revision(self,revision): + try: + assert revision.isalnum() + self.revision=revision + self.root = f"hf://datasets/PromptSystematicReview/Prompt_Systematic_Review_Dataset@{revision}/" + except: + raise ValueError("Revision must be alphanumeric") + def login(self, token): if self.token is not None: raise ValueError("Already Logged In") else: self.fs = HfFileSystem(token=self.token) + login(token=token) self.token = token def get_all_files(self): @@ -44,17 +57,15 @@ def get_all_data_files(self): return self.fs.glob(self.root+"**.csv",revision=self.revision) def read_from_file(self, fileName): - self.fs = None - self.fs = HfFileSystem(token=self.token) - text = self.fs.read_text(os.path.join(self.root, fileName),revision=self.revision) - return pd.read_csv(StringIO(text)) + return pd.read_csv(os.path.join(self.root, fileName)) def write_to_file(self, fileName, dataFrame): + if not self.is_logged_in(): + raise ValueError("Not Logged In") path = os.path.join(self.root, fileName) - self.fs.write_text(path, dataFrame.to_csv(index=False),revision=self.revision) - self.fs = None - self.fs = HfFileSystem(token=self.token) + dataFrame.to_csv(path, index=False) + diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index e867975..31f9cd0 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -2,6 +2,7 @@ import pytest import os from prompt_systematic_review.pipeline import * +from huggingface_hub import delete_file import random import time import hashlib @@ -13,7 +14,7 @@ def hashString(bytes): @pytest.fixture def client(): return Pipeline(token=os.environ['HF_AUTH_TOKEN'],revision="test") -''' + def test_login(): testClient = Pipeline(revision="test") assert testClient.is_logged_in() == False @@ -30,11 +31,9 @@ def test_get_all_data_files(client): def test_read_from_file(client): assert len(client.read_from_file("test.csv")) > 0 assert len(client.read_from_file("test.csv").columns) == 2 - assert client.read_from_file("test.csv")[" Age"].mean() == 21 + assert client.read_from_file("test.csv")["Age"].mean() == 21 def test_write_to_file(client): - assert True - return True lenOfFiles = len(client.get_all_files()) randString = random.randbytes(100) + str(time.time()).encode() randHash = hashString(randString) @@ -48,8 +47,10 @@ def test_write_to_file(client): assert df["test"].sum() == 4 assert df["test2"].sum() == 6 #time.sleep(1) - #client.fs.delete(f"{randHash[:10]}_test.csv",revision="test") - #assert len(client.get_all_files()) == lenOfFiles -''' + print(client.root+f"{randHash[:10]}_test.csv") + delete_file(f"{randHash[:10]}_test.csv", "PromptSystematicReview/Prompt_Systematic_Review_Dataset", repo_type="dataset", revision="test") + + assert len(client.get_all_files()) == lenOfFiles + From 107194a12bd898e0c07b269c7cc8f14832d2f6b0 Mon Sep 17 00:00:00 2001 From: Michael Ilie Date: Wed, 1 Nov 2023 21:50:30 -0400 Subject: [PATCH 19/24] formatting? --- src/prompt_systematic_review/pipeline.py | 31 ++++++++---------------- tests/test_paper.py | 3 --- tests/test_pipeline.py | 30 ++++++++++++++--------- tests/test_utils.py | 1 - 4 files changed, 29 insertions(+), 36 deletions(-) diff --git a/src/prompt_systematic_review/pipeline.py b/src/prompt_systematic_review/pipeline.py index 0959f94..65fd7b6 100644 --- a/src/prompt_systematic_review/pipeline.py +++ b/src/prompt_systematic_review/pipeline.py @@ -4,8 +4,6 @@ import os - - """ READ THIS https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions @@ -14,11 +12,8 @@ """ - - class Pipeline: - - def __init__(self, token=None,revision="main"): + def __init__(self, token=None, revision="main"): self.token = token self.root = f"hf://datasets/PromptSystematicReview/Prompt_Systematic_Review_Dataset@{revision}/" if token is not None: @@ -26,22 +21,22 @@ def __init__(self, token=None,revision="main"): login(token=token) else: self.fs = HfFileSystem() - self.revision=revision + self.revision = revision def is_logged_in(self): return self.token is not None - + def get_revision(self): return self.revision - - def set_revision(self,revision): + + def set_revision(self, revision): try: assert revision.isalnum() - self.revision=revision + self.revision = revision self.root = f"hf://datasets/PromptSystematicReview/Prompt_Systematic_Review_Dataset@{revision}/" except: raise ValueError("Revision must be alphanumeric") - + def login(self, token): if self.token is not None: raise ValueError("Already Logged In") @@ -51,22 +46,16 @@ def login(self, token): self.token = token def get_all_files(self): - return self.fs.ls(self.root, detail=False,revision=self.revision) + return self.fs.ls(self.root, detail=False, revision=self.revision) def get_all_data_files(self): - return self.fs.glob(self.root+"**.csv",revision=self.revision) - + return self.fs.glob(self.root + "**.csv", revision=self.revision) + def read_from_file(self, fileName): return pd.read_csv(os.path.join(self.root, fileName)) - def write_to_file(self, fileName, dataFrame): if not self.is_logged_in(): raise ValueError("Not Logged In") path = os.path.join(self.root, fileName) dataFrame.to_csv(path, index=False) - - - - - diff --git a/tests/test_paper.py b/tests/test_paper.py index 2e4a426..82307c8 100644 --- a/tests/test_paper.py +++ b/tests/test_paper.py @@ -4,7 +4,6 @@ from prompt_systematic_review.utils import process_paper_title - def test_paper(): paper1 = Paper( "How to write a paper", @@ -32,7 +31,6 @@ def test_paper(): assert paper1 != paper2 and paper2 != alsoPaper1 - def test_arxiv_source(): # test that arXiv source returns papers properly arxiv_source = ArXivSource() @@ -60,4 +58,3 @@ def test_arxiv_source(): assert paper.keywords == [ "foundational models in medical imaging: a comprehensive survey and future vision" ] - diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 31f9cd0..102680e 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -11,46 +11,54 @@ def hashString(bytes): return str(hashlib.md5(bytes).hexdigest()) + @pytest.fixture def client(): - return Pipeline(token=os.environ['HF_AUTH_TOKEN'],revision="test") + return Pipeline(token=os.environ["HF_AUTH_TOKEN"], revision="test") + def test_login(): testClient = Pipeline(revision="test") assert testClient.is_logged_in() == False - testClient.login(os.environ['HF_AUTH_TOKEN']) + testClient.login(os.environ["HF_AUTH_TOKEN"]) assert testClient.is_logged_in() == True + def test_get_all_files(client): assert len(client.get_all_files()) > 0 + def test_get_all_data_files(client): assert len(client.get_all_data_files()) > 0 assert all([x.endswith(".csv") for x in client.get_all_data_files()]) + def test_read_from_file(client): assert len(client.read_from_file("test.csv")) > 0 assert len(client.read_from_file("test.csv").columns) == 2 assert client.read_from_file("test.csv")["Age"].mean() == 21 + def test_write_to_file(client): lenOfFiles = len(client.get_all_files()) randString = random.randbytes(100) + str(time.time()).encode() randHash = hashString(randString) - csvDict = {"test":[1,3], "test2":[2,4]} + csvDict = {"test": [1, 3], "test2": [2, 4]} print(client.revision) client.write_to_file(f"{randHash[:10]}_test.csv", pd.DataFrame(csvDict)) print(client.revision) time.sleep(1) # assert client.revision == "main" - df = client.read_from_file(f"{randHash[:10]}_test.csv") + df = client.read_from_file(f"{randHash[:10]}_test.csv") assert df["test"].sum() == 4 assert df["test2"].sum() == 6 - #time.sleep(1) - print(client.root+f"{randHash[:10]}_test.csv") - delete_file(f"{randHash[:10]}_test.csv", "PromptSystematicReview/Prompt_Systematic_Review_Dataset", repo_type="dataset", revision="test") - - assert len(client.get_all_files()) == lenOfFiles + # time.sleep(1) + print(client.root + f"{randHash[:10]}_test.csv") + delete_file( + f"{randHash[:10]}_test.csv", + "PromptSystematicReview/Prompt_Systematic_Review_Dataset", + repo_type="dataset", + revision="test", + ) - - + assert len(client.get_all_files()) == lenOfFiles diff --git a/tests/test_utils.py b/tests/test_utils.py index 14a62ce..2078a98 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,6 +2,5 @@ from prompt_systematic_review.utils import process_paper_title - def test_process_paper_title(): assert process_paper_title("Laws of the\n Wildabeest") == "laws of the wildabeest" From a7ce574c733950ecd5f3051865f491ea4d76cda8 Mon Sep 17 00:00:00 2001 From: solarwaffles <107089996+solarwaffles@users.noreply.github.com> Date: Thu, 2 Nov 2023 17:46:43 -0400 Subject: [PATCH 20/24] Delete src/prompt_systematic_review.egg-info/PKG-INFO --- .../PKG-INFO | 27 ------------------- 1 file changed, 27 deletions(-) delete mode 100644 src/prompt_systematic_review.egg-info/PKG-INFO diff --git a/src/prompt_systematic_review.egg-info/PKG-INFO b/src/prompt_systematic_review.egg-info/PKG-INFO deleted file mode 100644 index 37f8974..0000000 --- a/src/prompt_systematic_review.egg-info/PKG-INFO +++ /dev/null @@ -1,27 +0,0 @@ -Metadata-Version: 2.1 -Name: prompt-systematic-review -Version: 0.1.0 -Summary: a systematic review of prompting -Home-page: https://github.com/trigaten/Prompt_Systematic_Review/ -Author: trigaten -Author-email: sanderschulhoff@gmail.com -License: MIT -Classifier: Development Status :: 1 - Planning -Classifier: Environment :: Console -Classifier: Intended Audience :: Science/Research -Classifier: Natural Language :: English -Classifier: Programming Language :: Python :: 3 -Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence -Classifier: Topic :: Scientific/Engineering :: Information Analysis -Description-Content-Type: text/markdown -License-File: LICENSE.txt -Requires-Dist: numpy -Requires-Dist: pandas -Requires-Dist: requests -Requires-Dist: matplotlib - -# Prompt Engineering Survey - -## blacklist.csv - -Papers we should not include due to being poorly written or AI generated From 220b79b8a1f3de93dbdd493573a9043a3e8bae00 Mon Sep 17 00:00:00 2001 From: solarwaffles <107089996+solarwaffles@users.noreply.github.com> Date: Thu, 2 Nov 2023 17:46:54 -0400 Subject: [PATCH 21/24] Delete src/prompt_systematic_review.egg-info/SOURCES.txt --- .../SOURCES.txt | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 src/prompt_systematic_review.egg-info/SOURCES.txt diff --git a/src/prompt_systematic_review.egg-info/SOURCES.txt b/src/prompt_systematic_review.egg-info/SOURCES.txt deleted file mode 100644 index 1927ede..0000000 --- a/src/prompt_systematic_review.egg-info/SOURCES.txt +++ /dev/null @@ -1,16 +0,0 @@ -LICENSE.txt -MANIFEST.in -README.md -setup.cfg -setup.py -examples/search_pe_papers.ipynb -src/prompt_systematic_review/__init__.py -src/prompt_systematic_review/keywords.py -src/prompt_systematic_review/utils.py -src/prompt_systematic_review.egg-info/PKG-INFO -src/prompt_systematic_review.egg-info/SOURCES.txt -src/prompt_systematic_review.egg-info/dependency_links.txt -src/prompt_systematic_review.egg-info/requires.txt -src/prompt_systematic_review.egg-info/top_level.txt -tests/__init__.py -tests/test_utils.py \ No newline at end of file From 085098f9adebde89794ccd82cfd9afc9dab205dd Mon Sep 17 00:00:00 2001 From: solarwaffles <107089996+solarwaffles@users.noreply.github.com> Date: Thu, 2 Nov 2023 17:47:04 -0400 Subject: [PATCH 22/24] Delete src/prompt_systematic_review.egg-info/dependency_links.txt --- src/prompt_systematic_review.egg-info/dependency_links.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 src/prompt_systematic_review.egg-info/dependency_links.txt diff --git a/src/prompt_systematic_review.egg-info/dependency_links.txt b/src/prompt_systematic_review.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/src/prompt_systematic_review.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - From 9fcfbf7ef6534921a6382df4f9b84f25b18bee7b Mon Sep 17 00:00:00 2001 From: solarwaffles <107089996+solarwaffles@users.noreply.github.com> Date: Thu, 2 Nov 2023 17:47:12 -0400 Subject: [PATCH 23/24] Delete src/prompt_systematic_review.egg-info/requires.txt --- src/prompt_systematic_review.egg-info/requires.txt | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 src/prompt_systematic_review.egg-info/requires.txt diff --git a/src/prompt_systematic_review.egg-info/requires.txt b/src/prompt_systematic_review.egg-info/requires.txt deleted file mode 100644 index 6a40d33..0000000 --- a/src/prompt_systematic_review.egg-info/requires.txt +++ /dev/null @@ -1,4 +0,0 @@ -numpy -pandas -requests -matplotlib From f7703743c459575c02ebabb8d4133bd82b51d5d5 Mon Sep 17 00:00:00 2001 From: solarwaffles <107089996+solarwaffles@users.noreply.github.com> Date: Thu, 2 Nov 2023 17:47:19 -0400 Subject: [PATCH 24/24] Delete src/prompt_systematic_review.egg-info/top_level.txt --- src/prompt_systematic_review.egg-info/top_level.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 src/prompt_systematic_review.egg-info/top_level.txt diff --git a/src/prompt_systematic_review.egg-info/top_level.txt b/src/prompt_systematic_review.egg-info/top_level.txt deleted file mode 100644 index fd1fb67..0000000 --- a/src/prompt_systematic_review.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -prompt_systematic_review