Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added option to sub-sample initial metadata download #12

Merged
merged 1 commit into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 55 additions & 55 deletions dvc.lock
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
schema: '2.0'
stages:
fetch-metadata:
cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json
cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json -s 3
deps:
- path: scripts/fetch_eidc_metadata.py
hash: md5
md5: 53d620665448ef91f2deedb517e2f502
size: 675
md5: a564cb0804b482ef09658f0cb4a0a705
size: 941
outs:
- path: data/eidc_metadata.json
hash: md5
md5: b4f3774a2921debb4d7740165ac604d4
size: 12157676
md5: 068ae066ea08ee369c505c8640481cf6
size: 125674
prepare:
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
deps:
Expand All @@ -33,94 +33,94 @@ stages:
deps:
- path: data/eidc_metadata.json
hash: md5
md5: b4f3774a2921debb4d7740165ac604d4
size: 12157676
md5: 068ae066ea08ee369c505c8640481cf6
size: 125674
- path: scripts/extract_metadata.py
hash: md5
md5: 3f0269a6413845f4425af55e7cea7bf8
size: 1304
md5: e66f21369c5106eaaad4476612c6fb5e
size: 1313
outs:
- path: data/extracted_metadata.json
hash: md5
md5: 789fda7a14f9a85c6ee0e10af8170a95
size: 4584498
md5: e71f887d993834e3bda1eb00e711e724
size: 7005
chunk-data:
cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 -s
10 data/extracted_metadata.json data/supporting-docs.json
cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json
data/supporting-docs.json
deps:
- path: data/extracted_metadata.json
hash: md5
md5: 789fda7a14f9a85c6ee0e10af8170a95
size: 4584498
md5: e71f887d993834e3bda1eb00e711e724
size: 7005
- path: data/supporting-docs.json
hash: md5
md5: f3ea9980226e5408497c96a10cc77b80
size: 72013526
md5: bdab1ea8df4a87aa3d314044eb2eaa0a
size: 188762
- path: scripts/chunk_data.py
hash: md5
md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
size: 2509
md5: e8de02d6b14c8fc22533d0becfb7d35d
size: 2198
outs:
- path: data/chunked_data.json
hash: md5
md5: f6426396e1a3564b53649ef5fc0571fd
size: 993814
md5: a01ff8ed4d429203d6903466d26937ff
size: 320740
create-embeddings:
cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
deps:
- path: data/chunked_data.json
hash: md5
md5: f6426396e1a3564b53649ef5fc0571fd
size: 993814
md5: a01ff8ed4d429203d6903466d26937ff
size: 320740
- path: scripts/create_embeddings.py
hash: md5
md5: 4649c700dfae922b43b3608ee4f00c1a
size: 808
md5: d9282fc92ed400855c4fc2a290289f14
size: 867
outs:
- path: data/embeddings.json
hash: md5
md5: 8fd682131a282736f6a81a6c53040b1e
size: 13422675
md5: 363e3eaf7f8baddf9aa2e83f45f074b1
size: 4345553
upload-to-docstore:
cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data
-em all-MiniLM-L6-v2 -c eidc-data
deps:
- path: data/embeddings.json
hash: md5
md5: 8fd682131a282736f6a81a6c53040b1e
size: 13422675
md5: 363e3eaf7f8baddf9aa2e83f45f074b1
size: 4345553
- path: scripts/upload_to_docstore.py
hash: md5
md5: 41da88e3bb6d2592bee938ce347f6983
size: 1905
md5: 7b9433047ff175d5e6af8d6056caf05b
size: 1931
outs:
- path: data/chroma-data
hash: md5
md5: 5c99644f30def03f87b37c98341c6f25.dir
size: 13758136
nfiles: 6
md5: 39b81f6d319a02523fbc356dd667b920.dir
size: 5702372
nfiles: 5
run-rag-pipeline:
cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
data/chroma-data -c eidc-data
deps:
- path: data/chroma-data
hash: md5
md5: 5c99644f30def03f87b37c98341c6f25.dir
size: 13758136
nfiles: 6
md5: 39b81f6d319a02523fbc356dd667b920.dir
size: 5702372
nfiles: 5
- path: data/eidc_rag_test_sample.csv
hash: md5
md5: a371d83c5822d256286e80d64d58c3fe
size: 7524
- path: scripts/run_rag_pipeline.py
hash: md5
md5: 8d5fc0669771146562c773186f4f44f6
size: 3667
md5: ea2b8d94ee42499870d925f916982e8a
size: 3781
outs:
- path: data/evaluation_data.csv
hash: md5
md5: 8ea0a3f240478e9db41855922ac534a6
size: 9894
md5: a4470a84d2de8b1d04c7d2dfd8b5f807
size: 9859
generate-testset:
cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
outs:
Expand All @@ -133,35 +133,35 @@ stages:
deps:
- path: data/eidc_metadata.json
hash: md5
md5: b4f3774a2921debb4d7740165ac604d4
size: 12157676
md5: 068ae066ea08ee369c505c8640481cf6
size: 125674
- path: scripts/fetch_supporting_docs.py
hash: md5
md5: 923af3b6ce1447d388b08fab0e3ab77d
size: 1660
md5: 02b94a2cc7bff711784cbdec3650b618
size: 1718
outs:
- path: data/supporting-docs.json
hash: md5
md5: f3ea9980226e5408497c96a10cc77b80
size: 72013526
md5: bdab1ea8df4a87aa3d314044eb2eaa0a
size: 188762
evaluate:
cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
-img data/eval.png
deps:
- path: data/evaluation_data.csv
hash: md5
md5: 8ea0a3f240478e9db41855922ac534a6
size: 9894
md5: a4470a84d2de8b1d04c7d2dfd8b5f807
size: 9859
- path: scripts/evaluate.py
hash: md5
md5: 10f76511eafc8a1a9b90e9ae92a76bc5
size: 2633
md5: a9c4c04157007c12c068aacdf5e099a9
size: 2634
outs:
- path: data/eval.png
hash: md5
md5: bae77b1b721bf283a30a64f67af45fea
size: 74438
md5: 981434fb5f4e61ce4288a4431f70bcc1
size: 67852
- path: data/metrics.json
hash: md5
md5: 0145280f36071a6df551ef57d3f8393e
size: 229
md5: 20efb8ebf0d6908f0ee7b35dbff2e7c7
size: 242
4 changes: 2 additions & 2 deletions dvc.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
stages:
fetch-metadata:
cmd: python scripts/fetch_eidc_metadata.py ${files.metadata}
cmd: python scripts/fetch_eidc_metadata.py ${files.metadata} -s ${sub-sample}
deps:
- scripts/fetch_eidc_metadata.py
outs:
Expand All @@ -20,7 +20,7 @@ stages:
outs:
- ${files.extracted}
chunk-data:
cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted} ${files.supporting-docs}
cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} ${files.extracted} ${files.supporting-docs}
deps:
- ${files.extracted}
- ${files.supporting-docs}
Expand Down
2 changes: 1 addition & 1 deletion params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ files:
eval-set: data/evaluation_data.csv
metrics: data/metrics.json
eval-plot: data/eval.png
sample-size: 10 # sample size of 0 will process all data
sub-sample: 3 # sample size of 0 will process all data
rag:
model: llama3.1
prompt: >-
Expand Down
24 changes: 4 additions & 20 deletions scripts/chunk_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,28 +28,20 @@ def chunk_metadata_value(


def chunk_metadata_file(
file: str, chunk_size: int, overlap: int, sample_size: int
file: str, chunk_size: int, overlap: int
) -> List[Dict[str, str]]:
chunked_metadata = []
with open(file) as f:
json_data = json.load(f)
count = 0
for metadata in json_data:
chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap))
count += 1
if count == sample_size:
break
return chunked_metadata


def main(
files: List[str], ouput_file: str, chunk_size: int, overlap: int, sample_size: int
) -> None:
def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> None:
all_chunked_metadata = []
for file in files:
all_chunked_metadata.extend(
chunk_metadata_file(file, chunk_size, overlap, sample_size)
)
all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap))
with open(ouput_file, "w") as f:
json.dump(all_chunked_metadata, f, indent=4)

Expand Down Expand Up @@ -81,14 +73,6 @@ def main(
nargs="?",
const=100,
)
parser.add_argument(
"-s",
"--sample",
help="Only generate chunks for n datasets",
type=int,
nargs="?",
const=0,
)
args = parser.parse_args()
assert args.chunk > args.overlap
main(args.input_files, args.output, args.chunk, args.overlap, args.sample)
main(args.input_files, args.output, args.chunk, args.overlap)
16 changes: 13 additions & 3 deletions scripts/fetch_eidc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
URL = "https://catalogue.ceh.ac.uk/eidc/documents"


def main(output_file: str) -> None:
def main(output_file: str, sample: int) -> None:
res = requests.get(
URL,
headers={"content-type": "application/json"},
Expand All @@ -16,12 +16,22 @@ def main(output_file: str) -> None:
"term": "recordType:Dataset",
},
)
json_data = res.json()
json_data["results"] = json_data["results"][:sample]
with open(output_file, "w") as f:
json.dump(res.json(), f, indent=4)
json.dump(json_data, f, indent=4)


if __name__ == "__main__":
parser = ArgumentParser("fetch_eidc_metadata.py")
parser.add_argument("output", help="The file path to save the downloaded data to.")
parser.add_argument(
"-s",
"--sample",
help="Only save n datasets",
type=int,
nargs="?",
const=0,
)
args = parser.parse_args()
main(args.output)
main(args.output, args.sample)
Loading