Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Synth gen #19

Merged
merged 8 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 53 additions & 43 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ stages:
outs:
- path: data/eidc_metadata.json
hash: md5
md5: ecfc40751072949b1bde0f46afd8c052
size: 12283698
md5: 413f59888bf033c30cc27b84b1a3f40b
size: 12313041
prepare:
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
deps:
Expand All @@ -33,140 +33,150 @@ stages:
deps:
- path: data/eidc_metadata.json
hash: md5
md5: ecfc40751072949b1bde0f46afd8c052
size: 12283698
md5: 413f59888bf033c30cc27b84b1a3f40b
size: 12313041
- path: scripts/extract_metadata.py
hash: md5
md5: e66f21369c5106eaaad4476612c6fb5e
size: 1313
outs:
- path: data/extracted_metadata.json
hash: md5
md5: e93397da0980be79f6b94abcc015c4c5
size: 4619699
md5: 226225c5bd64e15d803ba88560810c5a
size: 4629991
chunk-data:
cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 250 -ol 75 data/extracted_metadata.json data/supporting-docs.json -m
0
deps:
- path: data/extracted_metadata.json
hash: md5
md5: e93397da0980be79f6b94abcc015c4c5
size: 4619699
md5: 226225c5bd64e15d803ba88560810c5a
size: 4629991
- path: data/supporting-docs.json
hash: md5
md5: a485ad0d5e7a171be5e94b60abb433c7
size: 72412236
md5: e2581aff9abe25942e8009214b88d0a5
size: 72680213
- path: scripts/chunk_data.py
hash: md5
md5: 3ad449140b03e1c2904b22a5b401a12e
size: 2705
outs:
- path: data/chunked_data.json
hash: md5
md5: 718b9388c586a467f4ea4d74adc53e7b
size: 176416449
md5: 9893d839409c8cf4561e99ab5f747f20
size: 177068127
create-embeddings:
cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
-m all-MiniLM-L6-v2
deps:
- path: data/chunked_data.json
hash: md5
md5: 718b9388c586a467f4ea4d74adc53e7b
size: 176416449
md5: 9893d839409c8cf4561e99ab5f747f20
size: 177068127
- path: scripts/create_embeddings.py
hash: md5
md5: b0d8f7cb90f244e709656b1f38723e2d
size: 1552
outs:
- path: data/embeddings.json
hash: md5
md5: 9833122c7b5039cb8b73b5aaf4fad9a0
size: 3741355109
md5: 5c8ca3cdde4d5bc559fa2e701ff090a8
size: 3754990368
upload-to-docstore:
cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data -em
all-MiniLM-L6-v2 -c eidc-data
deps:
- path: data/embeddings.json
hash: md5
md5: 9833122c7b5039cb8b73b5aaf4fad9a0
size: 3741355109
md5: 5c8ca3cdde4d5bc559fa2e701ff090a8
size: 3754990368
- path: scripts/upload_to_docstore.py
hash: md5
md5: 930456cedd43723c1d643ad90c146952
size: 2793
outs:
- path: data/chroma-data
hash: md5
md5: 7050f2ade36567a1ed868e12ce507d8d.dir
size: 1815190012
md5: c06796220fbfe9db3b08b8439edf87b4.dir
size: 3081399131
nfiles: 6
run-rag-pipeline:
cmd: python scripts/run_rag_pipeline.py -i data/eidc_rag_testset.csv -o data/evaluation_data.csv -ds
data/chroma-data -c eidc-data -m llama3.1 -p data/pipeline.yml
deps:
- path: data/chroma-data
hash: md5
md5: 7050f2ade36567a1ed868e12ce507d8d.dir
size: 1815190012
md5: c06796220fbfe9db3b08b8439edf87b4.dir
size: 3081399131
nfiles: 6
- path: data/eidc_rag_testset.csv
hash: md5
md5: f301e759e74ce5e71b50e04993ec8c88
size: 144597
md5: 90d23c9bfcaddf9f152109a7b51e3151
size: 149155
- path: scripts/run_rag_pipeline.py
hash: md5
md5: 2d6dc886728d4bd46676ecd1882f1fd1
size: 5838
md5: 35eb5f65605242094a1581b92e9b2ef4
size: 5862
outs:
- path: data/evaluation_data.csv
hash: md5
md5: 77ba66c4b4afde504b2c5cee0463f13a
size: 253092
md5: 1b5f226c52d70bda7e2551d7778c1e89
size: 385945
- path: data/pipeline.yml
hash: md5
md5: 8e3c4e49d4d97f613e83468d010a96e9
size: 3440
md5: 70385a724cdf687c287596b8360e1448
size: 3334
generate-testset:
cmd: head -n 101 data/synthetic-datasets/eidc_rag_test_set.csv > data/eidc_rag_testset.csv
cmd: python scripts/generate_synthetic_testset.py data/extracted_metadata.json
data/eidc_rag_testset.csv 200
deps:
- path: data/extracted_metadata.json
hash: md5
md5: 226225c5bd64e15d803ba88560810c5a
size: 4629991
- path: scripts/generate_synthetic_testset.py
hash: md5
md5: fdac8b2f28de8f3b4e5025ca47bb94ca
size: 2175
outs:
- path: data/eidc_rag_testset.csv
hash: md5
md5: f301e759e74ce5e71b50e04993ec8c88
size: 144597
md5: 90d23c9bfcaddf9f152109a7b51e3151
size: 149155
fetch-supporting-docs:
cmd: python scripts/fetch_supporting_docs.py data/eidc_metadata.json data/supporting-docs.json
deps:
- path: data/eidc_metadata.json
hash: md5
md5: ecfc40751072949b1bde0f46afd8c052
size: 12283698
md5: 413f59888bf033c30cc27b84b1a3f40b
size: 12313041
- path: scripts/fetch_supporting_docs.py
hash: md5
md5: 02b94a2cc7bff711784cbdec3650b618
size: 1718
outs:
- path: data/supporting-docs.json
hash: md5
md5: a485ad0d5e7a171be5e94b60abb433c7
size: 72412236
md5: e2581aff9abe25942e8009214b88d0a5
size: 72680213
evaluate:
cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
-img data/eval.png
deps:
- path: data/evaluation_data.csv
hash: md5
md5: 77ba66c4b4afde504b2c5cee0463f13a
size: 253092
md5: 1b5f226c52d70bda7e2551d7778c1e89
size: 385945
- path: scripts/evaluate.py
hash: md5
md5: 4154acf8e74c1d8bcd0b0da72af038e0
size: 2728
outs:
- path: data/eval.png
hash: md5
md5: 03bcb249c6da9d4f98560b22c0fe7667
size: 83635
md5: 3308b984c5168a996805443d25697026
size: 83001
- path: data/metrics.json
hash: md5
md5: 1508479652b76271f9d8c6b5b155d48f
size: 286
md5: 709909482614d6cb47c160506088f53e
size: 287
5 changes: 4 additions & 1 deletion dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,10 @@ stages:
outs:
- ${files.doc-store}
generate-testset:
cmd: head -n ${test-set-size} data/synthetic-datasets/eidc_rag_test_set.csv > ${files.test-set}
cmd: python scripts/generate_synthetic_testset.py ${files.extracted} ${files.test-set} ${test-set-size}
deps:
- ${files.extracted}
- scripts/generate_synthetic_testset.py
outs:
- ${files.test-set}
run-rag-pipeline:
Expand Down
Loading
Loading