Skip to content

Commit

Permalink
Added dummy steps to pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewcoole committed Oct 17, 2024
1 parent aaa3a93 commit f9b9b3e
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 24 deletions.
3 changes: 3 additions & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@
/chunked_embeddings.json
/embeddings.json
/chroma-data
/evaluation_data.csv
/eidc_rag_test_sample.csv
/supporting-docs.json
6 changes: 3 additions & 3 deletions data/synthetic-datasets.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 61b4177259b03a7227784b5b7560726d.dir
size: 144597
nfiles: 1
- md5: 9d87c638c5cc518ea360c474c4e1e9ef.dir
size: 152121
nfiles: 2
hash: md5
path: synthetic-datasets
46 changes: 46 additions & 0 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ stages:
hash: md5
md5: fce18ce3c43175af1cea5d84dac9baf9
size: 4579965
- path: data/supporting-docs.json
hash: md5
md5: 0febface6f1d23fda46c11bef65284f4
size: 34
- path: scripts/chunk_data.py
hash: md5
md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
Expand Down Expand Up @@ -95,3 +99,45 @@ stages:
md5: 2f2ba629bf078284bb6d6be73c6166a7.dir
size: 2069220
nfiles: 5
run-rag-pipeline:
cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
deps:
- path: data/chroma-data
hash: md5
md5: 1d7c499f71791267391ff4108632988c.dir
size: 2069220
nfiles: 5
- path: data/eidc_rag_test_sample.csv
hash: md5
md5: a371d83c5822d256286e80d64d58c3fe
size: 7524
- path: scripts/run_rag_pipeline.py
hash: md5
md5: 6d1f49fa8b22288ecd50ed0e3898fd60
size: 3153
outs:
- path: data/evaluation_data.csv
hash: md5
md5: e313cb899c10a2b5ad670b8bc84d059f
size: 8407
generate-testset:
cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
outs:
- path: data/eidc_rag_test_sample.csv
hash: md5
md5: a371d83c5822d256286e80d64d58c3fe
size: 7524
fetch-supporting-docs:
cmd: echo "Fetch supporitng docs from legilo" > data/supporting-docs.json
outs:
- path: data/supporting-docs.json
hash: md5
md5: 0febface6f1d23fda46c11bef65284f4
size: 34
evaluate:
cmd: echo "Evaluate responses"
deps:
- path: data/evaluation_data.csv
hash: md5
md5: e313cb899c10a2b5ad670b8bc84d059f
size: 8407
22 changes: 20 additions & 2 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ stages:
- scripts/fetch_eidc_metadata.py
outs:
- ${files.metadata}
fetch-supporting-docs:
cmd: echo "Fetch supporitng docs from legilo" > ${files.supporting-docs}
outs:
- ${files.supporting-docs}
extract-metadata:
cmd: python scripts/extract_metadata.py ${files.metadata} ${files.extracted}
deps:
Expand All @@ -16,6 +20,7 @@ stages:
cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted}
deps:
- ${files.extracted}
- ${files.supporting-docs}
- scripts/chunk_data.py
outs:
- ${files.chunked}
Expand All @@ -33,6 +38,19 @@ stages:
- scripts/upload_to_docstore.py
outs:
- ${files.doc-store}
generate-testset:
cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
outs:
- ${files.test-set}
run-rag-pipeline:
cmd python scripts/run_rag_pipeline.py ${files.test-set}

cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set}
deps:
- ${files.test-set}
- ${files.doc-store}
- scripts/run_rag_pipeline.py
outs:
- ${files.eval-set}
evaluate:
cmd: echo "Evaluate responses"
deps:
- ${files.eval-set}
4 changes: 3 additions & 1 deletion params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ hp:
files:
metadata: "data/eidc_metadata.json"
extracted: "data/extracted_metadata.json"
supporting-docs: "data/supporting-docs.json"
chunked: "data/chunked_data.json"
embeddings: "data/embeddings.json"
doc-store: "data/chroma-data"
test-set: "data/synthetic-datasets/eidc_rag_test_sample.csv"
test-set: "data/eidc_rag_test_sample.csv"
eval-set: "data/evaluation_data.csv"
sample-size: 10 # sample size of 0 will process all data
rag:
model: llama3.1
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ dependencies = [
"chromadb",
"ollama-haystack == 0.0.7",
"chroma-haystack",
"ragas == 0.1.10",
"nltk"
]

[project.optional-dependencies]
Expand Down
42 changes: 24 additions & 18 deletions scripts/run_rag_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def build_rag_pipeline(model_name: str) -> Pipeline:
return rag_pipe


def query_pipeline(query: str, pipeline: Pipeline):
def run_query(query: str, pipeline: Pipeline):
return pipeline.run(
{
"retriever": {"query": query},
Expand All @@ -67,25 +67,27 @@ def query_pipeline(query: str, pipeline: Pipeline):
)


def main(test_data_file: str):
def query_pipeline(questions, rag_pipe):
answers = []
contexts = []
for q in questions:
response = run_query(q, rag_pipe)
answers.append(response["answer_builder"]["answers"][0].data)
contexts.append([doc.content for doc in response["answer_builder"]["answers"][0].documents])
return answers, contexts


def main(test_data_file: str, ouput_file: str):
rag_pipe = build_rag_pipeline("llama3.1")

df = pd.read_csv(test_data_file)
responses = []
for q in df["question"]:
responses.append(query_pipeline(q, rag_pipe))
df["rag_response"] = responses
df.to_csv("data/rag_response.csv")

query = "Who collected the land cover map data?"
result = rag_pipe.run(
{
"retriever": {"query": query},
"prompt_builder": {"query": query},
"answer_builder": {"query": query},
}
)
print(result)
df.drop(columns=["rating", "contexts"], inplace=True)

answers, contexts = query_pipeline(df["question"], rag_pipe)

df["answer"] = answers
df["contexts"] = contexts
df.to_csv(ouput_file, index=False)


if __name__ == "__main__":
Expand All @@ -94,5 +96,9 @@ def main(test_data_file: str):
"test_data_file",
help="File containing test queries to generate response from the RAG pipeline.",
)
parser.add_argument(
"output_file",
help="File to output results to.",
)
args = parser.parse_args()
main(args.test_data_file)
main(args.test_data_file, args.output_file)

1 comment on commit f9b9b3e

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

answer_relevancy: 0.49425339911566113
context_recall: 0.5376895418550558
answer_correctness: 0.527280896744315
context_precision: 0.5307366025707845

Please sign in to comment.