Skip to content

Commit

Permalink
Add CSV Loader to Document Loaders
Browse files Browse the repository at this point in the history
- Implement CsvLoader struct in src/document_loaders/csv.rs
- Add CsvLoader to the document_loaders module
- Implement DocumentLoader trait for CsvLoader
- Use csv crate for CSV parsing
- Add error handling for file operations and CSV parsing
- Update Cargo.toml with csv dependency
- Update documentation with CsvLoader usage examples
  • Loading branch information
Tachikoma000 committed Sep 19, 2024
1 parent 9c8e751 commit 5f05baa
Show file tree
Hide file tree
Showing 3 changed files with 206 additions and 27 deletions.
85 changes: 85 additions & 0 deletions rig-core/examples/rag_with_csv.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
use rig::{
completion::Prompt,
document_loaders::CsvLoader,
embeddings::EmbeddingsBuilder,
providers::openai::{Client, TEXT_EMBEDDING_ADA_002},
vector_store::{in_memory_store::InMemoryVectorStore, VectorStore},
};
use std::env;
use std::path::PathBuf;

#[tokio::main]
async fn main() -> Result<(), anyhow::Error> {
// Print current working directory
println!("Current working directory: {:?}", env::current_dir()?);

// Path to the CSV file
let csv_path = PathBuf::from("rig-core/examples/sample_data/top_rated_movies.csv");

// Print absolute path
println!(
"Attempting to access file at: {:?}",
csv_path.canonicalize()?
);

// Check if the file exists
if !csv_path.exists() {
eprintln!("Error: The file {} does not exist.", csv_path.display());
return Ok(());
}

println!("File found successfully!");

// Initialize OpenAI client
let openai = Client::from_env();
let embedding_model = openai.embedding_model(TEXT_EMBEDDING_ADA_002);

// Create vector store
let mut vector_store = InMemoryVectorStore::default();

// Build embeddings
let embeddings = EmbeddingsBuilder::new(embedding_model.clone())
.add_loader(CsvLoader::new(csv_path.to_str().unwrap()))
.build()
.await?;

println!(
"Embeddings created successfully. Count: {}",
embeddings.len()
);
for emb in &embeddings {
println!("Number of embeddings: {}", emb.embeddings.len());
println!(
"First embedding vector length: {}",
emb.embeddings.first().map_or(0, |e| e.vec.len())
);
println!("--------------------");
}

// Add documents to vector store
vector_store.add_documents(embeddings).await?;

// Create vector store index
let index = vector_store.index(embedding_model);

// Create RAG agent
let rag_agent = openai
.agent("gpt-4")
.preamble(
"
You are a knowledgeable assistant.
Use the information provided to you to answer questions about the CSV data.
",
)
.dynamic_context(5, index)
.build();

// Prompt the agent and print the response
let response = rag_agent
.prompt("Give me a summary of the CSV data.")
.await?;

println!("Agent Response:\n{}", response);

Ok(())
}
110 changes: 110 additions & 0 deletions rig-core/examples/sample_data/top_rated_movies.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
popularity,release_date,title,vote_average
174.522,9/23/1994,The Shawshank Redemption,8.706
165.677,3/14/1972,The Godfather,8.69
174.522,9/23/1994,The Shawshank Redemption,8.706
165.677,3/14/1972,The Godfather,8.69
47.916,12/20/1997,Life Is Beautiful,8.449
197.569,11/5/2014,Interstellar,8.44
42.629,10/13/2023,TAYLOR SWIFT | THE ERAS TOUR,8.388
21.39,11/19/2020,Gabriel's Inferno: Part III,8.4
69.527,7/3/1985,Back to the Future,8.318
49.507,6/2/1989,Dead Poets Society,8.312
18.452,10/28/1998,The Legend of 1900,8.266
14.67,8/22/2020,Given,8.3
173.556,12/7/2022,Puss in Boots: The Last Wish,8.227
29.933,10/26/2020,Wolfwalkers,8.22
135.05,10/7/2016,Hacksaw Ridge,8.198
46.126,12/19/1971,A Clockwork Orange,8.2
20.423,1/28/2005,Innocent Voices,8.174
27.808,11/3/1953,Tokyo Story,8.2
83.731,3/18/2021,Zack Snyder's Justice League,8.148
29.312,10/25/2019,Better Days,8.1
89.436,7/3/1991,Terminator 2: Judgment Day,8.119
14.482,7/6/1944,Double Indemnity,8.1
55.332,9/19/2013,Prisoners,8.098
14.629,3/31/1954,Sansho the Bailiff,8.098
11.283,11/24/2021,Far from the Tree,8.074
67.735,9/16/2005,Pride & Prejudice,8.075
38.208,2/26/2014,The Grand Budapest Hotel,8.1
42.511,12/3/2019,How to Train Your Dragon: Homecoming,8.048
13.881,10/27/2022,Beyond the Universe,8.027
26.21,10/18/2019,Jojo Rabbit,8.024
12.038,11/27/2020,Black Beauty,8
37.023,6/8/2009,Hachi: A Dog's Tale,8.008
5.126,6/1/2017,In a Heartbeat,7.995
33.577,12/23/2009,3 Idiots,7.995
137.914,5/3/2023,Guardians of the Galaxy Vol. 3,8
18.167,1/31/2009,Love Exposure,8
38.425,8/6/1999,The Sixth Sense,7.957
21.449,12/15/2004,Million Dollar Baby,7.957
56.873,6/13/2007,No Country for Old Men,7.944
35.17,10/18/2013,12 Years a Slave,7.942
12.423,1/21/2022,My Father's Violin,7.926
20.658,6/22/1954,On the Waterfront,7.9
10.913,8/1/1997,Children of Heaven,7.914
27.948,12/21/2016,Dangal,7.913
30.657,1/15/2021,Wish Dragon,7.902
28.602,11/4/2016,A Street Cat Named Bob,7.905
11.461,6/10/2008,La Maison en Petits Cubes,7.893
52.355,12/20/2017,The Greatest Showman,7.891
19.146,9/20/2000,Yi Yi,7.875
19.15,9/1/2000,Dancer in the Dark,7.875
14.602,12/21/2011,My Way,7.858
18.928,9/16/2004,Downfall,7.858
13.954,3/15/1940,The Grapes of Wrath,7.8
43.665,3/30/1990,Dances with Wolves,7.847
16.392,5/1/1983,Nostalgia,7.838
28.293,12/22/1960,Two Women,7.837
42.994,12/3/2022,The First Slam Dunk,7.8
92.789,6/21/2007,Ratatouille,7.824
32.022,3/20/1972,Solaris,7.8
22.13,6/16/2004,Before Sunset,7.818
33.161,10/23/2009,Fantastic Mr. Fox,7.8
141.3,7/9/2003,Pirates of the Caribbean: The Curse of the Black Pearl,7.804
10.15,8/21/1988,A Short Film About Love,7.794
45.622,9/20/2012,The Perks of Being a Wallflower,7.793
16.517,8/31/2000,Nine Queens,7.784
12.185,12/17/1993,The Wrong Trousers,7.784
20.295,12/4/1990,Awakenings,7.768
11.949,5/28/2009,Partly Cloudy,7.767
18.816,11/18/1974,A Woman Under the Influence,7.8
18.124,2/14/2008,The Chaser,7.758
86.727,2/11/2016,Zootopia,7.749
12.807,3/19/1980,The King and the Mockingbird,7.749
20.424,9/28/2019,Marriage Story,7.738
56.129,7/13/2022,The Killer,7.7
16.337,11/20/2020,Sound of Metal,7.727
14.088,9/20/1962,Vivre Sa Vie,7.727
18.88,12/22/2004,Hotel Rwanda,7.7
24.783,2/18/2017,Sword Art Online: The Movie – Ordinal Scale,7.718
62.912,6/10/2005,Batman Begins,7.709
28.113,12/9/1965,A Charlie Brown Christmas,7.707
8.592,4/7/1966,For Love and Gold,7.698
15.608,3/31/2011,The Turin Horse,7.7
61.64,9/30/2015,The Martian,7.687
13.588,1/8/2014,Boys,7.687
12.97,6/1/1998,"Black Cat, White Cat",7.68
12.521,9/25/1961,The Hustler,7.68
14.101,6/27/1951,Strangers on a Train,7.671
16.624,5/23/2019,The Traitor,7.672
29.525,3/6/1996,Primal Fear,7.661
26.983,3/31/2016,Hunt for the Wilderpeople,7.7
13.112,9/2/1949,White Heat,7.6
46.7,7/24/2020,The Kissing Booth 2,7.648
20.064,9/28/2022,Entergalactic,7.641
78.458,2/22/2024,Exhuma,7.64
56.569,1/19/2017,A Dog's Purpose,7.632
47.331,9/26/2008,Fireproof,7.632
18.036,10/19/1970,Le Cercle Rouge,7.623
65.228,2/24/2017,Get Out,7.623
38.806,4/9/2015,The Longest Ride,7.614
20.63,3/30/2005,Mysterious Skin,7.615
24.423,3/24/1989,The Killer,7.6
13.757,6/19/1969,The Wild Bunch,7.607
25.542,10/24/2008,Changeling,7.6
25.864,12/20/1991,JFK,7.6
23.749,6/19/2020,Feel the Beat,7.59
45.792,3/30/1999,10 Things I Hate About You,7.6
24.649,8/24/2018,Searching,7.583
79.872,12/15/2009,Avatar,7.583
17.303,6/19/2014,What We Do in the Shadows,7.575
38 changes: 11 additions & 27 deletions rig-core/src/document_loaders/csv.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
// src/document_loaders/csv.rs

use async_trait::async_trait;
use csv::Reader;
use serde_json::Value;
use serde_json::json;
use std::error::Error as StdError;
use tokio::fs::File;
use tokio::io::AsyncReadExt;
Expand All @@ -12,14 +10,12 @@ use crate::embeddings::DocumentEmbeddings;

pub struct CsvLoader {
path: String,
id_column: Option<String>,
}

impl CsvLoader {
pub fn new(path: &str, id_column: Option<&str>) -> Self {
pub fn new(path: &str) -> Self {
Self {
path: path.to_string(),
id_column: id_column.map(String::from),
}
}
}
Expand All @@ -32,34 +28,22 @@ impl DocumentLoader for CsvLoader {
file.read_to_string(&mut contents).await?;

let mut reader = Reader::from_reader(contents.as_bytes());
let headers = reader.headers()?.clone();
let headers: Vec<String> = reader.headers()?.iter().map(|h| h.to_string()).collect();

let mut documents = Vec::new();
let mut csv_content = String::new();

for result in reader.records() {
let record = result?;
let mut doc = serde_json::Map::new();

for (i, field) in record.iter().enumerate() {
doc.insert(headers[i].to_string(), Value::String(field.to_string()));
csv_content.push_str(&format!("{}: {}\n", headers[i], field));
}

let id = if let Some(id_col) = &self.id_column {
doc.get(id_col)
.and_then(|v| v.as_str())
.unwrap_or_default()
.to_string()
} else {
format!("csv_row_{}", documents.len())
};

documents.push(DocumentEmbeddings {
id,
document: Value::Object(doc),
embeddings: vec![],
});
csv_content.push_str("\n");
}

Ok(documents)
Ok(vec![DocumentEmbeddings {
id: self.path.clone(),
document: json!({"text": csv_content}),
embeddings: vec![],
}])
}
}

0 comments on commit 5f05baa

Please sign in to comment.