From 9c8e7519f38850efcac4994543b7161a26449f75 Mon Sep 17 00:00:00 2001 From: Rin_0xTohsaka <89101179+Tachikoma000@users.noreply.github.com> Date: Wed, 18 Sep 2024 19:11:38 -0500 Subject: [PATCH] minor comments to help review main feature files - Comments added to pdf.rs, mod.rs, document_loaders.rs --- rig-core/examples/document_loaders.rs | 6 ++---- rig-core/src/document_loaders/mod.rs | 4 +++- rig-core/src/document_loaders/pdf.rs | 13 ++++++++++++- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/rig-core/examples/document_loaders.rs b/rig-core/examples/document_loaders.rs index 4e04b217..40a41712 100644 --- a/rig-core/examples/document_loaders.rs +++ b/rig-core/examples/document_loaders.rs @@ -1,5 +1,3 @@ -// examples/document_loaders.rs - use rig::{ completion::Prompt, document_loaders::PdfLoader, @@ -50,8 +48,8 @@ async fn main() -> Result<(), anyhow::Error> { embeddings.len() ); for emb in &embeddings { - println!("Document ID: {}", emb.id); - println!("Document Content: {:?}", emb.document); + // println!("Document ID: {}", emb.id); + // println!("Document Content: {:?}", emb.document); println!("Number of embeddings: {}", emb.embeddings.len()); println!( "First embedding vector length: {}", diff --git a/rig-core/src/document_loaders/mod.rs b/rig-core/src/document_loaders/mod.rs index ee375bc9..b25b15f5 100644 --- a/rig-core/src/document_loaders/mod.rs +++ b/rig-core/src/document_loaders/mod.rs @@ -1,4 +1,5 @@ -// src/document_loaders/mod.rs +//! This module contains the implementation of document loaders for various file formats. +//! Currently, it includes loaders for CSV and PDF files. mod csv; // mod directory; @@ -14,6 +15,7 @@ use std::error::Error as StdError; #[async_trait] pub trait DocumentLoader { + /// Asynchronously loads the document and returns a vector of document embeddings. async fn load(&self) -> Result, Box>; } diff --git a/rig-core/src/document_loaders/pdf.rs b/rig-core/src/document_loaders/pdf.rs index f8489ec3..6d073af1 100644 --- a/rig-core/src/document_loaders/pdf.rs +++ b/rig-core/src/document_loaders/pdf.rs @@ -1,14 +1,17 @@ +// Import necessary dependencies use super::DocumentLoader; use crate::embeddings::DocumentEmbeddings; use async_trait::async_trait; use lopdf::Document; use serde_json::json; +// Define a struct for loading PDF documents pub struct PdfLoader { path: String, } impl PdfLoader { + // Implement a constructor for the PdfLoader struct pub fn new(path: &str) -> Self { Self { path: path.to_string(), @@ -18,17 +21,25 @@ impl PdfLoader { #[async_trait] impl DocumentLoader for PdfLoader { + // Implement the load function for the DocumentLoader trait async fn load( &self, ) -> Result, Box> { + // Load the PDF document from the specified path let doc = Document::load(&self.path)?; + + // Extract text from each page of the PDF document let mut text = String::new(); for page in doc.get_pages() { if let Ok(content) = doc.extract_text(&[page.0]) { text.push_str(&content); } } - println!("Extracted text from PDF: {}", text); // Debug print + + // Print the extracted text for debugging purposes + println!("Extracted text from PDF: {}", text); + + // Create a DocumentEmbeddings object with the extracted text Ok(vec![DocumentEmbeddings { id: self.path.clone(), document: json!({"text": text}),