diff --git a/.changeset/famous-ways-give.md b/.changeset/famous-ways-give.md new file mode 100644 index 000000000..08891c679 --- /dev/null +++ b/.changeset/famous-ways-give.md @@ -0,0 +1,5 @@ +--- +"create-llama": patch +--- + +Change --agents paramameter to --use-case diff --git a/.changeset/green-melons-thank.md b/.changeset/green-melons-thank.md new file mode 100644 index 000000000..c17a3cd7f --- /dev/null +++ b/.changeset/green-melons-thank.md @@ -0,0 +1,5 @@ +--- +"create-llama": patch +--- + +Add LlamaCloud support for Reflex templates diff --git a/create-app.ts b/create-app.ts index 345ff3699..b28dea5ac 100644 --- a/create-app.ts +++ b/create-app.ts @@ -39,7 +39,7 @@ export async function createApp({ tools, useLlamaParse, observability, - agents, + useCase, }: InstallAppArgs): Promise { const root = path.resolve(appPath); @@ -84,7 +84,7 @@ export async function createApp({ tools, useLlamaParse, observability, - agents, + useCase, }; // Install backend diff --git a/e2e/shared/multiagent_template.spec.ts b/e2e/shared/multiagent_template.spec.ts index ccb33539b..0839d0003 100644 --- a/e2e/shared/multiagent_template.spec.ts +++ b/e2e/shared/multiagent_template.spec.ts @@ -18,10 +18,10 @@ const templateUI: TemplateUI = "shadcn"; const templatePostInstallAction: TemplatePostInstallAction = "runApp"; const appType: AppType = templateFramework === "fastapi" ? "--frontend" : ""; const userMessage = "Write a blog post about physical standards for letters"; -const templateAgents = ["financial_report", "blog", "form_filling"]; +const templateUseCases = ["financial_report", "blog", "form_filling"]; -for (const agents of templateAgents) { - test.describe(`Test multiagent template ${agents} ${templateFramework} ${dataSource} ${templateUI} ${appType} ${templatePostInstallAction}`, async () => { +for (const useCase of templateUseCases) { + test.describe(`Test multiagent template ${useCase} ${templateFramework} ${dataSource} ${templateUI} ${appType} ${templatePostInstallAction}`, async () => { test.skip( process.platform !== "linux" || process.env.DATASOURCE === "--no-files", "The multiagent template currently only works with files. We also only run on Linux to speed up tests.", @@ -46,7 +46,7 @@ for (const agents of templateAgents) { postInstallAction: templatePostInstallAction, templateUI, appType, - agents, + useCase, }); name = result.projectName; appProcess = result.appProcess; @@ -71,8 +71,8 @@ for (const agents of templateAgents) { }) => { test.skip( templatePostInstallAction !== "runApp" || - agents === "financial_report" || - agents === "form_filling" || + useCase === "financial_report" || + useCase === "form_filling" || templateFramework === "express", "Skip chat tests for financial report and form filling.", ); diff --git a/e2e/shared/reflex_template.spec.ts b/e2e/shared/reflex_template.spec.ts index 766d20a1f..d20c6d7e8 100644 --- a/e2e/shared/reflex_template.spec.ts +++ b/e2e/shared/reflex_template.spec.ts @@ -3,7 +3,7 @@ import { expect, test } from "@playwright/test"; import { ChildProcess } from "child_process"; import fs from "fs"; import path from "path"; -import { TemplateAgents, TemplateFramework } from "../../helpers"; +import { TemplateFramework, TemplateUseCase } from "../../helpers"; import { createTestDir, runCreateLlama } from "../utils"; const templateFramework: TemplateFramework = process.env.FRAMEWORK @@ -12,7 +12,7 @@ const templateFramework: TemplateFramework = process.env.FRAMEWORK const dataSource: string = process.env.DATASOURCE ? process.env.DATASOURCE : "--example-file"; -const templateAgents: TemplateAgents[] = ["extractor", "contract_review"]; +const templateUseCases: TemplateUseCase[] = ["extractor", "contract_review"]; // The reflex template currently only works with FastAPI and files (and not on Windows) if ( @@ -20,8 +20,8 @@ if ( templateFramework === "fastapi" && dataSource === "--example-file" ) { - for (const agents of templateAgents) { - test.describe(`Test reflex template ${agents} ${templateFramework} ${dataSource}`, async () => { + for (const useCase of templateUseCases) { + test.describe(`Test reflex template ${useCase} ${templateFramework} ${dataSource}`, async () => { let appPort: number; let name: string; let appProcess: ChildProcess; @@ -39,7 +39,7 @@ if ( vectorDb: "none", port: appPort, postInstallAction: "runApp", - agents, + useCase, }); name = result.projectName; appProcess = result.appProcess; diff --git a/e2e/utils.ts b/e2e/utils.ts index e7a9cc9a3..a2d290830 100644 --- a/e2e/utils.ts +++ b/e2e/utils.ts @@ -33,7 +33,7 @@ export type RunCreateLlamaOptions = { tools?: string; useLlamaParse?: boolean; observability?: string; - agents?: string; + useCase?: string; }; export async function runCreateLlama({ @@ -51,7 +51,7 @@ export async function runCreateLlama({ tools, useLlamaParse, observability, - agents, + useCase, }: RunCreateLlamaOptions): Promise { if (!process.env.OPENAI_API_KEY || !process.env.LLAMA_CLOUD_API_KEY) { throw new Error( @@ -113,8 +113,8 @@ export async function runCreateLlama({ if (observability) { commandArgs.push("--observability", observability); } - if ((templateType === "multiagent" || templateType === "reflex") && agents) { - commandArgs.push("--agents", agents); + if ((templateType === "multiagent" || templateType === "reflex") && useCase) { + commandArgs.push("--use-case", useCase); } const command = commandArgs.join(" "); diff --git a/helpers/python.ts b/helpers/python.ts index e1b2271f0..5201c1b59 100644 --- a/helpers/python.ts +++ b/helpers/python.ts @@ -380,28 +380,32 @@ export const installPythonDependencies = ( }; export const installPythonTemplate = async ({ + appName, root, template, framework, vectorDb, + postInstallAction, + modelConfig, dataSources, tools, - postInstallAction, + useLlamaParse, + useCase, observability, - modelConfig, - agents, }: Pick< InstallTemplateArgs, + | "appName" | "root" - | "framework" | "template" + | "framework" | "vectorDb" + | "postInstallAction" + | "modelConfig" | "dataSources" | "tools" - | "postInstallAction" + | "useLlamaParse" + | "useCase" | "observability" - | "modelConfig" - | "agents" >) => { console.log("\nInitializing Python project with template:", template, "\n"); let templatePath; @@ -476,21 +480,12 @@ export const installPythonTemplate = async ({ await copyRouterCode(root, tools ?? []); } - if (template === "multiagent") { - // Copy multi-agent code - await copy("**", path.join(root), { - parents: true, - cwd: path.join(compPath, "multiagent", "python"), - rename: assetRelocator, - }); - } - if (template === "multiagent" || template === "reflex") { - if (agents) { + if (useCase) { const sourcePath = template === "multiagent" - ? path.join(compPath, "agents", "python", agents) - : path.join(compPath, "reflex", agents); + ? path.join(compPath, "agents", "python", useCase) + : path.join(compPath, "reflex", useCase); await copy("**", path.join(root), { parents: true, @@ -500,7 +495,7 @@ export const installPythonTemplate = async ({ } else { console.log( red( - `There is no agent selected for ${template} template. Please pick an agent to use via --agents flag.`, + `There is no use case selected for ${template} template. Please pick a use case to use via --use-case flag.`, ), ); process.exit(1); diff --git a/helpers/types.ts b/helpers/types.ts index a4635f0e2..544a27109 100644 --- a/helpers/types.ts +++ b/helpers/types.ts @@ -49,7 +49,7 @@ export type TemplateDataSource = { }; export type TemplateDataSourceType = "file" | "web" | "db"; export type TemplateObservability = "none" | "traceloop" | "llamatrace"; -export type TemplateAgents = +export type TemplateUseCase = | "financial_report" | "blog" | "form_filling" @@ -106,5 +106,5 @@ export interface InstallTemplateArgs { postInstallAction?: TemplatePostInstallAction; tools?: Tool[]; observability?: TemplateObservability; - agents?: TemplateAgents; + useCase?: TemplateUseCase; } diff --git a/helpers/typescript.ts b/helpers/typescript.ts index 761a4bb3d..b516cd392 100644 --- a/helpers/typescript.ts +++ b/helpers/typescript.ts @@ -26,7 +26,7 @@ export const installTSTemplate = async ({ tools, dataSources, useLlamaParse, - agents, + useCase, }: InstallTemplateArgs & { backend: boolean }) => { console.log(bold(`Using ${packageManager}.`)); @@ -131,16 +131,16 @@ export const installTSTemplate = async ({ cwd: path.join(multiagentPath, "workflow"), }); - // Copy agents use case code for multiagent template - if (agents) { - console.log("\nCopying agent:", agents, "\n"); - const useCasePath = path.join(compPath, "agents", "typescript", agents); - const agentsCodePath = path.join(useCasePath, "workflow"); + // Copy use case code for multiagent template + if (useCase) { + console.log("\nCopying use case:", useCase, "\n"); + const useCasePath = path.join(compPath, "agents", "typescript", useCase); + const useCaseCodePath = path.join(useCasePath, "workflow"); - // Copy agent codes + // Copy use case codes await copy("**", path.join(root, relativeEngineDestPath, "workflow"), { parents: true, - cwd: agentsCodePath, + cwd: useCaseCodePath, rename: assetRelocator, }); @@ -153,7 +153,7 @@ export const installTSTemplate = async ({ } else { console.log( red( - `There is no agent selected for ${template} template. Please pick an agent to use via --agents flag.`, + `There is no use case selected for ${template} template. Please pick a use case to use via --use-case flag.`, ), ); process.exit(1); diff --git a/index.ts b/index.ts index 4a3438995..370bd1e8b 100644 --- a/index.ts +++ b/index.ts @@ -202,10 +202,10 @@ const program = new Command(packageJson.name) false, ) .option( - "--agents ", + "--use-case ", ` - Select which agents to use for the multi-agent template (e.g: financial_report, blog). + Select which use case to use for the multi-agent template (e.g: financial_report, blog). `, ) .allowUnknownOption() diff --git a/questions/questions.ts b/questions/questions.ts index ebc83396f..559839975 100644 --- a/questions/questions.ts +++ b/questions/questions.ts @@ -2,7 +2,7 @@ import { blue } from "picocolors"; import prompts from "prompts"; import { isCI } from "."; import { COMMUNITY_OWNER, COMMUNITY_REPO } from "../helpers/constant"; -import { EXAMPLE_FILE } from "../helpers/datasources"; +import { EXAMPLE_FILE, EXAMPLE_GDPR } from "../helpers/datasources"; import { getAvailableLlamapackOptions } from "../helpers/llama-pack"; import { askModelConfig } from "../helpers/providers"; import { getProjectOptions } from "../helpers/repo"; @@ -33,7 +33,7 @@ export const askProQuestions = async (program: QuestionArgs) => { title: "Multi-agent app (using workflows)", value: "multiagent", }, - { title: "Structured Extractor", value: "extractor" }, + { title: "Fullstack python template with Reflex", value: "reflex" }, { title: `Community template from ${styledRepo}`, value: "community", @@ -100,6 +100,24 @@ export const askProQuestions = async (program: QuestionArgs) => { // So we just use example file for extractor template, this allows user to choose vector database later program.dataSources = [EXAMPLE_FILE]; program.framework = "fastapi"; + // Ask for which Reflex use case to use + const { useCase } = await prompts( + { + type: "select", + name: "useCase", + message: "Which use case would you like to build?", + choices: [ + { title: "Structured Extractor", value: "extractor" }, + { + title: "Contract review (using Workflow)", + value: "contract_review", + }, + ], + initial: 0, + }, + questionHandlers, + ); + program.useCase = useCase; } if (!program.framework) { @@ -171,32 +189,50 @@ export const askProQuestions = async (program: QuestionArgs) => { program.observability = observability; } - // Ask agents - if (program.template === "multiagent" && !program.agents) { - const { agents } = await prompts( + if ( + (program.template === "reflex" || program.template === "multiagent") && + !program.useCase + ) { + const choices = + program.template === "reflex" + ? [ + { title: "Structured Extractor", value: "extractor" }, + { + title: "Contract review (using Workflow)", + value: "contract_review", + }, + ] + : [ + { + title: "Financial report (generate a financial report)", + value: "financial_report", + }, + { + title: "Form filling (fill missing value in a CSV file)", + value: "form_filling", + }, + { title: "Blog writer (Write a blog post)", value: "blog" }, + ]; + + const { useCase } = await prompts( { type: "select", - name: "agents", - message: "Which agents would you like to use?", - choices: [ - { - title: "Financial report (generate a financial report)", - value: "financial_report", - }, - { - title: "Form filling (fill missing value in a CSV file)", - value: "form_filling", - }, - { - title: "Blog writer (Write a blog post)", - value: "blog_writer", - }, - ], + name: "useCase", + message: "Which use case would you like to use?", + choices, initial: 0, }, questionHandlers, ); - program.agents = agents; + program.useCase = useCase; + } + + // Configure framework and data sources for Reflex template + if (program.template === "reflex") { + program.framework = "fastapi"; + + program.dataSources = + program.useCase === "extractor" ? [EXAMPLE_FILE] : [EXAMPLE_GDPR]; } if (!program.modelConfig) { @@ -222,8 +258,8 @@ export const askProQuestions = async (program: QuestionArgs) => { program.vectorDb = vectorDb; } - if (program.vectorDb === "llamacloud") { - // When using a LlamaCloud index, don't ask for data sources just copy an example file + if (program.vectorDb === "llamacloud" && program.dataSources.length === 0) { + // When using a LlamaCloud index and no data sources are provided, just copy an example file program.dataSources = [EXAMPLE_FILE]; } @@ -354,7 +390,7 @@ export const askProQuestions = async (program: QuestionArgs) => { // default to use LlamaParse if using LlamaCloud program.useLlamaParse = true; } else { - // Reflex template doesn't support LlamaParse and LlamaCloud right now (cannot use asyncio loop in Reflex) + // Reflex template doesn't support LlamaParse right now (cannot use asyncio loop in Reflex) if (program.useLlamaParse === undefined && program.template !== "reflex") { // if already set useLlamaParse, don't ask again if (program.dataSources.some((ds) => ds.type === "file")) { diff --git a/questions/simple.ts b/questions/simple.ts index 198261984..eb85065fe 100644 --- a/questions/simple.ts +++ b/questions/simple.ts @@ -74,34 +74,34 @@ export const askSimpleQuestions = async ( questionHandlers, ); language = newLanguage; + } + + const { useLlamaCloud: newUseLlamaCloud } = await prompts( + { + type: "toggle", + name: "useLlamaCloud", + message: "Do you want to use LlamaCloud services?", + initial: false, + active: "Yes", + inactive: "No", + hint: "see https://www.llamaindex.ai/enterprise for more info", + }, + questionHandlers, + ); + useLlamaCloud = newUseLlamaCloud; - const { useLlamaCloud: newUseLlamaCloud } = await prompts( + if (useLlamaCloud && !llamaCloudKey) { + // Ask for LlamaCloud API key, if not set + const { llamaCloudKey: newLlamaCloudKey } = await prompts( { - type: "toggle", - name: "useLlamaCloud", - message: "Do you want to use LlamaCloud services?", - initial: false, - active: "Yes", - inactive: "No", - hint: "see https://www.llamaindex.ai/enterprise for more info", + type: "text", + name: "llamaCloudKey", + message: + "Please provide your LlamaCloud API key (leave blank to skip):", }, questionHandlers, ); - useLlamaCloud = newUseLlamaCloud; - - if (useLlamaCloud && !llamaCloudKey) { - // Ask for LlamaCloud API key, if not set - const { llamaCloudKey: newLlamaCloudKey } = await prompts( - { - type: "text", - name: "llamaCloudKey", - message: - "Please provide your LlamaCloud API key (leave blank to skip):", - }, - questionHandlers, - ); - llamaCloudKey = newLlamaCloudKey || process.env.LLAMA_CLOUD_API_KEY; - } + llamaCloudKey = newLlamaCloudKey || process.env.LLAMA_CLOUD_API_KEY; } const results = await convertAnswers(args, { @@ -133,7 +133,7 @@ const convertAnswers = async ( AppType, Pick< QuestionResults, - "template" | "tools" | "frontend" | "dataSources" | "agents" + "template" | "tools" | "frontend" | "dataSources" | "useCase" > & { modelConfig?: ModelConfig; } @@ -160,7 +160,7 @@ const convertAnswers = async ( }, financial_report_agent: { template: "multiagent", - agents: "financial_report", + useCase: "financial_report", tools: getTools(["document_generator", "interpreter"]), dataSources: EXAMPLE_10K_SEC_FILES, frontend: true, @@ -168,7 +168,7 @@ const convertAnswers = async ( }, form_filling: { template: "multiagent", - agents: "form_filling", + useCase: "form_filling", tools: getTools(["form_filling"]), dataSources: EXAMPLE_10K_SEC_FILES, frontend: true, @@ -176,14 +176,14 @@ const convertAnswers = async ( }, extractor: { template: "reflex", - agents: "extractor", + useCase: "extractor", tools: [], frontend: false, dataSources: [EXAMPLE_FILE], }, contract_review: { template: "reflex", - agents: "contract_review", + useCase: "contract_review", tools: [], frontend: false, dataSources: [EXAMPLE_GDPR], diff --git a/templates/types/reflex/app/api/routers/models.py b/templates/types/reflex/app/api/routers/models.py new file mode 100644 index 000000000..db672e783 --- /dev/null +++ b/templates/types/reflex/app/api/routers/models.py @@ -0,0 +1,65 @@ +import logging +import os +from typing import Any, Dict, List, Optional + +from llama_index.core.schema import NodeWithScore +from pydantic import BaseModel + +from app.config import DATA_DIR + +logger = logging.getLogger("uvicorn") + + +class SourceNodes(BaseModel): + id: str + metadata: Dict[str, Any] + score: Optional[float] + text: str + url: Optional[str] + + @classmethod + def from_source_node(cls, source_node: NodeWithScore): + metadata = source_node.node.metadata + url = cls.get_url_from_metadata(metadata) + + return cls( + id=source_node.node.node_id, + metadata=metadata, + score=source_node.score, + text=source_node.node.text, # type: ignore + url=url, + ) + + @classmethod + def get_url_from_metadata(cls, metadata: Dict[str, Any]) -> Optional[str]: + url_prefix = os.getenv("FILESERVER_URL_PREFIX") + if not url_prefix: + logger.warning( + "Warning: FILESERVER_URL_PREFIX not set in environment variables. Can't use file server" + ) + file_name = metadata.get("file_name") + + if file_name and url_prefix: + # file_name exists and file server is configured + pipeline_id = metadata.get("pipeline_id") + if pipeline_id: + # file is from LlamaCloud + file_name = f"{pipeline_id}${file_name}" + return f"{url_prefix}/output/llamacloud/{file_name}" + is_private = metadata.get("private", "false") == "true" + if is_private: + # file is a private upload + return f"{url_prefix}/output/uploaded/{file_name}" + # file is from calling the 'generate' script + # Get the relative path of file_path to data_dir + file_path = metadata.get("file_path") + data_dir = os.path.abspath(DATA_DIR) + if file_path and data_dir: + relative_path = os.path.relpath(file_path, data_dir) + return f"{url_prefix}/data/{relative_path}" + # fallback to URL in metadata (e.g. for websites) + return metadata.get("URL") + + @classmethod + def from_source_nodes(cls, source_nodes: List[NodeWithScore]): + return [cls.from_source_node(node) for node in source_nodes] diff --git a/templates/types/reflex/app/services/file.py b/templates/types/reflex/app/services/file.py new file mode 100644 index 000000000..3fc1a64f1 --- /dev/null +++ b/templates/types/reflex/app/services/file.py @@ -0,0 +1,281 @@ +import base64 +import logging +import mimetypes +import os +import re +import uuid +from io import BytesIO +from pathlib import Path +from typing import List, Optional, Tuple + +from llama_index.core import VectorStoreIndex +from llama_index.core.ingestion import IngestionPipeline +from llama_index.core.readers.file.base import ( + _try_loading_included_file_formats as get_file_loaders_map, +) +from llama_index.core.schema import Document +from llama_index.indices.managed.llama_cloud.base import LlamaCloudIndex +from llama_index.readers.file import FlatReader +from pydantic import BaseModel, Field + +logger = logging.getLogger(__name__) + +PRIVATE_STORE_PATH = str(Path("output", "uploaded")) +TOOL_STORE_PATH = str(Path("output", "tools")) +LLAMA_CLOUD_STORE_PATH = str(Path("output", "llamacloud")) + + +class DocumentFile(BaseModel): + id: str + name: str # Stored file name + type: str = None + size: int = None + url: str = None + path: Optional[str] = Field( + None, + description="The stored file path. Used internally in the server.", + exclude=True, + ) + refs: Optional[List[str]] = Field( + None, description="The document ids in the index." + ) + + +class FileService: + """ + To store the files uploaded by the user and add them to the index. + """ + + @classmethod + def process_private_file( + cls, + file_name: str, + base64_content: str, + params: Optional[dict] = None, + ) -> DocumentFile: + """ + Store the uploaded file and index it if necessary. + """ + try: + from app.engine.index import IndexConfig, get_index + except ImportError as e: + raise ValueError("IndexConfig or get_index is not found") from e + + if params is None: + params = {} + + # Add the nodes to the index and persist it + index_config = IndexConfig(**params) + index = get_index(index_config) + + # Preprocess and store the file + file_data, extension = cls._preprocess_base64_file(base64_content) + + document_file = cls.save_file( + file_data, + file_name=file_name, + save_dir=PRIVATE_STORE_PATH, + ) + + # Don't index csv files (they are handled by tools) + if extension == "csv": + return document_file + else: + # Insert the file into the index and update document ids to the file metadata + if isinstance(index, LlamaCloudIndex): + doc_id = cls._add_file_to_llama_cloud_index( + index, document_file.name, file_data + ) + # Add document ids to the file metadata + document_file.refs = [doc_id] + else: + documents = cls._load_file_to_documents(document_file) + cls._add_documents_to_vector_store_index(documents, index) + # Add document ids to the file metadata + document_file.refs = [doc.doc_id for doc in documents] + + # Return the file metadata + return document_file + + @classmethod + def save_file( + cls, + content: bytes | str, + file_name: str, + save_dir: Optional[str] = None, + ) -> DocumentFile: + """ + Save the content to a file in the local file server (accessible via URL) + + Args: + content (bytes | str): The content to save, either bytes or string. + file_name (str): The original name of the file. + save_dir (Optional[str]): The relative path from the current working directory. Defaults to the `output/uploaded` directory. + Returns: + The metadata of the saved file. + """ + if save_dir is None: + save_dir = os.path.join("output", "uploaded") + + file_id = str(uuid.uuid4()) + name, extension = os.path.splitext(file_name) + extension = extension.lstrip(".") + sanitized_name = _sanitize_file_name(name) + if extension == "": + raise ValueError("File is not supported!") + new_file_name = f"{sanitized_name}_{file_id}.{extension}" + + file_path = os.path.join(save_dir, new_file_name) + + if isinstance(content, str): + content = content.encode() + + try: + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "wb") as file: + file.write(content) + except PermissionError as e: + logger.error( + f"Permission denied when writing to file {file_path}: {str(e)}" + ) + raise + except IOError as e: + logger.error( + f"IO error occurred when writing to file {file_path}: {str(e)}" + ) + raise + except Exception as e: + logger.error(f"Unexpected error when writing to file {file_path}: {str(e)}") + raise + + logger.info(f"Saved file to {file_path}") + + file_url_prefix = os.getenv("FILESERVER_URL_PREFIX") + if file_url_prefix is None: + logger.warning( + "FILESERVER_URL_PREFIX is not set, fallback to http://localhost:8000/api/files" + ) + file_url_prefix = "http://localhost:8000/api/files" + file_size = os.path.getsize(file_path) + + file_url = os.path.join( + file_url_prefix, + save_dir, + new_file_name, + ) + + return DocumentFile( + id=file_id, + name=new_file_name, + type=extension, + size=file_size, + path=file_path, + url=file_url, + refs=None, + ) + + @staticmethod + def _preprocess_base64_file(base64_content: str) -> Tuple[bytes, str | None]: + header, data = base64_content.split(",", 1) + mime_type = header.split(";")[0].split(":", 1)[1] + extension = mimetypes.guess_extension(mime_type).lstrip(".") + # File data as bytes + return base64.b64decode(data), extension + + @staticmethod + def _load_file_to_documents(file: DocumentFile) -> List[Document]: + """ + Load the file from the private directory and return the documents + """ + _, extension = os.path.splitext(file.name) + extension = extension.lstrip(".") + + # Load file to documents + # If LlamaParse is enabled, use it to parse the file + # Otherwise, use the default file loaders + reader = _get_llamaparse_parser() + if reader is None: + reader_cls = _default_file_loaders_map().get(f".{extension}") + if reader_cls is None: + raise ValueError(f"File extension {extension} is not supported") + reader = reader_cls() + if file.path is None: + raise ValueError("Document file path is not set") + documents = reader.load_data(Path(file.path)) + # Add custom metadata + for doc in documents: + doc.metadata["file_name"] = file.name + doc.metadata["private"] = "true" + return documents + + @staticmethod + def _add_documents_to_vector_store_index( + documents: List[Document], index: VectorStoreIndex + ) -> None: + """ + Add the documents to the vector store index + """ + pipeline = IngestionPipeline() + nodes = pipeline.run(documents=documents) + + # Add the nodes to the index and persist it + if index is None: + index = VectorStoreIndex(nodes=nodes) + else: + index.insert_nodes(nodes=nodes) + index.storage_context.persist( + persist_dir=os.environ.get("STORAGE_DIR", "storage") + ) + + @staticmethod + def _add_file_to_llama_cloud_index( + index: LlamaCloudIndex, + file_name: str, + file_data: bytes, + ) -> str: + """ + Add the file to the LlamaCloud index. + LlamaCloudIndex is a managed index so we can directly use the files. + """ + try: + from app.engine.service import LLamaCloudFileService # type: ignore + except ImportError as e: + raise ValueError("LlamaCloudFileService is not found") from e + + # LlamaCloudIndex is a managed index so we can directly use the files + upload_file = (file_name, BytesIO(file_data)) + doc_id = LLamaCloudFileService.add_file_to_pipeline( + index.project.id, + index.pipeline.id, + upload_file, + custom_metadata={}, + wait_for_processing=True, + ) + return doc_id + + +def _sanitize_file_name(file_name: str) -> str: + """ + Sanitize the file name by replacing all non-alphanumeric characters with underscores + """ + sanitized_name = re.sub(r"[^a-zA-Z0-9.]", "_", file_name) + return sanitized_name + + +def _get_llamaparse_parser(): + from app.engine.loaders import load_configs + from app.engine.loaders.file import FileLoaderConfig, llama_parse_parser + + config = load_configs() + file_loader_config = FileLoaderConfig(**config["file"]) + if file_loader_config.use_llama_parse: + return llama_parse_parser() + else: + return None + + +def _default_file_loaders_map(): + default_loaders = get_file_loaders_map() + default_loaders[".txt"] = FlatReader + default_loaders[".csv"] = FlatReader + return default_loaders