-
Notifications
You must be signed in to change notification settings - Fork 74
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add data ingestion example with Tika
- Loading branch information
1 parent
f79db4b
commit 17860c2
Showing
13 changed files
with
323 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# Tika Document Readers: Ollama | ||
|
||
Reading and vectorizing documents with LLMs and Tika via Ollama. | ||
|
||
## Running the application | ||
|
||
The application relies on Ollama for providing LLMs. You can either run Ollama locally on your laptop, or rely on the Testcontainers support in Spring Boot to spin up an Ollama service automatically. | ||
|
||
### Ollama as a native application | ||
|
||
First, make sure you have [Ollama](https://ollama.ai) installed on your laptop. | ||
Then, use Ollama to run the _mistral_ large language model. | ||
|
||
```shell | ||
ollama run mistral | ||
``` | ||
|
||
Finally, run the Spring Boot application. | ||
|
||
```shell | ||
./gradlew bootRun | ||
``` | ||
|
||
### Ollama as a dev service with Testcontainers | ||
|
||
The application relies on the native Testcontainers support in Spring Boot to spin up an Ollama service with a _mistral_ model at startup time. | ||
|
||
```shell | ||
./gradlew bootTestRun | ||
``` | ||
|
||
## Calling the application | ||
|
||
You can now call the application that will use Ollama and _mistral_ to load documents as embeddings and generate an answer to your questions based on those documents (RAG pattern). | ||
This example uses [httpie](https://httpie.io) to send HTTP requests. | ||
|
||
```shell | ||
http --raw "What is Iorek's biggest dream?" :8080/chat/doc | ||
``` | ||
|
||
```shell | ||
http --raw "Who is Lucio?" :8080/chat/doc | ||
``` |
42 changes: 42 additions & 0 deletions
42
07-data-ingestion/document-readers-tika-ollama/build.gradle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
plugins { | ||
id 'java' | ||
id 'org.springframework.boot' | ||
id 'io.spring.dependency-management' | ||
id 'org.graalvm.buildtools.native' | ||
} | ||
|
||
group = 'com.thomasvitale' | ||
version = '0.0.1-SNAPSHOT' | ||
|
||
java { | ||
toolchain { | ||
languageVersion = JavaLanguageVersion.of(23) | ||
} | ||
} | ||
|
||
repositories { | ||
mavenCentral() | ||
maven { url 'https://repo.spring.io/milestone' } | ||
maven { url 'https://repo.spring.io/snapshot' } | ||
} | ||
|
||
dependencies { | ||
implementation platform("org.springframework.ai:spring-ai-bom:${springAiVersion}") | ||
|
||
implementation 'org.springframework.boot:spring-boot-starter-web' | ||
implementation 'org.springframework.ai:spring-ai-ollama-spring-boot-starter' | ||
implementation 'org.springframework.ai:spring-ai-tika-document-reader' | ||
|
||
testAndDevelopmentOnly 'org.springframework.boot:spring-boot-devtools' | ||
|
||
testImplementation 'org.springframework.boot:spring-boot-starter-test' | ||
testImplementation 'org.springframework.boot:spring-boot-starter-webflux' | ||
testImplementation 'org.springframework.boot:spring-boot-testcontainers' | ||
testImplementation 'org.springframework.ai:spring-ai-spring-boot-testcontainers' | ||
testImplementation 'org.testcontainers:junit-jupiter' | ||
testImplementation 'org.testcontainers:ollama' | ||
} | ||
|
||
tasks.named('test') { | ||
useJUnitPlatform() | ||
} |
21 changes: 21 additions & 0 deletions
21
...document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/ChatController.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package com.thomasvitale.ai.spring; | ||
|
||
import org.springframework.web.bind.annotation.PostMapping; | ||
import org.springframework.web.bind.annotation.RequestBody; | ||
import org.springframework.web.bind.annotation.RestController; | ||
|
||
@RestController | ||
class ChatController { | ||
|
||
private final ChatService chatService; | ||
|
||
ChatController(ChatService chatService) { | ||
this.chatService = chatService; | ||
} | ||
|
||
@PostMapping("/chat/doc") | ||
String chatWithDocument(@RequestBody String input) { | ||
return chatService.chatWithDocument(input); | ||
} | ||
|
||
} |
47 changes: 47 additions & 0 deletions
47
...on/document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/ChatService.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
package com.thomasvitale.ai.spring; | ||
|
||
import org.springframework.ai.chat.client.ChatClient; | ||
import org.springframework.ai.document.Document; | ||
import org.springframework.ai.vectorstore.SearchRequest; | ||
import org.springframework.ai.vectorstore.VectorStore; | ||
import org.springframework.stereotype.Service; | ||
|
||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
|
||
@Service | ||
class ChatService { | ||
|
||
private final ChatClient chatClient; | ||
private final VectorStore vectorStore; | ||
|
||
ChatService(ChatClient.Builder chatClientBuilder, VectorStore vectorStore) { | ||
this.chatClient = chatClientBuilder.build(); | ||
this.vectorStore = vectorStore; | ||
} | ||
|
||
String chatWithDocument(String message) { | ||
var systemPromptTemplate = """ | ||
You are a helpful assistant, conversing with a user about the subjects contained in a set of documents. | ||
Use the information from the DOCUMENTS section to provide accurate answers. If unsure or if the answer | ||
isn't found in the DOCUMENTS section, simply state that you don't know the answer and do not mention | ||
the DOCUMENTS section. | ||
DOCUMENTS: | ||
{documents} | ||
"""; | ||
|
||
List<Document> similarDocuments = vectorStore.similaritySearch(SearchRequest.query(message).withTopK(5)); | ||
String content = similarDocuments.stream().map(Document::getContent).collect(Collectors.joining(System.lineSeparator())); | ||
|
||
return chatClient.prompt() | ||
.system(systemSpec -> systemSpec | ||
.text(systemPromptTemplate) | ||
.param("documents", content) | ||
) | ||
.user(message) | ||
.call() | ||
.content(); | ||
} | ||
|
||
} |
47 changes: 47 additions & 0 deletions
47
...ent-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/DocumentEtlPipeline.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
package com.thomasvitale.ai.spring; | ||
|
||
import jakarta.annotation.PostConstruct; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
import org.springframework.ai.document.Document; | ||
import org.springframework.ai.reader.tika.TikaDocumentReader; | ||
import org.springframework.ai.vectorstore.VectorStore; | ||
import org.springframework.beans.factory.annotation.Value; | ||
import org.springframework.core.io.Resource; | ||
import org.springframework.stereotype.Component; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
@Component | ||
public class DocumentEtlPipeline { | ||
|
||
private static final Logger logger = LoggerFactory.getLogger(DocumentEtlPipeline.class); | ||
private final VectorStore vectorStore; | ||
|
||
@Value("classpath:documents/story1.md") | ||
Resource file1; | ||
|
||
@Value("classpath:documents/story2.pdf") | ||
Resource file2; | ||
|
||
public DocumentEtlPipeline(VectorStore vectorStore) { | ||
this.vectorStore = vectorStore; | ||
} | ||
|
||
@PostConstruct | ||
public void run() { | ||
List<Document> documents = new ArrayList<>(); | ||
|
||
logger.info("Loading files as Documents"); | ||
var tikaReader1 = new TikaDocumentReader(file1); | ||
documents.addAll(tikaReader1.get()); | ||
|
||
var tikaReader2 = new TikaDocumentReader(file2); | ||
documents.addAll(tikaReader2.get()); | ||
|
||
logger.info("Creating and storing Embeddings from Documents"); | ||
vectorStore.add(documents); | ||
} | ||
|
||
} |
22 changes: 22 additions & 0 deletions
22
...ollama/src/main/java/com/thomasvitale/ai/spring/DocumentReadersTikaOllamaApplication.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
package com.thomasvitale.ai.spring; | ||
|
||
import org.springframework.ai.embedding.EmbeddingModel; | ||
import org.springframework.ai.vectorstore.SimpleVectorStore; | ||
import org.springframework.ai.vectorstore.VectorStore; | ||
import org.springframework.boot.SpringApplication; | ||
import org.springframework.boot.autoconfigure.SpringBootApplication; | ||
import org.springframework.context.annotation.Bean; | ||
|
||
@SpringBootApplication | ||
public class DocumentReadersTikaOllamaApplication { | ||
|
||
public static void main(String[] args) { | ||
SpringApplication.run(DocumentReadersTikaOllamaApplication.class, args); | ||
} | ||
|
||
@Bean | ||
VectorStore vectorStore(EmbeddingModel embeddingModel) { | ||
return new SimpleVectorStore(embeddingModel); | ||
} | ||
|
||
} |
9 changes: 9 additions & 0 deletions
9
07-data-ingestion/document-readers-tika-ollama/src/main/resources/application.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
spring: | ||
ai: | ||
ollama: | ||
chat: | ||
options: | ||
model: mistral | ||
embedding: | ||
options: | ||
model: mistral |
42 changes: 42 additions & 0 deletions
42
...a-ingestion/document-readers-tika-ollama/src/main/resources/documents/story1.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# The Adventures of Iorek and Pingu | ||
|
||
Iorek was a little polar bear who lived in the Arctic circle. He loved to explore the snowy landscape and | ||
dreamt of one day going on an adventure around the North Pole. One day, he met a penguin named Pingu who | ||
was on a similar quest. They quickly became friends and decided to embark on their journey together. | ||
|
||
Iorek and Pingu set off early in the morning, eager to cover as much ground as possible before nightfall. | ||
The air was crisp and cold, and the snow crunched under their paws as they walked. They chatted excitedly | ||
about their dreams and aspirations, and Iorek told Pingu about his desire to see the Northern Lights. | ||
|
||
As they journeyed onward, they encountered a group of playful seals who were sliding and jumping in the | ||
snow. Iorek and Pingu watched in delight as the seals frolicked and splashed in the water. They even tried | ||
to join in, but their paws kept slipping and they ended up sliding on their stomachs instead. | ||
|
||
After a few hours of walking, Iorek and Pingu came across a cave hidden behind a wall of snow. They | ||
cautiously entered the darkness, their eyes adjusting to the dim light inside. The cave was filled with | ||
glittering ice formations that sparkled like diamonds in the flickering torchlight. | ||
|
||
As they continued their journey, Iorek and Pingu encountered a group of walruses who were lounging on the | ||
ice. They watched in amazement as the walruses lazily rolled over and exposed their tusks for a good | ||
scratch. Pingu even tried to imitate them, but ended up looking more like a clumsy seal than a walrus. | ||
|
||
As the sun began to set, Iorek and Pingu found themselves at the edge of a vast, frozen lake. They gazed | ||
out across the glassy surface, mesmerized by the way the ice glinted in the fading light. They could see | ||
the faint outline of a creature moving beneath the surface, and their hearts raced with excitement. | ||
|
||
Suddenly, a massive narwhal burst through the ice and into the air, its ivory tusk glistening in the | ||
sunset. Iorek and Pingu watched in awe as it soared overhead, its cries echoing across the lake. They felt | ||
as though they were witnessing a magical moment, one that would stay with them forever. | ||
|
||
As the night drew in, Iorek and Pingu settled down to rest in their makeshift camp. They huddled together | ||
for warmth, gazing up at the starry sky above. They chatted about all they had seen and experienced during | ||
their adventure, and Iorek couldn't help but feel grateful for the new friend he had made. | ||
|
||
The next morning, Iorek and Pingu set off once again, determined to explore every inch of the North Pole. | ||
They stumbled upon a hidden cave filled with glittering crystals that sparkled like diamonds in the | ||
sunlight. They marveled at their beauty before continuing on their way. | ||
|
||
As they journeyed onward, Iorek and Pingu encountered many more wonders and adventures. They met a group | ||
of playful reindeer who showed them how to pull sledges across the snow, and even caught a glimpse of the | ||
mythical Loch Ness Monster lurking beneath the icy waters. In the end, their adventure around the North | ||
Pole had been an unforgettable experience, one that they would treasure forever. |
Binary file added
BIN
+21.3 KB
07-data-ingestion/document-readers-tika-ollama/src/main/resources/documents/story2.pdf
Binary file not shown.
15 changes: 15 additions & 0 deletions
15
...a/src/test/java/com/thomasvitale/ai/spring/DocumentReadersTikaOllamaApplicationTests.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
package com.thomasvitale.ai.spring; | ||
|
||
import org.junit.jupiter.api.Disabled; | ||
import org.junit.jupiter.api.Test; | ||
import org.springframework.boot.test.context.SpringBootTest; | ||
|
||
@SpringBootTest | ||
@Disabled | ||
class DocumentReadersTikaOllamaApplicationTests { | ||
|
||
@Test | ||
void contextLoads() { | ||
} | ||
|
||
} |
26 changes: 26 additions & 0 deletions
26
...ma/src/test/java/com/thomasvitale/ai/spring/TestDocumentReadersTikaOllamaApplication.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
package com.thomasvitale.ai.spring; | ||
|
||
import org.springframework.boot.SpringApplication; | ||
import org.springframework.boot.devtools.restart.RestartScope; | ||
import org.springframework.boot.test.context.TestConfiguration; | ||
import org.springframework.boot.testcontainers.service.connection.ServiceConnection; | ||
import org.springframework.context.annotation.Bean; | ||
import org.testcontainers.ollama.OllamaContainer; | ||
import org.testcontainers.utility.DockerImageName; | ||
|
||
@TestConfiguration(proxyBeanMethods = false) | ||
public class TestDocumentReadersTikaOllamaApplication { | ||
|
||
@Bean | ||
@RestartScope | ||
@ServiceConnection | ||
OllamaContainer ollama() { | ||
return new OllamaContainer(DockerImageName.parse("ghcr.io/thomasvitale/ollama-mistral") | ||
.asCompatibleSubstituteFor("ollama/ollama")); | ||
} | ||
|
||
public static void main(String[] args) { | ||
SpringApplication.from(DocumentReadersTikaOllamaApplication::main).with(TestDocumentReadersTikaOllamaApplication.class).run(args); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters