Skip to content

Commit

Permalink
feat: es init config with client
Browse files Browse the repository at this point in the history
  • Loading branch information
N3kox committed Jan 20, 2025
1 parent 3b3e0ec commit bb62376
Show file tree
Hide file tree
Showing 23 changed files with 12,798 additions and 51 deletions.
10,631 changes: 10,631 additions & 0 deletions components/indexer/es8/examples/embeddings.json

Large diffs are not rendered by default.

150 changes: 150 additions & 0 deletions components/indexer/es8/examples/indexer/add_documents.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
package main

import (
"context"
"encoding/json"
"fmt"
"os"
"strconv"
"strings"

"github.com/cloudwego/eino-ext/components/indexer/es8"
"github.com/cloudwego/eino/components/embedding"
"github.com/cloudwego/eino/schema"
"github.com/elastic/go-elasticsearch/v8"
)

const (
indexName = "eino_example"
fieldContent = "content"
fieldContentVector = "content_vector"
fieldExtraLocation = "location"
docExtraLocation = "location"
)

func main() {
ctx := context.Background()

// es supports multiple ways to connect
username := os.Getenv("ES_USERNAME")
password := os.Getenv("ES_PASSWORD")
httpCACertPath := os.Getenv("ES_HTTP_CA_CERT_PATH")

cert, err := os.ReadFile(httpCACertPath)
if err != nil {
panic(err)
}

client, err := elasticsearch.NewClient(elasticsearch.Config{
Addresses: []string{"https://localhost:9200"},
Username: username,
Password: password,
CACert: cert,
})
if err != nil {
panic(err)
}

// create index if needed.
// comment out the code if index has been created.
if err = createIndex(ctx, client); err != nil {
panic(err)
}

// load embeddings from local
emb, err := prepareEmbeddings()
if err != nil {
panic(err)
}

// load docs
docs := prepareDocs()

// create es indexer component
indexer, err := es8.NewIndexer(ctx, &es8.IndexerConfig{
Client: client,
Index: indexName,
BatchSize: 10,
DocumentToFields: func(ctx context.Context, doc *schema.Document) (field2Value map[string]es8.FieldValue, err error) {
return map[string]es8.FieldValue{
fieldContent: {
Value: doc.Content,
EmbedKey: fieldContentVector, // vectorize doc content and save vector to field "content_vector"
},
fieldExtraLocation: {
Value: doc.MetaData[docExtraLocation],
},
}, nil
},
Embedding: &mockEmbedding{emb.Dense}, // replace it with real embedding component
})
if err != nil {
panic(err)
}

ids, err := indexer.Store(ctx, docs)
if err != nil {
panic(err)
}

fmt.Println(ids) // [1 2 3 4 5 6 7 8 9 10]
}

type localEmbeddings struct {
Dense [][]float64 `json:"dense"`
Sparse []map[int]float64 `json:"sparse"`
}

func prepareEmbeddings() (*localEmbeddings, error) {
b, err := os.ReadFile("./examples/embeddings.json")
if err != nil {
return nil, err
}

le := &localEmbeddings{}
if err = json.Unmarshal(b, le); err != nil {
return nil, err
}

return le, nil
}

func prepareDocs() []*schema.Document {
var docs []*schema.Document
contents := `1. Eiffel Tower: Located in Paris, France, it is one of the most famous landmarks in the world, designed by Gustave Eiffel and built in 1889.
2. The Great Wall: Located in China, it is one of the Seven Wonders of the World, built from the Qin Dynasty to the Ming Dynasty, with a total length of over 20000 kilometers.
3. Grand Canyon National Park: Located in Arizona, USA, it is famous for its deep canyons and magnificent scenery, which are cut by the Colorado River.
4. The Colosseum: Located in Rome, Italy, built between 70-80 AD, it was the largest circular arena in the ancient Roman Empire.
5. Taj Mahal: Located in Agra, India, it was completed by Mughal Emperor Shah Jahan in 1653 to commemorate his wife and is one of the New Seven Wonders of the World.
6. Sydney Opera House: Located in Sydney Harbour, Australia, it is one of the most iconic buildings of the 20th century, renowned for its unique sailboat design.
7. Louvre Museum: Located in Paris, France, it is one of the largest museums in the world with a rich collection, including Leonardo da Vinci's Mona Lisa and Greece's Venus de Milo.
8. Niagara Falls: located at the border of the United States and Canada, consisting of three main waterfalls, its spectacular scenery attracts millions of tourists every year.
9. St. Sophia Cathedral: located in Istanbul, Türkiye, originally built in 537 A.D., it used to be an Orthodox cathedral and mosque, and now it is a museum.
10. Machu Picchu: an ancient Inca site located on the plateau of the Andes Mountains in Peru, one of the New Seven Wonders of the World, with an altitude of over 2400 meters.`
locations := []string{"France", "China", "USA", "Italy", "India", "Australia", "France", "Border of the United States and Canada", "Turkey", "Peru"}

for i, content := range strings.Split(contents, "\n") {
docs = append(docs, &schema.Document{
ID: strconv.FormatInt(int64(i+1), 10),
Content: content,
MetaData: map[string]any{
docExtraLocation: locations[i],
},
})
}

return docs
}

func of[T any](v T) *T {
return &v
}

// mockEmbedding returns embeddings with 1024 dimensions
type mockEmbedding struct {
dense [][]float64
}

func (m mockEmbedding) EmbedStrings(ctx context.Context, texts []string, opts ...embedding.Option) ([][]float64, error) {
return m.dense, nil
}
28 changes: 28 additions & 0 deletions components/indexer/es8/examples/indexer/create_index.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package main

import (
"context"

"github.com/elastic/go-elasticsearch/v8"
"github.com/elastic/go-elasticsearch/v8/typedapi/indices/create"
"github.com/elastic/go-elasticsearch/v8/typedapi/types"
)

// createIndex create index for example in add_documents.go.
func createIndex(ctx context.Context, client *elasticsearch.Client) error {
_, err := create.NewCreateFunc(client)(indexName).Request(&create.Request{
Mappings: &types.TypeMapping{
Properties: map[string]types.Property{
fieldContent: types.NewTextProperty(),
fieldExtraLocation: types.NewTextProperty(),
fieldContentVector: &types.DenseVectorProperty{
Dims: of(1024), // same as embedding dimensions
Index: of(true),
Similarity: of("cosine"),
},
},
},
}).Do(ctx)

return err
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
package main

import (
"context"
"encoding/json"
"fmt"
"os"
"strconv"
"strings"

"github.com/cloudwego/eino-ext/components/indexer/es8"
"github.com/cloudwego/eino/components/embedding"
"github.com/cloudwego/eino/schema"
"github.com/elastic/go-elasticsearch/v8"
)

const (
indexName = "eino_example_sparse"
fieldContent = "content"
fieldContentDenseVector = "content_dense_vector"
fieldContentSparseVector = "content_sparse_vector"
fieldExtraLocation = "location"
docExtraLocation = "location"
)

func main() {
ctx := context.Background()

// es supports multiple ways to connect
username := os.Getenv("ES_USERNAME")
password := os.Getenv("ES_PASSWORD")
httpCACertPath := os.Getenv("ES_HTTP_CA_CERT_PATH")

cert, err := os.ReadFile(httpCACertPath)
if err != nil {
panic(err)
}

client, err := elasticsearch.NewClient(elasticsearch.Config{
Addresses: []string{"https://localhost:9200"},
Username: username,
Password: password,
CACert: cert,
})
if err != nil {
panic(err)
}

// create index if needed.
// comment out the code if index has been created.
if err = createIndex(ctx, client); err != nil {
panic(err)
}

// load embeddings from local
emb, err := prepareEmbeddings()
if err != nil {
panic(err)
}

// load docs, set sparse vector
docs := prepareDocs(emb)

// create es indexer component
indexer, err := es8.NewIndexer(ctx, &es8.IndexerConfig{
Client: client,
Index: indexName,
BatchSize: 10,
DocumentToFields: func(ctx context.Context, doc *schema.Document) (field2Value map[string]es8.FieldValue, err error) {
return map[string]es8.FieldValue{
fieldContent: {
Value: doc.Content,
EmbedKey: fieldContentDenseVector, // vectorize doc content and save vector to field "content_vector"
},
fieldContentSparseVector: {
Value: doc.SparseVector(), // load sparse vector from doc metadata
},
fieldExtraLocation: {
Value: doc.MetaData[docExtraLocation],
},
}, nil
},
Embedding: &mockEmbedding{emb.Dense}, // replace it with real embedding component
})
if err != nil {
panic(err)
}

ids, err := indexer.Store(ctx, docs)
if err != nil {
panic(err)
}

fmt.Println(ids) // [1 2 3 4 5 6 7 8 9 10]
}

type localEmbeddings struct {
Dense [][]float64 `json:"dense"`
Sparse []map[int]float64 `json:"sparse"`
}

func prepareEmbeddings() (*localEmbeddings, error) {
b, err := os.ReadFile("./examples/embeddings.json")
if err != nil {
return nil, err
}

le := &localEmbeddings{}
if err = json.Unmarshal(b, le); err != nil {
return nil, err
}

return le, nil
}

func prepareDocs(emb *localEmbeddings) []*schema.Document {
var docs []*schema.Document
contents := `1. Eiffel Tower: Located in Paris, France, it is one of the most famous landmarks in the world, designed by Gustave Eiffel and built in 1889.
2. The Great Wall: Located in China, it is one of the Seven Wonders of the World, built from the Qin Dynasty to the Ming Dynasty, with a total length of over 20000 kilometers.
3. Grand Canyon National Park: Located in Arizona, USA, it is famous for its deep canyons and magnificent scenery, which are cut by the Colorado River.
4. The Colosseum: Located in Rome, Italy, built between 70-80 AD, it was the largest circular arena in the ancient Roman Empire.
5. Taj Mahal: Located in Agra, India, it was completed by Mughal Emperor Shah Jahan in 1653 to commemorate his wife and is one of the New Seven Wonders of the World.
6. Sydney Opera House: Located in Sydney Harbour, Australia, it is one of the most iconic buildings of the 20th century, renowned for its unique sailboat design.
7. Louvre Museum: Located in Paris, France, it is one of the largest museums in the world with a rich collection, including Leonardo da Vinci's Mona Lisa and Greece's Venus de Milo.
8. Niagara Falls: located at the border of the United States and Canada, consisting of three main waterfalls, its spectacular scenery attracts millions of tourists every year.
9. St. Sophia Cathedral: located in Istanbul, Türkiye, originally built in 537 A.D., it used to be an Orthodox cathedral and mosque, and now it is a museum.
10. Machu Picchu: an ancient Inca site located on the plateau of the Andes Mountains in Peru, one of the New Seven Wonders of the World, with an altitude of over 2400 meters.`
locations := []string{"France", "China", "USA", "Italy", "India", "Australia", "France", "Border of the United States and Canada", "Turkey", "Peru"}

for i, content := range strings.Split(contents, "\n") {
doc := &schema.Document{
ID: strconv.FormatInt(int64(i+1), 10),
Content: content,
MetaData: map[string]any{
docExtraLocation: locations[i],
},
}
doc.WithSparseVector(emb.Sparse[i])
docs = append(docs, doc)
}

return docs
}

func of[T any](v T) *T {
return &v
}

// mockEmbedding returns embeddings with 1024 dimensions
type mockEmbedding struct {
dense [][]float64
}

func (m mockEmbedding) EmbedStrings(ctx context.Context, texts []string, opts ...embedding.Option) ([][]float64, error) {
return m.dense, nil
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package main

import (
"context"

"github.com/elastic/go-elasticsearch/v8"
"github.com/elastic/go-elasticsearch/v8/typedapi/indices/create"
"github.com/elastic/go-elasticsearch/v8/typedapi/types"
)

// createIndex create index for example in add_documents.go.
func createIndex(ctx context.Context, client *elasticsearch.Client) error {
_, err := create.NewCreateFunc(client)(indexName).Request(&create.Request{
Mappings: &types.TypeMapping{
Properties: map[string]types.Property{
fieldContent: types.NewTextProperty(),
fieldExtraLocation: types.NewTextProperty(),
fieldContentDenseVector: &types.DenseVectorProperty{
Dims: of(1024), // same as embedding dimensions
Index: of(true),
Similarity: of("cosine"),
},
fieldContentSparseVector: &types.SparseVectorProperty{},
},
},
}).Do(ctx)

return err
}
Loading

0 comments on commit bb62376

Please sign in to comment.