generated from cloudwego/.github
-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
23 changed files
with
12,798 additions
and
51 deletions.
There are no files selected for viewing
10,631 changes: 10,631 additions & 0 deletions
10,631
components/indexer/es8/examples/embeddings.json
Large diffs are not rendered by default.
Oops, something went wrong.
150 changes: 150 additions & 0 deletions
150
components/indexer/es8/examples/indexer/add_documents.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
package main | ||
|
||
import ( | ||
"context" | ||
"encoding/json" | ||
"fmt" | ||
"os" | ||
"strconv" | ||
"strings" | ||
|
||
"github.com/cloudwego/eino-ext/components/indexer/es8" | ||
"github.com/cloudwego/eino/components/embedding" | ||
"github.com/cloudwego/eino/schema" | ||
"github.com/elastic/go-elasticsearch/v8" | ||
) | ||
|
||
const ( | ||
indexName = "eino_example" | ||
fieldContent = "content" | ||
fieldContentVector = "content_vector" | ||
fieldExtraLocation = "location" | ||
docExtraLocation = "location" | ||
) | ||
|
||
func main() { | ||
ctx := context.Background() | ||
|
||
// es supports multiple ways to connect | ||
username := os.Getenv("ES_USERNAME") | ||
password := os.Getenv("ES_PASSWORD") | ||
httpCACertPath := os.Getenv("ES_HTTP_CA_CERT_PATH") | ||
|
||
cert, err := os.ReadFile(httpCACertPath) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
client, err := elasticsearch.NewClient(elasticsearch.Config{ | ||
Addresses: []string{"https://localhost:9200"}, | ||
Username: username, | ||
Password: password, | ||
CACert: cert, | ||
}) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
// create index if needed. | ||
// comment out the code if index has been created. | ||
if err = createIndex(ctx, client); err != nil { | ||
panic(err) | ||
} | ||
|
||
// load embeddings from local | ||
emb, err := prepareEmbeddings() | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
// load docs | ||
docs := prepareDocs() | ||
|
||
// create es indexer component | ||
indexer, err := es8.NewIndexer(ctx, &es8.IndexerConfig{ | ||
Client: client, | ||
Index: indexName, | ||
BatchSize: 10, | ||
DocumentToFields: func(ctx context.Context, doc *schema.Document) (field2Value map[string]es8.FieldValue, err error) { | ||
return map[string]es8.FieldValue{ | ||
fieldContent: { | ||
Value: doc.Content, | ||
EmbedKey: fieldContentVector, // vectorize doc content and save vector to field "content_vector" | ||
}, | ||
fieldExtraLocation: { | ||
Value: doc.MetaData[docExtraLocation], | ||
}, | ||
}, nil | ||
}, | ||
Embedding: &mockEmbedding{emb.Dense}, // replace it with real embedding component | ||
}) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
ids, err := indexer.Store(ctx, docs) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
fmt.Println(ids) // [1 2 3 4 5 6 7 8 9 10] | ||
} | ||
|
||
type localEmbeddings struct { | ||
Dense [][]float64 `json:"dense"` | ||
Sparse []map[int]float64 `json:"sparse"` | ||
} | ||
|
||
func prepareEmbeddings() (*localEmbeddings, error) { | ||
b, err := os.ReadFile("./examples/embeddings.json") | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
le := &localEmbeddings{} | ||
if err = json.Unmarshal(b, le); err != nil { | ||
return nil, err | ||
} | ||
|
||
return le, nil | ||
} | ||
|
||
func prepareDocs() []*schema.Document { | ||
var docs []*schema.Document | ||
contents := `1. Eiffel Tower: Located in Paris, France, it is one of the most famous landmarks in the world, designed by Gustave Eiffel and built in 1889. | ||
2. The Great Wall: Located in China, it is one of the Seven Wonders of the World, built from the Qin Dynasty to the Ming Dynasty, with a total length of over 20000 kilometers. | ||
3. Grand Canyon National Park: Located in Arizona, USA, it is famous for its deep canyons and magnificent scenery, which are cut by the Colorado River. | ||
4. The Colosseum: Located in Rome, Italy, built between 70-80 AD, it was the largest circular arena in the ancient Roman Empire. | ||
5. Taj Mahal: Located in Agra, India, it was completed by Mughal Emperor Shah Jahan in 1653 to commemorate his wife and is one of the New Seven Wonders of the World. | ||
6. Sydney Opera House: Located in Sydney Harbour, Australia, it is one of the most iconic buildings of the 20th century, renowned for its unique sailboat design. | ||
7. Louvre Museum: Located in Paris, France, it is one of the largest museums in the world with a rich collection, including Leonardo da Vinci's Mona Lisa and Greece's Venus de Milo. | ||
8. Niagara Falls: located at the border of the United States and Canada, consisting of three main waterfalls, its spectacular scenery attracts millions of tourists every year. | ||
9. St. Sophia Cathedral: located in Istanbul, Türkiye, originally built in 537 A.D., it used to be an Orthodox cathedral and mosque, and now it is a museum. | ||
10. Machu Picchu: an ancient Inca site located on the plateau of the Andes Mountains in Peru, one of the New Seven Wonders of the World, with an altitude of over 2400 meters.` | ||
locations := []string{"France", "China", "USA", "Italy", "India", "Australia", "France", "Border of the United States and Canada", "Turkey", "Peru"} | ||
|
||
for i, content := range strings.Split(contents, "\n") { | ||
docs = append(docs, &schema.Document{ | ||
ID: strconv.FormatInt(int64(i+1), 10), | ||
Content: content, | ||
MetaData: map[string]any{ | ||
docExtraLocation: locations[i], | ||
}, | ||
}) | ||
} | ||
|
||
return docs | ||
} | ||
|
||
func of[T any](v T) *T { | ||
return &v | ||
} | ||
|
||
// mockEmbedding returns embeddings with 1024 dimensions | ||
type mockEmbedding struct { | ||
dense [][]float64 | ||
} | ||
|
||
func (m mockEmbedding) EmbedStrings(ctx context.Context, texts []string, opts ...embedding.Option) ([][]float64, error) { | ||
return m.dense, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
package main | ||
|
||
import ( | ||
"context" | ||
|
||
"github.com/elastic/go-elasticsearch/v8" | ||
"github.com/elastic/go-elasticsearch/v8/typedapi/indices/create" | ||
"github.com/elastic/go-elasticsearch/v8/typedapi/types" | ||
) | ||
|
||
// createIndex create index for example in add_documents.go. | ||
func createIndex(ctx context.Context, client *elasticsearch.Client) error { | ||
_, err := create.NewCreateFunc(client)(indexName).Request(&create.Request{ | ||
Mappings: &types.TypeMapping{ | ||
Properties: map[string]types.Property{ | ||
fieldContent: types.NewTextProperty(), | ||
fieldExtraLocation: types.NewTextProperty(), | ||
fieldContentVector: &types.DenseVectorProperty{ | ||
Dims: of(1024), // same as embedding dimensions | ||
Index: of(true), | ||
Similarity: of("cosine"), | ||
}, | ||
}, | ||
}, | ||
}).Do(ctx) | ||
|
||
return err | ||
} |
156 changes: 156 additions & 0 deletions
156
components/indexer/es8/examples/indexer_with_sparse_vector/add_documents.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
package main | ||
|
||
import ( | ||
"context" | ||
"encoding/json" | ||
"fmt" | ||
"os" | ||
"strconv" | ||
"strings" | ||
|
||
"github.com/cloudwego/eino-ext/components/indexer/es8" | ||
"github.com/cloudwego/eino/components/embedding" | ||
"github.com/cloudwego/eino/schema" | ||
"github.com/elastic/go-elasticsearch/v8" | ||
) | ||
|
||
const ( | ||
indexName = "eino_example_sparse" | ||
fieldContent = "content" | ||
fieldContentDenseVector = "content_dense_vector" | ||
fieldContentSparseVector = "content_sparse_vector" | ||
fieldExtraLocation = "location" | ||
docExtraLocation = "location" | ||
) | ||
|
||
func main() { | ||
ctx := context.Background() | ||
|
||
// es supports multiple ways to connect | ||
username := os.Getenv("ES_USERNAME") | ||
password := os.Getenv("ES_PASSWORD") | ||
httpCACertPath := os.Getenv("ES_HTTP_CA_CERT_PATH") | ||
|
||
cert, err := os.ReadFile(httpCACertPath) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
client, err := elasticsearch.NewClient(elasticsearch.Config{ | ||
Addresses: []string{"https://localhost:9200"}, | ||
Username: username, | ||
Password: password, | ||
CACert: cert, | ||
}) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
// create index if needed. | ||
// comment out the code if index has been created. | ||
if err = createIndex(ctx, client); err != nil { | ||
panic(err) | ||
} | ||
|
||
// load embeddings from local | ||
emb, err := prepareEmbeddings() | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
// load docs, set sparse vector | ||
docs := prepareDocs(emb) | ||
|
||
// create es indexer component | ||
indexer, err := es8.NewIndexer(ctx, &es8.IndexerConfig{ | ||
Client: client, | ||
Index: indexName, | ||
BatchSize: 10, | ||
DocumentToFields: func(ctx context.Context, doc *schema.Document) (field2Value map[string]es8.FieldValue, err error) { | ||
return map[string]es8.FieldValue{ | ||
fieldContent: { | ||
Value: doc.Content, | ||
EmbedKey: fieldContentDenseVector, // vectorize doc content and save vector to field "content_vector" | ||
}, | ||
fieldContentSparseVector: { | ||
Value: doc.SparseVector(), // load sparse vector from doc metadata | ||
}, | ||
fieldExtraLocation: { | ||
Value: doc.MetaData[docExtraLocation], | ||
}, | ||
}, nil | ||
}, | ||
Embedding: &mockEmbedding{emb.Dense}, // replace it with real embedding component | ||
}) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
ids, err := indexer.Store(ctx, docs) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
fmt.Println(ids) // [1 2 3 4 5 6 7 8 9 10] | ||
} | ||
|
||
type localEmbeddings struct { | ||
Dense [][]float64 `json:"dense"` | ||
Sparse []map[int]float64 `json:"sparse"` | ||
} | ||
|
||
func prepareEmbeddings() (*localEmbeddings, error) { | ||
b, err := os.ReadFile("./examples/embeddings.json") | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
le := &localEmbeddings{} | ||
if err = json.Unmarshal(b, le); err != nil { | ||
return nil, err | ||
} | ||
|
||
return le, nil | ||
} | ||
|
||
func prepareDocs(emb *localEmbeddings) []*schema.Document { | ||
var docs []*schema.Document | ||
contents := `1. Eiffel Tower: Located in Paris, France, it is one of the most famous landmarks in the world, designed by Gustave Eiffel and built in 1889. | ||
2. The Great Wall: Located in China, it is one of the Seven Wonders of the World, built from the Qin Dynasty to the Ming Dynasty, with a total length of over 20000 kilometers. | ||
3. Grand Canyon National Park: Located in Arizona, USA, it is famous for its deep canyons and magnificent scenery, which are cut by the Colorado River. | ||
4. The Colosseum: Located in Rome, Italy, built between 70-80 AD, it was the largest circular arena in the ancient Roman Empire. | ||
5. Taj Mahal: Located in Agra, India, it was completed by Mughal Emperor Shah Jahan in 1653 to commemorate his wife and is one of the New Seven Wonders of the World. | ||
6. Sydney Opera House: Located in Sydney Harbour, Australia, it is one of the most iconic buildings of the 20th century, renowned for its unique sailboat design. | ||
7. Louvre Museum: Located in Paris, France, it is one of the largest museums in the world with a rich collection, including Leonardo da Vinci's Mona Lisa and Greece's Venus de Milo. | ||
8. Niagara Falls: located at the border of the United States and Canada, consisting of three main waterfalls, its spectacular scenery attracts millions of tourists every year. | ||
9. St. Sophia Cathedral: located in Istanbul, Türkiye, originally built in 537 A.D., it used to be an Orthodox cathedral and mosque, and now it is a museum. | ||
10. Machu Picchu: an ancient Inca site located on the plateau of the Andes Mountains in Peru, one of the New Seven Wonders of the World, with an altitude of over 2400 meters.` | ||
locations := []string{"France", "China", "USA", "Italy", "India", "Australia", "France", "Border of the United States and Canada", "Turkey", "Peru"} | ||
|
||
for i, content := range strings.Split(contents, "\n") { | ||
doc := &schema.Document{ | ||
ID: strconv.FormatInt(int64(i+1), 10), | ||
Content: content, | ||
MetaData: map[string]any{ | ||
docExtraLocation: locations[i], | ||
}, | ||
} | ||
doc.WithSparseVector(emb.Sparse[i]) | ||
docs = append(docs, doc) | ||
} | ||
|
||
return docs | ||
} | ||
|
||
func of[T any](v T) *T { | ||
return &v | ||
} | ||
|
||
// mockEmbedding returns embeddings with 1024 dimensions | ||
type mockEmbedding struct { | ||
dense [][]float64 | ||
} | ||
|
||
func (m mockEmbedding) EmbedStrings(ctx context.Context, texts []string, opts ...embedding.Option) ([][]float64, error) { | ||
return m.dense, nil | ||
} |
29 changes: 29 additions & 0 deletions
29
components/indexer/es8/examples/indexer_with_sparse_vector/create_index.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package main | ||
|
||
import ( | ||
"context" | ||
|
||
"github.com/elastic/go-elasticsearch/v8" | ||
"github.com/elastic/go-elasticsearch/v8/typedapi/indices/create" | ||
"github.com/elastic/go-elasticsearch/v8/typedapi/types" | ||
) | ||
|
||
// createIndex create index for example in add_documents.go. | ||
func createIndex(ctx context.Context, client *elasticsearch.Client) error { | ||
_, err := create.NewCreateFunc(client)(indexName).Request(&create.Request{ | ||
Mappings: &types.TypeMapping{ | ||
Properties: map[string]types.Property{ | ||
fieldContent: types.NewTextProperty(), | ||
fieldExtraLocation: types.NewTextProperty(), | ||
fieldContentDenseVector: &types.DenseVectorProperty{ | ||
Dims: of(1024), // same as embedding dimensions | ||
Index: of(true), | ||
Similarity: of("cosine"), | ||
}, | ||
fieldContentSparseVector: &types.SparseVectorProperty{}, | ||
}, | ||
}, | ||
}).Do(ctx) | ||
|
||
return err | ||
} |
Oops, something went wrong.