Skip to content

Commit

Permalink
【RAG】文档上传支持图表解析、表格深度解析 && 切片接口增加知识库ID参数 (#703)
Browse files Browse the repository at this point in the history
  • Loading branch information
userpj authored Jan 2, 2025
1 parent f16d544 commit 0c9957d
Show file tree
Hide file tree
Showing 14 changed files with 289 additions and 66 deletions.
59 changes: 34 additions & 25 deletions docs/BasisModule/Platform/KnowledgeBase/knowledgebase.md
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ class DocumentProcessOption(BaseModel):
description="模板类型,ppt:模版配置—ppt幻灯片, resume:模版配置—简历文档, paper:模版配置—论文文档, custom:自定义配置—自定义切片, default:自定义配置—默认切分",
enum=["ppt", "paper", "qaPair", "resume", " custom", "default"],
)
parser: Optional[DocumentChoices] = Field(None, description="解析方法(文字提取默认启动,参数不体现,layoutAnalysis版面分析,ocr按需增加)")
parser: Optional[DocumentChoices] = Field(None, description="解析方法(文字提取默认启动,参数不体现,layoutAnalysis版面分析,ocr光学字符识别,pageImageAnalysis文档图片解析,chartAnalysis图表解析,tableAnalysis表格深度解析,按需增加)")
knowledgeAugmentation: Optional[DocumentChoices] = Field(
None, description="知识增强,faq、spokenQuery、spo、shortSummary按需增加。问题生成:faq、spokenQuery,段落摘要:shortSummary,三元组知识抽取:spo"
)
Expand Down Expand Up @@ -662,10 +662,11 @@ for message in doc_list:

#### 方法参数

| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| ---------- | -------- | -------- | -------- | -------------- |
| documentId | string || 文档ID | "正确的文档ID" |
| content | string || 切片内容 | "内容" |
| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| --------------- | -------- | -------- | -------- | -------------- |
| knowledgeBaseId | string || 知识库ID | |
| documentId | string || 文档ID | "正确的文档ID" |
| content | string || 切片内容 | "内容" |

#### 方法返回值

Expand All @@ -686,7 +687,7 @@ os.environ["APPBUILDER_TOKEN"] = "your_appbuilder_token"
my_knowledge_base_id = "your_knowledge_base_id"
my_knowledge = appbuilder.KnowledgeBase(my_knowledge_base_id)
print("知识库ID: ", my_knowledge.knowledge_id)
resp = my_knowledge.create_chunk("your_document_id", "content")
resp = my_knowledge.create_chunk("your_document_id", "content", knowledgebase_id=knowledge_base_id)
print("切片ID: ", resp.id)
chunk_id = resp.id
```
Expand All @@ -697,6 +698,7 @@ chunk_id = resp.id

| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| ---------- | -------- | ------------ | -------------- | -------------- |
| knowledgeBaseId | string || 知识库ID | |
| chunkId | string || 文档ID | "正确的切片ID" |
| content | string || 切片内容 | "内容" |
| enable | bool || 是否用该切片 | True |
Expand All @@ -711,16 +713,17 @@ os.environ["APPBUILDER_TOKEN"] = "your_appbuilder_token"
my_knowledge_base_id = "your_knowledge_base_id"
my_knowledge = appbuilder.KnowledgeBase(my_knowledge_base_id)
print("知识库ID: ", my_knowledge.knowledge_id)
my_knowledge.modify_chunk("your_chunk_id", "content", True)
my_knowledge.modify_chunk("your_chunk_id", "content", True, knowledgebase_id=my_knowledge_base_id)
```

### 16. 删除切片`delete_chunk(chunkId: str)`

#### 方法参数

| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| -------- | -------- | -------- | ------ | -------------- |
| chunkId | string || 文档ID | "正确的切片ID" |
| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| --------------- | -------- | -------- | -------- | -------------- |
| knowledgeBaseId | string || 知识库ID | |
| chunkId | string || 文档ID | "正确的切片ID" |

#### 方法示例

Expand All @@ -732,16 +735,17 @@ os.environ["APPBUILDER_TOKEN"] = "your_appbuilder_token"
my_knowledge_base_id = "your_knowledge_base_id"
my_knowledge = appbuilder.KnowledgeBase(my_knowledge_base_id)
print("知识库ID: ", my_knowledge.knowledge_id)
my_knowledge.delete_chunk("your_chunk_id")
my_knowledge.delete_chunk("your_chunk_id", knowledgebase_id=my_knowledge_base_id)
```

### 17. 获取切片信息`describe_chunk(chunkId: str)`

#### 方法参数

| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| -------- | -------- | -------- | ------ | -------------- |
| chunkId | string || 文档ID | "正确的切片ID" |
| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| --------------- | -------- | -------- | -------- | -------------- |
| knowledgeBaseId | string || 知识库ID | |
| chunkId | string || 文档ID | "正确的切片ID" |

#### 方法返回值

Expand Down Expand Up @@ -774,7 +778,7 @@ os.environ["APPBUILDER_TOKEN"] = "your_appbuilder_token"
my_knowledge_base_id = "your_knowledge_base_id"
my_knowledge = appbuilder.KnowledgeBase(my_knowledge_base_id)
print("知识库ID: ", my_knowledge.knowledge_id)
resp = my_knowledge.describe_chunk("your_chunk_id")
resp = my_knowledge.describe_chunk("your_chunk_id", knowledgebase_id=my_knowledge_base_id)
print("切片详情:")
print(resp)
```
Expand All @@ -783,12 +787,13 @@ print(resp)

#### 方法参数

| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| ---------- | -------- | -------- | ------------------------------------------------------------ | -------------- |
| documentId | string || 文档ID | "正确的文档ID" |
| marker | string || 起始位置,切片ID | "正确的切片ID" |
| maxKeys | string || 返回文档数量大小,默认10,最大值100 | 10 |
| type | string || 根据类型获取切片列表(RAW、NEW、COPY),RAW:原文切片,NEW:新增切片,COPY:复制切片 | "RAW" |
| 参数名称 | 参数类型 | 是否必传 | 描述 | 示例值 |
| --------------- | -------- | -------- | ------------------------------------------------------------ | -------------- |
| knowledgeBaseId | string || 知识库ID | |
| documentId | string || 文档ID | "正确的文档ID" |
| marker | string || 起始位置,切片ID | "正确的切片ID" |
| maxKeys | string || 返回文档数量大小,默认10,最大值100 | 10 |
| type | string || 根据类型获取切片列表(RAW、NEW、COPY),RAW:原文切片,NEW:新增切片,COPY:复制切片 | "RAW" |

#### 方法返回值

Expand Down Expand Up @@ -834,7 +839,7 @@ os.environ["APPBUILDER_TOKEN"] = "your_appbuilder_token"
my_knowledge_base_id = "your_knowledge_base_id"
my_knowledge = appbuilder.KnowledgeBase(my_knowledge_base_id)
print("知识库ID: ", my_knowledge.knowledge_id)
resp = my_knowledge.describe_chunks("your_document_id")
resp = my_knowledge.describe_chunks("your_document_id", knowledgebase_id=my_knowledge_base_id)
print("切片列表:")
print(resp)
```
Expand Down Expand Up @@ -964,7 +969,11 @@ public class KnowledgebaseTest {
@Test
public void testCreateChunk() throws IOException, AppBuilderServerException {
String documentId = "";
Knowledgebase knowledgebase = new Knowledgebase();
// 知识库ID
String knowledgeBaseId = "";
// Appbuilder Token
String secretKey = "";
Knowledgebase knowledgebase = new Knowledgebase(knowledgeBaseID, secretKey);
// 创建切片
String chunkId = knowledgebase.createChunk(documentId, "test");
// 修改切片
Expand Down Expand Up @@ -1189,13 +1198,13 @@ func TestChunk(t *testing.T) {
os.Setenv("APPBUILDER_LOGLEVEL", "DEBUG")
os.Setenv("APPBUILDER_TOKEN", "")
documentID := ""

knowledgeBaseID := "";
config, err := NewSDKConfig("", "")
if err != nil {
t.Fatalf("new http client config failed: %v", err)
}

client, err := NewKnowledgeBase(config)
client, err := NewKnowledgeBaseWithKnowledgeBaseID(knowledgeBaseID, config)
if err != nil {
t.Fatalf("new Knowledge base instance failed")
}
Expand Down
36 changes: 34 additions & 2 deletions go/appbuilder/knowledge_base.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,21 @@ func NewKnowledgeBase(config *SDKConfig) (*KnowledgeBase, error) {
return &KnowledgeBase{sdkConfig: config, client: client}, nil
}

func NewKnowledgeBaseWithKnowledgeBaseID(knowledgeBaseID string, config *SDKConfig) (*KnowledgeBase, error) {
if config == nil {
return nil, errors.New("invalid config")
}
client := config.HTTPClient
if client == nil {
client = &http.Client{Timeout: 60 * time.Second}
}
return &KnowledgeBase{knowledgeBaseID: knowledgeBaseID, sdkConfig: config, client: client}, nil
}

type KnowledgeBase struct {
sdkConfig *SDKConfig
client HTTPClient
knowledgeBaseID string
sdkConfig *SDKConfig
client HTTPClient
}

func (t *KnowledgeBase) CreateDocument(req CreateDocumentRequest) (CreateDocumentResponse, error) {
Expand Down Expand Up @@ -618,6 +630,10 @@ func (t *KnowledgeBase) CreateChunk(req CreateChunkRequest) (string, error) {
if req.ClientToken == "" {
req.ClientToken = uuid.New().String()
}

if req.KnowledgeBaseID == "" && t.knowledgeBaseID != "" {
req.KnowledgeBaseID = t.knowledgeBaseID
}
serviceURL, err := t.sdkConfig.ServiceURLV2("/knowledgeBase?Action=CreateChunk&clientToken=" + req.ClientToken)
if err != nil {
return "", err
Expand Down Expand Up @@ -657,6 +673,10 @@ func (t *KnowledgeBase) ModifyChunk(req ModifyChunkRequest) error {
if req.ClientToken == "" {
req.ClientToken = uuid.New().String()
}

if req.KnowledgeBaseID == "" && t.knowledgeBaseID != "" {
req.KnowledgeBaseID = t.knowledgeBaseID
}
serviceURL, err := t.sdkConfig.ServiceURLV2("/knowledgeBase?Action=ModifyChunk&clientToken=" + req.ClientToken)
if err != nil {
return err
Expand Down Expand Up @@ -715,6 +735,10 @@ func (t *KnowledgeBase) deleteChunk(chunkID string, clientToken string) error {
req := DeleteChunkRequest{
ChunkID: chunkID,
}

if t.knowledgeBaseID != "" {
req.KnowledgeBaseID = t.knowledgeBaseID
}
data, _ := json.Marshal(req)
request.Body = NopCloser(bytes.NewReader(data))
t.sdkConfig.BuildCurlCommand(&request)
Expand Down Expand Up @@ -754,6 +778,10 @@ func (t *KnowledgeBase) DescribeChunk(chunkID string) (DescribeChunkResponse, er
req := DescribeChunkRequest{
ChunkID: chunkID,
}

if t.knowledgeBaseID != "" {
req.KnowledgeBaseID = t.knowledgeBaseID
}
data, _ := json.Marshal(req)
request.Body = NopCloser(bytes.NewReader(data))
t.sdkConfig.BuildCurlCommand(&request)
Expand Down Expand Up @@ -786,6 +814,10 @@ func (t *KnowledgeBase) DescribeChunks(req DescribeChunksRequest) (DescribeChunk
if err != nil {
return DescribeChunksResponse{}, err
}

if req.KnowledgeBaseID == "" && t.knowledgeBaseID != "" {
req.KnowledgeBaseID = t.knowledgeBaseID
}
request.URL = serviceURL
request.Method = "POST"
header.Set("Content-Type", "application/json")
Expand Down
33 changes: 19 additions & 14 deletions go/appbuilder/knowledge_base_data.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,29 +190,33 @@ type UploadDocumentsResponse struct {
}

type CreateChunkRequest struct {
DocumentID string `json:"documentId"`
Content string `json:"content"`
ClientToken string `json:"client_token,omitempty"`
KnowledgeBaseID string `json:"knowledgeBaseId"`
DocumentID string `json:"documentId"`
Content string `json:"content"`
ClientToken string `json:"client_token,omitempty"`
}

type CreateChunkResponse struct {
ID string `json:"id"`
}

type ModifyChunkRequest struct {
ChunkID string `json:"chunkId"`
Content string `json:"content"`
Enable bool `json:"enable"`
ClientToken string `json:"client_token,omitempty"`
KnowledgeBaseID string `json:"knowledgeBaseId"`
ChunkID string `json:"chunkId"`
Content string `json:"content"`
Enable bool `json:"enable"`
ClientToken string `json:"client_token,omitempty"`
}

type DeleteChunkRequest struct {
ChunkID string `json:"chunkId"`
ClientToken string `json:"client_token,omitempty"`
KnowledgeBaseID string `json:"knowledgeBaseId"`
ChunkID string `json:"chunkId"`
ClientToken string `json:"client_token,omitempty"`
}

type DescribeChunkRequest struct {
ChunkID string `json:"chunkId"`
KnowledgeBaseID string `json:"knowledgeBaseId"`
ChunkID string `json:"chunkId"`
}

type DescribeChunkResponse struct {
Expand All @@ -232,10 +236,11 @@ type DescribeChunkResponse struct {
}

type DescribeChunksRequest struct {
DocumnetID string `json:"documentId"`
Marker string `json:"marker,omitempty"`
MaxKeys int `json:"maxKeys,omitempty"`
Type string `json:"type,omitempty"`
KnowledgeBaseID string `json:"knowledgeBaseId"`
DocumnetID string `json:"documentId"`
Marker string `json:"marker,omitempty"`
MaxKeys int `json:"maxKeys,omitempty"`
Type string `json:"type,omitempty"`
}

type DescribeChunksResponse struct {
Expand Down
51 changes: 37 additions & 14 deletions go/appbuilder/knowledge_base_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -828,18 +828,28 @@ func TestChunkError(t *testing.T) {
t.Parallel() // 并发运行
os.Setenv("APPBUILDER_LOGLEVEL", "DEBUG")

documentID := os.Getenv(DocumentIDV3)
config, err := NewSDKConfig("", os.Getenv(SecretKeyV3))
knowledgeBaseID := os.Getenv(DatasetID)
config, err := NewSDKConfig("", os.Getenv(SecretKey))
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
t.Fatalf("new http client config failed: %v", err)
}

client, err := NewKnowledgeBase(config)
client, err := NewKnowledgeBaseWithKnowledgeBaseID(knowledgeBaseID, config)
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
t.Fatalf("new Knowledge base instance failed")
}

documentsRes, err := client.GetDocumentList(GetDocumentListRequest{
KnowledgeBaseID: knowledgeBaseID,
})
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
t.Fatalf("get document list failed: %v", err)
}
documentID := documentsRes.Data[0].ID

var clientT = client.client
var GatewayURL = client.sdkConfig.GatewayURLV2

Expand Down Expand Up @@ -1416,22 +1426,33 @@ func TestChunk(t *testing.T) {
fmt.Fprintf(&logBuffer, format+"\n", args...)
}

documentID := os.Getenv(DocumentIDV3)
config, err := NewSDKConfig("", os.Getenv(SecretKeyV3))
knowledgeBaseID := os.Getenv(DatasetID)
config, err := NewSDKConfig("", os.Getenv(SecretKey))
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
t.Fatalf("new http client config failed: %v", err)
}

client, err := NewKnowledgeBase(config)
client, err := NewKnowledgeBaseWithKnowledgeBaseID(knowledgeBaseID, config)
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
t.Fatalf("new Knowledge base instance failed")
}

documentsRes, err := client.GetDocumentList(GetDocumentListRequest{
KnowledgeBaseID: knowledgeBaseID,
})
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
t.Fatalf("get document list failed: %v", err)
}
log("Documents retrieved: %+v", documentsRes)
documentID := documentsRes.Data[0].ID
// 创建切片
chunkID, err := client.CreateChunk(CreateChunkRequest{
DocumentID: documentID,
Content: "test",
KnowledgeBaseID: knowledgeBaseID,
DocumentID: documentID,
Content: "test",
})
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
Expand All @@ -1441,9 +1462,10 @@ func TestChunk(t *testing.T) {

// 修改切片
err = client.ModifyChunk(ModifyChunkRequest{
ChunkID: chunkID,
Content: "new test",
Enable: true,
KnowledgeBaseID: knowledgeBaseID,
ChunkID: chunkID,
Content: "new test",
Enable: true,
})
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
Expand All @@ -1461,9 +1483,10 @@ func TestChunk(t *testing.T) {

// 获取切片列表
describeChunksRes, err := client.DescribeChunks(DescribeChunksRequest{
DocumnetID: documentID,
Marker: chunkID,
MaxKeys: 10,
KnowledgeBaseID: knowledgeBaseID,
DocumnetID: documentID,
Marker: chunkID,
MaxKeys: 10,
})
if err != nil {
t.Logf("%s========== FAIL: %s ==========%s", "\033[31m", t.Name(), "\033[0m")
Expand Down
Loading

0 comments on commit 0c9957d

Please sign in to comment.