Skip to content

Commit

Permalink
feat: Add Knowledge API to fetch document list
Browse files Browse the repository at this point in the history
  • Loading branch information
chyroc committed Sep 27, 2024
1 parent dce53be commit ff604db
Show file tree
Hide file tree
Showing 5 changed files with 252 additions and 1 deletion.
12 changes: 12 additions & 0 deletions cozepy/coze.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .chat import ChatClient
from .files import FilesClient
from .workflows import WorkflowsClient
from .knowledge import KnowledgeClient

Check warning on line 14 in cozepy/coze.py

View check run for this annotation

Codecov / codecov/patch

cozepy/coze.py#L14

Added line #L14 was not covered by tests


class Coze(object):
Expand All @@ -31,6 +32,7 @@ def __init__(
self._chat = None
self._files = None
self._workflows = None
self._knowledge = None

@property
def bots(self) -> "BotsClient":
Expand Down Expand Up @@ -85,3 +87,13 @@ def workflows(self) -> "WorkflowsClient":
self._base_url, self._auth, self._requester
)
return self._workflows

@property
def knowledge(self) -> "KnowledgeClient":
if not self._knowledge:
from .knowledge import KnowledgeClient

Check warning on line 94 in cozepy/coze.py

View check run for this annotation

Codecov / codecov/patch

cozepy/coze.py#L93-L94

Added lines #L93 - L94 were not covered by tests

self._knowledge = KnowledgeClient(

Check warning on line 96 in cozepy/coze.py

View check run for this annotation

Codecov / codecov/patch

cozepy/coze.py#L96

Added line #L96 was not covered by tests
self._base_url, self._auth, self._requester
)
return self._knowledge

Check warning on line 99 in cozepy/coze.py

View check run for this annotation

Codecov / codecov/patch

cozepy/coze.py#L99

Added line #L99 was not covered by tests
19 changes: 19 additions & 0 deletions cozepy/knowledge/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from cozepy.auth import Auth
from cozepy.request import Requester
from .documents import DocumentsClient


class KnowledgeClient(object):
def __init__(self, base_url: str, auth: Auth, requester: Requester):
self._base_url = base_url
self._auth = auth
self._requester = requester
self._documents = None

@property
def documents(self) -> DocumentsClient:
if self._documents is None:
self._documents = DocumentsClient(
base_url=self._base_url, auth=self._auth, requester=self._requester
)
return self._documents
211 changes: 211 additions & 0 deletions cozepy/knowledge/documents/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
from enum import IntEnum

from cozepy import NumberPaged
from cozepy.auth import Auth
from cozepy.model import CozeModel
from cozepy.request import Requester


class DocumentChunkStrategy(CozeModel):
# Segmentation setting. Values include:
# 0: Automatic segmentation and cleaning. Use preset rules for data segmentation and processing.
# 1: Custom. At this time, you need to specify segmentation rule details through separator, max_tokens,
# remove_extra_spaces, and remove_urls_emails.
chunk_type: int

# Maximum segment length, with a range of 100 to 2000.
# Required when chunk_type=1.
max_tokens: int

# Whether to automatically filter continuous spaces, line breaks, and tabs. Values include:
# true: Automatically filter
# false: (Default) Do not automatically filter<br>Effective when chunk_type=1.
remove_extra_spaces: bool

# Whether to automatically filter all URLs and email addresses. Values include:
# true: Automatically filter
# false: (Default) Do not automatically filter
# Effective when chunk_type=1.
remove_urls_emails: bool

# Segmentation identifier.
# Required when chunk_type=1.
separator: str


class DocumentFormatType(IntEnum):
# Document type, such as txt, pdf, online web pages, etc.
# 文档类型,例如 txt 、pdf 、在线网页等格式均属于文档类型。
document = 0

# 表格类型,例如 xls 表格等格式属于表格类型。
# Spreadsheet type, such as xls spreadsheets, etc.
spreadsheet = 1

# 照片类型,例如 png 图片等格式属于照片类型。
# Photo type, such as png images, etc.
image = 2


class DocumentSourceType(IntEnum):
# Upload local files.
# 上传本地文件。
local_file = 0

# Upload online web pages.
# 上传在线网页。
online_web = 1


class DocumentStatus(IntEnum):
# Processing
# 处理中
processing = 0

# Completed
# 处理完毕
completed = 1

# Processing failed, it is recommended to re-upload
# 处理失败,建议重新上传
failed = 9


class DocumentUpdateType(IntEnum):
# Do not automatically update
# 不自动更新
NO_AUTO_UPDATE = 0

# Automatically update
# 自动更新
AUTO_UPDATE = 1


class Document(CozeModel):
# The ID of the file.
# 文件的 ID。
document_id: int # TODO: fixme

# The total character count of the file content.
# 文件内容的总字符数量。
char_count: int

# The chunking rules. For detailed instructions, refer to the ChunkStrategy object.
# 分段规则。详细说明可参考 chunk_strategy object。
chunk_strategy: DocumentChunkStrategy

# The upload time of the file, in the format of a 10-digit Unix timestamp.
# 文件的上传时间,格式为 10 位的 Unixtime 时间戳。
create_time: int

# The last modified time of the file, in the format of a 10-digit Unix timestamp.
# 文件的最近一次修改时间,格式为 10 位的 Unixtime 时间戳。
update_time: int

# 文件的格式类型。取值包括:
# 0:文档类型,例如 txt 、pdf 、在线网页等格式均属于文档类型。
# 1:表格类型,例如 xls 表格等格式属于表格类型。
# 2:照片类型,例如 png 图片等格式属于照片类型。
# The type of file format. Values include:
# 0: Document type, such as txt, pdf, online web pages, etc.
# 1: Spreadsheet type, such as xls spreadsheets, etc.
# 2: Photo type, such as png images, etc.
format_type: DocumentFormatType

# The number of times the file has been hit in conversations.
# 被对话命中的次数。
hit_count: int

# The name of the file.
# 文件的名称。
name: str

# The size of the file in bytes.
# 文件的大小,单位为字节。
size: int

# The number of slices the file has been divided into.
# 文件的分段数量。
slice_count: int

# The method of uploading the file. Values include:
# 0: Upload local files.
# 1: Upload online web pages.
# 文件的上传方式。取值包括:
# 0:上传本地文件。
# 1:上传在线网页。
source_type: DocumentSourceType

# The processing status of the file. Values include:
# 0: Processing
# 1: Completed
# 9: Processing failed, it is recommended to re-upload
# 文件的处理状态。取值包括:
# 0:处理中
# 1:处理完毕
# 9:处理失败,建议重新上传
status: DocumentStatus

# The format of the local file, i.e., the file extension, such as "txt". Supported formats include PDF, TXT, DOC,
# DOCX.
# 本地文件格式,即文件后缀,例如 txt。格式支持 pdf、txt、doc、docx 类型。
type: str

# The frequency of automatic updates for online web pages, in hours.
# 在线网页自动更新的频率。单位为小时。
update_interval: int

# Whether the online web page is automatically updated. Values include:
# 0: Do not automatically update
# 1: Automatically update
# 在线网页是否自动更新。取值包括:
# 0:不自动更新
# 1:自动更新
update_type: DocumentUpdateType


class DocumentsClient(object):
def __init__(self, base_url: str, auth: Auth, requester: Requester):
self._base_url = base_url
self._auth = auth
self._requester = requester

def list(
self,
*,
dataset_id: str,
page_num: int = 1,
page_size: int = 10,
) -> NumberPaged[Document]:
"""
View the file list of a specified knowledge base, which includes lists of documents, spreadsheets, or images.
docs en: https://www.coze.com/docs/developer_guides/list_knowledge_files
docs zh: https://www.coze.cn/docs/developer_guides/list_knowledge_files
:param dataset_id: The ID of the knowledge base.
:param page_num: The page number for paginated queries. Default is 1, meaning the data return starts from the
first page.
:param page_size: The size of pagination. Default is 10, meaning that 10 data entries are returned per page.
:return: list of Document
"""
url = f"{self._base_url}/open_api/knowledge/document/list"
params = {
"dataset_id": dataset_id,
"page": page_num,
"size": page_size,
}
res = self._requester.request(
"get", url, self._PrivateListDocumentsV1Data, params=params
)
return NumberPaged(
items=res.document_infos,
page_num=page_num,
page_size=page_size,
total=res.total,
)

class _PrivateListDocumentsV1Data(CozeModel):
document_infos: list[Document]
total: int
1 change: 0 additions & 1 deletion cozepy/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

@lru_cache(maxsize=1)
def user_agent():
print("调用了")
python_version = ".".join(map(str, sys.version_info[:2]))

os_name = platform.system().lower()
Expand Down
10 changes: 10 additions & 0 deletions tests/test_knowledge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import unittest

from cozepy import COZE_CN_BASE_URL, Coze
from tests.config import fixed_token_auth


@unittest.skip("not available in not cn")
def test_knowledge_documents_list():
cli = Coze(auth=fixed_token_auth, base_url=COZE_CN_BASE_URL)
print(cli.knowledge.documents.list(dataset_id=""))

Check warning on line 10 in tests/test_knowledge.py

View check run for this annotation

Codecov / codecov/patch

tests/test_knowledge.py#L9-L10

Added lines #L9 - L10 were not covered by tests

0 comments on commit ff604db

Please sign in to comment.