Skip to content

Commit

Permalink
Merge pull request #860 from arc53/Fix-ingestion-grouping
Browse files Browse the repository at this point in the history
Fixing ingestion metadata grouping
  • Loading branch information
dartpain authored Feb 26, 2024
2 parents 4216671 + c8d8a8d commit 8d36f88
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 13 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,4 @@ application/vectors/
node_modules/
.vscode/settings.json
models/
model/
18 changes: 15 additions & 3 deletions application/parser/file/bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,24 @@ def load_data(self, concatenate: bool = False) -> List[Document]:
# do standard read
with open(input_file, "r", errors=self.errors) as f:
data = f.read()
# Prepare metadata for this file
if self.file_metadata is not None:
file_metadata = self.file_metadata(str(input_file))
else:
# Provide a default empty metadata
file_metadata = {'title': '', 'store': ''}
# TODO: Find a case with no metadata and check if breaks anything

if isinstance(data, List):
data_list.extend(data)
# Extend data_list with each item in the data list
data_list.extend([str(d) for d in data])
# For each item in the data list, add the file's metadata to metadata_list
metadata_list.extend([file_metadata for _ in data])
else:
# Add the single piece of data to data_list
data_list.append(str(data))
if self.file_metadata is not None:
metadata_list.append(self.file_metadata(str(input_file)))
# Add the file's metadata to metadata_list
metadata_list.append(file_metadata)

if concatenate:
return [Document("\n".join(data_list))]
Expand Down
15 changes: 7 additions & 8 deletions application/parser/token_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,15 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
for doc in documents:
doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))

if current_group is None:
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
elif len(tiktoken.get_encoding("cl100k_base").encode(
current_group.text)) + doc_len < max_tokens and doc_len < min_tokens:
current_group.text += " " + doc.text
# Check if current group is empty or if the document can be added based on token count and matching metadata
if current_group is None or (len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len < min_tokens and current_group.extra_info == doc.extra_info):
if current_group is None:
current_group = doc # Use the document directly to retain its metadata
else:
current_group.text += " " + doc.text # Append text to the current group
else:
docs.append(current_group)
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
current_group = doc # Start a new group with the current document

if current_group is not None:
docs.append(current_group)
Expand Down
3 changes: 1 addition & 2 deletions frontend/src/conversation/Conversation.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,7 @@ export default function Conversation() {
)}
</div>
<p className="text-gray-595959 dark:text-bright-gray bg-white dark:bg-raisin-black w-[100vw] self-center bg-transparent p-5 text-center text-xs md:w-full">
This is a chatbot that uses the GPT-3, Faiss and LangChain to answer
questions.
DocsGPT uses GenAI, please review critial information using sources.
</p>
</div>
</div>
Expand Down

0 comments on commit 8d36f88

Please sign in to comment.