-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcleanup_database.py
138 lines (112 loc) · 3.38 KB
/
cleanup_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python3
"""
Database cleanup utility for removing problematic documents and short content.
"""
import argparse
import logging
from typing import List
from redis.commands.search.query import Query
from cache import r, source_index_name
from tools import (
delete_document_from_db,
error_messages,
get_vector_store,
)
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
def remove_urls(urls: List[str], db) -> None:
"""
Remove specified URLs from the database.
Args:
urls: List of URLs to remove
db: Vector store database instance
"""
for url in urls:
delete_document_from_db(url, db, r)
def find_error_documents() -> List[dict]:
"""
Search for documents containing known error messages.
Returns:
List of document records containing errors
"""
error_docs = []
for error_message, response_message in error_messages.items():
# Sanitize error message for search
sanitized_message = (
error_message.replace(":", "?").replace(",", "?").replace(".", "?")
)
# Search for documents with error message
results = (
r.ft(source_index_name)
.search(
Query(
f'@page_content: "{sanitized_message}"' """@page_length:[0 10000]"""
)
.dialect(2)
.return_fields("source")
.paging(0, 10000)
.timeout(5000)
)
.docs
)
if results:
logger.info(
f"Found {len(results)} documents with error: {response_message}"
)
error_docs.extend(results)
return error_docs
def find_short_documents(min_length: int = 400) -> List[dict]:
"""
Find documents shorter than specified length.
Args:
min_length: Minimum acceptable document length in characters
Returns:
List of document records that are too short
"""
return (
r.ft(source_index_name)
.search(
Query(f"@page_length:[0 {min_length}]").dialect(2).return_fields("source")
)
.docs
)
def parse_args() -> argparse.Namespace:
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description="Clean up problematic documents from the database"
)
parser.add_argument(
"--remove-urls",
nargs="+",
default=[],
help="URL(s) to remove from the database",
)
parser.add_argument(
"--min-length",
type=int,
default=400,
help="Minimum document length (default: 400 characters)",
)
return parser.parse_args()
def main() -> None:
"""Main execution function."""
args = parse_args()
db = get_vector_store()
# Remove specified URLs
if args.remove_urls:
remove_urls(args.remove_urls, db)
# Find documents with errors
error_docs = find_error_documents()
if error_docs:
logger.info(f"Found {len(error_docs)} total documents with errors")
# Find short documents
short_docs = find_short_documents(args.min_length)
if short_docs:
logger.info(
f"Found {len(short_docs)} documents shorter than {args.min_length} characters"
)
if __name__ == "__main__":
main()