-
Notifications
You must be signed in to change notification settings - Fork 0
/
collect_data.py
127 lines (102 loc) · 4.18 KB
/
collect_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import wikipediaapi
import json
from tqdm import tqdm
import time
def get_wiki_pages(categories=["Azərbaycan tarixi", "Azərbaycan mədəniyyəti",
"Azərbaycan ədəbiyyatı", "Azərbaycan coğrafiyası"],
min_length=500, max_pages=1000):
"""
Recursively collect substantial Azerbaijani Wikipedia pages from multiple categories
"""
wiki = wikipediaapi.Wikipedia(
language='az',
extract_format=wikipediaapi.ExtractFormat.WIKI,
user_agent='AzGPTDataCollector/1.0'
)
collected_pages = {}
visited_pages = set()
def collect_pages(category_title):
if len(collected_pages) >= max_pages:
return
category = wiki.page(f"Kateqoriya:{category_title}")
if not category.exists():
print(f"Category not found: {category_title}")
return
# First, process all articles in this category
for member in category.categorymembers.values():
if len(collected_pages) >= max_pages:
return
if member.title in visited_pages:
continue
visited_pages.add(member.title)
# Skip if it's a category or template page
if member.title.startswith('Kateqoriya:') or member.title.startswith('Şablon:'):
continue
# Skip if content is too short
if len(member.text) < min_length:
continue
collected_pages[member.title] = {
'title': member.title,
'text': member.text,
'url': member.fullurl,
'length': len(member.text)
}
print(f"Collected: {member.title} ({len(member.text)} chars)")
# Delay to avoid hitting API limits
time.sleep(0.1)
# Then process subcategories
for subcategory in category.categorymembers.values():
if subcategory.title.startswith('Kateqoriya:'):
collect_pages(subcategory.title.replace('Kateqoriya:', ''))
# Start collection from each category
for category in categories:
print(f"\nStarting collection from category: {category}")
collect_pages(category)
return collected_pages
def preprocess_text(text):
"""
Enhanced text preprocessing for Azerbaijani text
"""
# Remove extra whitespace
text = ' '.join(text.split())
# Add space after punctuation if missing
for punct in '.!?،؛:()[]{}«»':
text = text.replace(punct, punct + ' ')
# Fix common OCR errors in Azerbaijani text
replacements = {
'i': 'ı', # Replace dotted i with dotless ı where appropriate
'І': 'I',
'...': '…',
}
for old, new in replacements.items():
text = text.replace(old, new)
return text
def save_dataset(pages, output_file='az_wiki_data.json'):
"""
Save collected pages to a JSON file
"""
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(pages, f, ensure_ascii=False, indent=2)
print(f"Saved {len(pages)} pages to {output_file}")
def main():
# Collect pages with minimum length requirement
print("Starting data collection...")
pages = get_wiki_pages(min_length=500, max_pages=100) # 500 chars minimum length
# Preprocess and save
print("\nPreprocessing and saving data...")
for title in pages:
pages[title]['text'] = preprocess_text(pages[title]['text'])
save_dataset(pages)
# Print statistics
total_chars = sum(page['length'] for page in pages.values())
if pages:
print(f"\nCollection complete!")
print(f"Total pages: {len(pages)}")
print(f"Total characters: {total_chars}")
print(f"Average page length: {total_chars / len(pages):.2f} characters")
# Print some titles as examples
print("\nSample of collected articles:")
for title in list(pages.keys())[:5]:
print(f"- {title} ({pages[title]['length']} chars)")
if __name__ == "__main__":
main()