forked from paracrawl/cirrus-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
domain-size.py
executable file
·50 lines (37 loc) · 1.27 KB
/
domain-size.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env python3
import sys
import gzip
import re
import os
from io import TextIOWrapper
from collections import Counter
from multiprocessing.pool import Pool
def domain(url):
match = re.match(r'^(https?:)?(//)?(?P<domain>[^/]+)', url)
return match.group('domain') if match else url
def batch_size_per_domain(batch):
counter = Counter()
with gzip.open(os.path.join(batch, 'url.gz')) as fh_url, \
gzip.open(os.path.join(batch, 'sentences.gz')) as fh_text:
for url, text in zip(fh_url, fh_text):
counter[domain(url.decode())] += len(text)
return counter
def shard_batches(shard):
for entry in os.scandir(shard):
if entry.name.isdigit():
yield entry.path
def shard_size_per_domain(shard, pool):
# Since 3.8 you can do this with sum(..., start=Counter()) I think?
totals = Counter()
for counter in pool.imap_unordered(batch_size_per_domain, shard_batches(shard)):
totals += counter
return totals
pool = Pool(8)
for shard in sys.argv[1:]:
totals = shard_size_per_domain(shard, pool)
# for domain, size in totals.most_common():
# print("{}\t{}".format(domain, size))
with gzip.open(os.path.join(shard, 'sizes.gz'), 'wb') as fh, \
TextIOWrapper(fh) as fout:
for domain, size in totals.most_common():
print("{}\t{}".format(domain, size), file=fout)