-
Notifications
You must be signed in to change notification settings - Fork 3
/
valid_index.py
28 lines (21 loc) · 1.02 KB
/
valid_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/usr/bin/env python
import argparse
psr = argparse.ArgumentParser("generate valid id pairs index")
psr.add_argument("-o", dest='opt', help="output")
psr.add_argument('ipt', help="input")
psr.add_argument('--field', default='org', help="the field to count common entries in")
args = psr.parse_args()
import pandas as pd, itertools as it, h5py, numpy as np
au = pd.read_csv(args.ipt)
missing = pd.read_csv('data/missing_id.csv')['x'].values
au[args.field] = au['id'].apply(lambda x: x not in missing)
# the central fucntion is sum((Counter(al[1]) & Counter(bl[1])).values())
# it counts the common org of a and b including duplications. For
# example, if a has 3 "Tsinghua" and b has 2, the common org is
# counted as 2.
# this is expanded to be used with keywords as well
dl = ((al[1].values[0] and bl[1].values[0])
for (al, bl) in it.combinations(au.groupby('id')[args.field],2))
x = np.array(list(dl), dtype='u2')
with h5py.File(args.opt, 'w') as opt:
opt.create_dataset('valid_index', data=x, compression="gzip", shuffle=True)