-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsubstance_dedup.py
36 lines (29 loc) · 1.17 KB
/
substance_dedup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#
# dedup some substance id's not previously deduplictated and append varying names
#
from dbconnect import getConnection
import psycopg2 as psql
conn = getConnection()
dups = "select substance_id from (select count(substance_id) as count, substance_id from reaxys_temp.substance group by substance_id)a where a.count >1;"
getdup = 'select ctid,* from reaxys_temp.substance where substance_id = %s order by name asc;'
update = 'update reaxys_temp.substance set name = %s where ctid = %s;'
delete = 'delete from reaxys_temp.substance where ctid = %s;'
cur = conn.cursor()
cur.execute(dups)
for line in cur.fetchall():
with conn.cursor() as c2:
c2.execute(getdup, line)
main=c2.fetchone()
firstname=str(main[3])
firstctid=main[0]
for l in c2.fetchall():
newname = str(l[3])
ctid=l[0]
if newname != 'None':
firstname=firstname+ '|' + str(l[3])
print(c2.mogrify(update, (firstname, firstctid)))
print(c2.mogrify(delete, (ctid,)))
c2.execute(update, (firstname, firstctid))
c2.execute(delete, (ctid,))
conn.commit()
conn.close()