forked from dalejn/cleanBib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
queries.py
665 lines (574 loc) · 37.8 KB
/
queries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
import numpy as np
import pandas as pd
import pickle
import tqdm as tqdm
import preprocessing
import re
import string
from ethnicolr import pred_fl_reg_name
from urllib.parse import quote
from urllib.request import urlopen
import json
import matplotlib.pyplot as plt
import seaborn as sns
def namesFromXref(cr, doi, title, authorPos):
'''Use DOI and article titles to query Crossref for author list'''
if authorPos == 'first':
idx = 0
elif authorPos == 'last':
idx = -1
# get cross ref data
authors = ['']
# first try DOI
if doi != "":
works = cr.works(query=title, select=["DOI", "author"], limit=1, filter={'doi': doi})
if works['message']['total-results'] > 0:
authors = works['message']['items'][0]['author']
elif title != '':
works = cr.works(query=f'title:"{title}"', select=["title", "author"], limit=10)
cnt = 0
name = ''
# check that you grabbed the proper paper
if works['message']['items'][cnt]['title'][0].lower() == title.lower():
authors = works['message']['items'][0]['author']
# check the all fields are available
if not 'given' in authors[idx]:
name = ''
else:
# trim initials
# but need last name!
fname = authors[idx]['given'].replace('.', ' ').split()[0]
lname = authors[idx]['family'].replace('.', ' ').split()[0]
name = lname + ", " + fname
return name
# def get_gender_base(homedir):
# """
# for unknown gender, fill with base rates
# you will never / can't run this (that file is too big to share)
# """
# with open(homedir + 'data/gender_base' + '.pkl', 'rb') as f:
# gender_base = pickle.load(f)
# return gender_base
def get_pred_demos(authors, homedir, bibfile, gender_key, unused_keys=None, font='Palatino', method='florida',
identity_threshold = 0.7, no_credits_left = False):
"""
:param authors:
:return:
"""
authors = authors.split(' ')
print('first author is %s %s ' % (authors[1], authors[0]))
print('last author is %s %s ' % (authors[3], authors[2]))
print("we don't count these, but check the predictions file to ensure your names did not slip through!")
# citation_matrix = np.zeros((8, 8)) # 4race cats
citation_matrix = np.zeros((4, 4))
print('looping through your references, predicting gender and race')
columns = ['CitationKey', 'Author', 'Gender', 'W', 'PoC', 'GendCat', 'RaceCat']
paper_df = pd.DataFrame(columns=columns)
gender = []
race = []
idx = 0
# skip self-citations
authors_full_list = pd.read_csv(homedir + 'cleanedBib.csv')
authors_list_length = authors_full_list.loc[authors_full_list['SelfCite'] == 'N']
total_names_needed = authors_list_length.FA.nunique() + authors_list_length.LA.nunique()
#print(total_names_needed)
if total_names_needed > 100 or no_credits_left:
print("using genderAPI for gender inference due to bibliography size")
else:
print("using genderize.io for gender inference")
# skip_selfCites = list(authors_full_list.loc[authors_full_list['SelfCite'] == 'Y']['CitationKey'])
# skip_xref = list(authors_full_list.loc[authors_full_list['UsedXref'] == 'Y']['CitationKey'])
# skip citation diversity statement papers
diversity_bib_titles = ['The extent and drivers of gender imbalance in neuroscience reference lists',
'The gender citation gap in international relations',
'Gendered citation patterns in international relations journals',
'Quantitative evaluation of gender bias in astronomical publications from citation counts',
'\# CommunicationSoWhite',
'{Just Ideas? The Status and Future of Publication Ethics in Philosophy: A White Paper}',
'Gendered citation patterns across political science and social science methodology fields',
'Gender Diversity Statement and Code Notebook v1.0',
'Racial and ethnic imbalance in neuroscience reference lists and intersections with gender',
'Gender Diversity Statement and Code Notebook v1.1',
'Gendered citation practices in the field of communication',
'Gender disparity in citations in high- impact journal articles',
'Gender Disparity in Citations in High-Impact Journal Articles',
'Gender (im)balance in citation practices in cognitive neuroscience',
'Gender (Im)balance in Citation Practices in Cognitive Neuroscience',
'Name-ethnicity classification from open sources',
'Predicting race and ethnicity from the sequence of characters in a name']
# save base gender rates - we can't run this so what will happen? --> throws an error, should we ask them?
# gender_base = get_gender_base(homedir)
# make a dictionary of names so we don't query the same thing twice
full_name_data = {}
first_name_data = {}
n_gen_queries = 0
n_race_queries = 0
n_skipped_unknown = 0
for index, paper in tqdm.tqdm(authors_full_list.iterrows(), total = len(authors_full_list.index)):
#for paper in tqdm.tqdm(bibfile.entries, total=len(bibfile.entries)): # loop through papers
# tqdm just adds a progress bar. but why are we using bibfile.entries?! use the cleanedBib.csv we put all the work into
#if paper in skip_selfCites:
if paper.loc["SelfCite"] == "Y":
continue
#if bibfile.entries[paper].fields['title'] in diversity_bib_titles:
if paper.loc["Title"] in diversity_bib_titles:
continue
if paper.loc["FA"] == paper.loc["LA"]: # skip if single author
continue
if unused_keys:
if paper.loc["CitationKey"] in unused_keys: # TODO: which element of paper would be in unused_keys
continue
# TODO: revisit this editor issue
# if 'author' not in bibfile.entries[paper].persons.keys(): # not sure about this one
# continue # some editorials have no authors
fa = paper.loc["FA"] #bibfile.entries[paper].persons['author'][0]
fa_lname = fa.split(", ")[0] #fa.last_names[0]
la = paper.loc["LA"] #bibfile.entries[paper].persons['author'][-1]
la_lname = la.split(", ")[0] #la.last_names[0]
try:
fa_fname = fa.split(", ")[1]
except:
fa_fname = fa.split(", ")[0] # for people like Plato
try:
la_fname = la.split(", ")[1]
except:
la_fname = fa.split(", ")[0] # for people like Plato
# fa_fname = preprocessing.convertLatexSpecialChars(str(fa_fname.encode("ascii", errors="ignore").decode())).translate(
# str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
# fa_fname = preprocessing.convertSpecialCharsToUTF8(fa_fname)
# fa_lname = preprocessing.convertLatexSpecialChars(str(fa_lname.encode("ascii", errors="ignore").decode())).translate(
# str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
# fa_lname = preprocessing.convertSpecialCharsToUTF8(fa_lname)
# la_fname = preprocessing.convertLatexSpecialChars(str(la_fname.encode("ascii", errors="ignore").decode())).translate(
# str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
# la_fname = preprocessing.convertSpecialCharsToUTF8(la_fname)
# la_lname = preprocessing.convertLatexSpecialChars(str(la_lname.encode("ascii", errors="ignore").decode())).translate(
# str.maketrans('', '', re.sub('\-', '', string.punctuation))).replace('Protected', "").replace(" ", '')
# la_lname = preprocessing.convertSpecialCharsToUTF8(la_lname)
# double check for self cites again
if fa_fname.lower().strip() == authors[1].lower().strip():
if fa_lname.lower().strip() == authors[0].lower().strip():
continue
if fa_fname.lower().strip() == authors[3].lower().strip():
if fa_lname.lower().strip() == authors[2].lower().strip():
continue
if la_fname.lower().strip() == authors[1].lower().strip():
if la_lname.lower().strip() == authors[0].lower().strip():
continue
if la_fname.lower().strip() == authors[3].lower().strip():
if la_lname.lower().strip() == authors[2].lower().strip():
continue
if (fa_lname, fa_fname) in full_name_data: #if you've already queried this name, use data from before
fa_race = full_name_data[(fa_lname, fa_fname)]
else:
names = [{'lname': fa_lname, 'fname': fa_fname}]
fa_df = pd.DataFrame(names, columns=['fname', 'lname'])
n_race_queries = n_race_queries + 1
fa_race, fa_r = ethnicolr_query(fa_df, identity_threshold)
full_name_data[(fa_lname, fa_fname)] = fa_race
if (la_lname, la_fname) in full_name_data:
la_race = full_name_data[(la_lname, la_fname)]
else:
names = [{'lname': la_lname, 'fname': la_fname}]
la_df = pd.DataFrame(names, columns=['fname', 'lname'])
n_race_queries = n_race_queries + 1
la_race, la_r = ethnicolr_query(la_df, identity_threshold)
full_name_data[(la_lname, la_fname)] = la_race
if fa_fname in first_name_data:
fa_gender, fa_g = first_name_data[fa_fname]
else:
#print(fa_fname)
if total_names_needed > 100 or no_credits_left:
fa_gender, fa_g = gen_api_query(gender_key, fa_fname, identity_threshold)
else:
fa_gender, fa_g = genderize_query(fa_fname, identity_threshold)
#fa_gender, fa_g = "female", [1, 0]
n_gen_queries = n_gen_queries + 1
first_name_data[fa_fname] = (fa_gender, fa_g)
if la_fname in first_name_data:
la_gender, la_g = first_name_data[la_fname]
else:
#print(la_fname)
if total_names_needed > 100 or no_credits_left:
la_gender, la_g = gen_api_query(gender_key, la_fname, identity_threshold)
else:
la_gender, la_g = genderize_query(la_fname, identity_threshold)
#la_gender, la_g = "male", [0, 1]
n_gen_queries= n_gen_queries + 1
first_name_data[la_fname] = (la_gender, la_g) # storing so we don't query duplicates
#columns = ['CitationKey', 'Author', 'Gender', 'W', 'PoC', 'GendCat', 'RaceCat']
if total_names_needed > 100 or no_credits_left: # gender_API
fa_data = np.array(
[paper, '%s,%s' % (fa_fname, fa_lname), '%s,%s' % (fa_gender['gender'], fa_gender['accuracy']), fa_r[0],
fa_r[1], '', ''], dtype = "object").reshape(1, 7)
la_data = np.array(
[paper, '%s,%s' % (la_fname, la_lname), '%s,%s' % (la_gender['gender'], la_gender['accuracy']), la_r[0],
la_r[1], '%s%s' % (fa_gender['gender'], la_gender['gender']), '%s%s' % (fa_race['race'], la_race['race'])], dtype = "object").reshape(1, 7)
else: # genderize.io
fa_data = np.array(
[paper, '%s,%s' % (fa_fname, fa_lname), '%s,%s' % (fa_gender['gender'], fa_gender['probability']*100), fa_r[0],
fa_r[1], '', ''], dtype = "object").reshape(1, 7)
la_data = np.array(
[paper, '%s,%s' % (la_fname, la_lname), '%s,%s' % (la_gender['gender'], la_gender['probability']*100), la_r[0],
la_r[1], '%s%s' % (fa_gender['gender'], la_gender['gender']), '%s%s' % (fa_race['race'], la_race['race'])], dtype = "object").reshape(1, 7)
paper_df = pd.concat([paper_df, pd.DataFrame(fa_data, columns=columns)], ignore_index=True) # replaced append with concat bc pandas update
paper_df = pd.concat([paper_df, pd.DataFrame(la_data, columns=columns)], ignore_index=True) # replaced append with concat bc pandas update
mm = fa_g[0] * la_g[0] # 0 for men, take probability
wm = fa_g[1] * la_g[0] # 1 for women, take probability
mw = fa_g[0] * la_g[1]
ww = fa_g[1] * la_g[1]
# maybe change this so if both genders are unknown then just skip this paper
mm, wm, mw, ww = [mm, wm, mw, ww] / np.sum([mm, wm, mw, ww]) # this could be sum of 0s and divide by 0
gender.append([mm, wm, mw, ww])
ww = fa_r[0] * la_r[0]
pw = fa_r[1] * la_r[0]
wp = fa_r[0] * la_r[1]
pp = fa_r[1] * la_r[1]
race.append([ww, pw, wp, pp])
# we may want to change this to white vs PoC depending on sample size
# paper_matrix = np.zeros((2, 8)) # 4race cats, two lists of length 8 (for intersectional identities), 1st for FA, 2nd for LA
paper_matrix = np.zeros((2, 4))
paper_matrix[0] = np.outer(fa_g, fa_r).flatten() # these are our data, basically matrix multiplication to intersect identities
paper_matrix[1] = np.outer(la_g, la_r).flatten()
paper_matrix = np.outer(paper_matrix[0], paper_matrix[1]) # matrix multiply to address author position
citation_matrix = citation_matrix + paper_matrix # add up identities from each paper
idx = idx + 1
# report queries
print(f"Queried gender api {n_gen_queries} times out of {len(bibfile.entries)*2} entries")
print(f"Queried race/ethnicity api {n_race_queries} times out of {len(bibfile.entries)*2} entries")
mm, wm, mw, ww = np.mean(gender, axis=0) * 100
WW, pw, wp, pp = np.mean(race, axis=0) * 100
print("MM: " + str(mm) + " WM: " + str(wm) + " MW: " + str(mw) + " WW: " + str(ww))
print("WhWh: " + str(WW) + " PoCWh: " + str(pw) + " WhPoC: " + str(wp) + " PoCPoC: " + str(pp))
return mm, wm, mw, ww, WW, pw, wp, pp, citation_matrix, paper_df
# this uses genderAPI, genderize.io is free for up to 100 a day with no API key
def gen_api_query(gender_key, name, gender_threshold):
url = "https://gender-api.com/get?key=" + gender_key + "&name=%s" % (quote(name))
response = urlopen(url)
decoded = response.read().decode('utf-8')
gender = json.loads(decoded)
if gender['gender'] == 'female':
if gender['accuracy']/100. >= gender_threshold: # accuracy is the percentage, convert to proportion
g = [0, gender['accuracy']/100.]
else: # below threshold
g = [gender['accuracy']/100., 1 - gender["accuracy"]/100.]
gender['gender'] = "unknown" # reset to unknown
elif gender['gender'] == 'male':
if gender['accuracy']/100. >= gender_threshold:
g = [gender['accuracy']/100., 0]
else: # below threshold
g = [1 - gender['accuracy']/100., gender['accuracy']/100.]
gender['gender'] = "unknown" # reset to unknown
elif gender['gender'] == 'unknown':
g = [0.5,0.5] # TODO: previously fills with base rate for year of the pub but we won't have this info so swap out, 50/50 chance? we could also set to makeup of field in 2019 or makeup in field from our calculations by year
return gender, g
def genderize_query(name, gender_threshold):
url = "https://api.genderize.io?name=" + name
response = urlopen(url)
decoded = response.read().decode('utf-8')
gender = json.loads(decoded)
if gender['gender'] is None:
g = [0.5,0.5]
gender['gender'] = "unknown"
elif gender['gender'] == "female":
if gender['probability'] >= gender_threshold:
g = [0, gender['probability']]
else: # below threshold
g = [gender['probability'], 1 - gender["probability"]]
gender['gender'] = "unknown" # reset to unknown
elif gender['gender'] == "male":
if gender['probability'] >= gender_threshold:
g = [gender['probability'], 0]
else: # below threshold
g = [1 - gender['probability'], gender['probability']]
gender['gender'] = "unknown" # reset to unknown
return gender, g
def ethnicolr_query(author_df, race_threshold):
odf = pred_fl_reg_name(author_df, 'lname', 'fname') # this is the query I think
if odf['nh_white'][0] > race_threshold: # probability white is greater than threshold
race = {'race':'nh_white', 'probability':odf['nh_white'][0]}
elif odf['nh_white'][0] < 1 - race_threshold: # probability non-white is greater than threshold
race = {'race':'poc', 'probability': 1 - odf['nh_white'][0]}
else: # unknown
race = {'race':'unknown', 'probability': odf['nh_white'][0]} # probability defaults to white if race unknown
r = [odf['nh_white'][0], 1 - odf['nh_white'][0]] # trying to make this match gender
return race, r
def print_statements(mm, wm, mw, ww, WW, aw, wa, aa):
statement = ("Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars "
"are under-cited relative to the number of such papers in the field (1-9). Here we sought to proactively consider choosing references that reflect the "
"diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first "
"and last author of each reference by using databases that store the probability of a first name being carried by a woman (5, 10). By this measure "
"and excluding self-citations to the first and last authors of our current paper), our references contain ww% woman(first)/woman(last), "
"MW% man/woman, WM% woman/man, and MM% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the "
"databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. "
"Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a "
"first and last name being carried by an author of color (11, 12). By this measure (and excluding self-citations), our references contain AA% author of "
"color (first)/author of color(last), WA% white author/author of color, AW% author of color/white author, and WW% white author/white author. This method "
"is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) "
"it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names. "
"We look forward to future work that could help us to better understand how to support equitable practices in science.")
statement = statement.replace('MM', str(np.around(mm, 2)))
statement = statement.replace('WM', str(np.around(wm, 2)))
statement = statement.replace('MW', str(np.around(mw, 2)))
statement = statement.replace('ww', str(np.around(ww, 2)))
statement = statement.replace('WW', np.array2string(WW.values[0], formatter={'float_kind':lambda x: "%.2f" % x}))
statement = statement.replace('AW', np.array2string(aw.values[0], formatter={'float_kind':lambda x: "%.2f" % x}))
statement = statement.replace('WA', np.array2string(wa.values[0], formatter={'float_kind':lambda x: "%.2f" % x}))
statement = statement.replace('AA', str(np.around(aa, 2)))
statementLatex = ("Recent work in several fields of science has identified a bias in citation practices such that papers from women and other minority scholars "
"are under-cited relative to the number of such papers in the field \cite{mitchell2013gendered,dion2018gendered,caplar2017quantitative, maliniak2013gender, Dworkin2020.01.03.894378, bertolero2021racial, wang2021gendered, chatterjee2021gender, fulvio2021imbalance}. Here we sought to proactively consider choosing references that reflect the "
"diversity of the field in thought, form of contribution, gender, race, ethnicity, and other factors. First, we obtained the predicted gender of the first "
"and last author of each reference by using databases that store the probability of a first name being carried by a woman \cite{Dworkin2020.01.03.894378,zhou_dale_2020_3672110}. By this measure "
"(and excluding self-citations to the first and last authors of our current paper), our references contain ww\% woman(first)/woman(last), "
"MW\% man/woman, WM\% woman/man, and MM\% man/man. This method is limited in that a) names, pronouns, and social media profiles used to construct the "
"databases may not, in every case, be indicative of gender identity and b) it cannot account for intersex, non-binary, or transgender people. "
"Second, we obtained predicted racial/ethnic category of the first and last author of each reference by databases that store the probability of a "
"first and last name being carried by an author of color \cite{ambekar2009name, sood2018predicting}. By this measure (and excluding self-citations), our references contain AA\% author of "
"color (first)/author of color(last), WA\% white author/author of color, AW\% author of color/white author, and WW\% white author/white author. This method "
"is limited in that a) names and Florida Voter Data to make the predictions may not be indicative of racial/ethnic identity, and b) "
"it cannot account for Indigenous and mixed-race authors, or those who may face differential biases due to the ambiguous racialization or ethnicization of their names. "
"We look forward to future work that could help us to better understand how to support equitable practices in science.")
statementLatex = statementLatex.replace('MM', str(np.around(mm, 2)))
statementLatex = statementLatex.replace('WM', str(np.around(wm, 2)))
statementLatex = statementLatex.replace('MW', str(np.around(mw, 2)))
statementLatex = statementLatex.replace('ww', str(np.around(ww, 2)))
statementLatex = statementLatex.replace('WW', np.array2string(WW.values[0], formatter={'float_kind':lambda x: "%.2f" % x}))
statementLatex = statementLatex.replace('AW', np.array2string(aw.values[0], formatter={'float_kind':lambda x: "%.2f" % x}))
statementLatex = statementLatex.replace('WA', np.array2string(wa.values[0], formatter={'float_kind':lambda x: "%.2f" % x}))
statementLatex = statementLatex.replace('AA', np.array2string(aa.values[0], formatter={'float_kind':lambda x: "%.2f" % x}))
#str(np.around(aa, 2)))
return statement, statementLatex
def plot_heatmaps(citation_matrix, homedir):
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# names = ['white_m','api_m','hispanic_m','black_m','white_w','api_w','hispanic_w','black_w'] # 4race cat
names = ['white_m','poc_m','white_w','poc_w']
plt.close()
sns.set(style='white')
fig, axes = plt.subplots(ncols=2,nrows=1,figsize=(7.5,4))
axes = axes.flatten()
plt.sca(axes[0])
heat = sns.heatmap(np.around((citation_matrix/citation_matrix.sum())*100,2),annot=True,ax=axes[0],annot_kws={"size": 8},cmap=cmap,vmax=1,vmin=0)
axes[0].set_ylabel('first author',labelpad=0)
heat.set_yticklabels(names,rotation=0)
axes[0].set_xlabel('last author',labelpad=1)
heat.set_xticklabels(names,rotation=90)
heat.set_title('percentage of citations')
citation_matrix_sum = citation_matrix / np.sum(citation_matrix)
# expected = np.load('/%s/data/expected_matrix_florida.npy'%(homedir)) # this is where we need to change to our expected matrix,
# # there's is 8x8 and not fractions
# expected = expected/np.sum(expected)
# using global north values
# 'white_m','poc_m','white_w','poc_w'
# FA - LA
#[[WM-WM, WM-PM, WM-WW, WM-PW],
# [PM-WM, PM-PM, PM-WW, PM-PW],
# [WW-WM, WW-PM, WW-WW, WW-PW],
# [PW-WM, PW-PM, PW-WW, PW-PW]] # check that this is right matrix setup
# how did I get these?
expected = [[0.12, 0.11, 0.04, 0.03],
[0.16, 0.14, 0.05, 0.03],
[0.08, 0.07, 0.03, 0.02],
[0.05, 0.05, 0.02, 0.01]]
percent_overunder = np.ceil( ((citation_matrix_sum - expected) / expected)*100)
plt.sca(axes[1])
heat = sns.heatmap(np.around(percent_overunder,2),annot=True,ax=axes[1],fmt='g',annot_kws={"size": 8},vmax=50,vmin=-50,cmap=cmap)
axes[1].set_ylabel('',labelpad=0)
heat.set_yticklabels('')
axes[1].set_xlabel('last author',labelpad=1)
heat.set_xticklabels(names,rotation=90)
heat.set_title('percentage over/under-citations')
plt.tight_layout()
plt.savefig('race_gender_citations.pdf')
def plot_gender_histograms():
# Plot a histogram #
names = pd.read_csv('predictions.csv')
#total_citations = names.CitationKey.nunique() # if some are unknown this will be wrong
names.GendCat = names.GendCat.str.replace('female', 'W', regex=False)
names.GendCat = names.GendCat.str.replace('male', 'M', regex=False)
names.GendCat = names.GendCat.str.replace('unknown', 'U', regex=False) # where are these Us from???
gend_cats = names['GendCat'].dropna().unique() # get a vector of all the gender categories in your paper
# we'd really rather drop unknowns I think & single author papers to match our manuscript
# dropped single authors in get_names above, need to deal with unknowns here
# Create a data frame that will be used to plot the histogram. This will have the gender category (e.g., WW, MM) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column #
dat_for_plot = names.groupby('GendCat').size().reset_index()
all_cats = ['WW', 'MW', 'WM', 'MM'] # drops unknowns
empty_dat_for_plot = pd.DataFrame(0, index=np.arange(4), columns=['GendCat', 0]) # 7 bc num unique cats
empty_dat_for_plot['GendCat'] = all_cats
#set(dat_for_plot['GendCat']).intersection(empty_dat_for_plot['GendCat']) # can drop unknowns if don't allow them in all_cats maybe
for i in set(dat_for_plot['GendCat']).intersection(empty_dat_for_plot['GendCat']):
empty_dat_for_plot.loc[empty_dat_for_plot['GendCat'] == i, 0] = dat_for_plot.loc[dat_for_plot['GendCat']== i, 0].values
# filling in rows with paper counts
dat_for_plot = empty_dat_for_plot
dat_for_plot.rename(columns={0:'count'}, inplace=True)
total_citations = dat_for_plot['count'].sum()
dat_for_plot = dat_for_plot.assign(percentage=dat_for_plot['count']/total_citations*100)
# no unknown genders right now so can't even check this
# Create a data frame with only the WW, MW, WM, MM categories and their base rates - to plot percent citations relative to benchmarks
dat_for_baserate_plot = dat_for_plot.loc[(dat_for_plot.GendCat == 'MM') |
(dat_for_plot.GendCat == 'WM') |
(dat_for_plot.GendCat == 'MW') |
(dat_for_plot.GendCat == 'WW'),:]
# MM,MW,WM,WW
# 58.4% for man/man, 9.4% for man/woman, 25.5% for woman/man, and 6.7% for woman/woman
#baserate = [6.7, 9.4, 25.5, 58.4]
baserate = [7.1, 13.3, 18.8, 60.8] # base rates for IDD from JCT, switched order
dat_for_baserate_plot['baserate'] = baserate
dat_for_baserate_plot = dat_for_baserate_plot.assign(citation_rel_to_baserate = dat_for_baserate_plot.percentage - dat_for_baserate_plot.baserate,
over_under_perc = ((dat_for_baserate_plot.percentage/dat_for_baserate_plot.baserate) - 1)*100)
# is it ok over/under is calculated from percentages here and not total citations? I think so bc 100s would cancel
print(dat_for_baserate_plot)
# plot
plt.figure()
fig, ax = plt.subplots(1, 2)
p1 = sns.barplot(data=dat_for_baserate_plot, x='GendCat', y='count', order=['MM','WM','MW','WW'], hue='GendCat', dodge = False, ax = ax[0])
ax[0].set(xlabel='Inferred gender category', ylabel='Number of papers')
#plt.yaxis.set_major_locator(ticker.MultipleLocator(2)) # left off here, trying to set y axis to integer breaks
p1.set_yticks(range(0, max(dat_for_baserate_plot['count']) + 1, 2))
p1.set_yticklabels(range(0, max(dat_for_baserate_plot['count']) + 1, 2))
#plt.tight_layout()
#plt.legend([],[], frameon=False)
#plt.figure()
p2 = sns.barplot(data=dat_for_baserate_plot, x='GendCat', y='over_under_perc', order=['MM','WM','MW','WW'], hue='GendCat', dodge = False, ax = ax[1])
p2.axhline(0, color="k", clip_on=False)
ax[1].set(xlabel='Inferred gender category', ylabel='% over/under citation')
#plt.tight_layout()
#plt.legend([],[], frameon=False)
#p1.get_legend().remove()
#p2.get_legend().remove()
fig.tight_layout()
fig.show()
plt.savefig('gender_results.pdf')
def plot_race_histograms():
# Plot a histogram #
names = pd.read_csv('predictions.csv')
#total_citations = names.CitationKey.nunique() # if unknowns this will be wrong
names.RaceCat = names.RaceCat.str.replace('nh_white', 'W', regex=False)
names.RaceCat = names.RaceCat.str.replace('poc', 'P', regex=False)
names.RaceCat = names.RaceCat.str.replace('unknown', 'U', regex=False) # where are these Us from???
race_cats = names['RaceCat'].dropna().unique()
# Create a data frame that will be used to plot the histogram. This will have the race category (e.g., WP, PP) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column #
dat_for_plot = names.groupby('RaceCat').size().reset_index()
all_cats = ['PP', 'WP', 'PW', 'WW'] # drops unknowns
empty_dat_for_plot = pd.DataFrame(0, index=np.arange(4), columns=['RaceCat', 0])
empty_dat_for_plot['RaceCat'] = all_cats
for i in set(dat_for_plot['RaceCat']).intersection(empty_dat_for_plot['RaceCat']):
empty_dat_for_plot.loc[empty_dat_for_plot['RaceCat'] == i, 0] = dat_for_plot.loc[dat_for_plot['RaceCat']== i, 0].values
dat_for_plot = empty_dat_for_plot
dat_for_plot.rename(columns={0:'count'}, inplace=True)
total_citations = dat_for_plot['count'].sum()
dat_for_plot = dat_for_plot.assign(percentage=dat_for_plot['count']/total_citations*100)
# Create a data frame with only the WW, MW, WM, MM categories and their base rates - to plot percent citations relative to benchmarks
dat_for_baserate_plot = dat_for_plot.loc[dat_for_plot.RaceCat == 'WW',:] # we only want this one bc we will collapse all articles with PoC authors
addition = pd.DataFrame(columns = dat_for_baserate_plot.columns,
data = [['PorP',
int(total_citations - dat_for_plot.loc[3, 'count']),
100 - dat_for_plot.loc[3, 'percentage']]])
dat_for_baserate_plot = pd.concat([dat_for_baserate_plot, addition], axis = 0)
print(dat_for_baserate_plot)
dat_for_baserate_plot = dat_for_baserate_plot.astype({"count": int})
baserate = [70.7, 29.3] # base rates for IDD from JCT
dat_for_baserate_plot['baserate'] = baserate
dat_for_baserate_plot = dat_for_baserate_plot.assign(citation_rel_to_baserate=dat_for_baserate_plot.percentage - dat_for_baserate_plot.baserate,
over_under_perc = ((dat_for_baserate_plot.percentage/dat_for_baserate_plot.baserate) - 1)*100)
print(dat_for_baserate_plot)
# plot
plt.figure()
fig, ax = plt.subplots(1, 2)
p1 = sns.barplot(data=dat_for_baserate_plot, x='RaceCat', y='count', order=['WW','PorP'], hue='RaceCat', dodge = False, ax = ax[0])
ax[0].set(xlabel='Inferred race category', ylabel='Number of papers')
#plt.yaxis.set_major_locator(ticker.MultipleLocator(2)) # left off here, trying to set y axis to integer breaks
p1.set_yticks(range(0, max(dat_for_baserate_plot['count']) + 1, 2))
p1.set_yticklabels(range(0, max(dat_for_baserate_plot['count']) + 1, 2))
#plt.tight_layout()
#plt.legend([],[], frameon=False)
#plt.figure()
p2 = sns.barplot(data=dat_for_baserate_plot, x='RaceCat', y='over_under_perc', order=['WW','PorP'], hue='RaceCat', dodge = False, ax = ax[1])
p2.axhline(0, color="k", clip_on=False)
ax[1].set(xlabel='Inferred race category', ylabel='% over/under citation rate')
#plt.tight_layout()
#plt.legend([],[], frameon=False)
#p1.get_legend().remove()
#p2.get_legend().remove()
fig.tight_layout()
fig.show()
plt.savefig('race_results.pdf')
def check_genderAPI_balance(genderAPI_key, homedir):
authors_full_list = pd.read_csv(homedir + 'cleanedBib.csv')
authors_full_list = authors_full_list.loc[authors_full_list['SelfCite'] == 'N']
url = "https://gender-api.com/get-stats?key=" + genderAPI_key
response = urlopen(url)
decoded = response.read().decode('utf-8')
decoded_json = json.loads(decoded)
print('Remaining credits: %s'%decoded_json["remaining_requests"])
print('This should use (at most) %d credits, '%(authors_full_list.FA.nunique() + authors_full_list.LA.nunique()) + \
'saving you approx %d'%((authors_full_list.FA.count() + authors_full_list.LA.count())-
(authors_full_list.FA.nunique() + authors_full_list.LA.nunique())) + \
' credit(s) by storing queries.')
def colorful_latex(paper_df, homedir, tex_file, bib_data):
cite_gender = paper_df[1::2]
cite_gender.GendCat = cite_gender.GendCat.str.replace('female', 'W', regex=False)
cite_gender.GendCat = cite_gender.GendCat.str.replace('male', 'M', regex=False)
cite_gender.GendCat = cite_gender.GendCat.str.replace('unknown', 'U', regex=False)
cite_gender['Color'] = '' # what color to make each gender category
# add back in self-cites as UU
authors_full_list = pd.read_csv(homedir + 'cleanedBib.csv')
add_selfCites = list(authors_full_list.loc[authors_full_list['SelfCite'] == 'Y']['CitationKey'])
add_selfCites = pd.DataFrame(add_selfCites)
try:
add_selfCites.columns = ['CitationKey']
except:
add_selfCites['CitationKey'] = ''
add_selfCites['GendCat'] = 'UU'
cite_gender = pd.concat([cite_gender,add_selfCites])
# # add back in diversity statement citations as UU
diversity_statement_entries = []
diversity_bib_titles = ['The extent and drivers of gender imbalance in neuroscience reference lists',
'The gender citation gap in international relations',
'Gendered citation patterns in international relations journals',
'Quantitative evaluation of gender bias in astronomical publications from citation counts',
'\# CommunicationSoWhite',
'{Just Ideas? The Status and Future of Publication Ethics in Philosophy: A White Paper}',
'Gendered citation patterns across political science and social science methodology fields',
'Gender Diversity Statement and Code Notebook v1.0',
'Racial and ethnic imbalance in neuroscience reference lists and intersections with gender',
'Gender Diversity Statement and Code Notebook v1.1',
'Gendered citation practices in the field of communication',
'Gender disparity in citations in high- impact journal articles',
'Gender Disparity in Citations in High-Impact Journal Articles',
'Gender (im)balance in citation practices in cognitive neuroscience',
'Gender (Im)balance in Citation Practices in Cognitive Neuroscience',
'Name-ethnicity classification from open sources',
'Predicting race and ethnicity from the sequence of characters in a name']
for paper in bib_data.entries:
if bib_data.entries[paper].fields['title'] in diversity_bib_titles:
diversity_statement_entries.append(paper)
add_selfCites = pd.DataFrame(diversity_statement_entries)
try:
add_selfCites.columns = ['CitationKey']
except:
add_selfCites['CitationKey'] = ''
add_selfCites['GendCat'] = 'UU'
cite_gender = pd.concat([cite_gender,add_selfCites])
# color citations (self-citations and citation diversity statement entries set as black)
cite_gender.index = cite_gender.CitationKey
colors = {'MM':'red','MW':'blue','WW':'green','WM':'magenta','UU':'black',
'MU':'black','UM':'black','UW':'black','WU':'black'}
for idx in cite_gender.index: # loop through each citation key and set color
cite_gender.loc[idx,'Color'] = colors[cite_gender.loc[idx,'GendCat']]
fin = open(tex_file)
texdoc=fin.readlines()
with open(tex_file[:-4]+'_gendercolor.tex','w') as fout:
for i in range(len(texdoc)):
s = texdoc[i]
cite_instances = re.findall('\\\\cite\{.*?\}',s)
cite_keys = re.findall('\\\\cite\{(.*?)\}',s)
cite_keys = [x.split(',') for x in cite_keys]
cite_keys_sub = [['\\textcolor{' + cite_gender.loc[x.strip(),'Color'] + '}{\\cite{'+x.strip()+'}}' for x in cite_instance] for cite_instance in cite_keys]
cite_keys_sub = ['\\textsuperscript{,}'.join(x) for x in cite_keys_sub]
for idx,cite_instance in enumerate(cite_instances):
s = s.replace(cite_instances[idx],cite_keys_sub[idx])
fout.write(s)
# place color key after abstract
if '\\section*{Introduction}\n' in s:
l = ['\\textcolor{' + colors[k] + '}{'+k+'}' for k in colors.keys()]
fout.write('\tKey: '+ ', '.join(l)+'.\n')