Skip to content

Commit

Permalink
Metadata overhaul #50, #62, #78, #79, #97
Browse files Browse the repository at this point in the history
  • Loading branch information
mattfullerton committed Jun 16, 2017
1 parent d02a307 commit 1698f32
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 15 deletions.
2 changes: 1 addition & 1 deletion harvesters/rgi/scripts/assessment_list.json

Large diffs are not rendered by default.

70 changes: 62 additions & 8 deletions harvesters/rgi/scripts/get_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,32 @@
"SSD":u"South Sudan"
}

complete_metadata = {}
question_categories = {}
with open('./questions.csv', 'r') as f:
question_lp = {}
question_scoring = {}
#Read output of import_questions.py
with open('./questions_out.csv', 'r') as f:
reader = csv.DictReader(f)
for row in reader:
question_categories[row['Q'].zfill(3)] = row['Component']
question_categories[row['Q'].zfill(3)] = row['Subcomponent']
question_lp[row['Q'].zfill(3)] = row['LawOrPractice']
question_scoring[row['Q'].zfill(3)] = row['Scoring']

with open('./assessments.csv', 'r') as f:
assessments = [l.strip().replace('"','').replace('"','') for l in f.readlines()]

datasets = {}

all_removals = set()
pdfs = 0
dropped_pdfs = 0
duplicates = []

def urlify(s):
return s.lower().replace(' ', '-').replace(':', '')

for assessment in assessments:
complete_metadata[assessment] = []
#Used for skipping/testing
#if "CIV" not in assessment:
# continue
Expand All @@ -39,15 +49,34 @@ def urlify(s):
print '%s has %s pdfs' % (assessment, len([d for d in docs if d['mime_type'] == "application/pdf"]))

for d in docs:
complete_metadata[assessment].append(d)
if (d['mime_type'] == 'application/pdf'):
pdfs += 1
category = ''
questions = []

try:
category = question_categories[d['answers'][0][-3:]]
questions = [q[-3:] for q in d['answers']]
except:
law_practice_question = set()
scoring_question = set()

questions_raw = [q[-3:] for q in d['answers']]
questions = set(questions_raw)
removals = []
for question in questions:
if question not in question_categories:
print "Warning, question " + question + " not in list of valid questions"
all_removals.add(question)
removals.append(question)
for removal in removals:
questions.remove(removal)
if len(questions) == 0:
print "Warning, PDF not associated with any valid questions, dropping"
dropped_pdfs += 1
continue
questions = list(questions)
category = question_categories[questions[0]]
for question in questions:
law_practice_question.add(question_lp[question])
scoring_question.add(question_scoring[question])

assessment_type_abbr = assessment[-2:]
if assessment_type_abbr == "HY":
Expand All @@ -57,7 +86,7 @@ def urlify(s):
else:
assessment_type = "Unknown"

datasets[urlify(d['title'])] = {
new_dataset = {
'type': 'document',
'title': d['title'] + " (" + assessment_type + ", " + iso3[assessment[0:3]] + ", " + assessment[4:8] + ")",
'name': urlify(d['title']),
Expand All @@ -72,6 +101,8 @@ def urlify(s):
'year': assessment[4:8],
'url': API_ENDPOINT + assessment,
'category': category,
'law_practice_question': list(law_practice_question).sort(), #Alphabetic - law before practice, see display snippet in CKAN extension, this is important :-)
'scoring_question': list(scoring_question),
'question': questions,
'extras': [
{'key': 'spatial_text', 'value': iso3[assessment[0:3]]},
Expand All @@ -88,6 +119,29 @@ def urlify(s):
}
]
}

if urlify(d['title']) in datasets:
print "Warning, dataset already exists..."
print "This:"
print new_dataset
print "That:"
print datasets[urlify(d['title'])]
duplicates.append(new_dataset['resources'][0]['url'] + "," + datasets[urlify(d['title'])]['resources'][0]['url'])
else:
datasets[urlify(d['title'])] = new_dataset

print "The following questions are invalid:"
all_removals_list = list(all_removals)
all_removals_list.sort(key=lambda x: int(x))
print all_removals_list
print "This led to " + str(dropped_pdfs) + " PDFs being dropped out of a total of " + str(pdfs) + " PDFs"
print "There were " + str(len(duplicates)) + " duplicates:"
for duplicate in duplicates:
print duplicate
print "Writing out " + str(len(datasets)) + " datasets for CKAN"

with open('./datasets2.json', 'w') as f:
json.dump(datasets, f, indent=4, separators=(',', ': '))

with open('./complete.json', 'w') as f:
json.dump(complete_metadata, f, indent=4)
27 changes: 22 additions & 5 deletions harvesters/rgi/scripts/import_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import json

questions = []
cset = set()

print("The list of sub-components (categories in CKAN) will be shown for cross-checking")

with open("questions_new.csv", "rb") as qfile:
csvreader = csv.reader(qfile)
Expand All @@ -13,20 +16,33 @@
currentComponent = row[3]
elif row[0] == "SUB-COMPONENT":
currentSubComponent = row[3]
cset.add(row[3])
elif row[0] == "INDICATOR":
currentIndicator = row[3]
elif row[0] in ("QUESTION", "NON_SCORING"):
lawOrPractice = row[1]
if (row[0] == "NON_SCORING"):
scoring = "non-scoring"
else:
scoring = "scoring"
if (row[1] == "Law_Q"):
lp = "law"
elif (row[1] == "Practice_Q"):
lp = "practice"
else:
lp = "neither"
ref = row[2]
elName = row[3]
qName = row[4]
questions.append([ref, currentComponent, qName, currentSubComponent, currentIndicator, lawOrPractice, elName])
questions.append([ref, currentComponent, qName, currentSubComponent, currentIndicator, lp, elName, scoring])

questions = sorted(questions, key=lambda k: int(k[0]))

for item in cset:
print(item)

with open("questions_out.csv", "wb") as outfile:
csvwriter = csv.writer(outfile)
csvwriter.writerow(["Q", "Component", "Question", "Subcomponent", "Indicator", "LawOrPractice", "Element name"])
csvwriter.writerow(["Q", "Component", "Question", "Subcomponent", "Indicator", "LawOrPractice", "Element name", "Scoring"])

for question in questions:
csvwriter.writerow(question)
Expand All @@ -36,6 +52,7 @@
jdata['choices'] = []

for question in questions:
jdata['choices'].append({"value": question[0], "label": str(question[0]) + ": " + question[2]})
q_as_str = "%03d" % int(question[0])
jdata['choices'].append({"value": q_as_str, "label": q_as_str + ": " + question[2]})

outfile.write(json.dumps(jdata));
outfile.write(json.dumps(jdata, sort_keys=True, indent=4,));
4 changes: 3 additions & 1 deletion harvesters/rgi/scripts/rgi_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import unicodedata
import re
import uuid
import os

failed_states = []

Expand Down Expand Up @@ -191,7 +192,8 @@ def unicodeToNumbers(name):

qtext = "Question "
qset = set()
#Avoid duplicate question entries and throw out questions not in the schema. Not ideal.
#Avoid duplicate question entries and throw out questions not in the schema.
#Neither should happen as we catch these things in get_pdfs.py
for question in datasets[d]['question']: #temp, already checked
if question in qchoices:
qset.add(question)
Expand Down
Empty file modified harvesters/rgi/scripts/update_data.sh
100644 → 100755
Empty file.

0 comments on commit 1698f32

Please sign in to comment.