Metadata overhaul #50, #62, #78, #79, #97

NRGI · Jun 16, 2017 · 1698f32 · 1698f32
1 parent d02a307
commit 1698f32
Show file tree

Hide file tree

Showing 5 changed files with 88 additions and 15 deletions.
diff --git a/harvesters/rgi/scripts/assessment_list.json b/harvesters/rgi/scripts/assessment_list.json
diff --git a/harvesters/rgi/scripts/get_pdfs.py b/harvesters/rgi/scripts/get_pdfs.py
@@ -14,22 +14,32 @@
 "SSD":u"South Sudan"
 }
 
+complete_metadata = {}
 question_categories = {}
-with open('./questions.csv', 'r') as f:
+question_lp = {}
+question_scoring = {}
+#Read output of import_questions.py
+with open('./questions_out.csv', 'r') as f:
     reader = csv.DictReader(f)
     for row in reader:
-        question_categories[row['Q'].zfill(3)] = row['Component']
+        question_categories[row['Q'].zfill(3)] = row['Subcomponent']
+        question_lp[row['Q'].zfill(3)] = row['LawOrPractice']
+        question_scoring[row['Q'].zfill(3)] = row['Scoring']
 
 with open('./assessments.csv', 'r') as f:
     assessments = [l.strip().replace('"','').replace('"','') for l in f.readlines()]
 
 datasets = {}
-
+all_removals = set()
+pdfs = 0
+dropped_pdfs = 0
+duplicates = []
 
 def urlify(s):
     return s.lower().replace(' ', '-').replace(':', '')
 
 for assessment in assessments:
+    complete_metadata[assessment] = []
     #Used for skipping/testing
     #if "CIV" not in assessment:
     #    continue
@@ -39,15 +49,34 @@ def urlify(s):
     print '%s has %s pdfs' % (assessment, len([d for d in docs if d['mime_type'] == "application/pdf"]))
 
     for d in docs:
+        complete_metadata[assessment].append(d)
         if (d['mime_type'] == 'application/pdf'):
+            pdfs += 1
             category = ''
             questions = []
 
-            try:
-                category = question_categories[d['answers'][0][-3:]]
-                questions = [q[-3:] for q in d['answers']]
-            except:
+            law_practice_question = set()
+            scoring_question = set()
+
+            questions_raw = [q[-3:] for q in d['answers']]
+            questions = set(questions_raw)
+            removals = []
+            for question in questions:
+                if question not in question_categories:
+                    print "Warning, question " + question + " not in list of valid questions"
+                    all_removals.add(question)
+                    removals.append(question)
+            for removal in removals:
+                questions.remove(removal)
+            if len(questions) == 0:
+                print "Warning, PDF not associated with any valid questions, dropping"
+                dropped_pdfs += 1
                 continue
+            questions = list(questions)
+            category = question_categories[questions[0]]
+            for question in questions:
+                law_practice_question.add(question_lp[question])
+                scoring_question.add(question_scoring[question])
 
             assessment_type_abbr = assessment[-2:]
             if assessment_type_abbr == "HY":
@@ -57,7 +86,7 @@ def urlify(s):
             else:
                 assessment_type = "Unknown"
 
-            datasets[urlify(d['title'])] = {
+            new_dataset = {
                 'type': 'document',
                 'title': d['title'] + " (" + assessment_type + ", " + iso3[assessment[0:3]] + ", " + assessment[4:8] + ")",
                 'name': urlify(d['title']),
@@ -72,6 +101,8 @@ def urlify(s):
                 'year': assessment[4:8],
                 'url': API_ENDPOINT + assessment,
                 'category': category,
+                'law_practice_question': list(law_practice_question).sort(), #Alphabetic - law before practice, see display snippet in CKAN extension, this is important :-)
+                'scoring_question': list(scoring_question),
                 'question': questions,
                 'extras': [
                     {'key': 'spatial_text', 'value': iso3[assessment[0:3]]},
@@ -88,6 +119,29 @@ def urlify(s):
                     }
                 ]
             }
+
+            if urlify(d['title']) in datasets:
+                print "Warning, dataset already exists..."
+                print "This:"
+                print new_dataset
+                print "That:"
+                print datasets[urlify(d['title'])]
+                duplicates.append(new_dataset['resources'][0]['url'] + "," + datasets[urlify(d['title'])]['resources'][0]['url'])
+            else:
+                datasets[urlify(d['title'])] = new_dataset
+
+print "The following questions are invalid:"
+all_removals_list = list(all_removals)
+all_removals_list.sort(key=lambda x: int(x))
+print all_removals_list
+print "This led to " + str(dropped_pdfs) + " PDFs being dropped out of a total of " + str(pdfs) + " PDFs"
+print "There were " + str(len(duplicates)) + " duplicates:"
+for duplicate in duplicates:
+    print duplicate
+print "Writing out " + str(len(datasets)) + " datasets for CKAN"
 
 with open('./datasets2.json', 'w') as f:
     json.dump(datasets, f, indent=4, separators=(',', ': '))
+
+with open('./complete.json', 'w') as f:
+    json.dump(complete_metadata, f, indent=4)
diff --git a/harvesters/rgi/scripts/import_questions.py b/harvesters/rgi/scripts/import_questions.py
@@ -2,6 +2,9 @@
 import json
 
 questions = []
+cset = set()
+
+print("The list of sub-components (categories in CKAN) will be shown for cross-checking")
 
 with open("questions_new.csv", "rb") as qfile:
     csvreader = csv.reader(qfile)
@@ -13,20 +16,33 @@
             currentComponent = row[3]
         elif row[0] == "SUB-COMPONENT":
             currentSubComponent = row[3]
+            cset.add(row[3])
         elif row[0] == "INDICATOR":
             currentIndicator = row[3]
         elif row[0] in ("QUESTION", "NON_SCORING"):
-            lawOrPractice = row[1]
+            if (row[0] == "NON_SCORING"):
+                scoring = "non-scoring"
+            else:
+                scoring = "scoring"
+            if (row[1] == "Law_Q"):
+                lp = "law"
+            elif (row[1] == "Practice_Q"):
+                lp = "practice"
+            else:
+                lp = "neither"
             ref = row[2]
             elName = row[3]
             qName = row[4]
-            questions.append([ref, currentComponent, qName, currentSubComponent, currentIndicator, lawOrPractice, elName])
+            questions.append([ref, currentComponent, qName, currentSubComponent, currentIndicator, lp, elName, scoring])
 
 questions = sorted(questions, key=lambda k: int(k[0])) 
+
+for item in cset:
+    print(item)
 
 with open("questions_out.csv", "wb") as outfile:
     csvwriter = csv.writer(outfile)
-    csvwriter.writerow(["Q", "Component", "Question", "Subcomponent", "Indicator", "LawOrPractice", "Element name"])
+    csvwriter.writerow(["Q", "Component", "Question", "Subcomponent", "Indicator", "LawOrPractice", "Element name", "Scoring"])
 
     for question in questions:
         csvwriter.writerow(question)
@@ -36,6 +52,7 @@
     jdata['choices'] = []
 
     for question in questions:
-        jdata['choices'].append({"value": question[0], "label": str(question[0]) + ": " + question[2]})
+        q_as_str = "%03d" % int(question[0])
+        jdata['choices'].append({"value": q_as_str, "label": q_as_str + ": " + question[2]})
 
-    outfile.write(json.dumps(jdata));
+    outfile.write(json.dumps(jdata, sort_keys=True, indent=4,));
diff --git a/harvesters/rgi/scripts/rgi_import.py b/harvesters/rgi/scripts/rgi_import.py
@@ -6,6 +6,7 @@
 import unicodedata
 import re
 import uuid
+import os
 
 failed_states = []
 
@@ -191,7 +192,8 @@ def unicodeToNumbers(name):
 
     qtext = "Question "
     qset = set()
-    #Avoid duplicate question entries and throw out questions not in the schema. Not ideal.
+    #Avoid duplicate question entries and throw out questions not in the schema.
+    #Neither should happen as we catch these things in get_pdfs.py
     for question in datasets[d]['question']: #temp, already checked
         if question in qchoices:
             qset.add(question)

diff --git a/harvesters/rgi/scripts/update_data.sh b/harvesters/rgi/scripts/update_data.sh