revise the way surveys are read

starting to read via xml instead of spreadsheet, more support across languages
e-mission · shankari · May 8, 2024 · Mar 13, 2024 · Mar 15, 2024 · Mar 20, 2024
commit c4cfdfaa0266a88532f0dcbbbf7426b65d660f26
diff --git a/viz_scripts/survey_responses.ipynb b/viz_scripts/survey_responses.ipynb
@@ -1,15 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a7fa9a20",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# %conda install openpyxl"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -19,14 +9,43 @@
    "source": [
     "year = None\n",
     "month = None\n",
-    "program = \"washingtoncommons\"\n",
+    "program = \"dfc-fermata\"\n",
     "study_type = \"study\"\n",
     "mode_of_interest = None\n",
     "include_test_users = False\n",
     "dynamic_labels = {  }\n",
     "use_imperial = False"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce0dcc9f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#probably going to end up passing this in\n",
+    "survey_info =  {\n",
+    "    \"surveys\": {\n",
+    "      \"UserProfileSurvey\": {\n",
+    "        \"formPath\": \"https://raw.githubusercontent.com/JGreenlee/nrel-openpath-deploy-configs/fermata-demo/survey_resources/dfc-fermata/fermata-onboarding-v0.xml\",\n",
+    "        \"version\": 1,\n",
+    "        \"compatibleWith\": 1,\n",
+    "        \"dataKey\": \"manual/demographic_survey\",\n",
+    "        \"labelTemplate\": { \"en\": \"Answered\" }\n",
+    "      },\n",
+    "      \"TripConfirmSurvey\": {\n",
+    "        \"formPath\": \"https://raw.githubusercontent.com/e-mission/nrel-openpath-deploy-configs/main/survey_resources/dfc-fermata/fermata-ev-return-trip-v0.xml\",\n",
+    "        \"version\": 1,\n",
+    "        \"compatibleWith\": 1,\n",
+    "        \"dataKey\": \"manual/trip_user_input\",\n",
+    "        \"labelTemplate\": { \"en\": \"Answered\" }\n",
+    "      }\n",
+    "    },\n",
+    "    \"trip-labels\": \"ENKETO\"\n",
+    "  }"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -35,11 +54,10 @@
    "outputs": [],
    "source": [
     "from collections import defaultdict\n",
-    "\n",
     "import urllib.request\n",
-    "\n",
     "import numpy as np\n",
     "import pandas as pd\n",
+    "from xml.dom import minidom\n",
     "\n",
     "from plots import *\n",
     "import scaffolding\n",
@@ -55,52 +73,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9a85ca35",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#probably going to end up passing this in\n",
-    "survey_info = {\n",
-    "      \"surveys\": {\n",
-    "        \"UserProfileSurvey\": {\n",
-    "          \"formPath\": \"https://raw.githubusercontent.com/e-mission/nrel-openpath-deploy-configs/main/survey_resources/washingtoncommons/washingtoncommons-onboarding-survey-v4.xml\",\n",
-    "          \"version\": 1.3,\n",
-    "          \"compatibleWith\": 1,\n",
-    "          \"dataKey\": \"manual/demographic_survey\",\n",
-    "          \"labelTemplate\": {\n",
-    "            \"en\": \"Answered\",\n",
-    "            \"es\": \"Contestada\"\n",
-    "          }\n",
-    "        },\n",
-    "        \"TripConfirmSurvey\": {\n",
-    "            \"formPath\": \"https://raw.githubusercontent.com/e-mission/nrel-openpath-deploy-configs/main/survey_resources/washingtoncommons/washingtoncommons-trip-survey-v2.json\",\n",
-    "            \"version\": 1.2,\n",
-    "            \"compatibleWith\": 1,\n",
-    "            \"dataKey\": \"manual/trip_user_input\",\n",
-    "            \"labelVars\": {\n",
-    "              \"modes\": {\n",
-    "                \"key\": \"What_was_the_main_pu_f_this_trip_trip_leg\",\n",
-    "                \"type\": \"length\"\n",
-    "              },\n",
-    "              \"purposes\": {\n",
-    "                \"key\": \"_2_What_was_the_mode_of_transp\",\n",
-    "                \"type\": \"length\"\n",
-    "              }\n",
-    "            },\n",
-    "            \"labelTemplate\": {\n",
-    "              \"en\": \"{ purposes, plural, =0 {No purposes} one {1 purpose} other {# purposes} }, { modes, plural, =0 {No modes} one {1 mode} other {# modes} }\",\n",
-    "              \"es\": \"{ purposes, plural, =0 {No propósitos} one {1 propósito} other {# propósitos} }, { modes, plural, =0 {No modos} one {1 modo} other {# modos} }\"\n",
-    "            }\n",
-    "          }\n",
-    "      },\n",
-    "      \"trip-labels\": \"ENKETO\"\n",
-    "    }"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bf0b2f08",
+   "id": "b18bc854",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -111,9 +84,9 @@
     "    sheet_list = []\n",
     "    for name in survey_list:\n",
     "        form_path = survey_info['surveys'][name]['formPath']\n",
-    "        #THIS ASSUMES THE FILENAME IS THE SAME AS THE FORM PATH BUT WITH XLSX FILE TYPE\n",
+    "        #THIS ASSUMES THE FILENAME IS THE SAME AS THE FORM PATH BUT WITH xml FILE TYPE\n",
     "        l_path = form_path.split('.')\n",
-    "        l_path[-1] = 'xlsx'\n",
+    "        l_path[-1] = 'xml'\n",
     "        s = '.'\n",
     "        sheet_path = s.join(l_path)\n",
     "        sheet_list.append(sheet_path)\n",
@@ -127,29 +100,61 @@
     "    quest_dict = {}\n",
     "    \n",
     "    for url in sheet_list:\n",
-    "        result = urllib.request.urlopen(url).read()\n",
-    "        xls = pd.ExcelFile(result)\n",
-    "        questions = pd.read_excel(xls, 'survey')\n",
-    "        options = pd.read_excel(xls, 'choices')\n",
+    "        result = urllib.request.urlopen(url)\n",
+    "        doc = minidom.parse(result) \n",
     "\n",
-    "        opt_dict.update(dict(zip(options.name, options.label)))\n",
-    "        quest_dict.update(dict(zip(questions.name, questions.label)))\n",
+    "        labels = doc.getElementsByTagName(\"label\") \n",
+    "        for label in labels:\n",
+    "            if(bool(label.parentNode.getAttribute(\"ref\"))):\n",
+    "                print(label.parentNode.getAttribute(\"ref\").split('/')[-1])\n",
+    "                print(label.firstChild.data)\n",
+    "                \n",
+    "                quest_dict[str(label.parentNode.getAttribute(\"ref\").split('/')[-1])] = label.firstChild.data\n",
+    "    \n",
+    "    return opt_dict, quest_dict\n",
+    "\n",
+    "def build_dictionaries(url_list):\n",
+    "    opt_dict = {}\n",
+    "    quest_dict = {}\n",
+    "    \n",
+    "    for url in sheet_list:\n",
+    "        result = urllib.request.urlopen(url)\n",
+    "        tree = ET.parse(result)\n",
+    "        root = tree.getroot()\n",
+    "        \n",
+    "        print(root.findall(\".\"))\n",
+    "    \n",
+    "        for child in root:\n",
+    "            print(child.tag, child.attrib)\n",
+    "            \n",
+    "        print(root.findall(\".//label\"))\n",
     "    \n",
     "    return opt_dict, quest_dict\n",
     "\n",
+    "\n",
     "#input: dataframe containing all trips that have non-blank user_input\n",
     "#output: dataframe with questions in the columns and answers in the rows\n",
     "#for loop will run n survey responses times (this could get big!)\n",
     "def create_dataframe(df_trips_w_surveys):\n",
     "    df = df_trips_w_surveys.reset_index()\n",
     "    rows = []\n",
     "    for i in range(len(df)):\n",
-    "        row = pd.json_normalize(df.loc[i].user_input['trip_user_input']['data']['jsonDocResponse']['data'])\n",
+    "        data_key = list(df.loc[i].user_input['trip_user_input']['data']['jsonDocResponse'].keys())[0]\n",
+    "        row = pd.json_normalize(df.loc[i].user_input['trip_user_input']['data']['jsonDocResponse'][data_key])\n",
     "        rows.append(row)\n",
-    "\n",
     "    df = pd.concat(rows)\n",
+    "#     print(df.head())\n",
+    "    \n",
+    "    rename_nests = {}\n",
+    "    for col in df.columns:\n",
+    "        rename_nests[col] = col.split('.')[-1]\n",
+    "    \n",
+    "    print(rename_nests)\n",
+    "    df = df.rename(columns=rename_nests)\n",
+    "    \n",
     "    #drop the non-question columns, should leave behind all the questions\n",
-    "    df = df.drop(columns = ['end', 'start', 'attrid', 'attrxmlns:orx', 'attrxmlns:orx', 'attrxmlns:jr', 'meta.instanceID', 'meta.deprecatedID'])\n",
+    "    #need to do this better, won't always be the same\n",
+    "    df = df.drop(columns = ['end', 'start', 'attrid', 'attrxmlns:orx', 'attrxmlns:orx', 'attrxmlns:jr', 'instanceID'])\n",
     "\n",
     "    return df\n",
     "\n",
@@ -182,7 +187,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4b9db890",
+   "id": "3b355efd",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -197,7 +202,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "caeb880b",
+   "id": "854b3070",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -206,7 +211,6 @@
     "all_confirmed_trips = scaffolding.load_all_confirmed_trips(tq)\n",
     "#remove blank inputs\n",
     "survey_trips = all_confirmed_trips[all_confirmed_trips['user_input'] != {}]\n",
-    "print(len(survey_trips))\n",
     "\n",
     "#survey counts df\n",
     "survey_trips = survey_trips.reset_index()\n",
@@ -223,46 +227,51 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c00da0a7",
+   "id": "7fe65f88",
    "metadata": {},
    "outputs": [],
    "source": [
+    "\n",
+    "\n",
     "#create translation dictionaries\n",
     "opt_dict, quest_dict = build_dictionaries(sheet_list)\n",
+    "print(opt_dict)\n",
+    "print(quest_dict)\n",
     "\n",
     "#format survey trips into responses dataframe\n",
-    "df_responses = create_dataframe(survey_trips)\n",
-    "\n",
-    "#replace questions\n",
-    "df_responses = df_responses.rename(columns = quest_dict)"
+    "df_responses = create_dataframe(survey_trips)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "08d04b39",
+   "id": "7bcebeef",
    "metadata": {
     "scrolled": false
    },
    "outputs": [],
    "source": [
+    "file_suffix = scaffolding.get_file_suffix(year, month, program)\n",
+    "\n",
     "#create one plot per question\n",
     "for col in df_responses.columns:\n",
+    "    filename = col + file_suffix\n",
+    "    print(filename)\n",
+    "    \n",
     "    quest_frame = df_responses.copy()\n",
     "    quest_frame = quest_frame[quest_frame[col] != \"\"] #could have blank responses for non-mandatory ?s\n",
-    "    labels = traslate_options(quest_frame[col].value_counts(dropna=True).keys().tolist(), opt_dict)\n",
+    "    labels = quest_frame[col].value_counts(dropna=True).keys().tolist()\n",
+    "#     labels = traslate_options(quest_frame[col].value_counts(dropna=True).keys().tolist(), opt_dict)\n",
     "    values = quest_frame[col].value_counts(dropna=True).tolist()\n",
+    "    \n",
+    "    try:\n",
+    "        label = quest_dict[col]\n",
+    "    except:\n",
+    "        label = col\n",
     "     \n",
-    "    pie_chart_purpose(col+'\\n'+qual_text, labels, values, \"howdy\")"
+    "    #if other is 0 don't display it :)\n",
+    "    pie_chart_purpose(label+'\\n'+qual_text, labels, values, filename)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "132e00a1",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {