diff --git a/.DS_Store b/.DS_Store index 1746666..7cc41a7 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/predicting-poverty-education-replication/.DS_Store b/predicting-poverty-education-replication/.DS_Store new file mode 100644 index 0000000..e3223d1 Binary files /dev/null and b/predicting-poverty-education-replication/.DS_Store differ diff --git a/predicting-poverty-education-replication/LICENSE b/predicting-poverty-education-replication/LICENSE new file mode 100644 index 0000000..fbad6f9 --- /dev/null +++ b/predicting-poverty-education-replication/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Jatin Mathur + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/predicting-poverty-education-replication/activation_maps/guided_backprop.py b/predicting-poverty-education-replication/activation_maps/guided_backprop.py new file mode 100644 index 0000000..dd808f3 --- /dev/null +++ b/predicting-poverty-education-replication/activation_maps/guided_backprop.py @@ -0,0 +1,73 @@ +""" +Taken directly from https://github.com/utkuozbulak/pytorch-cnn-visualizations +""" + +import torch +from torch.nn import ReLU + +class GuidedBackprop(): + """ + Produces gradients generated with guided back propagation from the given image + """ + def __init__(self, model): + self.model = model + self.gradients = None + self.forward_relu_outputs = [] + # Put model in evaluation mode + self.model.eval() + self.update_relus() + self.hook_layers() + + def hook_layers(self): + def hook_function(module, grad_in, grad_out): + self.gradients = grad_in[0] + # Register hook to the first layer + first_layer = list(self.model.features._modules.items())[0][1] + first_layer.register_backward_hook(hook_function) + + def update_relus(self): + """ + Updates relu activation functions so that + 1- stores output in forward pass + 2- imputes zero for gradient values that are less than zero + """ + def relu_backward_hook_function(module, grad_in, grad_out): + """ + If there is a negative gradient, change it to zero + """ + # Get last forward output + corresponding_forward_output = self.forward_relu_outputs[-1] + corresponding_forward_output[corresponding_forward_output > 0] = 1 + modified_grad_out = corresponding_forward_output * torch.clamp(grad_in[0], min=0.0) + del self.forward_relu_outputs[-1] # Remove last forward output + return (modified_grad_out,) + + def relu_forward_hook_function(module, ten_in, ten_out): + """ + Store results of forward pass + """ + self.forward_relu_outputs.append(ten_out) + + # Loop through layers, hook up ReLUs + for pos, module in self.model.features._modules.items(): + if isinstance(module, ReLU): + module.register_backward_hook(relu_backward_hook_function) + module.register_forward_hook(relu_forward_hook_function) + + def generate_gradients(self, input_image, target_class): + self.model.zero_grad() + + # Forward pass + model_output = self.model(input_image) + # Zero gradients + self.model.zero_grad() + # Target for backprop + one_hot_output = torch.FloatTensor(1, model_output.size()[-1]).zero_() + one_hot_output[0][target_class] = 1 + # Backward pass + model_output.backward(gradient=one_hot_output) + # Convert Pytorch variable to numpy array + # [0] to get rid of the first channel (1,3,224,224) + gradients_as_arr = self.gradients.data.numpy()[0] + return gradients_as_arr + \ No newline at end of file diff --git a/predicting-poverty-education-replication/activation_maps/images/bad_image.png b/predicting-poverty-education-replication/activation_maps/images/bad_image.png new file mode 100644 index 0000000..ab96ee7 Binary files /dev/null and b/predicting-poverty-education-replication/activation_maps/images/bad_image.png differ diff --git a/predicting-poverty-education-replication/activation_maps/images/roads.png b/predicting-poverty-education-replication/activation_maps/images/roads.png new file mode 100644 index 0000000..2baa43d Binary files /dev/null and b/predicting-poverty-education-replication/activation_maps/images/roads.png differ diff --git a/predicting-poverty-education-replication/activation_maps/images/water.png b/predicting-poverty-education-replication/activation_maps/images/water.png new file mode 100644 index 0000000..0d4bdb4 Binary files /dev/null and b/predicting-poverty-education-replication/activation_maps/images/water.png differ diff --git a/predicting-poverty-education-replication/activation_maps/visualization_utils.py b/predicting-poverty-education-replication/activation_maps/visualization_utils.py new file mode 100644 index 0000000..72c49e0 --- /dev/null +++ b/predicting-poverty-education-replication/activation_maps/visualization_utils.py @@ -0,0 +1,66 @@ +""" +Taken directly from https://github.com/utkuozbulak/pytorch-cnn-visualizations +""" + +import os +import copy +import numpy as np +from PIL import Image +import matplotlib.cm as mpl_color_map + +import torch +from torch.autograd import Variable +from torchvision import models + + +def preprocess_image(pil_im, resize_im=True): + """ + Processes image for CNNs + + Args: + PIL_img (PIL_img): Image to process + resize_im (bool): Resize to 224 or not + returns: + im_as_var (torch variable): Variable that contains processed float tensor + """ + # mean and std list for channels (Imagenet) + mean = [0.485, 0.456, 0.406] + std = [0.229, 0.224, 0.225] + # Resize image + if resize_im: + pil_im.thumbnail((224, 224)) + im_as_arr = np.float32(pil_im) + im_as_arr = im_as_arr.transpose(2, 0, 1) # Convert array to D,W,H + # Normalize the channels + for channel, _ in enumerate(im_as_arr): + im_as_arr[channel] /= 255 + im_as_arr[channel] -= mean[channel] + im_as_arr[channel] /= std[channel] + # Convert to float tensor + im_as_ten = torch.from_numpy(im_as_arr).float() + # Add one more channel to the beginning. Tensor shape = 1,3,224,224 + im_as_ten.unsqueeze_(0) + # Convert to Pytorch variable + im_as_var = Variable(im_as_ten, requires_grad=True) + return im_as_var + + +def convert_to_grayscale(im_as_arr): + """ + Converts 3d image to grayscale + + Args: + im_as_arr (numpy arr): RGB image with shape (D,W,H) + + returns: + grayscale_im (numpy_arr): Grayscale image with shape (1,W,D) + """ + grayscale_im = np.sum(np.abs(im_as_arr), axis=0) + im_max = np.percentile(grayscale_im, 99) + im_min = np.min(grayscale_im) + grayscale_im = (np.clip((grayscale_im - im_min) / (im_max - im_min), 0, 1)) + grayscale_im = np.expand_dims(grayscale_im, axis=0) + return grayscale_im + + + diff --git a/predicting-poverty-education-replication/activation_maps/visualize_cnn.ipynb b/predicting-poverty-education-replication/activation_maps/visualize_cnn.ipynb new file mode 100644 index 0000000..65e56f7 --- /dev/null +++ b/predicting-poverty-education-replication/activation_maps/visualize_cnn.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you would like to investigate the CNN in different ways, use https://github.com/utkuozbulak/pytorch-cnn-visualizations as a reference." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from PIL import Image\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import os\n", + "\n", + "from visualization_utils import preprocess_image, convert_to_grayscale\n", + "from guided_backprop import GuidedBackprop" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_DIR = '..'\n", + "COUNTRY = 'malawi_2016'\n", + "CNN_SAVE_DIR = os.path.join(BASE_DIR, 'models', 'trained_model.pt')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "model = torch.load(CNN_SAVE_DIR, map_location=torch.device('cpu'))\n", + "model = model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['water.png',\n", + " 'bad_image.png',\n", + " '2',\n", + " '1',\n", + " '.ipynb_checkpoints',\n", + " 'roads.png',\n", + " '3']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "os.listdir('images/')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "choice = 'roads.png'\n", + "image_dir = f'images/{choice}'\n", + "image = Image.open(image_dir).convert('RGB')\n", + "image" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "prediction: tensor([[ 1.7050, 1.5137, -3.3119]], grad_fn=)\n" + ] + } + ], + "source": [ + "proc_image = preprocess_image(image)\n", + "\n", + "# prediction by model\n", + "preds = model(proc_image)\n", + "print('prediction:', preds)\n", + "target = torch.argmax(preds).item()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "GBP = GuidedBackprop(model)\n", + "guided_grads = GBP.generate_gradients(proc_image, target)\n", + "grayscale_guided_grads = convert_to_grayscale(guided_grads)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "img = np.squeeze(grayscale_guided_grads.transpose(1, 2, 0))\n", + "fig = plt.figure(frameon=False)\n", + "ax = plt.Axes(fig, [0., 0., 1., 1.])\n", + "ax.set_axis_off()\n", + "fig.add_axes(ax)\n", + "ax.imshow(img, cmap='gray', vmin=0, vmax=1)\n", + "fig.savefig('out.png', bbox_inches='tight', pad_inches=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "name": "pytorch-gpu.1-4.m46", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-4:m46" + }, + "kernelspec": { + "display_name": "predicting-poverty-replication", + "language": "python", + "name": "predicting-poverty-replication" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/predicting-poverty-education-replication/figures/activations1.png b/predicting-poverty-education-replication/figures/activations1.png new file mode 100644 index 0000000..3d31627 Binary files /dev/null and b/predicting-poverty-education-replication/figures/activations1.png differ diff --git a/predicting-poverty-education-replication/figures/activations2.png b/predicting-poverty-education-replication/figures/activations2.png new file mode 100644 index 0000000..582e161 Binary files /dev/null and b/predicting-poverty-education-replication/figures/activations2.png differ diff --git a/predicting-poverty-education-replication/figures/activations3.png b/predicting-poverty-education-replication/figures/activations3.png new file mode 100644 index 0000000..7d10d6c Binary files /dev/null and b/predicting-poverty-education-replication/figures/activations3.png differ diff --git a/predicting-poverty-education-replication/figures/ethiopia_results.png b/predicting-poverty-education-replication/figures/ethiopia_results.png new file mode 100644 index 0000000..03c4398 Binary files /dev/null and b/predicting-poverty-education-replication/figures/ethiopia_results.png differ diff --git a/predicting-poverty-education-replication/figures/img1.png b/predicting-poverty-education-replication/figures/img1.png new file mode 100644 index 0000000..2baa43d Binary files /dev/null and b/predicting-poverty-education-replication/figures/img1.png differ diff --git a/predicting-poverty-education-replication/figures/img2.png b/predicting-poverty-education-replication/figures/img2.png new file mode 100644 index 0000000..0d4bdb4 Binary files /dev/null and b/predicting-poverty-education-replication/figures/img2.png differ diff --git a/predicting-poverty-education-replication/figures/img3.png b/predicting-poverty-education-replication/figures/img3.png new file mode 100644 index 0000000..ab96ee7 Binary files /dev/null and b/predicting-poverty-education-replication/figures/img3.png differ diff --git a/predicting-poverty-education-replication/figures/malawi_results.png b/predicting-poverty-education-replication/figures/malawi_results.png new file mode 100644 index 0000000..95cdaa7 Binary files /dev/null and b/predicting-poverty-education-replication/figures/malawi_results.png differ diff --git a/predicting-poverty-education-replication/figures/nigeria_results.png b/predicting-poverty-education-replication/figures/nigeria_results.png new file mode 100644 index 0000000..0217dc8 Binary files /dev/null and b/predicting-poverty-education-replication/figures/nigeria_results.png differ diff --git a/predicting-poverty-education-replication/gold_standard/remote_features_survey_model.ipynb b/predicting-poverty-education-replication/gold_standard/remote_features_survey_model.ipynb new file mode 100644 index 0000000..9009215 --- /dev/null +++ b/predicting-poverty-education-replication/gold_standard/remote_features_survey_model.ipynb @@ -0,0 +1,1191 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This file grabs everything from the LSMS survey that I think an image could possibly recognize and uses those features to predict consumption. This serves as a \"gold standard\" for any image-based model. Of course, this is no indication of an \"upper bound\" on CNN performance, but rather offers some means of comparison to a model that was built using only survey data.\n", + "\n", + "I only implement this gold standard for Malawi, but it could be done for the other countries." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_DIR = '..'\n", + "\n", + "RANDOM_SEED = 7 # for reproducibility\n", + "COUNTRIES_DIR = os.path.join(BASE_DIR, 'data', 'countries')\n", + "RESULTS_DIR = os.path.join(BASE_DIR, 'results')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(BASE_DIR)\n", + "from utils import merge_on_lat_lon, assign_groups, run_randomized_cv, run_spatial_cv" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def process_malawi():\n", + " np.random.seed(RANDOM_SEED)\n", + " lsms_dir = os.path.join(COUNTRIES_DIR, 'malawi_2016', 'LSMS')\n", + " consumption_file = 'IHS4 Consumption Aggregate.csv'\n", + " consumption_ph_col = 'rexpagg' # per household\n", + " hhsize_col = 'hhsize' # people in household\n", + "\n", + " geovariables_file = 'HouseholdGeovariables_csv/HouseholdGeovariablesIHS4.csv'\n", + " lat_col = 'lat_modified'\n", + " lon_col = 'lon_modified'\n", + "\n", + " # purchasing power parity for malawi in 2016 (https://data.worldbank.org/indicator/PA.NUS.PRVT.PP?locations=MW)\n", + " ppp = 215.182\n", + " \n", + " df_geo = pd.read_csv(os.path.join(lsms_dir, geovariables_file))\n", + " df_hhf = pd.read_csv(os.path.join(lsms_dir, 'hh_mod_f.csv'))\n", + " df_plot = pd.read_csv(os.path.join(lsms_dir, 'plotgeovariablesihs4.csv'))\n", + " df_com = pd.read_csv(os.path.join(lsms_dir, 'com_cd.csv'))\n", + " df_com2 = pd.read_csv(os.path.join(lsms_dir, 'com_cf1.csv'))\n", + " df_tie = pd.read_csv(os.path.join(lsms_dir, consumption_file))[['case_id', 'ea_id']]\n", + "\n", + " hhf_input = df_hhf[['case_id', 'hh_f10', 'hh_f08']]\n", + " com_input = df_com[['ea_id', 'com_cd01', 'com_cd16', 'com_cd18a', 'com_cd20a', 'com_cd22a', 'com_cd24a',\n", + " 'com_cd27a', 'com_cd36a', 'com_cd40a', 'com_cd49a', 'com_cd51a', 'com_cd60a', 'com_cd67a',\n", + " 'com_cd69a']]\n", + "\n", + " com2_input = df_com2[['ea_id', 'com_cf08a']]\n", + "\n", + " geo_input = df_geo[['case_id', 'dist_admarc', 'dist_agmrkt', 'dist_auction', 'dist_boma', 'dist_borderpost',\n", + " 'dist_popcenter', 'dist_road', 'af_bio_1', 'af_bio_8', 'af_bio_12', 'af_bio_13', 'af_bio_16', \n", + " 'lat_modified', 'lon_modified']]\n", + " geo_input.rename(columns={'lat_modified': 'cluster_lat', 'lon_modified': 'cluster_lon'}, inplace=True)\n", + " geo_input.dropna(inplace=True)\n", + "\n", + " plot_input = df_plot[['case_id', 'dist_hh']]\n", + " \n", + " df_cons = pd.read_csv(os.path.join(COUNTRIES_DIR, 'malawi_2016', 'processed', 'clusters.csv'))\n", + " df_merge = merge_on_lat_lon(df_cons, geo_input)\n", + " df_merge = pd.merge(df_merge, hhf_input, on='case_id', how='left')\n", + " df_merge = pd.merge(df_merge, df_tie, on='case_id', how='left')\n", + " df_merge = pd.merge(df_merge, com_input, on='ea_id', how='left')\n", + " df_merge = pd.merge(df_merge, com2_input, on='ea_id', how='left')\n", + " df_merge = pd.merge(df_merge, plot_input, on='case_id', how='left')\n", + " return df_merge.drop(['case_id', 'ea_id'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/envs/predicting-poverty-replication/lib/python3.7/site-packages/pandas/core/frame.py:4133: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " errors=errors,\n", + "/home/jupyter/.local/lib/python3.7/site-packages/ipykernel_launcher.py:33: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" + ] + } + ], + "source": [ + "df_mw = process_malawi()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrycluster_latcluster_loncons_pcnightlightsdist_admarcdist_agmrktdist_auctiondist_bomadist_borderpost...com_cd27acom_cd36acom_cd40acom_cd49acom_cd51acom_cd60acom_cd67acom_cd69acom_cf08adist_hh
0mw-17.0951535.2172131.4232390.0252061.021.0145.021.04.0...NaNNaNNaNNaNNaNNaNNaNNaNNaN1.2
1mw-17.0951535.2172131.4232390.0252062.020.0145.020.04.0...NaNNaNNaNNaNNaNNaNNaNNaNNaN1.0
2mw-17.0951535.2172131.4232390.0252062.020.0145.020.04.0...NaNNaNNaNNaNNaNNaNNaNNaNNaN1.7
3mw-17.0951535.2172131.4232390.0252062.020.0145.020.04.0...NaNNaNNaNNaNNaNNaNNaNNaNNaN1.7
4mw-17.0951535.2172131.4232390.0252062.020.0145.020.05.0...NaNNaNNaNNaNNaNNaNNaNNaNNaN0.9
\n", + "

5 rows × 35 columns

\n", + "
" + ], + "text/plain": [ + " country cluster_lat cluster_lon cons_pc nightlights dist_admarc \\\n", + "0 mw -17.09515 35.217213 1.423239 0.025206 1.0 \n", + "1 mw -17.09515 35.217213 1.423239 0.025206 2.0 \n", + "2 mw -17.09515 35.217213 1.423239 0.025206 2.0 \n", + "3 mw -17.09515 35.217213 1.423239 0.025206 2.0 \n", + "4 mw -17.09515 35.217213 1.423239 0.025206 2.0 \n", + "\n", + " dist_agmrkt dist_auction dist_boma dist_borderpost ... com_cd27a \\\n", + "0 21.0 145.0 21.0 4.0 ... NaN \n", + "1 20.0 145.0 20.0 4.0 ... NaN \n", + "2 20.0 145.0 20.0 4.0 ... NaN \n", + "3 20.0 145.0 20.0 4.0 ... NaN \n", + "4 20.0 145.0 20.0 5.0 ... NaN \n", + "\n", + " com_cd36a com_cd40a com_cd49a com_cd51a com_cd60a com_cd67a \\\n", + "0 NaN NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN NaN NaN \n", + "\n", + " com_cd69a com_cf08a dist_hh \n", + "0 NaN NaN 1.2 \n", + "1 NaN NaN 1.0 \n", + "2 NaN NaN 1.7 \n", + "3 NaN NaN 1.7 \n", + "4 NaN NaN 0.9 \n", + "\n", + "[5 rows x 35 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_mw.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(19865, 35)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_mw.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df_use = pd.get_dummies(df_mw.drop(['country'], axis=1))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "clusters = df_use.groupby(['cluster_lat', 'cluster_lon'])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "cluster_df = clusters.mean().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cluster_latcluster_loncons_pcnightlightsdist_admarcdist_agmrktdist_auctiondist_bomadist_borderpostdist_popcenter...com_cd27acom_cd36acom_cd40acom_cd49acom_cd51acom_cd60acom_cd67acom_cd69acom_cf08adist_hh
0-17.09515035.2172131.4232390.0252061.50000020.125000145.00000020.1250004.12500020.125000...NaNNaNNaNNaNNaNNaNNaNNaNNaN1.510000
1-17.09235135.1146431.2662040.0000008.10526325.578947146.36842125.57894710.10526325.578947...4.06.02.0NaN6.06.045.045.06.00.492308
2-17.01669835.0796291.5668700.00000015.76190523.047619134.85714323.04761921.52381023.047619...0.060.030.0NaN15.060.060.060.045.00.311765
3-16.97724335.2057061.6692450.0082666.97058811.764706135.76470611.76470613.50000011.764706...1.015.03.0NaN3.03.015.015.015.02.594118
4-16.95638535.1689671.0898910.00229513.00000013.681818130.18181813.68181820.63636413.681818...500.040.015.0NaN15.015.040.040.040.00.122222
..................................................................
775-9.59137833.0574501.4099320.0000007.66666726.222222235.27777826.2222225.944444103.666667...1.03.03.02.0NaN3.030.030.05.00.022222
776-9.55039733.2915581.2428010.00000010.18518518.370370228.74074118.37037017.48148182.481481...0.021.00.0NaN7.021.021.021.021.00.107692
777-9.51923033.1391931.8041220.0035575.05714324.971429238.40000024.97142917.42857198.828571...0.041.08.0NaN8.08.041.0NaN8.01.681818
778-9.50753833.2596491.7917250.0000004.46511621.604651234.20930221.60465118.44186090.000000...0.05.05.050.0NaN50.050.050.020.00.319512
779-9.42966733.0221181.5347020.00044818.78947441.789474253.52631641.7894746.210526116.473684...200.07.07.07.0NaN5.030.030.07.02.988235
\n", + "

780 rows × 34 columns

\n", + "
" + ], + "text/plain": [ + " cluster_lat cluster_lon cons_pc nightlights dist_admarc \\\n", + "0 -17.095150 35.217213 1.423239 0.025206 1.500000 \n", + "1 -17.092351 35.114643 1.266204 0.000000 8.105263 \n", + "2 -17.016698 35.079629 1.566870 0.000000 15.761905 \n", + "3 -16.977243 35.205706 1.669245 0.008266 6.970588 \n", + "4 -16.956385 35.168967 1.089891 0.002295 13.000000 \n", + ".. ... ... ... ... ... \n", + "775 -9.591378 33.057450 1.409932 0.000000 7.666667 \n", + "776 -9.550397 33.291558 1.242801 0.000000 10.185185 \n", + "777 -9.519230 33.139193 1.804122 0.003557 5.057143 \n", + "778 -9.507538 33.259649 1.791725 0.000000 4.465116 \n", + "779 -9.429667 33.022118 1.534702 0.000448 18.789474 \n", + "\n", + " dist_agmrkt dist_auction dist_boma dist_borderpost dist_popcenter \\\n", + "0 20.125000 145.000000 20.125000 4.125000 20.125000 \n", + "1 25.578947 146.368421 25.578947 10.105263 25.578947 \n", + "2 23.047619 134.857143 23.047619 21.523810 23.047619 \n", + "3 11.764706 135.764706 11.764706 13.500000 11.764706 \n", + "4 13.681818 130.181818 13.681818 20.636364 13.681818 \n", + ".. ... ... ... ... ... \n", + "775 26.222222 235.277778 26.222222 5.944444 103.666667 \n", + "776 18.370370 228.740741 18.370370 17.481481 82.481481 \n", + "777 24.971429 238.400000 24.971429 17.428571 98.828571 \n", + "778 21.604651 234.209302 21.604651 18.441860 90.000000 \n", + "779 41.789474 253.526316 41.789474 6.210526 116.473684 \n", + "\n", + " ... com_cd27a com_cd36a com_cd40a com_cd49a com_cd51a com_cd60a \\\n", + "0 ... NaN NaN NaN NaN NaN NaN \n", + "1 ... 4.0 6.0 2.0 NaN 6.0 6.0 \n", + "2 ... 0.0 60.0 30.0 NaN 15.0 60.0 \n", + "3 ... 1.0 15.0 3.0 NaN 3.0 3.0 \n", + "4 ... 500.0 40.0 15.0 NaN 15.0 15.0 \n", + ".. ... ... ... ... ... ... ... \n", + "775 ... 1.0 3.0 3.0 2.0 NaN 3.0 \n", + "776 ... 0.0 21.0 0.0 NaN 7.0 21.0 \n", + "777 ... 0.0 41.0 8.0 NaN 8.0 8.0 \n", + "778 ... 0.0 5.0 5.0 50.0 NaN 50.0 \n", + "779 ... 200.0 7.0 7.0 7.0 NaN 5.0 \n", + "\n", + " com_cd67a com_cd69a com_cf08a dist_hh \n", + "0 NaN NaN NaN 1.510000 \n", + "1 45.0 45.0 6.0 0.492308 \n", + "2 60.0 60.0 45.0 0.311765 \n", + "3 15.0 15.0 15.0 2.594118 \n", + "4 40.0 40.0 40.0 0.122222 \n", + ".. ... ... ... ... \n", + "775 30.0 30.0 5.0 0.022222 \n", + "776 21.0 21.0 21.0 0.107692 \n", + "777 41.0 NaN 8.0 1.681818 \n", + "778 50.0 50.0 20.0 0.319512 \n", + "779 30.0 30.0 7.0 2.988235 \n", + "\n", + "[780 rows x 34 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster_df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "com_cd01 0.032051\n", + "com_cd16 0.379487\n", + "com_cd18a 0.350000\n", + "com_cd20a 0.257692\n", + "com_cd22a 0.151282\n", + "com_cd24a 0.191026\n", + "com_cd27a 0.032051\n", + "com_cd36a 0.032051\n", + "com_cd40a 0.032051\n", + "com_cd49a 0.603846\n", + "com_cd51a 0.288462\n", + "com_cd60a 0.032051\n", + "com_cd67a 0.078205\n", + "com_cd69a 0.214103\n", + "com_cf08a 0.434615\n", + "dist_hh 0.042308\n", + "dtype: float64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# a few columns have a high percentage of NA\n", + "nas = cluster_df.isna().sum() / len(cluster_df)\n", + "nas[nas > 0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Modeling" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def nan_handler(df):\n", + " nas = df.isna().sum()\n", + " for c in df:\n", + " if nas[c] > 0:\n", + " df[c] = df[c].fillna(df[c].median())\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "cleaned_df = nan_handler(cluster_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cluster_latcluster_loncons_pcnightlightsdist_admarcdist_agmrktdist_auctiondist_bomadist_borderpostdist_popcenter...com_cd27acom_cd36acom_cd40acom_cd49acom_cd51acom_cd60acom_cd67acom_cd69acom_cf08adist_hh
0-17.09515035.2172131.4232390.0252061.50000020.125000145.00000020.1250004.12500020.125000...2.012.05.010.06.010.020.025.07.01.510000
1-17.09235135.1146431.2662040.0000008.10526325.578947146.36842125.57894710.10526325.578947...4.06.02.010.06.06.045.045.06.00.492308
2-17.01669835.0796291.5668700.00000015.76190523.047619134.85714323.04761921.52381023.047619...0.060.030.010.015.060.060.060.045.00.311765
3-16.97724335.2057061.6692450.0082666.97058811.764706135.76470611.76470613.50000011.764706...1.015.03.010.03.03.015.015.015.02.594118
4-16.95638535.1689671.0898910.00229513.00000013.681818130.18181813.68181820.63636413.681818...500.040.015.010.015.015.040.040.040.00.122222
\n", + "

5 rows × 34 columns

\n", + "
" + ], + "text/plain": [ + " cluster_lat cluster_lon cons_pc nightlights dist_admarc dist_agmrkt \\\n", + "0 -17.095150 35.217213 1.423239 0.025206 1.500000 20.125000 \n", + "1 -17.092351 35.114643 1.266204 0.000000 8.105263 25.578947 \n", + "2 -17.016698 35.079629 1.566870 0.000000 15.761905 23.047619 \n", + "3 -16.977243 35.205706 1.669245 0.008266 6.970588 11.764706 \n", + "4 -16.956385 35.168967 1.089891 0.002295 13.000000 13.681818 \n", + "\n", + " dist_auction dist_boma dist_borderpost dist_popcenter ... com_cd27a \\\n", + "0 145.000000 20.125000 4.125000 20.125000 ... 2.0 \n", + "1 146.368421 25.578947 10.105263 25.578947 ... 4.0 \n", + "2 134.857143 23.047619 21.523810 23.047619 ... 0.0 \n", + "3 135.764706 11.764706 13.500000 11.764706 ... 1.0 \n", + "4 130.181818 13.681818 20.636364 13.681818 ... 500.0 \n", + "\n", + " com_cd36a com_cd40a com_cd49a com_cd51a com_cd60a com_cd67a \\\n", + "0 12.0 5.0 10.0 6.0 10.0 20.0 \n", + "1 6.0 2.0 10.0 6.0 6.0 45.0 \n", + "2 60.0 30.0 10.0 15.0 60.0 60.0 \n", + "3 15.0 3.0 10.0 3.0 3.0 15.0 \n", + "4 40.0 15.0 10.0 15.0 15.0 40.0 \n", + "\n", + " com_cd69a com_cf08a dist_hh \n", + "0 25.0 7.0 1.510000 \n", + "1 45.0 6.0 0.492308 \n", + "2 60.0 45.0 0.311765 \n", + "3 15.0 15.0 2.594118 \n", + "4 40.0 40.0 0.122222 \n", + "\n", + "[5 rows x 34 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaned_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "y = cleaned_df['cons_pc'].values\n", + "\n", + "to_drop = ['cluster_lat', 'cluster_lon', 'cons_pc', 'nightlights']\n", + "x = cleaned_df.drop(to_drop, axis=1).values" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "r2, _ = run_randomized_cv(x, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.08803164394491252" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "r2, _ = run_randomized_cv(x, np.log(y))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.46636052584664556" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "groups, _ = assign_groups(cleaned_df, 5)\n", + "r2, _ = run_spatial_cv(x, y, groups)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-1.2040690308579176" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "name": "pytorch-gpu.1-4.m46", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-4:m46" + }, + "kernelspec": { + "display_name": "predicting-poverty-replication", + "language": "python", + "name": "predicting-poverty-replication" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/predicting-poverty-education-replication/papers/.DS_Store b/predicting-poverty-education-replication/papers/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/predicting-poverty-education-replication/papers/.DS_Store differ diff --git a/predicting-poverty-education-replication/papers/Satellite imagery and ML to predict poverty.pdf b/predicting-poverty-education-replication/papers/Satellite imagery and ML to predict poverty.pdf new file mode 100644 index 0000000..c6ef14e Binary files /dev/null and b/predicting-poverty-education-replication/papers/Satellite imagery and ML to predict poverty.pdf differ diff --git a/predicting-poverty-education-replication/papers/Supplementary Info Paper.pdf b/predicting-poverty-education-replication/papers/Supplementary Info Paper.pdf new file mode 100644 index 0000000..b2a05b4 Binary files /dev/null and b/predicting-poverty-education-replication/papers/Supplementary Info Paper.pdf differ diff --git a/predicting-poverty-education-replication/papers/Transfer Learning Paper.pdf b/predicting-poverty-education-replication/papers/Transfer Learning Paper.pdf new file mode 100644 index 0000000..fee5682 Binary files /dev/null and b/predicting-poverty-education-replication/papers/Transfer Learning Paper.pdf differ diff --git a/predicting-poverty-education-replication/papers/aaai16.pdf b/predicting-poverty-education-replication/papers/aaai16.pdf new file mode 100644 index 0000000..fee5682 Binary files /dev/null and b/predicting-poverty-education-replication/papers/aaai16.pdf differ diff --git a/predicting-poverty-education-replication/papers/jean_et_al.pdf b/predicting-poverty-education-replication/papers/jean_et_al.pdf new file mode 100644 index 0000000..0fa7284 Binary files /dev/null and b/predicting-poverty-education-replication/papers/jean_et_al.pdf differ diff --git a/predicting-poverty-education-replication/requirements.txt b/predicting-poverty-education-replication/requirements.txt new file mode 100644 index 0000000..fc8142c --- /dev/null +++ b/predicting-poverty-education-replication/requirements.txt @@ -0,0 +1,8 @@ +numpy +pandas +requests +scikit-learn +geoio +Pillow==6.2.1 +matplotlib +tqdm diff --git a/predicting-poverty-education-replication/scripts/.DS_Store b/predicting-poverty-education-replication/scripts/.DS_Store new file mode 100644 index 0000000..05a5f92 Binary files /dev/null and b/predicting-poverty-education-replication/scripts/.DS_Store differ diff --git a/predicting-poverty-education-replication/scripts/.ipynb_checkpoints/feature_extract-checkpoint.ipynb b/predicting-poverty-education-replication/scripts/.ipynb_checkpoints/feature_extract-checkpoint.ipynb new file mode 100644 index 0000000..5878a9d --- /dev/null +++ b/predicting-poverty-education-replication/scripts/.ipynb_checkpoints/feature_extract-checkpoint.ipynb @@ -0,0 +1,744 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the images marked as valid per cluster, we pass them through the CNN and extract their feature vectors. the results are stored at a per-country basis. For example, all Malawi feature extractions will go into results/malawi_2016/cnn." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import shutil\n", + "import numpy as np\n", + "import pandas as pd\n", + "from tqdm.notebook import tqdm\n", + "import pickle\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "import numpy as np\n", + "import torchvision\n", + "from torchvision import datasets, models, transforms\n", + "import matplotlib.pyplot as plt\n", + "import time\n", + "import copy" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_DIR = '..'\n", + "COUNTRIES_DIR = os.path.join(BASE_DIR, 'data', 'countries')\n", + "PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')\n", + "RESULTS_DIR = os.path.join(BASE_DIR, 'results')\n", + "CNN_TRAIN_IMAGE_DIR = os.path.join(BASE_DIR, 'data', 'cnn_images')\n", + "CNN_DIR = os.path.join(BASE_DIR, 'models', 'trained_model.pt')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(RESULTS_DIR, exist_ok=True)\n", + "for country in ['malawi_2016', 'ethiopia_2015', 'nigeria_2015']:\n", + " os.makedirs(os.path.join(RESULTS_DIR, country), exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature extract with CNN\n", + "If you have run this step before, you can skip it and run the commented out code in the next section to quick-start." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df_images = pd.read_csv(os.path.join(PROCESSED_DIR, 'image_download_actual.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image_nameimage_latimage_loncluster_latcluster_loncons_pcnightlightscountrynightlights_binis_train
04.31578611574_6.223837135554024_4.31578611574_...4.3157866.2238374.3157866.2687534.3177170.123354ng1True
14.330758037141992_6.223837135554024_4.31578611...4.3307586.2238374.3157866.2687534.3177170.123354ng1True
24.285842272936016_6.238809056956016_4.31578611...4.2858426.2388094.3157866.2687534.3177170.123354ng1True
34.270870351534024_6.253780978358008_4.31578611...4.2708706.2537814.3157866.2687534.3177170.123354ng1True
44.345729958543984_6.253780978358008_4.31578611...4.3457306.2537814.3157866.2687534.3177170.123354ng1True
\n", + "
" + ], + "text/plain": [ + " image_name image_lat image_lon \\\n", + "0 4.31578611574_6.223837135554024_4.31578611574_... 4.315786 6.223837 \n", + "1 4.330758037141992_6.223837135554024_4.31578611... 4.330758 6.223837 \n", + "2 4.285842272936016_6.238809056956016_4.31578611... 4.285842 6.238809 \n", + "3 4.270870351534024_6.253780978358008_4.31578611... 4.270870 6.253781 \n", + "4 4.345729958543984_6.253780978358008_4.31578611... 4.345730 6.253781 \n", + "\n", + " cluster_lat cluster_lon cons_pc nightlights country nightlights_bin \\\n", + "0 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "1 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "2 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "3 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "4 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "\n", + " is_train \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 True \n", + "4 True " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_images.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using cpu as backend\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/serialization.py:493: SourceChangeWarning: source code of class 'torch.nn.modules.container.Sequential' has changed. you can retrieve the original source code by accessing the object's source attribute or set `torch.nn.Module.dump_patches = True` and use the patch tool to revert the changes.\n", + " warnings.warn(msg, SourceChangeWarning)\n" + ] + } + ], + "source": [ + "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "print(f'Using {device} as backend')\n", + "model = torch.load(CNN_DIR, map_location=device)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Sequential(\n", + " (0): Linear(in_features=25088, out_features=4096, bias=True)\n", + " (1): ReLU(inplace=True)\n", + " (2): Dropout(p=0.5, inplace=False)\n", + " (3): Linear(in_features=4096, out_features=4096, bias=True)\n", + " (4): ReLU(inplace=True)\n", + " (5): Dropout(p=0.5, inplace=False)\n", + " (6): Linear(in_features=4096, out_features=3, bias=True)\n", + ")" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# rip off the final layers\n", + "model.classifier = model.classifier[:4]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Sequential(\n", + " (0): Linear(in_features=25088, out_features=4096, bias=True)\n", + " (1): ReLU(inplace=True)\n", + " (2): Dropout(p=0.5, inplace=False)\n", + " (3): Linear(in_features=4096, out_features=4096, bias=True)\n", + ")" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1712e3d5767847a9b3fef1fe5f4ab51b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/154 [00:00\n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"\", line 22, in __getitem__\n X = self.filename_to_im_tensor(self.image_dir + '/' + image_name)\n File \"\", line 28, in filename_to_im_tensor\n im = plt.imread(file)[:,:,:3]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/matplotlib/pyplot.py\", line 2407, in imread\n return matplotlib.image.imread(fname, format)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/matplotlib/image.py\", line 1501, in imread\n with img_open(fname) as image:\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/PIL/ImageFile.py\", line 95, in __init__\n self.fp = open(fp, \"rb\")\nFileNotFoundError: [Errno 2] No such file or directory: '/Users/TinotendaMatsika/Documents/predicting-poverty-education-replication/data/cnn_images/valid/0/11.965055592105976_8.717694503334023_11.9201398279_8.76261026754.png'\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0mimage_order\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mimage_list\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;31m# forward pass for this class\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 45\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataloader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 46\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/tqdm/notebook.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtqdm_notebook\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0;31m# return super(tqdm...) will not catch exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/tqdm/std.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1176\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1177\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1178\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1179\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1180\u001b[0m \u001b[0;31m# Update and possibly print the progressbar.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 818\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_task_info\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 819\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 820\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 821\u001b[0m \u001b[0mnext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m__next__\u001b[0m \u001b[0;31m# Python 2 compatibility\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_process_data\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 844\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_try_put_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 845\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mExceptionWrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 846\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 847\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/_utils.py\u001b[0m in \u001b[0;36mreraise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 383\u001b[0m \u001b[0;31m# (https://bugs.python.org/issue2651), so we work around it.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 384\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKeyErrorMessage\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 385\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m: Caught FileNotFoundError in DataLoader worker process 0.\nOriginal Traceback (most recent call last):\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py\", line 178, in _worker_loop\n data = fetcher.fetch(index)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py\", line 44, in fetch\n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py\", line 44, in \n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"\", line 22, in __getitem__\n X = self.filename_to_im_tensor(self.image_dir + '/' + image_name)\n File \"\", line 28, in filename_to_im_tensor\n im = plt.imread(file)[:,:,:3]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/matplotlib/pyplot.py\", line 2407, in imread\n return matplotlib.image.imread(fname, format)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/matplotlib/image.py\", line 1501, in imread\n with img_open(fname) as image:\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/PIL/ImageFile.py\", line 95, in __init__\n self.fp = open(fp, \"rb\")\nFileNotFoundError: [Errno 2] No such file or directory: '/Users/TinotendaMatsika/Documents/predicting-poverty-education-replication/data/cnn_images/valid/0/11.965055592105976_8.717694503334023_11.9201398279_8.76261026754.png'\n" + ] + } + ], + "source": [ + "transformer = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n", + " ])\n", + "\n", + "# custom dataset for fast image loading and processing\n", + "# does not follow the usual style of folder -> folder for each class -> image\n", + "# we just want one folder with images\n", + "class ForwardPassDataset(torch.utils.data.Dataset):\n", + " def __init__(self, image_dir, transformer):\n", + " self.image_dir = image_dir\n", + " self.image_list = os.listdir(self.image_dir)\n", + " self.transformer = transformer\n", + "\n", + " def __len__(self):\n", + " return len(self.image_list)\n", + "\n", + " def __getitem__(self, index):\n", + " image_name = self.image_list[index]\n", + "\n", + " # Load image\n", + " X = self.filename_to_im_tensor(self.image_dir + '/' + image_name)\n", + " \n", + " # dataloaders need to return a label, but for the forward pass we don't really care\n", + " return X, -1\n", + " \n", + " def filename_to_im_tensor(self, file):\n", + " im = plt.imread(file)[:,:,:3]\n", + " im = self.transformer(im)\n", + " return im\n", + "\n", + "model.eval() \n", + "classes = [0, 1, 2]\n", + "# shape of final array will be (num_validation_images, 4096)\n", + "# we also want to record the image each index represents\n", + "feats = np.zeros(((~df_images['is_train']).sum(), 4096))\n", + "image_order = []\n", + "i = 0\n", + "for c in classes:\n", + " # use the validation images to do the forward pass\n", + " dataset = ForwardPassDataset(os.path.join(CNN_TRAIN_IMAGE_DIR, 'valid', str(c)), transformer)\n", + " dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=4)\n", + " image_order += dataset.image_list\n", + " # forward pass for this class\n", + " for inputs, _ in tqdm(dataloader):\n", + " inputs = inputs.to(device)\n", + " outputs = model(inputs)\n", + " feats[i:i+len(inputs),:] = outputs.cpu().detach().numpy()\n", + " i += len(inputs)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-1.26415765, -0.28140783, -0.29993755, ..., 0.33739716,\n", + " -0.96456331, -0.95310527],\n", + " [ 0.55027246, -0.06091447, 0.11403629, ..., -0.08996978,\n", + " -0.62236136, -0.96085918],\n", + " [ 0.52193987, -0.29220241, -0.45371717, ..., 0.34175205,\n", + " -1.1439786 , -0.85960728],\n", + " ...,\n", + " [-0.50936353, 0.39209121, -0.29870456, ..., 0.0661362 ,\n", + " 0.43009469, -0.34069228],\n", + " [ 0.24428365, 0.07818466, -0.89307284, ..., 0.29522306,\n", + " -0.72958505, -1.24356151],\n", + " [-0.30123377, 0.6785413 , -0.19940855, ..., 0.14395328,\n", + " 0.52420121, -1.16047859]])" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feats" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image_namefeat_index
010.513181862198008_39.768191057994024_10.52815...0
1-14.632534157196016_34.981995235794024_-14.662...1
2-14.526346764205977_35.593520078598004_-14.481...2
37.4290196173360155_7.26950147266_7.45896346014...3
4-10.405547698244352_34.14279535535209_-10.4038...4
\n", + "
" + ], + "text/plain": [ + " image_name feat_index\n", + "0 10.513181862198008_39.768191057994024_10.52815... 0\n", + "1 -14.632534157196016_34.981995235794024_-14.662... 1\n", + "2 -14.526346764205977_35.593520078598004_-14.481... 2\n", + "3 7.4290196173360155_7.26950147266_7.45896346014... 3\n", + "4 -10.405547698244352_34.14279535535209_-10.4038... 4" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "forward_pass_df = pd.DataFrame.from_dict({'image_name': image_order, 'feat_index': np.arange(len(image_order))})\n", + "forward_pass_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "df_consumption = pd.merge(left=df_images, right=forward_pass_df, on='image_name')" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "# have we maintained all validation images?\n", + "assert len(df_consumption) == (~df_images['is_train']).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image_nameimage_latimage_loncluster_latcluster_loncons_pcnightlightscountrynightlights_binis_trainfeat_index
0-17.125093842803985_35.18726915719602_-17.0951...-17.12509435.187269-17.09515035.2172131.4232390.025206mw0False318
1-17.140065764205975_35.232184921401995_-17.095...-17.14006635.232185-17.09515035.2172131.4232390.025206mw0False1861
2-17.065206157196016_35.262128764205976_-17.095...-17.06520635.262129-17.09515035.2172131.4232390.025206mw0False836
3-17.07737907859801_35.069727235794026_-17.0923...-17.07737935.069727-17.09235135.1146431.2662040.000000mw0False18
4-17.137266764205975_35.08469915719602_-17.0923...-17.13726735.084699-17.09235135.1146431.2662040.000000mw0False1051
\n", + "
" + ], + "text/plain": [ + " image_name image_lat image_lon \\\n", + "0 -17.125093842803985_35.18726915719602_-17.0951... -17.125094 35.187269 \n", + "1 -17.140065764205975_35.232184921401995_-17.095... -17.140066 35.232185 \n", + "2 -17.065206157196016_35.262128764205976_-17.095... -17.065206 35.262129 \n", + "3 -17.07737907859801_35.069727235794026_-17.0923... -17.077379 35.069727 \n", + "4 -17.137266764205975_35.08469915719602_-17.0923... -17.137267 35.084699 \n", + "\n", + " cluster_lat cluster_lon cons_pc nightlights country nightlights_bin \\\n", + "0 -17.095150 35.217213 1.423239 0.025206 mw 0 \n", + "1 -17.095150 35.217213 1.423239 0.025206 mw 0 \n", + "2 -17.095150 35.217213 1.423239 0.025206 mw 0 \n", + "3 -17.092351 35.114643 1.266204 0.000000 mw 0 \n", + "4 -17.092351 35.114643 1.266204 0.000000 mw 0 \n", + "\n", + " is_train feat_index \n", + "0 False 318 \n", + "1 False 1861 \n", + "2 False 836 \n", + "3 False 18 \n", + "4 False 1051 " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_consumption.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aggregate Features\n", + "For each country, we aggregate the image features per cluster and save them to results/country/cnn" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "country_abbrv = ['mw', 'eth', 'ng']\n", + "country_dir = ['malawi_2016', 'ethiopia_2015', 'nigeria_2015']\n", + "\n", + "for ca, cd in zip(country_abbrv, country_dir):\n", + " df_c = df_consumption[df_consumption['country'] == ca]\n", + " group = df_c.groupby(['cluster_lat', 'cluster_lon'])\n", + " x = np.zeros((len(group), 4096))\n", + " cluster_list = [] # the corresponding clusters (lat, lon) to the x aggregate feature array\n", + " for i, g in enumerate(group):\n", + " lat, lon = g[0]\n", + " im_sub = df_consumption[(df_consumption['cluster_lat'] == lat) & (df_consumption['cluster_lon'] == lon)].reset_index(drop=True)\n", + " agg_feats = np.zeros((len(im_sub), 4096))\n", + " for j, d in im_sub.iterrows():\n", + " agg_feats[j,:] = feats[d.feat_index]\n", + " agg_feats = agg_feats.mean(axis=0) # averages the features across all images in the cluster\n", + "\n", + " x[i,:] = agg_feats\n", + " cluster_list.append([lat, lon])\n", + " # save to the correct directory\n", + " save_dir = os.path.join(RESULTS_DIR, cd, 'cnn')\n", + " os.makedirs(save_dir, exist_ok=True)\n", + " np.save(os.path.join(save_dir, 'cluster_feats.npy'), x)\n", + " pickle.dump(cluster_list, open(os.path.join(save_dir, 'cluster_order.pkl'), 'wb')) \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "name": "pytorch-gpu.1-4.m46", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-4:m46" + }, + "kernelspec": { + "display_name": "testenv", + "language": "python", + "name": "testenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/predicting-poverty-education-replication/scripts/.ipynb_checkpoints/train_cnn-checkpoint.ipynb b/predicting-poverty-education-replication/scripts/.ipynb_checkpoints/train_cnn-checkpoint.ipynb new file mode 100644 index 0000000..c87e48d --- /dev/null +++ b/predicting-poverty-education-replication/scripts/.ipynb_checkpoints/train_cnn-checkpoint.ipynb @@ -0,0 +1,1190 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import shutil\n", + "import numpy as np\n", + "import pandas as pd\n", + "from tqdm.notebook import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_DIR = '..'\n", + "RANDOM_SEED = 7 # for reproducibility\n", + "COUNTRIES_DIR = os.path.join(BASE_DIR, 'data', 'countries')\n", + "PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')\n", + "\n", + "# these relate to training the CNN to predict nightlights\n", + "CNN_TRAIN_IMAGE_DIR = os.path.join(BASE_DIR, 'data', 'cnn_images')\n", + "CNN_SAVE_DIR = os.path.join(BASE_DIR, 'models')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(CNN_TRAIN_IMAGE_DIR, exist_ok=True)\n", + "os.makedirs(CNN_SAVE_DIR, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocess\n", + "After doing this once, you can skip to the training if the script broke" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "actually downloaded: 25246, expected: 14500\n" + ] + } + ], + "source": [ + "df_download = pd.read_csv(os.path.join(PROCESSED_DIR, 'image_download_locs.csv'))\n", + "downloaded = os.listdir(os.path.join(COUNTRIES_DIR, 'malawi_2016', 'images')) + \\\n", + " os.listdir(os.path.join(COUNTRIES_DIR, 'ethiopia_2015', 'images')) + \\\n", + " os.listdir(os.path.join(COUNTRIES_DIR, 'nigeria_2015', 'images'))\n", + "\n", + "print(f\"actually downloaded: {len(downloaded)}, expected: {len(df_download)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image_nameimage_latimage_loncluster_latcluster_loncons_pcnightlightscountrynightlights_bin
04.31578611574_6.223837135554024_4.31578611574_...4.3157866.2238374.3157866.2687534.3177170.123354ng1
14.330758037141992_6.223837135554024_4.31578611...4.3307586.2238374.3157866.2687534.3177170.123354ng1
24.285842272936016_6.238809056956016_4.31578611...4.2858426.2388094.3157866.2687534.3177170.123354ng1
34.270870351534024_6.253780978358008_4.31578611...4.2708706.2537814.3157866.2687534.3177170.123354ng1
44.345729958543984_6.253780978358008_4.31578611...4.3457306.2537814.3157866.2687534.3177170.123354ng1
\n", + "
" + ], + "text/plain": [ + " image_name image_lat image_lon \\\n", + "0 4.31578611574_6.223837135554024_4.31578611574_... 4.315786 6.223837 \n", + "1 4.330758037141992_6.223837135554024_4.31578611... 4.330758 6.223837 \n", + "2 4.285842272936016_6.238809056956016_4.31578611... 4.285842 6.238809 \n", + "3 4.270870351534024_6.253780978358008_4.31578611... 4.270870 6.253781 \n", + "4 4.345729958543984_6.253780978358008_4.31578611... 4.345730 6.253781 \n", + "\n", + " cluster_lat cluster_lon cons_pc nightlights country nightlights_bin \n", + "0 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "1 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "2 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "3 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "4 4.315786 6.268753 4.317717 0.123354 ng 1 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_download.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df_download['row'] = np.arange(len(df_download))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "\"['-13.642146764205977_34.846897078598005_-13.597231_34.861869.png'\\n '-15.84688_35.071951921402_-15.84688_35.05698.png'\\n '-11.884834921401993_34.132379_-11.869863_34.132379.png' ...\\n '-11.305565235794024_33.48830884280398_-11.350481_33.458365.png'\\n '.DS_Store' '.DS_Store'] not found in axis\"", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0midx_not_download\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_download\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'image_name'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdownloaded\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'row'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdf_download\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0midx_not_download\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mdrop\u001b[0;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[1;32m 4313\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4314\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4315\u001b[0;31m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4316\u001b[0m )\n\u001b[1;32m 4317\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mdrop\u001b[0;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[1;32m 4151\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32min\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4152\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4153\u001b[0;31m \u001b[0mobj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_drop_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4154\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4155\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_drop_axis\u001b[0;34m(self, labels, axis, level, errors)\u001b[0m\n\u001b[1;32m 4186\u001b[0m \u001b[0mnew_axis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4187\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4188\u001b[0;31m \u001b[0mnew_axis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4189\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0maxis_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnew_axis\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4190\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mdrop\u001b[0;34m(self, labels, errors)\u001b[0m\n\u001b[1;32m 5589\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5590\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m\"ignore\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5591\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{labels[mask]} not found in axis\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5592\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5593\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdelete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: \"['-13.642146764205977_34.846897078598005_-13.597231_34.861869.png'\\n '-15.84688_35.071951921402_-15.84688_35.05698.png'\\n '-11.884834921401993_34.132379_-11.869863_34.132379.png' ...\\n '-11.305565235794024_33.48830884280398_-11.350481_33.458365.png'\\n '.DS_Store' '.DS_Store'] not found in axis\"" + ] + } + ], + "source": [ + "idx_not_download = df_download.set_index('image_name').drop(downloaded)['row'].values.tolist()\n", + "df_download.drop(idx_not_download, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df_download.drop('row', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.41379310344827586, 0.3586206896551724, 0.22758620689655173)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the distribution\n", + "(df_download['nightlights_bin']==0).mean(), (df_download['nightlights_bin']==1).mean(), (df_download['nightlights_bin']==2).mean()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Split images into train/valid.\n", + "Each cluster will contribute 80% of images for training, and 20% for validation." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df_download.reset_index(drop=True, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image_nameimage_latimage_loncluster_latcluster_loncons_pcnightlightscountrynightlights_bin
04.31578611574_6.223837135554024_4.31578611574_...4.3157866.2238374.3157866.2687534.3177170.123354ng1
14.330758037141992_6.223837135554024_4.31578611...4.3307586.2238374.3157866.2687534.3177170.123354ng1
24.285842272936016_6.238809056956016_4.31578611...4.2858426.2388094.3157866.2687534.3177170.123354ng1
34.270870351534024_6.253780978358008_4.31578611...4.2708706.2537814.3157866.2687534.3177170.123354ng1
44.345729958543984_6.253780978358008_4.31578611...4.3457306.2537814.3157866.2687534.3177170.123354ng1
\n", + "
" + ], + "text/plain": [ + " image_name image_lat image_lon \\\n", + "0 4.31578611574_6.223837135554024_4.31578611574_... 4.315786 6.223837 \n", + "1 4.330758037141992_6.223837135554024_4.31578611... 4.330758 6.223837 \n", + "2 4.285842272936016_6.238809056956016_4.31578611... 4.285842 6.238809 \n", + "3 4.270870351534024_6.253780978358008_4.31578611... 4.270870 6.253781 \n", + "4 4.345729958543984_6.253780978358008_4.31578611... 4.345730 6.253781 \n", + "\n", + " cluster_lat cluster_lon cons_pc nightlights country nightlights_bin \n", + "0 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "1 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "2 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "3 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "4 4.315786 6.268753 4.317717 0.123354 ng 1 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_download.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "df_download['is_train'] = True" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/pandas/core/indexing.py:1637: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " self._setitem_single_block(indexer, value, name)\n" + ] + } + ], + "source": [ + "np.random.seed(RANDOM_SEED)\n", + "groups = df_download.groupby(['cluster_lat', 'cluster_lon'])\n", + "for _, g in groups:\n", + " n_ims = len(g)\n", + " n_train = int(0.8 * n_ims)\n", + " n_valid = n_ims - n_train\n", + " valid_choices = np.random.choice(np.arange(n_ims), replace=False, size=n_valid).tolist()\n", + " current_index = g.index\n", + " idx_valid = current_index[valid_choices]\n", + " df_download['is_train'].loc[idx_valid] = False" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7978620689655173" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_download['is_train'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# save this new dataframe\n", + "df_download.to_csv(os.path.join(PROCESSED_DIR, 'image_download_actual.csv'), index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(os.path.join(CNN_TRAIN_IMAGE_DIR, 'train'), exist_ok=False)\n", + "os.makedirs(os.path.join(CNN_TRAIN_IMAGE_DIR, 'valid'), exist_ok=False)\n", + "\n", + "labels = ['0', '1', '2']\n", + "for l in labels:\n", + " os.makedirs(os.path.join(CNN_TRAIN_IMAGE_DIR, 'train', l), exist_ok=False)\n", + " os.makedirs(os.path.join(CNN_TRAIN_IMAGE_DIR, 'valid', l), exist_ok=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "t = df_download[df_download['is_train']]\n", + "v = df_download[~df_download['is_train']]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(11569, 2931)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(t), len(v)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "copying train images\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c73db387267d48fc8812f72932669dda", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/11569 [00:00 10:\n", + " # fine tune whole model\n", + " for param in model_ft.parameters():\n", + " param.requires_grad = True\n", + " optimizer = optim.SGD(model_ft.parameters(), lr=1e-4, momentum=0.9)\n", + "\n", + " # Each epoch has a training and validation phase\n", + " for phase in ['train', 'valid']:\n", + " if phase == 'train':\n", + " model.train() # Set model to training mode\n", + " else:\n", + " model.eval() # Set model to evaluate mode\n", + "\n", + " running_loss = 0.0\n", + " running_corrects = 0\n", + "\n", + " # Iterate over data.\n", + " for inputs, labels in tqdm(dataloaders[phase]):\n", + " inputs = inputs.to(device)\n", + " labels = labels.to(device)\n", + "\n", + " # zero the parameter gradients\n", + " optimizer.zero_grad()\n", + "\n", + " # forward\n", + " # track history if only in train\n", + " with torch.set_grad_enabled(phase == 'train'):\n", + " outputs = model(inputs)\n", + " loss = criterion(outputs, labels)\n", + "\n", + " _, preds = torch.max(outputs, 1)\n", + "\n", + " # backward + optimize only if in training phase\n", + " if phase == 'train':\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " # statistics\n", + " running_loss += loss.item() * inputs.size(0)\n", + " running_corrects += torch.sum(preds == labels.data)\n", + "\n", + " epoch_loss = running_loss / len(dataloaders[phase].dataset)\n", + " epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)\n", + "\n", + " print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))\n", + "\n", + " # deep copy the model\n", + " if phase == 'valid' and epoch_acc > best_acc:\n", + " best_acc = epoch_acc\n", + " best_model_wts = copy.deepcopy(model.state_dict())\n", + " if phase == 'valid':\n", + " val_acc_history.append(epoch_acc)\n", + " \n", + " print()\n", + "\n", + " time_elapsed = time.time() - since\n", + " print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))\n", + " print('Best val Acc: {:4f}'.format(best_acc))\n", + "\n", + " # load best model weights\n", + " model.load_state_dict(best_model_wts)\n", + " return model, val_acc_history" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 0/19\n", + "----------\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "98513fb13ff84100b70ad72b4ebc3a18", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1447 [00:00\n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 138, in __getitem__\n sample = self.loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 174, in default_loader\n return pil_loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 155, in pil_loader\n with open(path, 'rb') as f:\nFileNotFoundError: [Errno 2] No such file or directory: '/Users/TinotendaMatsika/Documents/predicting-poverty-education-replication/data/cnn_images/train/1/5.577058854163986_5.771977164594023_5.547115011360002_5.8168929288.png'\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# Train and evaluate\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mmodel_ft\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhist\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_ft\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdataloaders_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcriterion\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptimizer_ft\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_epochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnum_epochs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain_model\u001b[0;34m(model, dataloaders, criterion, optimizer, num_epochs)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;31m# Iterate over data.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 29\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataloaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mphase\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 30\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/tqdm/notebook.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtqdm_notebook\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0;31m# return super(tqdm...) will not catch exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/tqdm/std.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1176\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1177\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1178\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1179\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1180\u001b[0m \u001b[0;31m# Update and possibly print the progressbar.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 818\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_task_info\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 819\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 820\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 821\u001b[0m \u001b[0mnext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m__next__\u001b[0m \u001b[0;31m# Python 2 compatibility\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_process_data\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 844\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_try_put_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 845\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mExceptionWrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 846\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 847\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/_utils.py\u001b[0m in \u001b[0;36mreraise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 383\u001b[0m \u001b[0;31m# (https://bugs.python.org/issue2651), so we work around it.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 384\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKeyErrorMessage\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 385\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m: Caught FileNotFoundError in DataLoader worker process 0.\nOriginal Traceback (most recent call last):\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py\", line 178, in _worker_loop\n data = fetcher.fetch(index)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py\", line 44, in fetch\n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py\", line 44, in \n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 138, in __getitem__\n sample = self.loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 174, in default_loader\n return pil_loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 155, in pil_loader\n with open(path, 'rb') as f:\nFileNotFoundError: [Errno 2] No such file or directory: '/Users/TinotendaMatsika/Documents/predicting-poverty-education-replication/data/cnn_images/train/1/5.577058854163986_5.771977164594023_5.547115011360002_5.8168929288.png'\n" + ] + } + ], + "source": [ + "# Setup the loss fxn\n", + "criterion = nn.CrossEntropyLoss()\n", + "\n", + "# Train and evaluate\n", + "model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A model is already saved at this location\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "None", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCNN_SAVE_DIR\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'trained_model.pt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misfile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'A model is already saved at this location'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'Saving model to {path}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_ft\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAssertionError\u001b[0m: None" + ] + } + ], + "source": [ + "path = os.path.join(CNN_SAVE_DIR, 'trained_model.pt')\n", + "assert not os.path.isfile(path), print('A model is already saved at this location')\n", + "print(f'Saving model to {path}')\n", + "torch.save(model_ft, path)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2fe8181ce2654062ba50450b26a2f647", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1447 [00:00\n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 138, in __getitem__\n sample = self.loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 174, in default_loader\n return pil_loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 155, in pil_loader\n with open(path, 'rb') as f:\nFileNotFoundError: [Errno 2] No such file or directory: '/Users/TinotendaMatsika/Documents/predicting-poverty-education-replication/data/cnn_images/train/0/10.271252764598009_9.682578659135977_10.286224686_9.637662894930001.png'\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;31m# Iterate over data.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataloaders_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'train'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/tqdm/notebook.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtqdm_notebook\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0;31m# return super(tqdm...) will not catch exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/tqdm/std.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1176\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1177\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1178\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1179\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1180\u001b[0m \u001b[0;31m# Update and possibly print the progressbar.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 818\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_task_info\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 819\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 820\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 821\u001b[0m \u001b[0mnext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m__next__\u001b[0m \u001b[0;31m# Python 2 compatibility\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_process_data\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 844\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_try_put_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 845\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mExceptionWrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 846\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 847\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/_utils.py\u001b[0m in \u001b[0;36mreraise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 383\u001b[0m \u001b[0;31m# (https://bugs.python.org/issue2651), so we work around it.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 384\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKeyErrorMessage\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 385\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m: Caught FileNotFoundError in DataLoader worker process 0.\nOriginal Traceback (most recent call last):\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py\", line 178, in _worker_loop\n data = fetcher.fetch(index)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py\", line 44, in fetch\n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py\", line 44, in \n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 138, in __getitem__\n sample = self.loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 174, in default_loader\n return pil_loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 155, in pil_loader\n with open(path, 'rb') as f:\nFileNotFoundError: [Errno 2] No such file or directory: '/Users/TinotendaMatsika/Documents/predicting-poverty-education-replication/data/cnn_images/train/0/10.271252764598009_9.682578659135977_10.286224686_9.637662894930001.png'\n" + ] + } + ], + "source": [ + "# you can run below if you want to see the final accuracy on nightlights over the train set\n", + "model_ft.eval() # Set model to evaluate mode\n", + "\n", + "criterion = nn.CrossEntropyLoss()\n", + "running_loss = 0.0\n", + "running_corrects = 0\n", + "total = 0\n", + "\n", + "# Iterate over data.\n", + "for inputs, labels in tqdm(dataloaders_dict['train']):\n", + " inputs = inputs.to(device)\n", + " labels = labels.to(device)\n", + "\n", + " # forward\n", + " # track history if only in train\n", + " with torch.set_grad_enabled(False):\n", + " outputs = model_ft(inputs)\n", + " loss = criterion(outputs, labels)\n", + "\n", + " _, preds = torch.max(outputs, 1)\n", + "\n", + " # statistics\n", + " running_loss += loss.item() * inputs.size(0)\n", + " running_corrects += torch.sum(preds == labels.data)\n", + " \n", + " total += len(preds)\n", + " \n", + "print(running_corrects.double()/total)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "name": "pytorch-gpu.1-4.m46", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-4:m46" + }, + "kernelspec": { + "display_name": "testenv", + "language": "python", + "name": "testenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/predicting-poverty-education-replication/scripts/download_images.ipynb b/predicting-poverty-education-replication/scripts/download_images.ipynb new file mode 100644 index 0000000..577509c --- /dev/null +++ b/predicting-poverty-education-replication/scripts/download_images.ipynb @@ -0,0 +1,3074 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "download_images.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "environment": { + "name": "pytorch-gpu.1-4.m46", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-4:m46" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "eb03e01e458a4797a88152b874f376e3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7c0a93896bc04018beac3d9d8634fa5e", + "IPY_MODEL_516f34ee82664b1d8d9b385efa35eaa7" + ], + "layout": "IPY_MODEL_8f50de1322db49869a25e53ad103cdfe" + } + }, + "7c0a93896bc04018beac3d9d8634fa5e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "100%", + "description_tooltip": null, + "layout": "IPY_MODEL_e8a9e60dfc8341a18d60d9aa25fcf4c4", + "max": 14500, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_bec8e5c2800b433394e2c0856dd0d526", + "value": 14500 + } + }, + "516f34ee82664b1d8d9b385efa35eaa7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_466113287a364f27bdc04158b27555bf", + "placeholder": "​", + "style": "IPY_MODEL_dee1210a038945c9aa01296d060eaa6b", + "value": " 14500/14500 [4:31:10<00:00, 1.12s/it]" + } + }, + "8f50de1322db49869a25e53ad103cdfe": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e8a9e60dfc8341a18d60d9aa25fcf4c4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bec8e5c2800b433394e2c0856dd0d526": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "initial" + } + }, + "466113287a364f27bdc04158b27555bf": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dee1210a038945c9aa01296d060eaa6b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Aj2mm397iqS-" + }, + "source": [ + "Skip the first two steps if you've already ran them and simply need to continue downloading images" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6MlpN1c5irpO", + "outputId": "41e6525a-594b-400e-f02b-c143946d3913" + }, + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/gdrive')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Mounted at /content/gdrive\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "1KpqrxtQiqTD" + }, + "source": [ + "import math\n", + "import random\n", + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "from osgeo import gdal, osr\n", + "from tqdm.notebook import tqdm\n", + "import requests\n", + "import matplotlib.pyplot as plt\n", + "from io import BytesIO\n", + "import logging\n", + "import time" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "e9KaUKaXiqTE" + }, + "source": [ + "BASE_DIR = 'gdrive/MyDrive/geo'\n", + "COUNTRIES_DIR = os.path.join(BASE_DIR, 'data', 'countries')\n", + "PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')\n", + "# can try using the google downloader, in which case change this to be your google api token\n", + "ACCESS_TOKEN_DIR = os.path.join(BASE_DIR, 'planet_api_key.txt')\n", + "\n", + "RANDOM_SEED = 7 # for reproducibility\n", + "\n", + "# each cluster must have AT LEAST this many images after doing nightlights processing\n", + "MIN_IMAGES_PER_CLUSTER = 10" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Uyb7qkkhiqTF" + }, + "source": [ + "# from ctypes.util import find_library\n", + "# find_library('geos_c')\n", + "import sys\n", + "sys.path.append(BASE_DIR)\n", + "from utils import create_space" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iRIf2ixRiqTG" + }, + "source": [ + "# Generate Download Locations" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "DrRhGn1kiqTG" + }, + "source": [ + "# # df_mw = pd.read_csv(os.path.join(COUNTRIES_DIR, 'malawi_2016', 'processed/clusters.csv'))\n", + "# df_eth = pd.read_csv(os.path.join(COUNTRIES_DIR, 'ethiopia_2015', 'processed/clusters.csv'))\n", + "df_ng = pd.read_csv(os.path.join(COUNTRIES_DIR, 'nigeria_2015', 'processed/clusters.csv'))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "YDMyNVxriqTG", + "outputId": "ec6a6a5d-bf9e-41ff-e986-322f93d972bc" + }, + "source": [ + "for country in ['malawi_2016', 'ethiopia_2015', 'nigeria_2015']:\n", + " os.makedirs(os.path.join(COUNTRIES_DIR, country, 'images'), exist_ok=False)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "error", + "ename": "FileExistsError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileExistsError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mcountry\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'malawi_2016'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'ethiopia_2015'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nigeria_2015'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCOUNTRIES_DIR\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcountry\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'images'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/usr/lib/python3.7/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 223\u001b[0;31m \u001b[0mmkdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 224\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[0;31m# Cannot rely on checking for EEXIST, since the operating system\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFileExistsError\u001b[0m: [Errno 17] File exists: 'gdrive/MyDrive/geo/data/countries/malawi_2016/images'" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "QlxLFk0tiqTH" + }, + "source": [ + "def generate_download_locations(df, ipc=50):\n", + " '''\n", + " Takes a dataframe with columns cluster_lat, cluster_lon\n", + " Generates a 10km x 10km bounding box around the cluster and samples \n", + " ipc images per cluster. First samples in a grid fashion, then any \n", + " remaining points are randomly (uniformly) chosen\n", + " '''\n", + " np.random.seed(RANDOM_SEED) # for reproducability\n", + " df_download = {'image_name': [], 'image_lat': [], 'image_lon': [], 'cluster_lat': [], \n", + " 'cluster_lon': [], 'cons_pc': [], 'nightlights': [] }\n", + " \n", + " # side length of square for uniform distribution\n", + " edge_num = math.floor(math.sqrt(ipc))\n", + " for _, r in df.iterrows():\n", + " min_lat, min_lon, max_lat, max_lon = create_space(r.cluster_lat, r.cluster_lon)\n", + " lats = np.linspace(min_lat, max_lat, edge_num).tolist()\n", + " lons = np.linspace(min_lon, max_lon, edge_num).tolist()\n", + "\n", + " # performs cartesian product\n", + " uniform_points = np.transpose([np.tile(lats, len(lons)), np.repeat(lons, len(lats))])\n", + " \n", + " lats = uniform_points[:,0].tolist()\n", + " lons = uniform_points[:,1].tolist()\n", + " \n", + " # fills the remainder with random points\n", + " for _ in range(ipc - edge_num * edge_num):\n", + " lat = random.uniform(min_lat, max_lat)\n", + " lon = random.uniform(min_lon, max_lon)\n", + " lats.append(lat)\n", + " lons.append(lon)\n", + " \n", + " # add to dict\n", + " for lat, lon in zip(lats, lons):\n", + " # image name is going to be image_lat_image_lon_cluster_lat_cluster_lon.png\n", + " image_name = str(lat) + '_' + str(lon) + '_' + str(r.cluster_lat) + '_' + str(r.cluster_lon) + '.png'\n", + " df_download['image_name'].append(image_name)\n", + " df_download['image_lat'].append(lat)\n", + " df_download['image_lon'].append(lon)\n", + " df_download['cluster_lat'].append(r.cluster_lat)\n", + " df_download['cluster_lon'].append(r.cluster_lon)\n", + " df_download['cons_pc'].append(r.cons_pc)\n", + " df_download['nightlights'].append(r.nightlights)\n", + " \n", + " return pd.DataFrame.from_dict(df_download)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "l1BRpf9yiqTI" + }, + "source": [ + "# # df_mw_download = generate_download_locations(df_mw)\n", + "# df_eth_download = generate_download_locations(df_eth)\n", + "df_ng_download = generate_download_locations(df_ng)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "y5r_X3L2iqTK", + "outputId": "cbbf8934-ddb5-48b4-c614-1d66a28275df" + }, + "source": [ + "# df_mw_download.shape, df_eth_download.shape, df_ng_download.shape\n", + "df_ng_download.shape" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(33200, 7)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Nhd481lriqTN" + }, + "source": [ + "# df_mw_download.head()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "id": "SqvHNzWciqTO", + "outputId": "fb1b5833-4e0b-4809-ae6a-c73611d69391" + }, + "source": [ + "# df_eth_download.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image_nameimage_latimage_loncluster_latcluster_loncons_pcnightlights
03.410784802784024_39.47107816189403_3.45570056...3.41078539.4710783.45570139.51599414.8546340.0
13.4257567241860163_39.47107816189403_3.4557005...3.42575739.4710783.45570139.51599414.8546340.0
23.440728645588008_39.47107816189403_3.45570056...3.44072939.4710783.45570139.51599414.8546340.0
33.45570056699_39.47107816189403_3.45570056699_...3.45570139.4710783.45570139.51599414.8546340.0
43.470672488391992_39.47107816189403_3.45570056...3.47067239.4710783.45570139.51599414.8546340.0
\n", + "
" + ], + "text/plain": [ + " image_name ... nightlights\n", + "0 3.410784802784024_39.47107816189403_3.45570056... ... 0.0\n", + "1 3.4257567241860163_39.47107816189403_3.4557005... ... 0.0\n", + "2 3.440728645588008_39.47107816189403_3.45570056... ... 0.0\n", + "3 3.45570056699_39.47107816189403_3.45570056699_... ... 0.0\n", + "4 3.470672488391992_39.47107816189403_3.45570056... ... 0.0\n", + "\n", + "[5 rows x 7 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "id": "unWw1QyTiqTO", + "outputId": "98877468-1022-418f-95c9-3bc26c2bb0a1" + }, + "source": [ + "df_ng_download.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image_nameimage_latimage_loncluster_latcluster_loncons_pcnightlights
04.270870351534024_6.223837135554024_4.31578611...4.2708706.2238374.3157866.2687534.3177170.123354
14.285842272936016_6.223837135554024_4.31578611...4.2858426.2238374.3157866.2687534.3177170.123354
24.300814194338008_6.223837135554024_4.31578611...4.3008146.2238374.3157866.2687534.3177170.123354
34.31578611574_6.223837135554024_4.31578611574_...4.3157866.2238374.3157866.2687534.3177170.123354
44.330758037141992_6.223837135554024_4.31578611...4.3307586.2238374.3157866.2687534.3177170.123354
\n", + "
" + ], + "text/plain": [ + " image_name ... nightlights\n", + "0 4.270870351534024_6.223837135554024_4.31578611... ... 0.123354\n", + "1 4.285842272936016_6.223837135554024_4.31578611... ... 0.123354\n", + "2 4.300814194338008_6.223837135554024_4.31578611... ... 0.123354\n", + "3 4.31578611574_6.223837135554024_4.31578611574_... ... 0.123354\n", + "4 4.330758037141992_6.223837135554024_4.31578611... ... 0.123354\n", + "\n", + "[5 rows x 7 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xf6dj5P8iqTP" + }, + "source": [ + "# # df_mw_download['country'] = 'mw'\n", + "# df_eth_download['country'] = 'eth'\n", + "df_ng_download['country'] = 'ng'" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "LW49lKmliqTP" + }, + "source": [ + "# for image download purposes the country distinction is irreleveant\n", + "# df_potential_download = pd.concat([df_mw_download, df_eth_download, df_ng_download], axis=0)\n", + "df_potential_download = df_ng_download\n", + "df_potential_download.reset_index(drop=True, inplace=True)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "id": "DkrzvDA3iqTP", + "outputId": "fce118b0-c801-4948-9ebe-f3782f2433d3" + }, + "source": [ + "df_potential_download.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image_nameimage_latimage_loncluster_latcluster_loncons_pcnightlightscountry
04.270870351534024_6.223837135554024_4.31578611...4.2708706.2238374.3157866.2687534.3177170.123354ng
14.285842272936016_6.223837135554024_4.31578611...4.2858426.2238374.3157866.2687534.3177170.123354ng
24.300814194338008_6.223837135554024_4.31578611...4.3008146.2238374.3157866.2687534.3177170.123354ng
34.31578611574_6.223837135554024_4.31578611574_...4.3157866.2238374.3157866.2687534.3177170.123354ng
44.330758037141992_6.223837135554024_4.31578611...4.3307586.2238374.3157866.2687534.3177170.123354ng
\n", + "
" + ], + "text/plain": [ + " image_name ... country\n", + "0 4.270870351534024_6.223837135554024_4.31578611... ... ng\n", + "1 4.285842272936016_6.223837135554024_4.31578611... ... ng\n", + "2 4.300814194338008_6.223837135554024_4.31578611... ... ng\n", + "3 4.31578611574_6.223837135554024_4.31578611574_... ... ng\n", + "4 4.330758037141992_6.223837135554024_4.31578611... ... ng\n", + "\n", + "[5 rows x 8 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 12 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tkvD_SW7iqTQ" + }, + "source": [ + "# Filter Download Choices" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "g8TQCFGLiqTQ", + "outputId": "a2ccf8d4-f34a-432a-a3ca-68c53b7dcf56" + }, + "source": [ + "df_potential_download['nightlights'].max()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "67.03113555908203" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 13 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4GnrymuoiqTQ", + "outputId": "4c7e5d49-97b9-44c1-8337-ed23fed5f93a" + }, + "source": [ + "# most nightlights are 0\n", + "# let's download images that have nonzero nightlights to induce variety into the model\n", + "print((df_potential_download['nightlights'] == 0).mean())\n", + "print((df_potential_download['nightlights'] <= 2).mean())" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "0.23343373493975902\n", + "0.786144578313253\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "6ibURIDaiqTR" + }, + "source": [ + "def drop_0s(df, fr=0.1):\n", + " \"\"\"\n", + " Solves for d:\n", + " (c_z - d)/(n - d) = fr\n", + " Where d = rows to drop, c_z = num rows with zero nightlights, n = num rows, fr = frac remaining\n", + " \n", + " Yields:\n", + " d = (c_z - n*fr) / (1 - fr)\n", + " \"\"\"\n", + " np.random.seed(RANDOM_SEED)\n", + " c_z = (df['nightlights']==0).sum()\n", + " n = len(df)\n", + " assert c_z / n > fr, print(f'Dataframe already has under {fr} zeros')\n", + " \n", + " d = (c_z - n * fr) / (1 - fr)\n", + " d = int(d)\n", + " print(f'dropping: {d}')\n", + " \n", + " zero_df = df[df['nightlights']==0]\n", + " zero_clusters = zero_df.groupby(['cluster_lat', 'cluster_lon'])\n", + " per_cluster_drop = int(d / len(zero_clusters))\n", + " print(f'Need to drop {per_cluster_drop} per cluster with 0 nightlights')\n", + " \n", + " drop_inds = []\n", + " for (cluster_lat, cluster_lon), group in zero_clusters:\n", + " z_inds = group.index\n", + " clust_drop = np.random.choice(z_inds, per_cluster_drop, replace=False)\n", + " assert len(group) - len(clust_drop) >= MIN_IMAGES_PER_CLUSTER, print(f'dropping too many in {cluster_lat}, {cluster_lon}')\n", + " drop_inds += clust_drop.tolist()\n", + " \n", + " # this is how you do it purely randomly but some clusters might get wiped out\n", + " # z_inds = np.argwhere(df['nightlights'].values == 0).reshape(-1)\n", + " # drop_inds = np.random.choice(z_inds, d, replace=False)\n", + " return df.drop(drop_inds).reset_index(drop=True)\n", + "\n", + "def drop_in_range(df, lower=0, upper=2, fr=0.25):\n", + " \"\"\"\n", + " Very similar to drop_0s calculation, but more generalized. Lower and upper are inclusive.\n", + " \"\"\"\n", + " np.random.seed(RANDOM_SEED)\n", + " boolean_idx = ((lower <= df['nightlights']) & (df['nightlights'] <= upper))\n", + " c_under = boolean_idx.sum()\n", + " n = len(df)\n", + " assert c_under / n > fr, print(f'Dataframe already has under {fr} rows in the given range')\n", + " \n", + " d = (c_under - n * fr) / (1 - fr)\n", + " d = int(d)\n", + " print(f'dropping: {d}')\n", + " \n", + " select_df = df[boolean_idx]\n", + " select_clusters = select_df.groupby(['cluster_lat', 'cluster_lon'])\n", + " per_cluster_drop = int(d / len(select_clusters))\n", + " print(f'Need to drop {per_cluster_drop} per cluster in the given range')\n", + " \n", + " drop_inds = []\n", + " for (cluster_lat, cluster_lon), group in select_clusters:\n", + " z_inds = group.index\n", + " clust_drop = np.random.choice(z_inds, per_cluster_drop, replace=False)\n", + " assert len(group) - len(clust_drop) >= MIN_IMAGES_PER_CLUSTER, print(f'dropping too many in {cluster_lat}, {cluster_lon}')\n", + " drop_inds += clust_drop.tolist()\n", + " \n", + " return df.drop(drop_inds).reset_index(drop=True)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wkVCur9diqTS", + "outputId": "2b60cb24-9ca3-4f75-f091-ae9610d99893" + }, + "source": [ + "df_mod_download = drop_0s(df_potential_download, fr=0.1)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "dropping: 4922\n", + "Need to drop 31 per cluster with 0 nightlights\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "o5bCJVKtiqTT", + "outputId": "2dabd475-8549-425f-a657-dcc4dc02db3c" + }, + "source": [ + "(df_mod_download['nightlights'] == 0).mean()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.10371544285965839" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "i5lOnFRbiqTT", + "outputId": "e642dc2e-3afb-41b5-b557-7467eb850275" + }, + "source": [ + "df_mod_download = drop_in_range(df_mod_download, lower=0.001, upper=3, fr=0.4)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "dropping: 14153\n", + "Need to drop 35 per cluster in the given range\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uljZ8WWGiqTT", + "outputId": "a5fc0874-f106-4219-f05b-31e4ed183dd3" + }, + "source": [ + "((0.001 <= df_mod_download['nightlights']) & (df_mod_download['nightlights'] <= 3)).mean()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.4106896551724138" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 19 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "z70U-yLhiqTT", + "outputId": "a24e2172-f6a8-4060-9ab0-927ab98da944" + }, + "source": [ + "# this has gone up now though\n", + "(df_mod_download['nightlights'] == 0).mean()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.20310344827586208" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3gj6x123iqTU", + "outputId": "b14af133-13ee-4684-fd93-4c25d27deab6" + }, + "source": [ + "df_mod_download = drop_0s(df_mod_download, fr=0.2)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "dropping: 56\n", + "Need to drop 0 per cluster with 0 nightlights\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "chGJ_9byiqTU" + }, + "source": [ + "At this point the low nightlight clusters (0 and under 3) have 11 and 12 images respectively, meaning very few more images can be dropped without going under 10." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "hxY4yTXjiqTU" + }, + "source": [ + "from sklearn.mixture import GaussianMixture as GMM\n", + "X = df_mod_download['nightlights'].values.reshape(-1,1)\n", + "gmm = GMM(n_components=3).fit(X)\n", + "labels = gmm.predict(df_mod_download['nightlights'].values.reshape(-1,1))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qSI-sY6tiqTU", + "outputId": "db7934df-3db8-4109-9e2a-906d57c79bf4" + }, + "source": [ + "(labels==0).mean(), (labels==1).mean(), (labels==2).mean()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(0.57, 0.017241379310344827, 0.4127586206896552)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 23 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CFymdGg8iqTU", + "outputId": "b7567870-f606-406c-cc79-449aefe29806" + }, + "source": [ + "# these are the cutoff for the labels identified by the Guassian Mixture Model\n", + "label0_max = df_mod_download['nightlights'][labels==0].max()\n", + "label1_max = df_mod_download['nightlights'][labels==1].max()\n", + "label2_max = df_mod_download['nightlights'][labels==2].max()\n", + "\n", + "label0_max, label1_max, label2_max" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(16.685375213623047, 67.03113555908203, 0.04805320128798485)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 24 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Llb78MTviqTV" + }, + "source": [ + "# I am going to hand reassign these to have better representation among all three classes\n", + "# these are not ideal distributions obviously but the model should still be able to learn\n", + "# something like a quantile cut might work better and be less arbitrary, but for reproducability \n", + "# purposes I'll stick to the GMM-based approach\n", + "label0_max = 0.05\n", + "label1_max = 5\n", + "label2_max = 70" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VSZAzq5uiqTV", + "outputId": "2af58315-d0d2-4a36-9ada-a6a45ab25d70" + }, + "source": [ + "def query_df(df, lower, upper):\n", + " return df[((lower <= df['nightlights']) & (df['nightlights'] < upper))]\n", + "\n", + "print(len(query_df(df_mod_download, 0, label0_max)) / len(df_mod_download))\n", + "print(len(query_df(df_mod_download, label0_max, label1_max)) / len(df_mod_download))\n", + "print(len(query_df(df_mod_download, label1_max, label2_max)) / len(df_mod_download))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "0.41379310344827586\n", + "0.3586206896551724\n", + "0.22758620689655173\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ErMOpsbjiqTV", + "outputId": "62f374b0-13ad-402d-f908-3f68f5a29b3b" + }, + "source": [ + "def create_nightlights_bin(df, cutoffs):\n", + " assert len(cutoffs) >= 2, print('need at least 2 bins')\n", + " cutoffs = sorted(cutoffs, reverse=True)\n", + " labels = list(range(len(cutoffs)))[::-1]\n", + " df['nightlights_bin'] = len(cutoffs)\n", + " for cutoff, label in zip(cutoffs, labels):\n", + " df['nightlights_bin'].loc[df['nightlights'] <= cutoff] = label\n", + "\n", + "df_download = df_mod_download.copy()\n", + "create_nightlights_bin(df_download, cutoffs=[label0_max, label1_max, label2_max])" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " iloc._setitem_with_indexer(indexer, value)\n", + "/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " iloc._setitem_with_indexer(indexer, value)\n", + "/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " iloc._setitem_with_indexer(indexer, value)\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WPirq5YKiqTW", + "outputId": "02da8141-0df2-4257-9869-781cd5f0a065" + }, + "source": [ + "# these should match above\n", + "(df_download['nightlights_bin']==0).mean(), (df_download['nightlights_bin']==1).mean(), (df_download['nightlights_bin']==2).mean()\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(0.41379310344827586, 0.3586206896551724, 0.22758620689655173)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 28 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ug1a-EotiqTW", + "outputId": "a24a3ff2-9057-4628-8a2b-30af7aaef499" + }, + "source": [ + "df_download.shape" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(14500, 9)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 29 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "lvUEfdJiiqTW" + }, + "source": [ + "os.makedirs(PROCESSED_DIR, exist_ok=True)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "n0fk443siqTX" + }, + "source": [ + "df_download.to_csv(os.path.join(PROCESSED_DIR, 'image_download_locs.csv'), index=False)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "87oQFeHbiqTX" + }, + "source": [ + "# Download Images\n", + "If the script breaks, you can restart here by uncommenting the line below and running the below code again. It won't download images you have already downloaded." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Z_hZTdiiiqTX" + }, + "source": [ + "# df_download = pd.read_csv(os.path.join(PROCESSED_DIR, 'image_download_locs.csv'))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "LKqxVTZRiqTX" + }, + "source": [ + "# you can try the google downloader if you don't have the planet API key\n", + "# the tradeoff is that planet's data can be queried with time, but Google's images\n", + "# are higher res\n", + "from utils import PlanetDownloader" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 286 + }, + "id": "rPdAoDDeiqTY", + "outputId": "8a30ef75-9283-4d4a-c80e-fa052d92dafc" + }, + "source": [ + "# this demonstrates the API call\n", + "lat = 38.441332\n", + "lon = -105.234751\n", + "min_year = 2016\n", + "min_month = 1\n", + "max_year = 2016\n", + "max_month = 12\n", + "\n", + "access = open(ACCESS_TOKEN_DIR, 'r').readlines()[0].strip()\n", + "pd = PlanetDownloader(access)\n", + "plt.imshow(pd.download_image(lat, lon, min_year, min_month, max_year, max_month))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 33 + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "o0cMhBHdiqTY" + }, + "source": [ + "def download_images(df):\n", + " \"\"\"\n", + " Download images using a pandas DataFrame that has \"image_lat\", \"image_lon\", \"image_name\", \"country\" as columns\n", + " \n", + " Saves images to the corresponding country's images folder\n", + "\n", + " To use the Google Downloader, switch PlanetDownloader to GoogleDownloader and make imd.download_image only\n", + " provide lat and lon as arguments. Use zoom = 16.\n", + " \"\"\"\n", + " access = None\n", + " with open(ACCESS_TOKEN_DIR, 'r') as f:\n", + " access = f.readlines()[0]\n", + " imd = PlanetDownloader(access)\n", + "# imd = GoogleDownloader(access)\n", + " num_retries = 20\n", + " wait_time = 0.1 # seconds\n", + "\n", + " # drops what is already downloaded\n", + " already_downloaded = os.listdir(os.path.join(COUNTRIES_DIR, 'malawi_2016', 'images')) + \\\n", + " os.listdir(os.path.join(COUNTRIES_DIR, 'ethiopia_2015', 'images')) + \\\n", + " os.listdir(os.path.join(COUNTRIES_DIR, 'nigeria_2015', 'images'))\n", + " already_downloaded = list(set(already_downloaded).intersection(set(df['image_name'])))\n", + " print('Already downloaded ' + str(len(already_downloaded)))\n", + " df = df.set_index('image_name').drop(already_downloaded).reset_index()\n", + " print('Need to download ' + str(len(df)))\n", + " # use three years of images to find one that matches search critera\n", + " min_year = 2014\n", + " min_month = 1\n", + " max_year = 2016\n", + " max_month = 12\n", + " for _, r in tqdm(df.iterrows(), total=df.shape[0]):\n", + " lat = r.image_lat\n", + " lon = r.image_lon\n", + " zoom = 50\n", + " name = r.image_name\n", + " country_dir = None\n", + " if r.country == 'mw':\n", + " country_dir = 'malawi_2016'\n", + " elif r.country == 'eth':\n", + " country_dir = 'ethiopia_2015'\n", + " elif r.country == 'ng':\n", + " country_dir = 'nigeria_2015'\n", + " else:\n", + " print(f\"unrecognized country: {r.country}\")\n", + " raise ValueError()\n", + " image_save_path = os.path.join(COUNTRIES_DIR, country_dir, 'images', r.image_name)\n", + " try:\n", + " im = imd.download_image(lat, lon, min_year, min_month, max_year, max_month)\n", + " if (type(im) == str and im == 'RETRY') or im is None:\n", + " resolved = False\n", + " for _ in range(num_retries):\n", + " time.sleep(wait_time)\n", + " im = imd.download_image(lat, lon, min_year, min_month, max_year, max_month)\n", + " if (type(im) == str and im == 'RETRY') or im is None:\n", + " continue\n", + " else:\n", + " plt.imsave(image_save_path, im)\n", + " resolved = True\n", + " break\n", + " if not resolved:\n", + " print(f'Could not download {lat}, {lon} despite several retries and waiting')\n", + " continue\n", + " else:\n", + " pass\n", + " else:\n", + " # no issues, save according to naming convention\n", + " plt.imsave(image_save_path, im)\n", + "\n", + " except Exception as e:\n", + " logging.error(f\"Error-could not download {lat}, {lon}\", exc_info=True)\n", + " continue" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "background_save": true, + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "eb03e01e458a4797a88152b874f376e3", + "7c0a93896bc04018beac3d9d8634fa5e", + "516f34ee82664b1d8d9b385efa35eaa7", + "8f50de1322db49869a25e53ad103cdfe", + "e8a9e60dfc8341a18d60d9aa25fcf4c4", + "bec8e5c2800b433394e2c0856dd0d526", + "466113287a364f27bdc04158b27555bf", + "dee1210a038945c9aa01296d060eaa6b" + ] + }, + "id": "5bQ9hanyiqTZ", + "outputId": "d51b54bd-146f-4092-cb08-becd932f7f66" + }, + "source": [ + "download_images(df_download)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Already downloaded 0\n", + "Need to download 14500\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "eb03e01e458a4797a88152b874f376e3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=14500.0), HTML(value='')))" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "stream", + "text": [ + "Could not download 4.31578611574, 6.223837135554024 despite several retries and waiting\n", + "Could not download 4.330758037141992, 6.223837135554024 despite several retries and waiting\n", + "Could not download 4.285842272936016, 6.238809056956016 despite several retries and waiting\n", + "Could not download 4.345729958543984, 6.253780978358008 despite several retries and waiting\n", + "Could not download 4.300814194338008, 6.26875289976 despite several retries and waiting\n", + "Could not download 4.330758037141992, 6.26875289976 despite several retries and waiting\n", + "Could not download 4.345729958543984, 6.26875289976 despite several retries and waiting\n", + "Could not download 4.345729958543984, 6.313668663965976 despite several retries and waiting\n", + "Could not download 4.283803377034025, 6.263256104344025 despite several retries and waiting\n", + "Could not download 4.283803377034025, 6.278228025746017 despite several retries and waiting\n", + "Could not download 4.3287191412400015, 6.278228025746017 despite several retries and waiting\n", + "Could not download 4.313747219838009, 6.3081718685500014 despite several retries and waiting\n", + "Could not download 4.343691062641994, 6.3231437899519936 despite several retries and waiting\n", + "Could not download 4.283803377034025, 6.338115711353986 despite several retries and waiting\n", + "Could not download 4.298775298436017, 6.338115711353986 despite several retries and waiting\n", + "Could not download 4.313747219838009, 6.353087632755978 despite several retries and waiting\n", + "Could not download 4.358662984043986, 6.353087632755978 despite several retries and waiting\n", + "Could not download 4.373634905445978, 6.353087632755978 despite several retries and waiting\n", + "Could not download 4.290142273407084, 6.296692183585628 despite several retries and waiting\n", + "Could not download 4.443342532885977, 7.139046639044024 despite several retries and waiting\n", + "Could not download 4.440163648431993, 7.122018824984023 despite several retries and waiting\n", + "Could not download 4.455135569833985, 7.122018824984023 despite several retries and waiting\n", + "Could not download 4.440163648431993, 7.136990746386015 despite several retries and waiting\n", + "Could not download 4.455135569833985, 7.136990746386015 despite several retries and waiting\n", + "Could not download 4.438511483134734, 7.12623269530718 despite several retries and waiting\n", + "Could not download 4.574461166034023, 7.640029737454023 despite several retries and waiting\n", + "Could not download 4.634348851641992, 7.640029737454023 despite several retries and waiting\n", + "Could not download 4.574461166034023, 7.6550016588560155 despite several retries and waiting\n", + "Could not download 4.634348851641992, 7.6550016588560155 despite several retries and waiting\n", + "Could not download 4.6044050088380075, 7.68494550166 despite several retries and waiting\n", + "Could not download 4.634348851641992, 7.714889344463984 despite several retries and waiting\n", + "Could not download 4.649320773043984, 7.714889344463984 despite several retries and waiting\n", + "Could not download 4.649320773043984, 7.729861265865976 despite several retries and waiting\n", + "Could not download 4.633468852209999, 7.656062334554024 despite several retries and waiting\n", + "Could not download 4.588553088004023, 7.671034255956016 despite several retries and waiting\n", + "Could not download 4.663412695013983, 7.686006177358008 despite several retries and waiting\n", + "Could not download 4.648440773611991, 7.715950020161992 despite several retries and waiting\n", + "Could not download 4.6215271599577346, 7.657246030680976 despite several retries and waiting\n", + "Could not download 4.7087128449559765, 7.287448788724023 despite several retries and waiting\n", + "Could not download 4.643263203974024, 7.663839143824022 despite several retries and waiting\n", + "Could not download 4.658235125376016, 7.678811065226014 despite several retries and waiting\n", + "Could not download 4.68817896818, 7.678811065226014 despite several retries and waiting\n", + "Could not download 4.733094732385976, 7.708754908029999 despite several retries and waiting\n", + "Could not download 4.643263203974024, 7.723726829431991 despite several retries and waiting\n", + "Could not download 4.718122810983984, 7.723726829431991 despite several retries and waiting\n", + "Could not download 4.703150889581992, 7.753670672235975 despite several retries and waiting\n", + "Could not download 4.718122810983984, 7.753670672235975 despite several retries and waiting\n", + "Could not download 4.70728140817, 6.043187026464023 despite several retries and waiting\n", + "Could not download 4.722253329571992, 6.043187026464023 despite several retries and waiting\n", + "Could not download 4.722253329571992, 6.058158947866015 despite several retries and waiting\n", + "Could not download 4.752197172375976, 6.058158947866015 despite several retries and waiting\n", + "Could not download 4.737225250973984, 6.073130869268007 despite several retries and waiting\n", + "Could not download 4.70728140817, 6.088102790669999 despite several retries and waiting\n", + "Could not download 4.752197172375976, 6.088102790669999 despite several retries and waiting\n", + "Could not download 4.692309486768008, 6.103074712071991 despite several retries and waiting\n", + "Could not download 4.70728140817, 6.103074712071991 despite several retries and waiting\n", + "Could not download 4.722253329571992, 6.103074712071991 despite several retries and waiting\n", + "Could not download 4.737225250973984, 6.103074712071991 despite several retries and waiting\n", + "Could not download 4.752197172375976, 6.118046633473983 despite several retries and waiting\n", + "Could not download 4.747722175680069, 6.044642939388144 despite several retries and waiting\n", + "Could not download 4.699336230828008, 7.212831141355976 despite several retries and waiting\n", + "Could not download 4.71430815223, 7.212831141355976 despite several retries and waiting\n", + "Could not download 4.699734956834828, 7.212242098215538 despite several retries and waiting\n", + "Could not download 4.74375535836, 6.822015620135977 despite several retries and waiting\n", + "Could not download 4.758727279761992, 6.822015620135977 despite several retries and waiting\n", + "Could not download 4.717062213806016, 7.273772634216016 despite several retries and waiting\n", + "Could not download 4.776949899413984, 7.273772634216016 despite several retries and waiting\n", + "Could not download 4.717062213806016, 7.288744555618008 despite several retries and waiting\n", + "Could not download 4.74700605661, 7.30371647702 despite several retries and waiting\n", + "Could not download 4.791921820815976, 7.318688398421992 despite several retries and waiting\n", + "Could not download 4.715844019424024, 7.209912888695976 despite several retries and waiting\n", + "Could not download 4.76758083369, 6.943311839634024 despite several retries and waiting\n", + "Could not download 4.782552755091992, 6.943311839634024 despite several retries and waiting\n", + "Could not download 4.7975246764939845, 6.943311839634024 despite several retries and waiting\n", + "Could not download 4.812496597895977, 6.973255682438008 despite several retries and waiting\n", + "Could not download 4.730980759024024, 6.8695305684639845 despite several retries and waiting\n", + "Could not download 4.730980759024024, 6.884502489865977 despite several retries and waiting\n", + "Could not download 4.813704083363985, 6.976052903154025 despite several retries and waiting\n", + "Could not download 4.828676004765977, 6.976052903154025 despite several retries and waiting\n", + "Could not download 4.774767226928008, 6.352828032855976 despite several retries and waiting\n", + "Could not download 4.78973914833, 6.352828032855976 despite several retries and waiting\n", + "Could not download 4.804711069731992, 6.352828032855976 despite several retries and waiting\n", + "Could not download 4.763027457364024, 6.928135920174024 despite several retries and waiting\n", + "Could not download 4.777999378766016, 6.928135920174024 despite several retries and waiting\n", + "Could not download 4.792971300168008, 6.928135920174024 despite several retries and waiting\n", + "Could not download 4.763027457364024, 6.943107841576016 despite several retries and waiting\n", + "Could not download 4.777999378766016, 6.943107841576016 despite several retries and waiting\n", + "Could not download 4.792971300168008, 6.943107841576016 despite several retries and waiting\n", + "Could not download 4.80794322157, 6.973051684380001 despite several retries and waiting\n", + "Could not download 4.8229151429719925, 6.973051684380001 despite several retries and waiting\n", + "Could not download 4.837887064373985, 6.973051684380001 despite several retries and waiting\n", + "Could not download 4.8113188547300005, 6.965991353474024 despite several retries and waiting\n", + "Could not download 4.826290776131993, 6.965991353474024 despite several retries and waiting\n", + "Could not download 4.841262697533985, 6.965991353474024 despite several retries and waiting\n", + "Could not download 4.8113188547300005, 6.980963274876016 despite several retries and waiting\n", + "Could not download 4.826290776131993, 6.980963274876016 despite several retries and waiting\n", + "Could not download 4.841262697533985, 6.980963274876016 despite several retries and waiting\n", + "Could not download 4.767380794814023, 8.161771710364023 despite several retries and waiting\n", + "Could not download 4.797324637618007, 8.161771710364023 despite several retries and waiting\n", + "Could not download 4.812296559019999, 8.161771710364023 despite several retries and waiting\n", + "Could not download 4.782352716216015, 8.176743631766016 despite several retries and waiting\n", + "Could not download 4.797324637618007, 8.176743631766016 despite several retries and waiting\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "ERROR:root:Error-could not download 4.812296559019999, 8.176743631766016\n", + "Traceback (most recent call last):\n", + " File \"\", line 46, in download_images\n", + " im = imd.download_image(lat, lon, min_year, min_month, max_year, max_month)\n", + " File \"gdrive/MyDrive/geo/utils/planet_downloader.py\", line 95, in download_image\n", + " res = json.loads(result.text)\n", + " File \"/usr/lib/python3.7/json/__init__.py\", line 348, in loads\n", + " return _default_decoder.decode(s)\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 337, in decode\n", + " obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 355, in raw_decode\n", + " raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n", + "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "Could not download 4.827268480421991, 8.176743631766016 despite several retries and waiting\n", + "Could not download 4.767380794814023, 8.20668747457 despite several retries and waiting\n", + "Could not download 4.782352716216015, 8.20668747457 despite several retries and waiting\n", + "Could not download 4.797324637618007, 8.20668747457 despite several retries and waiting\n", + "Could not download 4.797324637618007, 8.221659395971992 despite several retries and waiting\n", + "Could not download 4.782352716216015, 8.236631317373984 despite several retries and waiting\n", + "Could not download 4.797324637618007, 8.236631317373984 despite several retries and waiting\n", + "Could not download 4.782352716216015, 8.251603238775976 despite several retries and waiting\n", + "Could not download 4.782409637894023, 8.052724953836012 despite several retries and waiting\n", + "Could not download 4.8123534806980075, 8.082668796639997 despite several retries and waiting\n", + "Could not download 4.8273254021, 8.082668796639997 despite several retries and waiting\n", + "Could not download 4.842297323501992, 8.097640718041989 despite several retries and waiting\n", + "Could not download 4.842297323501992, 8.11261263944398 despite several retries and waiting\n", + "Could not download 4.857269244903984, 8.11261263944398 despite several retries and waiting\n", + "Could not download 4.842297323501992, 8.127584560845973 despite several retries and waiting\n", + "Could not download 4.822758349486196, 8.066377107545561 despite several retries and waiting\n", + "Could not download 4.858164785873984, 6.610742571134024 despite several retries and waiting\n", + "Could not download 4.813249021668008, 6.640686413938008 despite several retries and waiting\n", + "Could not download 4.82822094307, 6.640686413938008 despite several retries and waiting\n", + "Could not download 4.873136707275976, 6.700574099545976 despite several retries and waiting\n", + "Could not download 4.794197859974024, 6.932358914714023 despite several retries and waiting\n", + "Could not download 4.809169781376016, 6.97727467892 despite several retries and waiting\n", + "Could not download 4.824141702778008, 6.97727467892 despite several retries and waiting\n", + "Could not download 4.83911362418, 6.97727467892 despite several retries and waiting\n", + "Could not download 4.803255570534024, 6.942066615554024 despite several retries and waiting\n", + "Could not download 4.818227491936016, 6.9720104583580085 despite several retries and waiting\n", + "Could not download 4.833199413338008, 6.9720104583580085 despite several retries and waiting\n", + "Could not download 4.84817133474, 6.9720104583580085 despite several retries and waiting\n", + "Could not download 4.818227491936016, 6.986982379760001 despite several retries and waiting\n", + "Could not download 4.833199413338008, 6.986982379760001 despite several retries and waiting\n", + "Could not download 4.84817133474, 6.986982379760001 despite several retries and waiting\n", + "Could not download 4.834944730758007, 7.326533303524024 despite several retries and waiting\n", + "Could not download 4.819972809356015, 7.416364831935977 despite several retries and waiting\n", + "Could not download 4.904933960635977, 7.043627224258009 despite several retries and waiting\n", + "Could not download 4.819735270454023, 7.834977320648008 despite several retries and waiting\n", + "Could not download 4.879622956061992, 7.894865006255976 despite several retries and waiting\n", + "Could not download 4.843150984816015, 6.867310670296015 despite several retries and waiting\n", + "Could not download 4.9030386704239834, 6.9421702773059755 despite several retries and waiting\n", + "Could not download 4.846246607926016, 6.786322199314023 despite several retries and waiting\n", + "Could not download 4.9061342935339844, 6.786322199314023 despite several retries and waiting\n", + "Could not download 4.861218529328008, 6.801294120716015 despite several retries and waiting\n", + "Could not download 4.831274686524024, 6.861181806323984 despite several retries and waiting\n", + "Could not download 4.8986575724, 7.034441365283985 despite several retries and waiting\n", + "Could not download 4.913629493801992, 7.034441365283985 despite several retries and waiting\n", + "Could not download 4.9286014152039845, 7.034441365283985 despite several retries and waiting\n", + "Could not download 4.8986575724, 7.049413286685977 despite several retries and waiting\n", + "Could not download 4.913629493801992, 7.049413286685977 despite several retries and waiting\n", + "Could not download 4.9286014152039845, 7.049413286685977 despite several retries and waiting\n", + "Could not download 4.903783503998008, 6.199261082994024 despite several retries and waiting\n", + "Could not download 4.888811582596016, 6.229204925798008 despite several retries and waiting\n", + "Could not download 4.903783503998008, 6.229204925798008 despite several retries and waiting\n", + "Could not download 4.894441895506016, 6.936036111738007 despite several retries and waiting\n", + "Could not download 4.909413816908008, 6.936036111738007 despite several retries and waiting\n", + "Could not download 4.969301502515976, 6.995923797345975 despite several retries and waiting\n", + "Could not download 4.964420059423983, 7.937385238586015 despite several retries and waiting\n", + "Could not download 4.979391980825975, 7.937385238586015 despite several retries and waiting\n", + "Could not download 4.93711593292, 6.394170511863984 despite several retries and waiting\n", + "Could not download 4.93711593292, 6.409142433265976 despite several retries and waiting\n", + "Could not download 4.982031697125977, 6.409142433265976 despite several retries and waiting\n", + "Could not download 4.977858413253984, 7.7583190461 despite several retries and waiting\n", + "Could not download 4.977858413253984, 7.773290967501992 despite several retries and waiting\n", + "Could not download 4.94791457045, 7.803234810305976 despite several retries and waiting\n", + "Could not download 4.96225342141, 8.297358519786016 despite several retries and waiting\n", + "Could not download 4.96225342141, 8.312330441188008 despite several retries and waiting\n", + "Could not download 4.96225342141, 8.342274283991992 despite several retries and waiting\n", + "Could not download 4.977225342811992, 8.342274283991992 despite several retries and waiting\n", + "Could not download 4.977225342811992, 8.357246205393984 despite several retries and waiting\n", + "Could not download 4.974769912887915, 8.361608173193538 despite several retries and waiting\n", + "Could not download 4.96425337683, 5.77154382849 despite several retries and waiting\n", + "Could not download 4.9792252982319924, 5.77154382849 despite several retries and waiting\n", + "Could not download 4.919337612624024, 5.786515749891992 despite several retries and waiting\n", + "Could not download 4.934309534026016, 5.786515749891992 despite several retries and waiting\n", + "Could not download 4.949281455428008, 5.786515749891992 despite several retries and waiting\n", + "Could not download 5.009169141035977, 5.786515749891992 despite several retries and waiting\n", + "Could not download 4.919337612624024, 5.801487671293984 despite several retries and waiting\n", + "Could not download 4.934309534026016, 5.801487671293984 despite several retries and waiting\n", + "Could not download 4.949281455428008, 5.801487671293984 despite several retries and waiting\n", + "Could not download 4.96425337683, 5.801487671293984 despite several retries and waiting\n", + "Could not download 4.9792252982319924, 5.801487671293984 despite several retries and waiting\n", + "Could not download 4.919337612624024, 5.816459592695976 despite several retries and waiting\n", + "Could not download 4.934309534026016, 5.816459592695976 despite several retries and waiting\n", + "Could not download 4.949281455428008, 5.816459592695976 despite several retries and waiting\n", + "Could not download 4.96425337683, 5.816459592695976 despite several retries and waiting\n", + "Could not download 4.9792252982319924, 5.816459592695976 despite several retries and waiting\n", + "Could not download 4.96466286575, 7.774760404396016 despite several retries and waiting\n", + "Could not download 4.979634787151992, 7.774760404396016 despite several retries and waiting\n", + "Could not download 4.96679274568, 6.352797664331992 despite several retries and waiting\n", + "Could not download 4.96679274568, 6.367769585733984 despite several retries and waiting\n", + "Could not download 4.921876981474024, 6.382741507135976 despite several retries and waiting\n", + "Could not download 4.936848902876016, 6.382741507135976 despite several retries and waiting\n", + "Could not download 4.951820824278008, 6.382741507135976 despite several retries and waiting\n", + "Could not download 4.966145979146016, 8.312583253214024 despite several retries and waiting\n", + "Could not download 4.966145979146016, 8.342527096018008 despite several retries and waiting\n", + "Could not download 5.026033664753984, 8.387442860223985 despite several retries and waiting\n", + "Could not download 5.041005586155976, 8.387442860223985 despite several retries and waiting\n", + "Could not download 4.973558081616016, 7.94191212047 despite several retries and waiting\n", + "Could not download 4.966674708914024, 7.934654492188008 despite several retries and waiting\n", + "Could not download 4.966674708914024, 7.94962641359 despite several retries and waiting\n", + "Could not download 5.028007334971992, 7.850132108061992 despite several retries and waiting\n", + "Could not download 5.01303541357, 7.880075950865976 despite several retries and waiting\n", + "Could not download 5.028043145781992, 8.298575720736016 despite several retries and waiting\n", + "Could not download 4.968155460174024, 8.313547642138008 despite several retries and waiting\n", + "Could not download 5.043015067183984, 8.358463406343985 despite several retries and waiting\n", + "Could not download 5.028043145781992, 8.373435327745977 despite several retries and waiting\n", + "Could not download 5.044115686533985, 6.499834221374025 despite several retries and waiting\n", + "Could not download 5.035767191028008, 8.294498340284022 despite several retries and waiting\n", + "Could not download 5.065711033831993, 8.294498340284022 despite several retries and waiting\n", + "Could not download 5.0507391124300005, 8.309470261686014 despite several retries and waiting\n", + "Could not download 5.0507391124300005, 8.324442183088006 despite several retries and waiting\n", + "Could not download 5.080682955233985, 8.324442183088006 despite several retries and waiting\n", + "Could not download 5.0507391124300005, 8.384329868695975 despite several retries and waiting\n", + "Could not download 5.05398782837, 8.377504333514024 despite several retries and waiting\n", + "Could not download 5.098903592575977, 8.377504333514024 despite several retries and waiting\n", + "Could not download 5.05398782837, 8.392476254916016 despite several retries and waiting\n", + "Could not download 5.05398782837, 8.407448176318008 despite several retries and waiting\n", + "Could not download 5.009072064164024, 8.42242009772 despite several retries and waiting\n", + "Could not download 5.024043985566016, 8.42242009772 despite several retries and waiting\n", + "Could not download 5.024043985566016, 8.437392019121992 despite several retries and waiting\n", + "Could not download 5.009072064164024, 8.452363940523984 despite several retries and waiting\n", + "Could not download 5.0689597497719925, 8.467335861925976 despite several retries and waiting\n", + "Could not download 5.011549953964024, 7.873707481204024 despite several retries and waiting\n", + "Could not download 5.101381482375976, 7.91862324541 despite several retries and waiting\n", + "Could not download 5.125065485025976, 6.801879419218008 despite several retries and waiting\n", + "Could not download 5.125065485025976, 6.81685134062 despite several retries and waiting\n", + "Could not download 5.065177799418008, 6.8318232620219925 despite several retries and waiting\n", + "Could not download 5.08014972082, 6.8318232620219925 despite several retries and waiting\n", + "Could not download 5.110093563623984, 6.8318232620219925 despite several retries and waiting\n", + "Could not download 5.035233956614023, 6.846795183423985 despite several retries and waiting\n", + "Could not download 5.095121642221992, 6.846795183423985 despite several retries and waiting\n", + "Could not download 5.0625741540540234, 8.227348541615976 despite several retries and waiting\n", + "Could not download 5.190932250975976, 6.571307695204023 despite several retries and waiting\n", + "Could not download 5.116072643966016, 6.61622345941 despite several retries and waiting\n", + "Could not download 5.131044565368008, 6.61622345941 despite several retries and waiting\n", + "Could not download 5.160988408171992, 6.61622345941 despite several retries and waiting\n", + "Could not download 5.116072643966016, 6.631195380811992 despite several retries and waiting\n", + "Could not download 5.131044565368008, 6.631195380811992 despite several retries and waiting\n", + "Could not download 5.14601648677, 6.646167302213984 despite several retries and waiting\n", + "Could not download 5.190932250975976, 6.646167302213984 despite several retries and waiting\n", + "Could not download 5.1011007225640235, 6.661139223615976 despite several retries and waiting\n", + "Could not download 5.116072643966016, 6.661139223615976 despite several retries and waiting\n", + "Could not download 5.131044565368008, 6.661139223615976 despite several retries and waiting\n", + "Could not download 5.175960329573984, 6.661139223615976 despite several retries and waiting\n", + "Could not download 5.162194178291992, 6.314337852258008 despite several retries and waiting\n", + "Could not download 5.226314816306016, 6.574498277034024 despite several retries and waiting\n", + "Could not download 5.241286737708008, 6.574498277034024 despite several retries and waiting\n", + "Could not download 5.3011744233159765, 6.574498277034024 despite several retries and waiting\n", + "Could not download 5.226314816306016, 6.5894701984360164 despite several retries and waiting\n", + "Could not download 5.241286737708008, 6.5894701984360164 despite several retries and waiting\n", + "Could not download 5.3011744233159765, 6.5894701984360164 despite several retries and waiting\n", + "Could not download 5.226314816306016, 6.6044421198380086 despite several retries and waiting\n", + "Could not download 5.241286737708008, 6.6044421198380086 despite several retries and waiting\n", + "Could not download 5.25625865911, 6.6044421198380086 despite several retries and waiting\n", + "Could not download 5.211342894904024, 6.619414041240001 despite several retries and waiting\n", + "Could not download 5.271230580511992, 6.619414041240001 despite several retries and waiting\n", + "Could not download 5.286202501913984, 6.619414041240001 despite several retries and waiting\n", + "Could not download 5.211342894904024, 6.634385962641993 despite several retries and waiting\n", + "Could not download 5.271230580511992, 6.634385962641993 despite several retries and waiting\n", + "Could not download 5.286202501913984, 6.634385962641993 despite several retries and waiting\n", + "Could not download 5.211342894904024, 6.649357884043985 despite several retries and waiting\n", + "Could not download 5.25625865911, 6.649357884043985 despite several retries and waiting\n", + "Could not download 5.211342894904024, 6.664329805445977 despite several retries and waiting\n", + "Could not download 5.226314816306016, 6.664329805445977 despite several retries and waiting\n", + "Could not download 5.241286737708008, 6.664329805445977 despite several retries and waiting\n", + "Could not download 5.25625865911, 6.664329805445977 despite several retries and waiting\n", + "Could not download 5.239647461556015, 6.259141996204024 despite several retries and waiting\n", + "Could not download 5.2546193829580075, 6.289085839008008 despite several retries and waiting\n", + "Could not download 5.26959130436, 6.289085839008008 despite several retries and waiting\n", + "Could not download 5.284563225761992, 6.289085839008008 despite several retries and waiting\n", + "Could not download 5.291371142401992, 7.751515845553983 despite several retries and waiting\n", + "Could not download 5.306343063803984, 7.751515845553983 despite several retries and waiting\n", + "Could not download 5.280780263364023, 6.851805846504025 despite several retries and waiting\n", + "Could not download 5.280780263364023, 6.866777767906017 despite several retries and waiting\n", + "Could not download 5.280780263364023, 6.881749689308009 despite several retries and waiting\n", + "Could not download 5.370611791775976, 6.926665453513985 despite several retries and waiting\n", + "Could not download 5.370611791775976, 6.941637374915977 despite several retries and waiting\n", + "Could not download 5.366012679523984, 8.380754045945976 despite several retries and waiting\n", + "Could not download 5.380809528853985, 7.371945324386016 despite several retries and waiting\n", + "Could not download 5.420351476265976, 7.0723601283380075 despite several retries and waiting\n", + "Could not download 5.448626963423983, 5.712169139884024 despite several retries and waiting\n", + "Could not download 5.463598884825975, 5.727141061286016 despite several retries and waiting\n", + "Could not download 5.388739277816015, 5.772056825491992 despite several retries and waiting\n", + "Could not download 5.403711199218007, 5.787028746893984 despite several retries and waiting\n", + "Could not download 5.418683120619999, 5.802000668295976 despite several retries and waiting\n", + "Could not download 5.483399643465976, 6.168482320814023 despite several retries and waiting\n", + "Could not download 5.44315706398, 7.022593698656015 despite several retries and waiting\n", + "Could not download 5.4880728281859765, 7.022593698656015 despite several retries and waiting\n", + "Could not download 5.401324742004023, 6.981277471266015 despite several retries and waiting\n", + "Could not download 5.431268584808008, 6.996249392668007 despite several retries and waiting\n", + "Could not download 5.491156270415976, 7.026193235471991 despite several retries and waiting\n", + "Could not download 5.413769956224024, 7.192073491756017 despite several retries and waiting\n", + "Could not download 5.443713799028008, 7.222017334560001 despite several retries and waiting\n", + "Could not download 5.478065316348006, 6.1759533895240235 despite several retries and waiting\n", + "Could not download 5.537953001955975, 6.1759533895240235 despite several retries and waiting\n", + "Could not download 5.508009159151991, 6.190925310926016 despite several retries and waiting\n", + "Could not download 5.522981080553983, 6.205897232328008 despite several retries and waiting\n", + "Could not download 5.470082359546016, 7.231366137513985 despite several retries and waiting\n", + "Could not download 5.470082359546016, 7.246338058915977 despite several retries and waiting\n", + "Could not download 5.471188130634023, 5.663180244784024 despite several retries and waiting\n", + "Could not download 5.486160052036015, 5.663180244784024 despite several retries and waiting\n", + "Could not download 5.5011319734380075, 5.663180244784024 despite several retries and waiting\n", + "Could not download 5.51610389484, 5.663180244784024 despite several retries and waiting\n", + "Could not download 5.531075816241992, 5.663180244784024 despite several retries and waiting\n", + "Could not download 5.546047737643984, 5.663180244784024 despite several retries and waiting\n", + "Could not download 5.471188130634023, 5.678152166186016 despite several retries and waiting\n", + "Could not download 5.486160052036015, 5.678152166186016 despite several retries and waiting\n", + "Could not download 5.5011319734380075, 5.678152166186016 despite several retries and waiting\n", + "Could not download 5.51610389484, 5.678152166186016 despite several retries and waiting\n", + "Could not download 5.471188130634023, 5.693124087588008 despite several retries and waiting\n", + "Could not download 5.486160052036015, 5.693124087588008 despite several retries and waiting\n", + "Could not download 5.5011319734380075, 5.693124087588008 despite several retries and waiting\n", + "Could not download 5.51610389484, 5.693124087588008 despite several retries and waiting\n", + "Could not download 5.471188130634023, 5.70809600899 despite several retries and waiting\n", + "Could not download 5.486160052036015, 5.70809600899 despite several retries and waiting\n", + "Could not download 5.5011319734380075, 5.70809600899 despite several retries and waiting\n", + "Could not download 5.51610389484, 5.70809600899 despite several retries and waiting\n", + "Could not download 5.471188130634023, 5.723067930391992 despite several retries and waiting\n", + "Could not download 5.486160052036015, 5.738039851793984 despite several retries and waiting\n", + "Could not download 5.5011319734380075, 5.738039851793984 despite several retries and waiting\n", + "Could not download 5.486160052036015, 5.7530117731959765 despite several retries and waiting\n", + "Could not download 5.5011319734380075, 5.7530117731959765 despite several retries and waiting\n", + "Could not download 5.552325673303984, 7.0343431087580095 despite several retries and waiting\n", + "Could not download 5.5074099090980075, 7.049315030160002 despite several retries and waiting\n", + "Could not download 5.552325673303984, 7.049315030160002 despite several retries and waiting\n", + "Could not download 5.567297594705976, 7.049315030160002 despite several retries and waiting\n", + "Could not download 5.567297594705976, 7.064286951561994 despite several retries and waiting\n", + "Could not download 5.577058854163986, 5.771977164594023 despite several retries and waiting\n", + "Could not download 5.592030775565978, 5.771977164594023 despite several retries and waiting\n", + "Could not download 5.577058854163986, 5.7869490859960155 despite several retries and waiting\n", + "Could not download 5.592030775565978, 5.7869490859960155 despite several retries and waiting\n", + "Could not download 5.584606468777418, 5.791342288128962 despite several retries and waiting\n", + "Could not download 5.5491711665860155, 7.034897209741992 despite several retries and waiting\n", + "Could not download 5.564143087988008, 7.034897209741992 despite several retries and waiting\n", + "Could not download 5.5491711665860155, 7.049869131143984 despite several retries and waiting\n", + "Could not download 5.686671033293984, 7.277837499354025 despite several retries and waiting\n", + "Could not download 5.686671033293984, 7.367669027765977 despite several retries and waiting\n", + "Could not download 5.709804141455977, 6.996414183744023 despite several retries and waiting\n", + "Could not download 5.649916455848008, 7.011386105146015 despite several retries and waiting\n", + "Could not download 5.619972613044024, 7.056301869351992 despite several retries and waiting\n", + "Could not download 5.619972613044024, 7.071273790753984 despite several retries and waiting\n", + "Could not download 5.634944534446016, 7.071273790753984 despite several retries and waiting\n", + "Could not download 5.679860298651993, 7.086245712155976 despite several retries and waiting\n", + "Could not download 5.634520199444024, 7.112446462513985 despite several retries and waiting\n", + "Could not download 5.634520199444024, 7.127418383915977 despite several retries and waiting\n", + "Could not download 5.725540957935976, 6.3656870288940235 despite several retries and waiting\n", + "Could not download 5.750183920463986, 7.256033670591991 despite several retries and waiting\n", + "Could not download 5.7463142697, 6.957473407466015 despite several retries and waiting\n", + "Could not download 5.716370426896016, 6.987417250269999 despite several retries and waiting\n", + "Could not download 5.716370426896016, 7.002389171671991 despite several retries and waiting\n", + "Could not download 5.776258112503984, 7.002389171671991 despite several retries and waiting\n", + "Could not download 5.716370426896016, 7.0323330144759755 despite several retries and waiting\n", + "Could not download 5.731342348298008, 7.0323330144759755 despite several retries and waiting\n", + "Could not download 5.7463142697, 7.0323330144759755 despite several retries and waiting\n", + "Could not download 5.761286191101992, 7.0323330144759755 despite several retries and waiting\n", + "Could not download 5.779988858273984, 5.955662984224024 despite several retries and waiting\n", + "Could not download 5.779988858273984, 5.970634905626016 despite several retries and waiting\n", + "Could not download 5.779988858273984, 6.000578748430001 despite several retries and waiting\n", + "Could not download 5.794960779675976, 6.000578748430001 despite several retries and waiting\n", + "Could not download 5.779988858273984, 6.015550669831993 despite several retries and waiting\n", + "Could not download 5.705129251264023, 6.030522591233985 despite several retries and waiting\n", + "Could not download 5.75004501547, 6.045494512635977 despite several retries and waiting\n", + "Could not download 5.862283011205977, 8.07352598035801 despite several retries and waiting\n", + "Could not download 5.804250622696016, 7.002834653328009 despite several retries and waiting\n", + "Could not download 5.882288204475976, 7.167790600561992 despite several retries and waiting\n", + "Could not download 5.962678824495977, 7.117318050273984 despite several retries and waiting\n", + "Could not download 5.91892832509, 7.377582693104024 despite several retries and waiting\n", + "Could not download 5.888984482286016, 7.437470378711993 despite several retries and waiting\n", + "Could not download 5.965219560435978, 4.9311438196160156 despite several retries and waiting\n", + "Could not download 5.953399632633984, 7.520043095326017 despite several retries and waiting\n", + "Could not download 5.938427711231992, 7.579930780933985 despite several retries and waiting\n", + "Could not download 5.968417882185976, 6.220389998924024 despite several retries and waiting\n", + "Could not download 5.968417882185976, 6.235361920326016 despite several retries and waiting\n", + "Could not download 5.878586353774024, 6.2503338417280085 despite several retries and waiting\n", + "Could not download 5.893558275176016, 6.2503338417280085 despite several retries and waiting\n", + "Could not download 5.878586353774024, 6.265305763130001 despite several retries and waiting\n", + "Could not download 5.893558275176016, 6.265305763130001 despite several retries and waiting\n", + "Could not download 5.878586353774024, 6.280277684531993 despite several retries and waiting\n", + "Could not download 5.893558275176016, 6.280277684531993 despite several retries and waiting\n", + "Could not download 5.878586353774024, 6.295249605933985 despite several retries and waiting\n", + "Could not download 5.893558275176016, 6.295249605933985 despite several retries and waiting\n", + "Could not download 5.964289812223983, 8.117081926626016 despite several retries and waiting\n", + "Could not download 5.979261733625975, 8.161997690831992 despite several retries and waiting\n", + "Could not download 5.889430205214023, 8.176969612233984 despite several retries and waiting\n", + "Could not download 5.954723846056016, 6.873102267136016 despite several retries and waiting\n", + "Could not download 5.956927376628991, 7.0997860693502775 despite several retries and waiting\n", + "Could not download 6.042681742206015, 6.794410187834024 despite several retries and waiting\n", + "Could not download 6.042681742206015, 6.809382109236016 despite several retries and waiting\n", + "Could not download 6.085599016606015, 6.146675225616016 despite several retries and waiting\n", + "Could not download 6.145486702213984, 6.146675225616016 despite several retries and waiting\n", + "Could not download 6.160458623615976, 6.146675225616016 despite several retries and waiting\n", + "Could not download 6.11554285941, 6.161647147018008 despite several retries and waiting\n", + "Could not download 6.130514780811992, 6.161647147018008 despite several retries and waiting\n", + "Could not download 6.11554285941, 6.17661906842 despite several retries and waiting\n", + "Could not download 6.130514780811992, 6.191590989821992 despite several retries and waiting\n", + "Could not download 6.157142120393985, 6.150001140924024 despite several retries and waiting\n", + "Could not download 6.142170198991993, 6.164973062326016 despite several retries and waiting\n", + "Could not download 6.157142120393985, 6.164973062326016 despite several retries and waiting\n", + "Could not download 6.082282513384024, 6.179944983728008 despite several retries and waiting\n", + "Could not download 6.0972544347860165, 6.179944983728008 despite several retries and waiting\n", + "Could not download 6.082282513384024, 6.19491690513 despite several retries and waiting\n", + "Could not download 6.0972544347860165, 6.19491690513 despite several retries and waiting\n", + "Could not download 6.112226356188009, 6.19491690513 despite several retries and waiting\n", + "Could not download 6.127198277590001, 6.19491690513 despite several retries and waiting\n", + "Could not download 6.0972544347860165, 6.224860747933985 despite several retries and waiting\n", + "Could not download 6.112226356188009, 6.224860747933985 despite several retries and waiting\n", + "Could not download 6.112226356188009, 6.239832669335977 despite several retries and waiting\n", + "Could not download 6.175725221875976, 6.922742052756017 despite several retries and waiting\n", + "Could not download 6.090171543674024, 6.2944873011619915 despite several retries and waiting\n", + "Could not download 6.120115386478008, 6.2944873011619915 despite several retries and waiting\n", + "Could not download 6.090171543674024, 6.309459222563984 despite several retries and waiting\n", + "Could not download 6.105143465076016, 6.309459222563984 despite several retries and waiting\n", + "Could not download 6.13508730788, 6.309459222563984 despite several retries and waiting\n", + "Could not download 6.090171543674024, 6.324431143965976 despite several retries and waiting\n", + "Could not download 6.105143465076016, 6.324431143965976 despite several retries and waiting\n", + "Could not download 6.120115386478008, 6.324431143965976 despite several retries and waiting\n", + "Could not download 6.13508730788, 6.324431143965976 despite several retries and waiting\n", + "Could not download 6.180003072085976, 6.324431143965976 despite several retries and waiting\n", + "Could not download 6.132053407918007, 8.262274875055978 despite several retries and waiting\n", + "Could not download 6.15062167012, 7.915096986886016 despite several retries and waiting\n", + "Could not download 6.195537434325976, 7.930068908288008 despite several retries and waiting\n", + "Could not download 6.151757315128008, 6.998163450564023 despite several retries and waiting\n", + "Could not download 6.152818369228009, 6.991717119328008 despite several retries and waiting\n", + "Could not download 6.242807500533984, 6.680889736486016 despite several retries and waiting\n", + "Could not download 6.249641592023983, 4.628069795666015 despite several retries and waiting\n", + "Could not download 6.234669670621991, 4.643041717068007 despite several retries and waiting\n", + "Could not download 6.234669670621991, 4.658013638469999 despite several retries and waiting\n", + "Could not download 6.249641592023983, 4.658013638469999 despite several retries and waiting\n", + "Could not download 6.264613513425975, 4.687957481273983 despite several retries and waiting\n", + "Could not download 6.273133818143984, 7.587237739806017 despite several retries and waiting\n", + "Could not download 6.198274211134024, 7.6171815826100016 despite several retries and waiting\n", + "Could not download 6.258161896741992, 7.632153504011994 despite several retries and waiting\n", + "Could not download 6.231460060404023, 5.611257657656016 despite several retries and waiting\n", + "Could not download 6.246431981806015, 5.611257657656016 despite several retries and waiting\n", + "Could not download 6.231460060404023, 5.626229579058008 despite several retries and waiting\n", + "Could not download 6.246431981806015, 5.626229579058008 despite several retries and waiting\n", + "Could not download 6.231460060404023, 5.64120150046 despite several retries and waiting\n", + "Could not download 6.246431981806015, 5.64120150046 despite several retries and waiting\n", + "Could not download 6.231460060404023, 5.656173421861992 despite several retries and waiting\n", + "Could not download 6.246431981806015, 5.656173421861992 despite several retries and waiting\n", + "Could not download 6.321291588815976, 5.656173421861992 despite several retries and waiting\n", + "Could not download 6.231460060404023, 5.6711453432639845 despite several retries and waiting\n", + "Could not download 6.246431981806015, 5.6711453432639845 despite several retries and waiting\n", + "Could not download 6.2614039032080075, 5.6711453432639845 despite several retries and waiting\n", + "Could not download 6.27637582461, 5.6711453432639845 despite several retries and waiting\n", + "Could not download 6.291347746011992, 5.6711453432639845 despite several retries and waiting\n", + "Could not download 6.231460060404023, 5.686117264665977 despite several retries and waiting\n", + "Could not download 6.246431981806015, 5.686117264665977 despite several retries and waiting\n", + "Could not download 6.2614039032080075, 5.686117264665977 despite several retries and waiting\n", + "Could not download 6.27637582461, 5.686117264665977 despite several retries and waiting\n", + "Could not download 6.291347746011992, 5.686117264665977 despite several retries and waiting\n", + "Could not download 6.2852450514400005, 7.360891214375976 despite several retries and waiting\n", + "Could not download 6.244031297734024, 9.120246218723985 despite several retries and waiting\n", + "Could not download 6.273975140538008, 9.120246218723985 despite several retries and waiting\n", + "Could not download 6.303918983341992, 9.120246218723985 despite several retries and waiting\n", + "Could not download 6.273975140538008, 9.135218140125977 despite several retries and waiting\n", + "Could not download 6.303918983341992, 9.135218140125977 despite several retries and waiting\n", + "Could not download 6.308970527785476, 9.123614431783546 despite several retries and waiting\n", + "Could not download 6.2948770108800005, 7.094112656263985 despite several retries and waiting\n", + "Could not download 6.309848932281993, 7.094112656263985 despite several retries and waiting\n", + "Could not download 6.280008938656017, 5.69852872786 despite several retries and waiting\n", + "Could not download 6.2949808600580095, 5.69852872786 despite several retries and waiting\n", + "Could not download 6.309952781460002, 5.69852872786 despite several retries and waiting\n", + "Could not download 6.324924702861994, 5.7284725706639845 despite several retries and waiting\n", + "Could not download 6.354868545665978, 5.7284725706639845 despite several retries and waiting\n", + "Could not download 6.324924702861994, 5.743444492065977 despite several retries and waiting\n", + "Could not download 6.31723421115, 5.657578470090001 despite several retries and waiting\n", + "Could not download 6.332206132551992, 5.657578470090001 despite several retries and waiting\n", + "Could not download 6.272318446944023, 5.672550391491993 despite several retries and waiting\n", + "Could not download 6.2872903683460155, 5.672550391491993 despite several retries and waiting\n", + "Could not download 6.272318446944023, 5.687522312893985 despite several retries and waiting\n", + "Could not download 6.2872903683460155, 5.687522312893985 despite several retries and waiting\n", + "Could not download 6.272318446944023, 5.702494234295977 despite several retries and waiting\n", + "Could not download 6.2872903683460155, 5.702494234295977 despite several retries and waiting\n", + "Could not download 6.302262289748008, 5.702494234295977 despite several retries and waiting\n", + "Could not download 6.365613656275976, 5.573604465104023 despite several retries and waiting\n", + "Could not download 6.32069789207, 5.648464072113984 despite several retries and waiting\n", + "Could not download 6.335669813471992, 5.648464072113984 despite several retries and waiting\n", + "Could not download 6.32069789207, 5.663435993515976 despite several retries and waiting\n", + "Could not download 6.335669813471992, 5.663435993515976 despite several retries and waiting\n", + "Could not download 6.32851769839, 7.893282336246014 despite several retries and waiting\n", + "Could not download 6.32851769839, 7.908254257648006 despite several retries and waiting\n", + "Could not download 6.381566444165975, 5.551828192544023 despite several retries and waiting\n", + "Could not download 6.366594522763983, 5.566800113946015 despite several retries and waiting\n", + "Could not download 6.324780391316015, 5.64703466044 despite several retries and waiting\n", + "Could not download 6.324780391316015, 5.662006581841992 despite several retries and waiting\n", + "Could not download 6.309808469914023, 5.6919504246459764 despite several retries and waiting\n", + "Could not download 6.326769733816016, 5.6497511196819925 despite several retries and waiting\n", + "Could not download 6.326769733816016, 5.664723041083985 despite several retries and waiting\n", + "Could not download 6.350126992688009, 4.745522215884024 despite several retries and waiting\n", + "Could not download 6.365098914090001, 4.760494137286016 despite several retries and waiting\n", + "Could not download 6.380070835491993, 4.760494137286016 despite several retries and waiting\n", + "Could not download 6.410014678295977, 4.775466058688008 despite several retries and waiting\n", + "Could not download 6.366917857000001, 7.179579409854024 despite several retries and waiting\n", + "Could not download 6.3369740141960165, 7.239467095461992 despite several retries and waiting\n", + "Could not download 6.37205907464, 4.745537667764023 despite several retries and waiting\n", + "Could not download 6.416974838845976, 4.760509589166015 despite several retries and waiting\n", + "Could not download 6.416974838845976, 4.7754815105680075 despite several retries and waiting\n", + "Could not download 6.335870188714023, 4.757183470966016 despite several retries and waiting\n", + "Could not download 6.380785952919999, 4.757183470966016 despite several retries and waiting\n", + "Could not download 6.425701717125976, 4.772155392368008 despite several retries and waiting\n", + "Could not download 6.425701717125976, 4.78712731377 despite several retries and waiting\n", + "Could not download 6.353935121414023, 6.6136068491859765 despite several retries and waiting\n", + "Could not download 6.416355007558008, 2.849532898596016 despite several retries and waiting\n", + "Could not download 6.416355007558008, 2.864504819998008 despite several retries and waiting\n", + "Could not download 6.416355007558008, 2.8794767414 despite several retries and waiting\n", + "Could not download 6.446298850361992, 2.8794767414 despite several retries and waiting\n", + "Could not download 6.416355007558008, 2.894448662801992 despite several retries and waiting\n", + "Could not download 6.43132692896, 2.894448662801992 despite several retries and waiting\n", + "Could not download 6.416355007558008, 2.9094205842039838 despite several retries and waiting\n", + "Could not download 6.446298850361992, 2.9094205842039838 despite several retries and waiting\n", + "Could not download 6.417194824338008, 7.445259981854024 despite several retries and waiting\n", + "Could not download 6.477082509945976, 7.445259981854024 despite several retries and waiting\n", + "Could not download 6.417194824338008, 7.460231903256016 despite several retries and waiting\n", + "Could not download 6.43216674574, 7.460231903256016 despite several retries and waiting\n", + "Could not download 6.447138667141992, 7.460231903256016 despite several retries and waiting\n", + "Could not download 6.462110588543984, 7.460231903256016 despite several retries and waiting\n", + "Could not download 6.477082509945976, 7.460231903256016 despite several retries and waiting\n", + "Could not download 6.447138667141992, 7.475203824658008 despite several retries and waiting\n", + "Could not download 6.462110588543984, 7.475203824658008 despite several retries and waiting\n", + "Could not download 6.477082509945976, 7.475203824658008 despite several retries and waiting\n", + "Could not download 6.447138667141992, 7.49017574606 despite several retries and waiting\n", + "Could not download 6.462110588543984, 7.49017574606 despite several retries and waiting\n", + "Could not download 6.477082509945976, 7.49017574606 despite several retries and waiting\n", + "Could not download 6.387250981534024, 7.505147667461992 despite several retries and waiting\n", + "Could not download 6.402222902936016, 7.505147667461992 despite several retries and waiting\n", + "Could not download 6.458159639134412, 7.476152431879382 despite several retries and waiting\n", + "Could not download 6.447651523391992, 6.167314974286016 despite several retries and waiting\n", + "Could not download 6.402735759186016, 6.227202659893984 despite several retries and waiting\n", + "Could not download 6.431525767468008, 3.30557127872 despite several retries and waiting\n", + "Could not download 6.44649768887, 3.30557127872 despite several retries and waiting\n", + "Could not download 6.461469610271992, 3.30557127872 despite several retries and waiting\n", + "Could not download 6.476441531673984, 3.30557127872 despite several retries and waiting\n", + "Could not download 6.491413453075976, 3.30557127872 despite several retries and waiting\n", + "Could not download 6.416553846066016, 3.320543200121992 despite several retries and waiting\n", + "Could not download 6.431525767468008, 3.320543200121992 despite several retries and waiting\n", + "Could not download 6.44649768887, 3.320543200121992 despite several retries and waiting\n", + "Could not download 6.461469610271992, 3.320543200121992 despite several retries and waiting\n", + "Could not download 6.476441531673984, 3.320543200121992 despite several retries and waiting\n", + "Could not download 6.416553846066016, 3.3355151215239838 despite several retries and waiting\n", + "Could not download 6.431525767468008, 3.3355151215239838 despite several retries and waiting\n", + "Could not download 6.44649768887, 3.3355151215239838 despite several retries and waiting\n", + "Could not download 6.461469610271992, 3.3355151215239838 despite several retries and waiting\n", + "Could not download 6.476441531673984, 3.3355151215239838 despite several retries and waiting\n", + "Could not download 6.416553846066016, 3.350487042925976 despite several retries and waiting\n", + "Could not download 6.491413453075976, 3.350487042925976 despite several retries and waiting\n", + "Could not download 6.430287915517978, 3.3056344489836555 despite several retries and waiting\n", + "Could not download 6.416696212006015, 7.439815488204023 despite several retries and waiting\n", + "Could not download 6.476583897613984, 7.439815488204023 despite several retries and waiting\n", + "Could not download 6.416696212006015, 7.454787409606015 despite several retries and waiting\n", + "Could not download 6.4316681334080075, 7.454787409606015 despite several retries and waiting\n", + "Could not download 6.44664005481, 7.454787409606015 despite several retries and waiting\n", + "Could not download 6.461611976211992, 7.454787409606015 despite several retries and waiting\n", + "Could not download 6.476583897613984, 7.454787409606015 despite several retries and waiting\n", + "Could not download 6.416696212006015, 7.4697593310080075 despite several retries and waiting\n", + "Could not download 6.4316681334080075, 7.4697593310080075 despite several retries and waiting\n", + "Could not download 6.44664005481, 7.4697593310080075 despite several retries and waiting\n", + "Could not download 6.461611976211992, 7.4697593310080075 despite several retries and waiting\n", + "Could not download 6.476583897613984, 7.4697593310080075 despite several retries and waiting\n", + "Could not download 6.44664005481, 7.48473125241 despite several retries and waiting\n", + "Could not download 6.461611976211992, 7.48473125241 despite several retries and waiting\n", + "Could not download 6.476583897613984, 7.48473125241 despite several retries and waiting\n", + "Could not download 6.401724290604023, 7.499703173811992 despite several retries and waiting\n", + "Could not download 6.491555819015976, 7.499703173811992 despite several retries and waiting\n", + "Could not download 6.437106191682019, 7.466605388684359 despite several retries and waiting\n", + "Could not download 6.435760430148008, 3.2970155456660164 despite several retries and waiting\n", + "Could not download 6.45073235155, 3.2970155456660164 despite several retries and waiting\n", + "Could not download 6.465704272951992, 3.2970155456660164 despite several retries and waiting\n", + "Could not download 6.480676194353984, 3.2970155456660164 despite several retries and waiting\n", + "Could not download 6.4956481157559764, 3.2970155456660164 despite several retries and waiting\n", + "Could not download 6.435760430148008, 3.311987467068008 despite several retries and waiting\n", + "Could not download 6.45073235155, 3.311987467068008 despite several retries and waiting\n", + "Could not download 6.465704272951992, 3.311987467068008 despite several retries and waiting\n", + "Could not download 6.480676194353984, 3.311987467068008 despite several retries and waiting\n", + "Could not download 6.4956481157559764, 3.311987467068008 despite several retries and waiting\n", + "Could not download 6.405816587344024, 3.32695938847 despite several retries and waiting\n", + "Could not download 6.420788508746016, 3.32695938847 despite several retries and waiting\n", + "Could not download 6.435760430148008, 3.32695938847 despite several retries and waiting\n", + "Could not download 6.45073235155, 3.32695938847 despite several retries and waiting\n", + "Could not download 6.465704272951992, 3.32695938847 despite several retries and waiting\n", + "Could not download 6.480676194353984, 3.32695938847 despite several retries and waiting\n", + "Could not download 6.405816587344024, 3.3419313098719923 despite several retries and waiting\n", + "Could not download 6.420788508746016, 3.3419313098719923 despite several retries and waiting\n", + "Could not download 6.4956481157559764, 3.3419313098719923 despite several retries and waiting\n", + "Could not download 6.405816587344024, 3.356903231273984 despite several retries and waiting\n", + "Could not download 6.420788508746016, 3.356903231273984 despite several retries and waiting\n", + "Could not download 6.4956481157559764, 3.356903231273984 despite several retries and waiting\n", + "Could not download 6.45073235155, 3.371875152675976 despite several retries and waiting\n", + "Could not download 6.465704272951992, 3.371875152675976 despite several retries and waiting\n", + "Could not download 6.4956481157559764, 3.371875152675976 despite several retries and waiting\n", + "Could not download 6.45311631035674, 3.3694700512672733 despite several retries and waiting\n", + "Could not download 6.410586360204023, 7.434143106744024 despite several retries and waiting\n", + "Could not download 6.470474045811992, 7.434143106744024 despite several retries and waiting\n", + "Could not download 6.485445967213984, 7.434143106744024 despite several retries and waiting\n", + "Could not download 6.410586360204023, 7.449115028146016 despite several retries and waiting\n", + "Could not download 6.4255582816060155, 7.449115028146016 despite several retries and waiting\n", + "Could not download 6.440530203008008, 7.449115028146016 despite several retries and waiting\n", + "Could not download 6.45550212441, 7.449115028146016 despite several retries and waiting\n", + "Could not download 6.470474045811992, 7.449115028146016 despite several retries and waiting\n", + "Could not download 6.485445967213984, 7.449115028146016 despite several retries and waiting\n", + "Could not download 6.410586360204023, 7.464086949548008 despite several retries and waiting\n", + "Could not download 6.4255582816060155, 7.464086949548008 despite several retries and waiting\n", + "Could not download 6.440530203008008, 7.464086949548008 despite several retries and waiting\n", + "Could not download 6.45550212441, 7.464086949548008 despite several retries and waiting\n", + "Could not download 6.470474045811992, 7.464086949548008 despite several retries and waiting\n", + "Could not download 6.485445967213984, 7.464086949548008 despite several retries and waiting\n", + "Could not download 6.45550212441, 7.47905887095 despite several retries and waiting\n", + "Could not download 6.470474045811992, 7.47905887095 despite several retries and waiting\n", + "Could not download 6.485445967213984, 7.47905887095 despite several retries and waiting\n", + "Could not download 6.500417888615976, 7.494030792351992 despite several retries and waiting\n", + "Could not download 6.500417888615976, 7.5090027137539845 despite several retries and waiting\n", + "Could not download 6.4562865888, 3.419484369444024 despite several retries and waiting\n", + "Could not download 6.471258510201992, 3.419484369444024 despite several retries and waiting\n", + "Could not download 6.4862304316039845, 3.419484369444024 despite several retries and waiting\n", + "Could not download 6.501202353005977, 3.419484369444024 despite several retries and waiting\n", + "Could not download 6.426342745996016, 3.434456290846016 despite several retries and waiting\n", + "Could not download 6.441314667398008, 3.434456290846016 despite several retries and waiting\n", + "Could not download 6.4562865888, 3.434456290846016 despite several retries and waiting\n", + "Could not download 6.471258510201992, 3.434456290846016 despite several retries and waiting\n", + "Could not download 6.4862304316039845, 3.434456290846016 despite several retries and waiting\n", + "Could not download 6.501202353005977, 3.434456290846016 despite several retries and waiting\n", + "Could not download 6.426342745996016, 3.4494282122480078 despite several retries and waiting\n", + "Could not download 6.441314667398008, 3.4494282122480078 despite several retries and waiting\n", + "Could not download 6.4562865888, 3.4494282122480078 despite several retries and waiting\n", + "Could not download 6.471258510201992, 3.4494282122480078 despite several retries and waiting\n", + "Could not download 6.4862304316039845, 3.4494282122480078 despite several retries and waiting\n", + "Could not download 6.501202353005977, 3.4494282122480078 despite several retries and waiting\n", + "Could not download 6.426342745996016, 3.46440013365 despite several retries and waiting\n", + "Could not download 6.441314667398008, 3.46440013365 despite several retries and waiting\n", + "Could not download 6.4562865888, 3.46440013365 despite several retries and waiting\n", + "Could not download 6.471258510201992, 3.46440013365 despite several retries and waiting\n", + "Could not download 6.4862304316039845, 3.46440013365 despite several retries and waiting\n", + "Could not download 6.4562865888, 3.479372055051992 despite several retries and waiting\n", + "Could not download 6.4562865888, 3.4943439764539836 despite several retries and waiting\n", + "Could not download 6.4562865888, 3.5093158978559758 despite several retries and waiting\n", + "Could not download 6.453208255828009, 3.5117014445740242 despite several retries and waiting\n", + "Could not download 6.4382363344260165, 3.5266733659760163 despite several retries and waiting\n", + "Could not download 6.453208255828009, 3.571589130181992 despite several retries and waiting\n", + "Could not download 6.498124020033985, 3.586561051583984 despite several retries and waiting\n", + "Could not download 6.498124020033985, 3.601532972985976 despite several retries and waiting\n", + "Could not download 6.47041667546, 7.442715252614023 despite several retries and waiting\n", + "Could not download 6.485388596861992, 7.442715252614023 despite several retries and waiting\n", + "Could not download 6.425500911254024, 7.457687174016015 despite several retries and waiting\n", + "Could not download 6.440472832656016, 7.457687174016015 despite several retries and waiting\n", + "Could not download 6.455444754058008, 7.457687174016015 despite several retries and waiting\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "ERROR:root:Error-could not download 6.485388596861992, 7.457687174016015\n", + "Traceback (most recent call last):\n", + " File \"\", line 46, in download_images\n", + " im = imd.download_image(lat, lon, min_year, min_month, max_year, max_month)\n", + " File \"gdrive/MyDrive/geo/utils/planet_downloader.py\", line 95, in download_image\n", + " res = json.loads(result.text)\n", + " File \"/usr/lib/python3.7/json/__init__.py\", line 348, in loads\n", + " return _default_decoder.decode(s)\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 337, in decode\n", + " obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 355, in raw_decode\n", + " raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n", + "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "Could not download 6.47041667546, 7.457687174016015 despite several retries and waiting\n", + "Could not download 6.455444754058008, 7.472659095418007 despite several retries and waiting\n", + "Could not download 6.47041667546, 7.472659095418007 despite several retries and waiting\n", + "Could not download 6.485388596861992, 7.472659095418007 despite several retries and waiting\n", + "Could not download 6.455444754058008, 7.487631016819999 despite several retries and waiting\n", + "Could not download 6.47041667546, 7.487631016819999 despite several retries and waiting\n", + "Could not download 6.485388596861992, 7.487631016819999 despite several retries and waiting\n", + "Could not download 6.500360518263984, 7.502602938221991 despite several retries and waiting\n", + "Could not download 6.431778452264023, 3.305904738064024 despite several retries and waiting\n", + "Could not download 6.446750373666015, 3.305904738064024 despite several retries and waiting\n", + "Could not download 6.461722295068007, 3.305904738064024 despite several retries and waiting\n", + "Could not download 6.476694216469999, 3.305904738064024 despite several retries and waiting\n", + "Could not download 6.4916661378719915, 3.305904738064024 despite several retries and waiting\n", + "Could not download 6.506638059273984, 3.305904738064024 despite several retries and waiting\n", + "Could not download 6.431778452264023, 3.320876659466016 despite several retries and waiting\n", + "Could not download 6.446750373666015, 3.320876659466016 despite several retries and waiting\n", + "Could not download 6.461722295068007, 3.320876659466016 despite several retries and waiting\n", + "Could not download 6.476694216469999, 3.320876659466016 despite several retries and waiting\n", + "Could not download 6.431778452264023, 3.335848580868008 despite several retries and waiting\n", + "Could not download 6.446750373666015, 3.335848580868008 despite several retries and waiting\n", + "Could not download 6.461722295068007, 3.335848580868008 despite several retries and waiting\n", + "Could not download 6.476694216469999, 3.335848580868008 despite several retries and waiting\n", + "Could not download 6.4916661378719915, 3.35082050227 despite several retries and waiting\n", + "Could not download 6.506638059273984, 3.35082050227 despite several retries and waiting\n", + "Could not download 6.446750373666015, 3.365792423671992 despite several retries and waiting\n", + "Could not download 6.461722295068007, 3.365792423671992 despite several retries and waiting\n", + "Could not download 6.4916661378719915, 3.365792423671992 despite several retries and waiting\n", + "Could not download 6.506638059273984, 3.365792423671992 despite several retries and waiting\n", + "Could not download 6.521609980675976, 3.365792423671992 despite several retries and waiting\n", + "Could not download 6.446750373666015, 3.3807643450739837 despite several retries and waiting\n", + "Could not download 6.461722295068007, 3.3807643450739837 despite several retries and waiting\n", + "Could not download 6.4916661378719915, 3.3807643450739837 despite several retries and waiting\n", + "Could not download 6.506638059273984, 3.3807643450739837 despite several retries and waiting\n", + "Could not download 6.521609980675976, 3.3807643450739837 despite several retries and waiting\n", + "Could not download 6.446750373666015, 3.395736266475976 despite several retries and waiting\n", + "Could not download 6.461722295068007, 3.395736266475976 despite several retries and waiting\n", + "Could not download 6.476694216469999, 3.395736266475976 despite several retries and waiting\n", + "Could not download 6.4916661378719915, 3.395736266475976 despite several retries and waiting\n", + "Could not download 6.506638059273984, 3.395736266475976 despite several retries and waiting\n", + "Could not download 6.521609980675976, 3.395736266475976 despite several retries and waiting\n", + "Could not download 6.464454916618007, 7.723408767874024 despite several retries and waiting\n", + "Could not download 6.5093706808239835, 7.738380689276016 despite several retries and waiting\n", + "Could not download 6.479426838019999, 7.76832453208 despite several retries and waiting\n", + "Could not download 6.434511073814023, 7.783296453481992 despite several retries and waiting\n", + "Could not download 6.464454916618007, 7.783296453481992 despite several retries and waiting\n", + "Could not download 6.434511073814023, 7.813240296285977 despite several retries and waiting\n", + "Could not download 6.4440371855340235, 3.301675547338008 despite several retries and waiting\n", + "Could not download 6.459009106936016, 3.301675547338008 despite several retries and waiting\n", + "Could not download 6.473981028338008, 3.301675547338008 despite several retries and waiting\n", + "Could not download 6.48895294974, 3.301675547338008 despite several retries and waiting\n", + "Could not download 6.503924871141992, 3.301675547338008 despite several retries and waiting\n", + "Could not download 6.4440371855340235, 3.3166474687400003 despite several retries and waiting\n", + "Could not download 6.459009106936016, 3.3166474687400003 despite several retries and waiting\n", + "Could not download 6.473981028338008, 3.3166474687400003 despite several retries and waiting\n", + "Could not download 6.48895294974, 3.3166474687400003 despite several retries and waiting\n", + "Could not download 6.503924871141992, 3.3166474687400003 despite several retries and waiting\n", + "Could not download 6.4440371855340235, 3.3316193901419924 despite several retries and waiting\n", + "Could not download 6.459009106936016, 3.3316193901419924 despite several retries and waiting\n", + "Could not download 6.473981028338008, 3.3316193901419924 despite several retries and waiting\n", + "Could not download 6.48895294974, 3.3316193901419924 despite several retries and waiting\n", + "Could not download 6.503924871141992, 3.346591311543984 despite several retries and waiting\n", + "Could not download 6.533868713945976, 3.346591311543984 despite several retries and waiting\n", + "Could not download 6.503924871141992, 3.361563232945976 despite several retries and waiting\n", + "Could not download 6.533868713945976, 3.361563232945976 despite several retries and waiting\n", + "Could not download 6.487317196382653, 3.3173926006543253 despite several retries and waiting\n", + "Could not download 6.445481534634024, 3.305640659428008 despite several retries and waiting\n", + "Could not download 6.460453456036016, 3.305640659428008 despite several retries and waiting\n", + "Could not download 6.475425377438008, 3.305640659428008 despite several retries and waiting\n", + "Could not download 6.4903972988400005, 3.305640659428008 despite several retries and waiting\n", + "Could not download 6.505369220241993, 3.305640659428008 despite several retries and waiting\n", + "Could not download 6.445481534634024, 3.32061258083 despite several retries and waiting\n", + "Could not download 6.460453456036016, 3.32061258083 despite several retries and waiting\n", + "Could not download 6.475425377438008, 3.32061258083 despite several retries and waiting\n", + "Could not download 6.445481534634024, 3.3355845022319923 despite several retries and waiting\n", + "Could not download 6.460453456036016, 3.3355845022319923 despite several retries and waiting\n", + "Could not download 6.475425377438008, 3.3355845022319923 despite several retries and waiting\n", + "Could not download 6.4903972988400005, 3.350556423633984 despite several retries and waiting\n", + "Could not download 6.505369220241993, 3.350556423633984 despite several retries and waiting\n", + "Could not download 6.535313063045977, 3.350556423633984 despite several retries and waiting\n", + "Could not download 6.460453456036016, 3.365528345035976 despite several retries and waiting\n", + "Could not download 6.4903972988400005, 3.365528345035976 despite several retries and waiting\n", + "Could not download 6.505369220241993, 3.365528345035976 despite several retries and waiting\n", + "Could not download 6.520341141643985, 3.365528345035976 despite several retries and waiting\n", + "Could not download 6.535313063045977, 3.365528345035976 despite several retries and waiting\n", + "Could not download 6.461657705577472, 3.3164105510626967 despite several retries and waiting\n", + "Could not download 6.448046524714023, 3.303887952668008 despite several retries and waiting\n", + "Could not download 6.463018446116015, 3.303887952668008 despite several retries and waiting\n", + "Could not download 6.477990367518007, 3.303887952668008 despite several retries and waiting\n", + "Could not download 6.492962288919999, 3.303887952668008 despite several retries and waiting\n", + "Could not download 6.5079342103219915, 3.303887952668008 despite several retries and waiting\n", + "Could not download 6.448046524714023, 3.31885987407 despite several retries and waiting\n", + "Could not download 6.463018446116015, 3.31885987407 despite several retries and waiting\n", + "Could not download 6.477990367518007, 3.31885987407 despite several retries and waiting\n", + "Could not download 6.448046524714023, 3.3338317954719923 despite several retries and waiting\n", + "Could not download 6.463018446116015, 3.3338317954719923 despite several retries and waiting\n", + "Could not download 6.477990367518007, 3.3338317954719923 despite several retries and waiting\n", + "Could not download 6.492962288919999, 3.348803716873984 despite several retries and waiting\n", + "Could not download 6.5079342103219915, 3.348803716873984 despite several retries and waiting\n", + "Could not download 6.537878053125976, 3.348803716873984 despite several retries and waiting\n", + "Could not download 6.448046524714023, 3.363775638275976 despite several retries and waiting\n", + "Could not download 6.463018446116015, 3.363775638275976 despite several retries and waiting\n", + "Could not download 6.492962288919999, 3.363775638275976 despite several retries and waiting\n", + "Could not download 6.5079342103219915, 3.363775638275976 despite several retries and waiting\n", + "Could not download 6.522906131723984, 3.363775638275976 despite several retries and waiting\n", + "Could not download 6.537878053125976, 3.363775638275976 despite several retries and waiting\n", + "Could not download 6.451864592804023, 3.301400714508008 despite several retries and waiting\n", + "Could not download 6.466836514206015, 3.301400714508008 despite several retries and waiting\n", + "Could not download 6.4818084356080075, 3.301400714508008 despite several retries and waiting\n", + "Could not download 6.49678035701, 3.301400714508008 despite several retries and waiting\n", + "Could not download 6.511752278411992, 3.301400714508008 despite several retries and waiting\n", + "Could not download 6.451864592804023, 3.31637263591 despite several retries and waiting\n", + "Could not download 6.466836514206015, 3.31637263591 despite several retries and waiting\n", + "Could not download 6.4818084356080075, 3.31637263591 despite several retries and waiting\n", + "Could not download 6.49678035701, 3.31637263591 despite several retries and waiting\n", + "Could not download 6.511752278411992, 3.31637263591 despite several retries and waiting\n", + "Could not download 6.451864592804023, 3.331344557311992 despite several retries and waiting\n", + "Could not download 6.466836514206015, 3.331344557311992 despite several retries and waiting\n", + "Could not download 6.4818084356080075, 3.331344557311992 despite several retries and waiting\n", + "Could not download 6.49678035701, 3.346316478713984 despite several retries and waiting\n", + "Could not download 6.511752278411992, 3.346316478713984 despite several retries and waiting\n", + "Could not download 6.541696121215976, 3.346316478713984 despite several retries and waiting\n", + "Could not download 6.49678035701, 3.361288400115976 despite several retries and waiting\n", + "Could not download 6.511752278411992, 3.361288400115976 despite several retries and waiting\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "ERROR:root:Error-could not download 6.541696121215976, 3.361288400115976\n", + "Traceback (most recent call last):\n", + " File \"\", line 51, in download_images\n", + " im = imd.download_image(lat, lon, min_year, min_month, max_year, max_month)\n", + " File \"gdrive/MyDrive/geo/utils/planet_downloader.py\", line 95, in download_image\n", + " res = json.loads(result.text)\n", + " File \"/usr/lib/python3.7/json/__init__.py\", line 348, in loads\n", + " return _default_decoder.decode(s)\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 337, in decode\n", + " obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 355, in raw_decode\n", + " raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n", + "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n", + "ERROR:root:Error-could not download 6.50086128035502, 3.280458884913953\n", + "Traceback (most recent call last):\n", + " File \"\", line 46, in download_images\n", + " im = imd.download_image(lat, lon, min_year, min_month, max_year, max_month)\n", + " File \"gdrive/MyDrive/geo/utils/planet_downloader.py\", line 95, in download_image\n", + " res = json.loads(result.text)\n", + " File \"/usr/lib/python3.7/json/__init__.py\", line 348, in loads\n", + " return _default_decoder.decode(s)\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 337, in decode\n", + " obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 355, in raw_decode\n", + " raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n", + "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "Could not download 6.49837750849, 3.3441339482340244 despite several retries and waiting\n", + "Could not download 6.543293272695976, 3.3441339482340244 despite several retries and waiting\n", + "Could not download 6.49837750849, 3.3591058696360165 despite several retries and waiting\n", + "Could not download 6.543293272695976, 3.3591058696360165 despite several retries and waiting\n", + "Could not download 6.453461744284024, 3.374077791038008 despite several retries and waiting\n", + "Could not download 6.49837750849, 3.374077791038008 despite several retries and waiting\n", + "Could not download 6.513349429891992, 3.374077791038008 despite several retries and waiting\n", + "Could not download 6.528321351293984, 3.374077791038008 despite several retries and waiting\n", + "Could not download 6.543293272695976, 3.374077791038008 despite several retries and waiting\n", + "Could not download 6.453461744284024, 3.3890497124400003 despite several retries and waiting\n", + "Could not download 6.468433665686016, 3.3890497124400003 despite several retries and waiting\n", + "Could not download 6.483405587088008, 3.3890497124400003 despite several retries and waiting\n", + "Could not download 6.49837750849, 3.3890497124400003 despite several retries and waiting\n", + "Could not download 6.513349429891992, 3.3890497124400003 despite several retries and waiting\n", + "Could not download 6.528321351293984, 3.3890497124400003 despite several retries and waiting\n", + "Could not download 6.543293272695976, 3.3890497124400003 despite several retries and waiting\n", + "Could not download 6.453461744284024, 3.4040216338419924 despite several retries and waiting\n", + "Could not download 6.468433665686016, 3.4040216338419924 despite several retries and waiting\n", + "Could not download 6.483405587088008, 3.4040216338419924 despite several retries and waiting\n", + "Could not download 6.49837750849, 3.4040216338419924 despite several retries and waiting\n", + "Could not download 6.513349429891992, 3.4040216338419924 despite several retries and waiting\n", + "Could not download 6.528321351293984, 3.4040216338419924 despite several retries and waiting\n", + "Could not download 6.543293272695976, 3.4040216338419924 despite several retries and waiting\n", + "Could not download 6.453461744284024, 3.418993555243984 despite several retries and waiting\n", + "Could not download 6.468433665686016, 3.418993555243984 despite several retries and waiting\n", + "Could not download 6.483405587088008, 3.418993555243984 despite several retries and waiting\n", + "Could not download 6.49837750849, 3.418993555243984 despite several retries and waiting\n", + "Could not download 6.513349429891992, 3.418993555243984 despite several retries and waiting\n", + "Could not download 6.528321351293984, 3.418993555243984 despite several retries and waiting\n", + "Could not download 6.543293272695976, 3.418993555243984 despite several retries and waiting\n", + "Could not download 6.453461744284024, 3.433965476645976 despite several retries and waiting\n", + "Could not download 6.468433665686016, 3.433965476645976 despite several retries and waiting\n", + "Could not download 6.483405587088008, 3.433965476645976 despite several retries and waiting\n", + "Could not download 6.49837750849, 3.433965476645976 despite several retries and waiting\n", + "Could not download 6.513349429891992, 3.433965476645976 despite several retries and waiting\n", + "Could not download 6.528321351293984, 3.433965476645976 despite several retries and waiting\n", + "Could not download 6.543293272695976, 3.433965476645976 despite several retries and waiting\n", + "Could not download 6.501125953552996, 3.3779899290701763 despite several retries and waiting\n", + "Could not download 6.50045244209, 3.59690898231 despite several retries and waiting\n", + "Could not download 6.456820426674025, 4.799983078813984 despite several retries and waiting\n", + "Could not download 6.459431334864023, 3.303253964316016 despite several retries and waiting\n", + "Could not download 6.474403256266015, 3.303253964316016 despite several retries and waiting\n", + "Could not download 6.489375177668007, 3.303253964316016 despite several retries and waiting\n", + "Could not download 6.504347099069999, 3.303253964316016 despite several retries and waiting\n", + "Could not download 6.459431334864023, 3.318225885718008 despite several retries and waiting\n", + "Could not download 6.474403256266015, 3.318225885718008 despite several retries and waiting\n", + "Could not download 6.489375177668007, 3.318225885718008 despite several retries and waiting\n", + "Could not download 6.459431334864023, 3.33319780712 despite several retries and waiting\n", + "Could not download 6.474403256266015, 3.33319780712 despite several retries and waiting\n", + "Could not download 6.489375177668007, 3.33319780712 despite several retries and waiting\n", + "Could not download 6.504347099069999, 3.348169728521992 despite several retries and waiting\n", + "Could not download 6.534290941873984, 3.348169728521992 despite several retries and waiting\n", + "Could not download 6.549262863275976, 3.348169728521992 despite several retries and waiting\n", + "Could not download 6.459431334864023, 3.3631416499239837 despite several retries and waiting\n", + "Could not download 6.504347099069999, 3.3631416499239837 despite several retries and waiting\n", + "Could not download 6.5193190204719915, 3.3631416499239837 despite several retries and waiting\n", + "Could not download 6.534290941873984, 3.3631416499239837 despite several retries and waiting\n", + "Could not download 6.549262863275976, 3.3631416499239837 despite several retries and waiting\n", + "Could not download 6.459431334864023, 3.378113571325976 despite several retries and waiting\n", + "Could not download 6.504347099069999, 3.378113571325976 despite several retries and waiting\n", + "Could not download 6.5193190204719915, 3.378113571325976 despite several retries and waiting\n", + "Could not download 6.534290941873984, 3.378113571325976 despite several retries and waiting\n", + "Could not download 6.549262863275976, 3.378113571325976 despite several retries and waiting\n", + "Could not download 6.475854565146016, 8.840218174156016 despite several retries and waiting\n", + "Could not download 6.50579840795, 8.840218174156016 despite several retries and waiting\n", + "Could not download 6.535742250753985, 8.855190095558008 despite several retries and waiting\n", + "Could not download 6.550714172155977, 8.87016201696 despite several retries and waiting\n", + "Could not download 6.50579840795, 8.885133938361992 despite several retries and waiting\n", + "Could not download 6.470551677024024, 3.3030515341760163 despite several retries and waiting\n", + "Could not download 6.485523598426016, 3.3030515341760163 despite several retries and waiting\n", + "Could not download 6.500495519828008, 3.3030515341760163 despite several retries and waiting\n", + "Could not download 6.470551677024024, 3.318023455578008 despite several retries and waiting\n", + "Could not download 6.485523598426016, 3.318023455578008 despite several retries and waiting\n", + "Could not download 6.470551677024024, 3.33299537698 despite several retries and waiting\n", + "Could not download 6.485523598426016, 3.33299537698 despite several retries and waiting\n", + "Could not download 6.500495519828008, 3.347967298381992 despite several retries and waiting\n", + "Could not download 6.545411284033984, 3.347967298381992 despite several retries and waiting\n", + "Could not download 6.5603832054359765, 3.347967298381992 despite several retries and waiting\n", + "Could not download 6.500495519828008, 3.362939219783984 despite several retries and waiting\n", + "Could not download 6.51546744123, 3.362939219783984 despite several retries and waiting\n", + "Could not download 6.530439362631992, 3.362939219783984 despite several retries and waiting\n", + "Could not download 6.545411284033984, 3.362939219783984 despite several retries and waiting\n", + "Could not download 6.5603832054359765, 3.362939219783984 despite several retries and waiting\n", + "Could not download 6.500495519828008, 3.377911141185976 despite several retries and waiting\n", + "Could not download 6.51546744123, 3.377911141185976 despite several retries and waiting\n", + "Could not download 6.530439362631992, 3.377911141185976 despite several retries and waiting\n", + "Could not download 6.545411284033984, 3.377911141185976 despite several retries and waiting\n", + "Could not download 6.5603832054359765, 3.377911141185976 despite several retries and waiting\n", + "Could not download 6.546280651193984, 5.672829526930001 despite several retries and waiting\n", + "Could not download 6.504390547856016, 3.3426545327740245 despite several retries and waiting\n", + "Could not download 6.53433439066, 3.3426545327740245 despite several retries and waiting\n", + "Could not download 6.549306312061992, 3.3426545327740245 despite several retries and waiting\n", + "Could not download 6.564278233463984, 3.3426545327740245 despite several retries and waiting\n", + "Could not download 6.504390547856016, 3.3576264541760166 despite several retries and waiting\n", + "Could not download 6.53433439066, 3.3576264541760166 despite several retries and waiting\n", + "Could not download 6.549306312061992, 3.3576264541760166 despite several retries and waiting\n", + "Could not download 6.564278233463984, 3.3576264541760166 despite several retries and waiting\n", + "Could not download 6.504390547856016, 3.3725983755780082 despite several retries and waiting\n", + "Could not download 6.519362469258008, 3.3725983755780082 despite several retries and waiting\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "ERROR:root:Error-could not download 6.53433439066, 3.3725983755780082\n", + "Traceback (most recent call last):\n", + " File \"\", line 51, in download_images\n", + " im = imd.download_image(lat, lon, min_year, min_month, max_year, max_month)\n", + " File \"gdrive/MyDrive/geo/utils/planet_downloader.py\", line 95, in download_image\n", + " res = json.loads(result.text)\n", + " File \"/usr/lib/python3.7/json/__init__.py\", line 348, in loads\n", + " return _default_decoder.decode(s)\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 337, in decode\n", + " obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 355, in raw_decode\n", + " raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n", + "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "Could not download 6.549306312061992, 3.3725983755780082 despite several retries and waiting\n", + "Could not download 6.564278233463984, 3.3725983755780082 despite several retries and waiting\n", + "Could not download 6.579250154865976, 3.3725983755780082 despite several retries and waiting\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "ERROR:root:Error-could not download 6.4894186264540235, 3.3875702969800003\n", + "Traceback (most recent call last):\n", + " File \"\", line 51, in download_images\n", + " im = imd.download_image(lat, lon, min_year, min_month, max_year, max_month)\n", + " File \"gdrive/MyDrive/geo/utils/planet_downloader.py\", line 95, in download_image\n", + " res = json.loads(result.text)\n", + " File \"/usr/lib/python3.7/json/__init__.py\", line 348, in loads\n", + " return _default_decoder.decode(s)\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 337, in decode\n", + " obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 355, in raw_decode\n", + " raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n", + "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "Could not download 6.504390547856016, 3.3875702969800003 despite several retries and waiting\n", + "Could not download 6.519362469258008, 3.3875702969800003 despite several retries and waiting\n", + "Could not download 6.53433439066, 3.3875702969800003 despite several retries and waiting\n", + "Could not download 6.549306312061992, 3.3875702969800003 despite several retries and waiting\n", + "Could not download 6.564278233463984, 3.3875702969800003 despite several retries and waiting\n", + "Could not download 6.579250154865976, 3.3875702969800003 despite several retries and waiting\n", + "Could not download 6.4894186264540235, 3.4025422183819924 despite several retries and waiting\n", + "Could not download 6.504390547856016, 3.4025422183819924 despite several retries and waiting\n", + "Could not download 6.519362469258008, 3.4025422183819924 despite several retries and waiting\n", + "Could not download 6.53433439066, 3.4025422183819924 despite several retries and waiting\n", + "Could not download 6.549306312061992, 3.4025422183819924 despite several retries and waiting\n", + "Could not download 6.564278233463984, 3.4025422183819924 despite several retries and waiting\n", + "Could not download 6.579250154865976, 3.4025422183819924 despite several retries and waiting\n", + "Could not download 6.4894186264540235, 3.417514139783984 despite several retries and waiting\n", + "Could not download 6.504390547856016, 3.417514139783984 despite several retries and waiting\n", + "Could not download 6.519362469258008, 3.417514139783984 despite several retries and waiting\n", + "Could not download 6.53433439066, 3.417514139783984 despite several retries and waiting\n", + "Could not download 6.549306312061992, 3.417514139783984 despite several retries and waiting\n", + "Could not download 6.564278233463984, 3.417514139783984 despite several retries and waiting\n", + "Could not download 6.579250154865976, 3.417514139783984 despite several retries and waiting\n", + "Could not download 6.4894186264540235, 3.432486061185976 despite several retries and waiting\n", + "Could not download 6.504390547856016, 3.432486061185976 despite several retries and waiting\n", + "Could not download 6.519362469258008, 3.432486061185976 despite several retries and waiting\n", + "Could not download 6.53433439066, 3.432486061185976 despite several retries and waiting\n", + "Could not download 6.549306312061992, 3.432486061185976 despite several retries and waiting\n", + "Could not download 6.564278233463984, 3.432486061185976 despite several retries and waiting\n", + "Could not download 6.500612866060731, 3.3560429887079737 despite several retries and waiting\n", + "Could not download 6.499420652814024, 3.2970355889740244 despite several retries and waiting\n", + "Could not download 6.499420652814024, 3.3120075103760165 despite several retries and waiting\n", + "Could not download 6.499420652814024, 3.3419513531800003 despite several retries and waiting\n", + "Could not download 6.54433641702, 3.3419513531800003 despite several retries and waiting\n", + "Could not download 6.559308338421992, 3.3419513531800003 despite several retries and waiting\n", + "Could not download 6.5742802598239845, 3.3419513531800003 despite several retries and waiting\n", + "Could not download 6.499420652814024, 3.3569232745819924 despite several retries and waiting\n", + "Could not download 6.54433641702, 3.3569232745819924 despite several retries and waiting\n", + "Could not download 6.559308338421992, 3.3569232745819924 despite several retries and waiting\n", + "Could not download 6.5742802598239845, 3.3569232745819924 despite several retries and waiting\n", + "Could not download 6.499420652814024, 3.371895195983984 despite several retries and waiting\n", + "Could not download 6.514392574216016, 3.371895195983984 despite several retries and waiting\n", + "Could not download 6.529364495618008, 3.371895195983984 despite several retries and waiting\n", + "Could not download 6.54433641702, 3.371895195983984 despite several retries and waiting\n", + "Could not download 6.559308338421992, 3.371895195983984 despite several retries and waiting\n", + "Could not download 6.5742802598239845, 3.371895195983984 despite several retries and waiting\n", + "Could not download 6.589252181225977, 3.371895195983984 despite several retries and waiting\n", + "Could not download 6.499420652814024, 3.386867117385976 despite several retries and waiting\n", + "Could not download 6.514392574216016, 3.386867117385976 despite several retries and waiting\n", + "Could not download 6.529364495618008, 3.386867117385976 despite several retries and waiting\n", + "Could not download 6.54433641702, 3.386867117385976 despite several retries and waiting\n", + "Could not download 6.559308338421992, 3.386867117385976 despite several retries and waiting\n", + "Could not download 6.5742802598239845, 3.386867117385976 despite several retries and waiting\n", + "Could not download 6.589252181225977, 3.386867117385976 despite several retries and waiting\n", + "Could not download 6.504260955087261, 3.3065040275603064 despite several retries and waiting\n", + "Could not download 6.536318459618008, 8.869393726864024 despite several retries and waiting\n", + "Could not download 6.55129038102, 8.869393726864024 despite several retries and waiting\n", + "Could not download 6.5812342238239845, 8.899337569668008 despite several retries and waiting\n", + "Could not download 6.566262302421992, 8.944253333873984 despite several retries and waiting\n", + "Could not download 6.631121206735976, 3.198581672726016 despite several retries and waiting\n", + "Could not download 6.543993772714023, 3.348758235476016 despite several retries and waiting\n", + "Could not download 6.558965694116015, 3.348758235476016 despite several retries and waiting\n", + "Could not download 6.573937615518007, 3.348758235476016 despite several retries and waiting\n", + "Could not download 6.543993772714023, 3.363730156878008 despite several retries and waiting\n", + "Could not download 6.558965694116015, 3.363730156878008 despite several retries and waiting\n", + "Could not download 6.573937615518007, 3.363730156878008 despite several retries and waiting\n", + "Could not download 6.588909536919999, 3.363730156878008 despite several retries and waiting\n", + "Could not download 6.543993772714023, 3.37870207828 despite several retries and waiting\n", + "Could not download 6.558965694116015, 3.37870207828 despite several retries and waiting\n", + "Could not download 6.573937615518007, 3.37870207828 despite several retries and waiting\n", + "Could not download 6.588909536919999, 3.37870207828 despite several retries and waiting\n", + "Could not download 6.543993772714023, 3.393673999681992 despite several retries and waiting\n", + "Could not download 6.558965694116015, 3.393673999681992 despite several retries and waiting\n", + "Could not download 6.573937615518007, 3.393673999681992 despite several retries and waiting\n", + "Could not download 6.588909536919999, 3.393673999681992 despite several retries and waiting\n", + "Could not download 6.603881458321991, 3.393673999681992 despite several retries and waiting\n", + "Could not download 6.6188533797239835, 3.393673999681992 despite several retries and waiting\n", + "Could not download 6.543993772714023, 3.4086459210839837 despite several retries and waiting\n", + "Could not download 6.558965694116015, 3.4086459210839837 despite several retries and waiting\n", + "Could not download 6.573937615518007, 3.4086459210839837 despite several retries and waiting\n", + "Could not download 6.588909536919999, 3.4086459210839837 despite several retries and waiting\n", + "Could not download 6.543993772714023, 3.423617842485976 despite several retries and waiting\n", + "Could not download 6.558965694116015, 3.423617842485976 despite several retries and waiting\n", + "Could not download 6.573937615518007, 3.423617842485976 despite several retries and waiting\n", + "Could not download 6.588909536919999, 3.423617842485976 despite several retries and waiting\n", + "Could not download 6.553862450814022, 3.345283882305976 despite several retries and waiting\n", + "Could not download 6.568834372216014, 3.345283882305976 despite several retries and waiting\n", + "Could not download 6.554027420784024, 3.343271466031992 despite several retries and waiting\n", + "Could not download 6.568999342186016, 3.343271466031992 despite several retries and waiting\n", + "Could not download 6.554027420784024, 3.358243387433984 despite several retries and waiting\n", + "Could not download 6.568999342186016, 3.358243387433984 despite several retries and waiting\n", + "Could not download 6.554027420784024, 3.373215308835976 despite several retries and waiting\n", + "Could not download 6.568999342186016, 3.373215308835976 despite several retries and waiting\n", + "Could not download 6.583971263588008, 3.373215308835976 despite several retries and waiting\n", + "Could not download 6.59894318499, 3.373215308835976 despite several retries and waiting\n", + "Could not download 6.556147980314024, 3.3528454902359757 despite several retries and waiting\n", + "Could not download 6.571119901716016, 3.3528454902359757 despite several retries and waiting\n", + "Could not download 6.585449447434024, 3.3630790885759763 despite several retries and waiting\n", + "Could not download 6.624188540466016, 3.867525352093984 despite several retries and waiting\n", + "Could not download 6.639160461868008, 3.867525352093984 despite several retries and waiting\n", + "Could not download 6.624188540466016, 3.882497273495976 despite several retries and waiting\n", + "Could not download 6.709727838735977, 3.1194055995460164 despite several retries and waiting\n", + "Could not download 6.634868231726016, 3.194265206555976 despite several retries and waiting\n", + "Could not download 6.6674215866180075, 7.253603776614024 despite several retries and waiting\n", + "Could not download 6.652449665216015, 7.268575698016016 despite several retries and waiting\n", + "Could not download 6.6674215866180075, 7.268575698016016 despite several retries and waiting\n", + "Could not download 6.68239350802, 7.283547619418008 despite several retries and waiting\n", + "Could not download 6.637477743814023, 7.29851954082 despite several retries and waiting\n", + "Could not download 6.637477743814023, 7.313491462221992 despite several retries and waiting\n", + "Could not download 6.712337350823984, 7.313491462221992 despite several retries and waiting\n", + "Could not download 6.637477743814023, 7.328463383623984 despite several retries and waiting\n", + "Could not download 6.6426397093640235, 3.20068664011 despite several retries and waiting\n", + "Could not download 6.69118125694, 8.090186760154024 despite several retries and waiting\n", + "Could not download 6.721125099743984, 8.105158681556016 despite several retries and waiting\n", + "Could not download 6.664060555186017, 3.056837762044024 despite several retries and waiting\n", + "Could not download 6.738920162195978, 3.056837762044024 despite several retries and waiting\n", + "Could not download 6.679032476588009, 3.0718096834460162 despite several retries and waiting\n", + "Could not download 6.6940043979900015, 3.086781604848008 despite several retries and waiting\n", + "Could not download 6.723948240793986, 3.086781604848008 despite several retries and waiting\n", + "Could not download 6.708976319391994, 3.116725447651992 despite several retries and waiting\n", + "Could not download 6.738920162195978, 3.1316973690539838 despite several retries and waiting\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "ERROR:root:Error-could not download 6.761198113495976, 8.902699118784021\n", + "Traceback (most recent call last):\n", + " File \"\", line 46, in download_images\n", + " im = imd.download_image(lat, lon, min_year, min_month, max_year, max_month)\n", + " File \"gdrive/MyDrive/geo/utils/planet_downloader.py\", line 95, in download_image\n", + " res = json.loads(result.text)\n", + " File \"/usr/lib/python3.7/json/__init__.py\", line 348, in loads\n", + " return _default_decoder.decode(s)\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 337, in decode\n", + " obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 355, in raw_decode\n", + " raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n", + "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "Could not download 6.694550227774024, 3.025148089914024 despite several retries and waiting\n", + "Could not download 6.724494070578008, 3.025148089914024 despite several retries and waiting\n", + "Could not download 6.7544379133819925, 3.025148089914024 despite several retries and waiting\n", + "Could not download 6.769409834783985, 3.025148089914024 despite several retries and waiting\n", + "Could not download 6.709522149176016, 3.040120011316016 despite several retries and waiting\n", + "Could not download 6.724494070578008, 3.040120011316016 despite several retries and waiting\n", + "Could not download 6.73946599198, 3.07006385412 despite several retries and waiting\n", + "Could not download 6.7544379133819925, 3.085035775521992 despite several retries and waiting\n", + "Could not download 6.784381756185977, 3.1149796183259757 despite several retries and waiting\n", + "Could not download 6.73947844291, 3.136252953026016 despite several retries and waiting\n", + "Could not download 6.723021555624024, 7.633186747338008 despite several retries and waiting\n", + "Could not download 6.818706899445977, 3.1823028681560164 despite several retries and waiting\n", + "Could not download 6.803734978043985, 3.197274789558008 despite several retries and waiting\n", + "Could not download 6.818706899445977, 3.197274789558008 despite several retries and waiting\n", + "Could not download 6.810196777023983, 3.2035536425840245 despite several retries and waiting\n", + "Could not download 6.745360146454024, 4.4881754665 despite several retries and waiting\n", + "Could not download 6.760332067856016, 4.533091230705977 despite several retries and waiting\n", + "Could not download 6.82426073472, 3.174401844454024 despite several retries and waiting\n", + "Could not download 6.839232656121992, 3.174401844454024 despite several retries and waiting\n", + "Could not download 6.809288813318008, 3.189373765856016 despite several retries and waiting\n", + "Could not download 6.8691764989259765, 3.204345687258008 despite several retries and waiting\n", + "Could not download 6.839232656121992, 3.21931760866 despite several retries and waiting\n", + "Could not download 6.8691764989259765, 3.234289530061992 despite several retries and waiting\n", + "Could not download 6.849107800415081, 3.2092273528897355 despite several retries and waiting\n", + "Could not download 6.842401553481992, 7.418058468686015 despite several retries and waiting\n", + "Could not download 6.8573734748839845, 7.418058468686015 despite several retries and waiting\n", + "Could not download 6.872345396285977, 7.418058468686015 despite several retries and waiting\n", + "Could not download 6.842401553481992, 7.492918075695975 despite several retries and waiting\n", + "Could not download 6.880465844363984, 8.430195644648007 despite several retries and waiting\n", + "Could not download 6.85900318163, 8.885972186535977 despite several retries and waiting\n", + "Could not download 6.881680958474024, 7.487816047954024 despite several retries and waiting\n", + "Could not download 6.904864771504025, 7.448601936896016 despite several retries and waiting\n", + "Could not download 6.919836692906017, 7.463573858298008 despite several retries and waiting\n", + "Could not download 6.934808614308009, 7.463573858298008 despite several retries and waiting\n", + "Could not download 6.917319070494024, 3.609559104614024 despite several retries and waiting\n", + "Could not download 6.917319070494024, 3.624531026016016 despite several retries and waiting\n", + "Could not download 6.932290991896016, 3.65447486882 despite several retries and waiting\n", + "Could not download 6.932290991896016, 3.669446790221992 despite several retries and waiting\n", + "Could not download 6.947262913298008, 3.669446790221992 despite several retries and waiting\n", + "Could not download 7.007150598905977, 3.669446790221992 despite several retries and waiting\n", + "Could not download 6.932290991896016, 3.6844187116239837 despite several retries and waiting\n", + "Could not download 6.9921786775039845, 3.6844187116239837 despite several retries and waiting\n", + "Could not download 7.009570499581993, 3.650173005834024 despite several retries and waiting\n", + "Could not download 6.994598578180001, 3.665144927236016 despite several retries and waiting\n", + "Could not download 6.964654735376016, 3.69508877004 despite several retries and waiting\n", + "Could not download 7.009570499581993, 3.69508877004 despite several retries and waiting\n", + "Could not download 6.949682813974024, 3.710060691441992 despite several retries and waiting\n", + "Could not download 6.994598578180001, 3.710060691441992 despite several retries and waiting\n", + "Could not download 7.039514342385977, 3.7250326128439837 despite several retries and waiting\n", + "Could not download 7.024542420983985, 3.740004534245976 despite several retries and waiting\n", + "Could not download 6.953261713234023, 3.607852866766016 despite several retries and waiting\n", + "Could not download 6.968233634636015, 3.622824788168008 despite several retries and waiting\n", + "Could not download 6.99817747744, 3.652768630971992 despite several retries and waiting\n", + "Could not download 7.013149398841992, 3.652768630971992 despite several retries and waiting\n", + "Could not download 6.99817747744, 3.6677405523739837 despite several retries and waiting\n", + "Could not download 7.013149398841992, 3.6677405523739837 despite several retries and waiting\n", + "Could not download 7.013149398841992, 3.6827124737759758 despite several retries and waiting\n", + "Could not download 7.043093241645976, 3.6827124737759758 despite several retries and waiting\n", + "Could not download 6.993899744958008, 3.759700670524025 despite several retries and waiting\n", + "Could not download 7.00887166636, 3.759700670524025 despite several retries and waiting\n", + "Could not download 7.023843587761992, 3.7896445133280086 despite several retries and waiting\n", + "Could not download 7.038815509163984, 3.7896445133280086 despite several retries and waiting\n", + "Could not download 7.038815509163984, 3.8495321989359765 despite several retries and waiting\n", + "Could not download 6.968371756184023, 3.603065300505976 despite several retries and waiting\n", + "Could not download 7.123776591733984, 4.778507805014024 despite several retries and waiting\n", + "Could not download 7.09383274893, 4.793479726416016 despite several retries and waiting\n", + "Could not download 7.09383274893, 4.808451647818008 despite several retries and waiting\n", + "Could not download 7.118890978694023, 3.2466539307940243 despite several retries and waiting\n", + "Could not download 7.133862900096015, 3.2616258521960164 despite several retries and waiting\n", + "Could not download 7.118890978694023, 3.276597773598008 despite several retries and waiting\n", + "Could not download 7.155518622468008, 3.365200087313984 despite several retries and waiting\n", + "Could not download 7.245357438293984, 4.902629393894023 despite several retries and waiting\n", + "Could not download 7.260329359695976, 4.917601315296015 despite several retries and waiting\n", + "Could not download 7.200441674088007, 4.9325732366980075 despite several retries and waiting\n", + "Could not download 7.245357438293984, 4.9325732366980075 despite several retries and waiting\n", + "Could not download 7.2303855168919915, 4.9475451581 despite several retries and waiting\n", + "Could not download 7.245357438293984, 4.9475451581 despite several retries and waiting\n", + "Could not download 7.260329359695976, 4.9475451581 despite several retries and waiting\n", + "Could not download 7.260329359695976, 4.962517079501992 despite several retries and waiting\n", + "Could not download 7.260329359695976, 4.977489000903984 despite several retries and waiting\n", + "Could not download 7.253882544021992, 7.191676208054024 despite several retries and waiting\n", + "Could not download 7.268854465423984, 7.206648129456016 despite several retries and waiting\n", + "Could not download 7.273432766373984, 4.88333742804 despite several retries and waiting\n", + "Could not download 7.258460844971992, 4.913281270843984 despite several retries and waiting\n", + "Could not download 7.213545080766016, 4.928253192245976 despite several retries and waiting\n", + "Could not download 7.258460844971992, 4.928253192245976 despite several retries and waiting\n", + "Could not download 7.273432766373984, 4.928253192245976 despite several retries and waiting\n", + "Could not download 7.2884046877759765, 4.928253192245976 despite several retries and waiting\n", + "Could not download 7.289811854895976, 4.876583980615976 despite several retries and waiting\n", + "Could not download 7.284933328458009, 5.140163299796016 despite several retries and waiting\n", + "Could not download 7.274171415134022, 4.939282913204024 despite several retries and waiting\n", + "Could not download 7.289143336536014, 4.939282913204024 despite several retries and waiting\n", + "Could not download 7.304115257938006, 4.939282913204024 despite several retries and waiting\n", + "Could not download 7.334059100741991, 4.939282913204024 despite several retries and waiting\n", + "Could not download 7.289143336536014, 4.954254834606016 despite several retries and waiting\n", + "Could not download 7.334059100741991, 4.954254834606016 despite several retries and waiting\n", + "Could not download 7.334059100741991, 4.969226756008008 despite several retries and waiting\n", + "Could not download 7.349031022143983, 4.969226756008008 despite several retries and waiting\n", + "Could not download 7.364002943545975, 4.969226756008008 despite several retries and waiting\n", + "Could not download 7.364002943545975, 4.98419867741 despite several retries and waiting\n", + "Could not download 7.304115257938006, 4.999170598811992 despite several retries and waiting\n", + "Could not download 7.364002943545975, 4.999170598811992 despite several retries and waiting\n", + "Could not download 7.3190871793399985, 5.014142520213984 despite several retries and waiting\n", + "Could not download 7.334059100741991, 5.014142520213984 despite several retries and waiting\n", + "Could not download 7.4162054131839845, 3.843102910034024 despite several retries and waiting\n", + "Could not download 7.410451191728009, 4.537162084498008 despite several retries and waiting\n", + "Could not download 7.413233856424024, 3.8326875529240247 despite several retries and waiting\n", + "Could not download 7.419940563274025, 5.459334583103984 despite several retries and waiting\n", + "Could not download 7.499920944693983, 4.581471422471992 despite several retries and waiting\n", + "Could not download 7.467449884376016, 4.492781585124024 despite several retries and waiting\n", + "Could not download 7.527337569983985, 4.492781585124024 despite several retries and waiting\n", + "Could not download 7.501940224864023, 3.5242385814380084 despite several retries and waiting\n", + "Could not download 7.576799831873983, 3.5242385814380084 despite several retries and waiting\n", + "Could not download 7.609928345694024, 4.701428455964023 despite several retries and waiting\n", + "Could not download 7.6548441099, 4.716400377366015 despite several retries and waiting\n", + "Could not download 7.707767585184023, 4.583969018963984 despite several retries and waiting\n", + "Could not download 7.722739506586015, 4.583969018963984 despite several retries and waiting\n", + "Could not download 7.737711427988007, 4.583969018963984 despite several retries and waiting\n", + "Could not download 7.752683349389999, 4.583969018963984 despite several retries and waiting\n", + "Could not download 7.707767585184023, 4.598940940365976 despite several retries and waiting\n", + "Could not download 7.722739506586015, 4.598940940365976 despite several retries and waiting\n", + "Could not download 7.737711427988007, 4.598940940365976 despite several retries and waiting\n", + "Could not download 7.752683349389999, 4.598940940365976 despite several retries and waiting\n", + "Could not download 7.767655270791991, 4.598940940365976 despite several retries and waiting\n", + "Could not download 7.711429930184023, 4.579425538183985 despite several retries and waiting\n", + "Could not download 7.726401851586015, 4.579425538183985 despite several retries and waiting\n", + "Could not download 7.741373772988007, 4.579425538183985 despite several retries and waiting\n", + "Could not download 7.711429930184023, 4.594397459585977 despite several retries and waiting\n", + "Could not download 7.726401851586015, 4.594397459585977 despite several retries and waiting\n", + "Could not download 7.741373772988007, 4.594397459585977 despite several retries and waiting\n", + "Could not download 7.756345694389999, 4.594397459585977 despite several retries and waiting\n", + "Could not download 7.771317615791991, 4.594397459585977 despite several retries and waiting\n", + "Could not download 7.713079722004023, 4.576122964501992 despite several retries and waiting\n", + "Could not download 7.728051643406015, 4.576122964501992 despite several retries and waiting\n", + "Could not download 7.7430235648080075, 4.576122964501992 despite several retries and waiting\n", + "Could not download 7.713079722004023, 4.591094885903984 despite several retries and waiting\n", + "Could not download 7.728051643406015, 4.591094885903984 despite several retries and waiting\n", + "Could not download 7.7430235648080075, 4.591094885903984 despite several retries and waiting\n", + "Could not download 7.713079722004023, 4.606066807305976 despite several retries and waiting\n", + "Could not download 7.728051643406015, 4.606066807305976 despite several retries and waiting\n", + "Could not download 7.7430235648080075, 4.606066807305976 despite several retries and waiting\n", + "Could not download 7.75799548621, 4.606066807305976 despite several retries and waiting\n", + "Could not download 7.772967407611992, 4.606066807305976 despite several retries and waiting\n", + "Could not download 7.724900485984023, 4.577488744251992 despite several retries and waiting\n", + "Could not download 7.739872407386015, 4.577488744251992 despite several retries and waiting\n", + "Could not download 7.724900485984023, 4.592460665653984 despite several retries and waiting\n", + "Could not download 7.739872407386015, 4.592460665653984 despite several retries and waiting\n", + "Could not download 7.754844328788007, 4.592460665653984 despite several retries and waiting\n", + "Could not download 7.769816250189999, 4.592460665653984 despite several retries and waiting\n", + "Could not download 7.724900485984023, 4.6074325870559765 despite several retries and waiting\n", + "Could not download 7.739872407386015, 4.6074325870559765 despite several retries and waiting\n", + "Could not download 7.754844328788007, 4.6074325870559765 despite several retries and waiting\n", + "Could not download 7.769816250189999, 4.6074325870559765 despite several retries and waiting\n", + "Could not download 7.735016919137214, 4.572560505861917 despite several retries and waiting\n", + "Could not download 7.758157468948006, 4.756039090326016 despite several retries and waiting\n", + "Could not download 7.812351810281992, 4.733887108564023 despite several retries and waiting\n", + "Could not download 7.79737988888, 4.793774794171991 despite several retries and waiting\n", + "Could not download 7.827323731683984, 4.8237186369759755 despite several retries and waiting\n", + "Could not download 7.773978860634022, 4.605081113835977 despite several retries and waiting\n", + "Could not download 7.839901875710001, 4.907028441278008 despite several retries and waiting\n", + "Could not download 7.843275973094025, 5.1299048293139835 despite several retries and waiting\n", + "Could not download 7.878635412896016, 4.702326264971992 despite several retries and waiting\n", + "Could not download 7.9085792557, 4.702326264971992 despite several retries and waiting\n", + "Could not download 7.9085792557, 4.717298186373984 despite several retries and waiting\n", + "Could not download 7.878635412896016, 4.732270107775976 despite several retries and waiting\n", + "Could not download 7.872328893934022, 5.787084440111991 despite several retries and waiting\n", + "Could not download 7.919674841769999, 4.6938192515359765 despite several retries and waiting\n", + "Could not download 7.916264688568008, 4.689968792436016 despite several retries and waiting\n", + "Could not download 7.916264688568008, 4.764828399445976 despite several retries and waiting\n", + "Could not download 8.301023658063984, 4.233550498574024 despite several retries and waiting\n", + "Could not download 8.445723187251993, 4.603914315263984 despite several retries and waiting\n", + "Could not download 8.398237999484023, 5.033339678128008 despite several retries and waiting\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "ERROR:root:Error-could not download 8.445122201819999, 10.5898676009\n", + "Traceback (most recent call last):\n", + " File \"\", line 46, in download_images\n", + " im = imd.download_image(lat, lon, min_year, min_month, max_year, max_month)\n", + " File \"gdrive/MyDrive/geo/utils/planet_downloader.py\", line 95, in download_image\n", + " res = json.loads(result.text)\n", + " File \"/usr/lib/python3.7/json/__init__.py\", line 348, in loads\n", + " return _default_decoder.decode(s)\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 337, in decode\n", + " obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 355, in raw_decode\n", + " raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n", + "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n", + "ERROR:root:Error-could not download 8.430150280418006, 10.604839522301992\n", + "Traceback (most recent call last):\n", + " File \"\", line 46, in download_images\n", + " im = imd.download_image(lat, lon, min_year, min_month, max_year, max_month)\n", + " File \"gdrive/MyDrive/geo/utils/planet_downloader.py\", line 95, in download_image\n", + " res = json.loads(result.text)\n", + " File \"/usr/lib/python3.7/json/__init__.py\", line 348, in loads\n", + " return _default_decoder.decode(s)\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 337, in decode\n", + " obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n", + " File \"/usr/lib/python3.7/json/decoder.py\", line 355, in raw_decode\n", + " raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n", + "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "Could not download 8.442183483694023, 4.600277017325976 despite several retries and waiting\n", + "Could not download 8.449753561004023, 4.592374400183984 despite several retries and waiting\n", + "Could not download 8.449753561004023, 4.607346321585976 despite several retries and waiting\n", + "Could not download 9.167971645683984, 8.795682521465977 despite several retries and waiting\n", + "Could not download 9.182943567085976, 8.795682521465977 despite several retries and waiting\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xVOodZH2iqTZ" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/predicting-poverty-education-replication/scripts/feature_extract.ipynb b/predicting-poverty-education-replication/scripts/feature_extract.ipynb new file mode 100644 index 0000000..5878a9d --- /dev/null +++ b/predicting-poverty-education-replication/scripts/feature_extract.ipynb @@ -0,0 +1,744 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the images marked as valid per cluster, we pass them through the CNN and extract their feature vectors. the results are stored at a per-country basis. For example, all Malawi feature extractions will go into results/malawi_2016/cnn." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import shutil\n", + "import numpy as np\n", + "import pandas as pd\n", + "from tqdm.notebook import tqdm\n", + "import pickle\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "import numpy as np\n", + "import torchvision\n", + "from torchvision import datasets, models, transforms\n", + "import matplotlib.pyplot as plt\n", + "import time\n", + "import copy" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_DIR = '..'\n", + "COUNTRIES_DIR = os.path.join(BASE_DIR, 'data', 'countries')\n", + "PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')\n", + "RESULTS_DIR = os.path.join(BASE_DIR, 'results')\n", + "CNN_TRAIN_IMAGE_DIR = os.path.join(BASE_DIR, 'data', 'cnn_images')\n", + "CNN_DIR = os.path.join(BASE_DIR, 'models', 'trained_model.pt')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(RESULTS_DIR, exist_ok=True)\n", + "for country in ['malawi_2016', 'ethiopia_2015', 'nigeria_2015']:\n", + " os.makedirs(os.path.join(RESULTS_DIR, country), exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature extract with CNN\n", + "If you have run this step before, you can skip it and run the commented out code in the next section to quick-start." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df_images = pd.read_csv(os.path.join(PROCESSED_DIR, 'image_download_actual.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image_nameimage_latimage_loncluster_latcluster_loncons_pcnightlightscountrynightlights_binis_train
04.31578611574_6.223837135554024_4.31578611574_...4.3157866.2238374.3157866.2687534.3177170.123354ng1True
14.330758037141992_6.223837135554024_4.31578611...4.3307586.2238374.3157866.2687534.3177170.123354ng1True
24.285842272936016_6.238809056956016_4.31578611...4.2858426.2388094.3157866.2687534.3177170.123354ng1True
34.270870351534024_6.253780978358008_4.31578611...4.2708706.2537814.3157866.2687534.3177170.123354ng1True
44.345729958543984_6.253780978358008_4.31578611...4.3457306.2537814.3157866.2687534.3177170.123354ng1True
\n", + "
" + ], + "text/plain": [ + " image_name image_lat image_lon \\\n", + "0 4.31578611574_6.223837135554024_4.31578611574_... 4.315786 6.223837 \n", + "1 4.330758037141992_6.223837135554024_4.31578611... 4.330758 6.223837 \n", + "2 4.285842272936016_6.238809056956016_4.31578611... 4.285842 6.238809 \n", + "3 4.270870351534024_6.253780978358008_4.31578611... 4.270870 6.253781 \n", + "4 4.345729958543984_6.253780978358008_4.31578611... 4.345730 6.253781 \n", + "\n", + " cluster_lat cluster_lon cons_pc nightlights country nightlights_bin \\\n", + "0 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "1 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "2 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "3 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "4 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "\n", + " is_train \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 True \n", + "4 True " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_images.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using cpu as backend\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/serialization.py:493: SourceChangeWarning: source code of class 'torch.nn.modules.container.Sequential' has changed. you can retrieve the original source code by accessing the object's source attribute or set `torch.nn.Module.dump_patches = True` and use the patch tool to revert the changes.\n", + " warnings.warn(msg, SourceChangeWarning)\n" + ] + } + ], + "source": [ + "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "print(f'Using {device} as backend')\n", + "model = torch.load(CNN_DIR, map_location=device)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Sequential(\n", + " (0): Linear(in_features=25088, out_features=4096, bias=True)\n", + " (1): ReLU(inplace=True)\n", + " (2): Dropout(p=0.5, inplace=False)\n", + " (3): Linear(in_features=4096, out_features=4096, bias=True)\n", + " (4): ReLU(inplace=True)\n", + " (5): Dropout(p=0.5, inplace=False)\n", + " (6): Linear(in_features=4096, out_features=3, bias=True)\n", + ")" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# rip off the final layers\n", + "model.classifier = model.classifier[:4]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Sequential(\n", + " (0): Linear(in_features=25088, out_features=4096, bias=True)\n", + " (1): ReLU(inplace=True)\n", + " (2): Dropout(p=0.5, inplace=False)\n", + " (3): Linear(in_features=4096, out_features=4096, bias=True)\n", + ")" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1712e3d5767847a9b3fef1fe5f4ab51b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/154 [00:00\n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"\", line 22, in __getitem__\n X = self.filename_to_im_tensor(self.image_dir + '/' + image_name)\n File \"\", line 28, in filename_to_im_tensor\n im = plt.imread(file)[:,:,:3]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/matplotlib/pyplot.py\", line 2407, in imread\n return matplotlib.image.imread(fname, format)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/matplotlib/image.py\", line 1501, in imread\n with img_open(fname) as image:\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/PIL/ImageFile.py\", line 95, in __init__\n self.fp = open(fp, \"rb\")\nFileNotFoundError: [Errno 2] No such file or directory: '/Users/TinotendaMatsika/Documents/predicting-poverty-education-replication/data/cnn_images/valid/0/11.965055592105976_8.717694503334023_11.9201398279_8.76261026754.png'\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0mimage_order\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mimage_list\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;31m# forward pass for this class\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 45\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataloader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 46\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/tqdm/notebook.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtqdm_notebook\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0;31m# return super(tqdm...) will not catch exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/tqdm/std.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1176\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1177\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1178\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1179\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1180\u001b[0m \u001b[0;31m# Update and possibly print the progressbar.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 818\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_task_info\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 819\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 820\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 821\u001b[0m \u001b[0mnext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m__next__\u001b[0m \u001b[0;31m# Python 2 compatibility\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_process_data\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 844\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_try_put_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 845\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mExceptionWrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 846\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 847\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/_utils.py\u001b[0m in \u001b[0;36mreraise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 383\u001b[0m \u001b[0;31m# (https://bugs.python.org/issue2651), so we work around it.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 384\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKeyErrorMessage\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 385\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m: Caught FileNotFoundError in DataLoader worker process 0.\nOriginal Traceback (most recent call last):\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py\", line 178, in _worker_loop\n data = fetcher.fetch(index)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py\", line 44, in fetch\n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py\", line 44, in \n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"\", line 22, in __getitem__\n X = self.filename_to_im_tensor(self.image_dir + '/' + image_name)\n File \"\", line 28, in filename_to_im_tensor\n im = plt.imread(file)[:,:,:3]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/matplotlib/pyplot.py\", line 2407, in imread\n return matplotlib.image.imread(fname, format)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/matplotlib/image.py\", line 1501, in imread\n with img_open(fname) as image:\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/PIL/ImageFile.py\", line 95, in __init__\n self.fp = open(fp, \"rb\")\nFileNotFoundError: [Errno 2] No such file or directory: '/Users/TinotendaMatsika/Documents/predicting-poverty-education-replication/data/cnn_images/valid/0/11.965055592105976_8.717694503334023_11.9201398279_8.76261026754.png'\n" + ] + } + ], + "source": [ + "transformer = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n", + " ])\n", + "\n", + "# custom dataset for fast image loading and processing\n", + "# does not follow the usual style of folder -> folder for each class -> image\n", + "# we just want one folder with images\n", + "class ForwardPassDataset(torch.utils.data.Dataset):\n", + " def __init__(self, image_dir, transformer):\n", + " self.image_dir = image_dir\n", + " self.image_list = os.listdir(self.image_dir)\n", + " self.transformer = transformer\n", + "\n", + " def __len__(self):\n", + " return len(self.image_list)\n", + "\n", + " def __getitem__(self, index):\n", + " image_name = self.image_list[index]\n", + "\n", + " # Load image\n", + " X = self.filename_to_im_tensor(self.image_dir + '/' + image_name)\n", + " \n", + " # dataloaders need to return a label, but for the forward pass we don't really care\n", + " return X, -1\n", + " \n", + " def filename_to_im_tensor(self, file):\n", + " im = plt.imread(file)[:,:,:3]\n", + " im = self.transformer(im)\n", + " return im\n", + "\n", + "model.eval() \n", + "classes = [0, 1, 2]\n", + "# shape of final array will be (num_validation_images, 4096)\n", + "# we also want to record the image each index represents\n", + "feats = np.zeros(((~df_images['is_train']).sum(), 4096))\n", + "image_order = []\n", + "i = 0\n", + "for c in classes:\n", + " # use the validation images to do the forward pass\n", + " dataset = ForwardPassDataset(os.path.join(CNN_TRAIN_IMAGE_DIR, 'valid', str(c)), transformer)\n", + " dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=4)\n", + " image_order += dataset.image_list\n", + " # forward pass for this class\n", + " for inputs, _ in tqdm(dataloader):\n", + " inputs = inputs.to(device)\n", + " outputs = model(inputs)\n", + " feats[i:i+len(inputs),:] = outputs.cpu().detach().numpy()\n", + " i += len(inputs)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-1.26415765, -0.28140783, -0.29993755, ..., 0.33739716,\n", + " -0.96456331, -0.95310527],\n", + " [ 0.55027246, -0.06091447, 0.11403629, ..., -0.08996978,\n", + " -0.62236136, -0.96085918],\n", + " [ 0.52193987, -0.29220241, -0.45371717, ..., 0.34175205,\n", + " -1.1439786 , -0.85960728],\n", + " ...,\n", + " [-0.50936353, 0.39209121, -0.29870456, ..., 0.0661362 ,\n", + " 0.43009469, -0.34069228],\n", + " [ 0.24428365, 0.07818466, -0.89307284, ..., 0.29522306,\n", + " -0.72958505, -1.24356151],\n", + " [-0.30123377, 0.6785413 , -0.19940855, ..., 0.14395328,\n", + " 0.52420121, -1.16047859]])" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feats" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image_namefeat_index
010.513181862198008_39.768191057994024_10.52815...0
1-14.632534157196016_34.981995235794024_-14.662...1
2-14.526346764205977_35.593520078598004_-14.481...2
37.4290196173360155_7.26950147266_7.45896346014...3
4-10.405547698244352_34.14279535535209_-10.4038...4
\n", + "
" + ], + "text/plain": [ + " image_name feat_index\n", + "0 10.513181862198008_39.768191057994024_10.52815... 0\n", + "1 -14.632534157196016_34.981995235794024_-14.662... 1\n", + "2 -14.526346764205977_35.593520078598004_-14.481... 2\n", + "3 7.4290196173360155_7.26950147266_7.45896346014... 3\n", + "4 -10.405547698244352_34.14279535535209_-10.4038... 4" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "forward_pass_df = pd.DataFrame.from_dict({'image_name': image_order, 'feat_index': np.arange(len(image_order))})\n", + "forward_pass_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "df_consumption = pd.merge(left=df_images, right=forward_pass_df, on='image_name')" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "# have we maintained all validation images?\n", + "assert len(df_consumption) == (~df_images['is_train']).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image_nameimage_latimage_loncluster_latcluster_loncons_pcnightlightscountrynightlights_binis_trainfeat_index
0-17.125093842803985_35.18726915719602_-17.0951...-17.12509435.187269-17.09515035.2172131.4232390.025206mw0False318
1-17.140065764205975_35.232184921401995_-17.095...-17.14006635.232185-17.09515035.2172131.4232390.025206mw0False1861
2-17.065206157196016_35.262128764205976_-17.095...-17.06520635.262129-17.09515035.2172131.4232390.025206mw0False836
3-17.07737907859801_35.069727235794026_-17.0923...-17.07737935.069727-17.09235135.1146431.2662040.000000mw0False18
4-17.137266764205975_35.08469915719602_-17.0923...-17.13726735.084699-17.09235135.1146431.2662040.000000mw0False1051
\n", + "
" + ], + "text/plain": [ + " image_name image_lat image_lon \\\n", + "0 -17.125093842803985_35.18726915719602_-17.0951... -17.125094 35.187269 \n", + "1 -17.140065764205975_35.232184921401995_-17.095... -17.140066 35.232185 \n", + "2 -17.065206157196016_35.262128764205976_-17.095... -17.065206 35.262129 \n", + "3 -17.07737907859801_35.069727235794026_-17.0923... -17.077379 35.069727 \n", + "4 -17.137266764205975_35.08469915719602_-17.0923... -17.137267 35.084699 \n", + "\n", + " cluster_lat cluster_lon cons_pc nightlights country nightlights_bin \\\n", + "0 -17.095150 35.217213 1.423239 0.025206 mw 0 \n", + "1 -17.095150 35.217213 1.423239 0.025206 mw 0 \n", + "2 -17.095150 35.217213 1.423239 0.025206 mw 0 \n", + "3 -17.092351 35.114643 1.266204 0.000000 mw 0 \n", + "4 -17.092351 35.114643 1.266204 0.000000 mw 0 \n", + "\n", + " is_train feat_index \n", + "0 False 318 \n", + "1 False 1861 \n", + "2 False 836 \n", + "3 False 18 \n", + "4 False 1051 " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_consumption.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aggregate Features\n", + "For each country, we aggregate the image features per cluster and save them to results/country/cnn" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "country_abbrv = ['mw', 'eth', 'ng']\n", + "country_dir = ['malawi_2016', 'ethiopia_2015', 'nigeria_2015']\n", + "\n", + "for ca, cd in zip(country_abbrv, country_dir):\n", + " df_c = df_consumption[df_consumption['country'] == ca]\n", + " group = df_c.groupby(['cluster_lat', 'cluster_lon'])\n", + " x = np.zeros((len(group), 4096))\n", + " cluster_list = [] # the corresponding clusters (lat, lon) to the x aggregate feature array\n", + " for i, g in enumerate(group):\n", + " lat, lon = g[0]\n", + " im_sub = df_consumption[(df_consumption['cluster_lat'] == lat) & (df_consumption['cluster_lon'] == lon)].reset_index(drop=True)\n", + " agg_feats = np.zeros((len(im_sub), 4096))\n", + " for j, d in im_sub.iterrows():\n", + " agg_feats[j,:] = feats[d.feat_index]\n", + " agg_feats = agg_feats.mean(axis=0) # averages the features across all images in the cluster\n", + "\n", + " x[i,:] = agg_feats\n", + " cluster_list.append([lat, lon])\n", + " # save to the correct directory\n", + " save_dir = os.path.join(RESULTS_DIR, cd, 'cnn')\n", + " os.makedirs(save_dir, exist_ok=True)\n", + " np.save(os.path.join(save_dir, 'cluster_feats.npy'), x)\n", + " pickle.dump(cluster_list, open(os.path.join(save_dir, 'cluster_order.pkl'), 'wb')) \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "name": "pytorch-gpu.1-4.m46", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-4:m46" + }, + "kernelspec": { + "display_name": "testenv", + "language": "python", + "name": "testenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/predicting-poverty-education-replication/scripts/predict_consumption.ipynb b/predicting-poverty-education-replication/scripts/predict_consumption.ipynb new file mode 100644 index 0000000..86f8a5a --- /dev/null +++ b/predicting-poverty-education-replication/scripts/predict_consumption.ipynb @@ -0,0 +1,462 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the cluster_feats saved in each country's results folder, we will finally predict the metric of interest (consumption per capita). There are many ways to evaluate our results, I do the following:
\n", + "1) randomized CV - use 5-fold cross validation randomly on all clusters, all countries
\n", + "2) randomized CV per country - use 5-fold cross validation within a country for all countries
\n", + "3) spatial CV per country - use 5-fold cross validation but folds consist of clusters that are geographically close
\n", + "4) cross-country CV - we have three countries, so hold one country out and perform cross validation
\n", + "\n", + "Jean et al use the 2nd evaluation method. One important point: in their code, they use pearson R and square it to get R^2. The more conventional way is shown here: https://en.wikipedia.org/wiki/Coefficient_of_determination. I use the latter method, as in my experience this is standard for reporting R^2. Jean et al's method will likely lead to a higher \"R^2\" than the conventional method, and doing so prevents a negative R^2 whereas the conventional method allows that possibility." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import pickle\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_DIR = '..'\n", + "\n", + "RANDOM_SEED = 7 # for reproducibility\n", + "COUNTRIES_DIR = os.path.join(BASE_DIR, 'data', 'countries')\n", + "RESULTS_DIR = os.path.join(BASE_DIR, 'results')\n", + "FIGURES_DIR = os.path.join(BASE_DIR, 'figures')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(BASE_DIR)\n", + "from utils import merge_on_lat_lon, assign_groups, run_randomized_cv, run_spatial_cv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Predict Consumption" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def load_country(country):\n", + " '''\n", + " Organizes the country's dataframe so that each index corresponds to the index in the cluster features\n", + " Returns the cluster features and the organized dataframe\n", + " '''\n", + " country_processed_dir = os.path.join(COUNTRIES_DIR, country, 'processed')\n", + " country_results_dir = os.path.join(RESULTS_DIR, country, 'cnn')\n", + " x = np.load(os.path.join(country_results_dir, 'cluster_feats.npy'))\n", + " cluster_list = pickle.load(open(os.path.join(country_results_dir, 'cluster_order.pkl'), 'rb'))\n", + " cluster_list = pd.DataFrame.from_records(cluster_list, columns=['cluster_lat', 'cluster_lon'])\n", + " cluster_list['feat_index'] = np.arange(len(cluster_list))\n", + " \n", + " df_clusters = pd.read_csv(os.path.join(country_processed_dir, 'clusters.csv'))\n", + " assert len(df_clusters) == len(cluster_list)\n", + "\n", + " df = merge_on_lat_lon(df_clusters, cluster_list, keys=['cluster_lat', 'cluster_lon'])\n", + " assert len(df) == len(df_clusters) == len(cluster_list)\n", + " df.sort_values('feat_index', ascending=True, inplace=True)\n", + " return x, df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "x_mw, df_mw = load_country('malawi_2016')\n", + "x_eth, df_eth = load_country('ethiopia_2015')\n", + "x_ng, df_ng = load_country('nigeria_2015')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "y_mw = df_mw['cons_pc'].values\n", + "y_eth = df_eth['cons_pc'].values\n", + "y_ng = df_ng['cons_pc'].values" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def test_fully_randomized_cv():\n", + " print(\"Testing fully randomized CV:\\n--------------\\n\")\n", + " x_all = np.concatenate([x_mw, x_eth, x_ng], axis=0)\n", + " y_all = np.concatenate([y_mw, y_eth, y_ng], axis=0)\n", + " r2_direct, _ = run_randomized_cv(x_all, y_all, random_seed=RANDOM_SEED, to_print=False)\n", + " r2_log, _ = run_randomized_cv(x_all, np.log(y_all), random_seed=RANDOM_SEED)\n", + " print(f\"For fully randomized cv: direct r2: {r2_direct}, log r2: {r2_log}\")\n", + " \n", + "def test_randomized_cv_per_country():\n", + " print(\"Testing per country randomized CV:\\n--------------\\n\")\n", + " xs = [x_mw, x_eth, x_ng]\n", + " ys = [y_mw, y_eth, y_ng]\n", + " countries = ['malawi_2016', 'ethiopia_2015', 'nigeria_2015']\n", + " for x, y, c in zip(xs, ys, countries):\n", + " r2_direct, _ = run_randomized_cv(x, y, random_seed=RANDOM_SEED, to_print=False)\n", + " r2_log, _ = run_randomized_cv(x, np.log(y), random_seed=RANDOM_SEED)\n", + " print(f\"For {c}, direct r2: {r2_direct}, log r2: {r2_log}\")\n", + " \n", + "def test_spatial_cv_per_country():\n", + " print(\"Testing per country spatial CV:\\n--------------\\n\")\n", + " xs = [x_mw, x_eth, x_ng]\n", + " ys = [y_mw, y_eth, y_ng]\n", + " dfs = [df_mw, df_eth, df_ng]\n", + " countries = ['malawi_2016', 'ethiopia_2015', 'nigeria_2015']\n", + " for x, y, df, c in zip(xs, ys, dfs, countries):\n", + " groups, _ = assign_groups(df, k=5, random_seed=RANDOM_SEED)\n", + " r2_direct, _ = run_spatial_cv(x, y, groups, random_seed=RANDOM_SEED)\n", + " r2_log, _ = run_spatial_cv(x, np.log(y), groups, random_seed=RANDOM_SEED)\n", + " print(f\"For {c}, direct r2: {r2_direct}, log r2: {r2_log}\")\n", + " \n", + "def test_cross_country_cv():\n", + " print(\"Testing cross country CV:\\n--------------\\n\")\n", + " x_all = np.concatenate([x_mw, x_eth, x_ng], axis=0)\n", + " y_all = np.concatenate([y_mw, y_eth, y_ng], axis=0)\n", + " groups = np.zeros_like(y_all)\n", + " groups[len(y_mw) : len(y_mw) + len(y_eth)] = 1 # ethiopia indices become 1\n", + " groups[len(y_mw) + len(y_eth):] = 2 # nigeria indices become 2\n", + " r2_direct, _ = run_spatial_cv(x_all, y_all, groups, random_seed=RANDOM_SEED, k_inner=10)\n", + " r2_log, _ = run_spatial_cv(x_all, np.log(y_all), groups, random_seed=RANDOM_SEED, k_inner=10)\n", + " print(f\"For cross country cv: direct r2: {r2_direct}, log r2: {r2_log}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing fully randomized CV:\n", + "--------------\n", + "\n", + "For fully randomized cv: direct r2: 0.3026856411592201, log r2: 0.44629388573117235\n" + ] + } + ], + "source": [ + "# this concatenates all countries and runs randomized CV\n", + "test_fully_randomized_cv()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing per country randomized CV:\n", + "--------------\n", + "\n", + "For malawi_2016, direct r2: 0.2563769526470949, log r2: 0.40946728194107446\n", + "For ethiopia_2015, direct r2: 0.15408096665923968, log r2: 0.13757502459231657\n", + "For nigeria_2015, direct r2: 0.19112920163918376, log r2: 0.34070324404134966\n" + ] + } + ], + "source": [ + "# this runs randomized CV per country; this is how Jean et al report their results\n", + "# we can see that the model does perform well, although how well it performs varies greatly\n", + "# in our three countries. It is difficult to make statements about why the model \n", + "# does better in certain countries (and why predicting the log doesn't do better in ethiopia)\n", + "# without more analysis and more countries\n", + "test_randomized_cv_per_country()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing per country spatial CV:\n", + "--------------\n", + "\n", + "For malawi_2016, direct r2: -0.040709767100400904, log r2: 0.15253299922530078\n", + "For ethiopia_2015, direct r2: 0.0876498448297943, log r2: -0.01924967612917705\n", + "For nigeria_2015, direct r2: -0.19857908515776138, log r2: -0.10348441015409007\n" + ] + } + ], + "source": [ + "# spatial CV provides very inconsistent results. this indicates that the model is \n", + "# greatly advantaged in randomized CV, most likely because it can train on one cluster and \n", + "# validate on a nearby cluster. this suggests there is still more to be desired for a truly generalizable model\n", + "test_spatial_cv_per_country()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing cross country CV:\n", + "--------------\n", + "\n", + "For cross country cv: direct r2: -1.0104939224653613, log r2: -3.1636548554218886\n" + ] + } + ], + "source": [ + "# the model does not generalize if trained on two countries to predict the third\n", + "# slight tweaks to this function can show how each country faired when held out\n", + "test_cross_country_cv()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plots" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_predictions(y, yhat, r2, country, max_y=None):\n", + " if max_y is not None:\n", + " yhat = yhat[y < max_y]\n", + " y = y[y < max_y]\n", + " fig = plt.figure(figsize=(8,5))\n", + " plt.scatter(y, yhat, alpha=0.6)\n", + " plt.plot(np.unique(y), np.poly1d(np.polyfit(y, yhat, 1))(np.unique(y)), color='g')\n", + " plt.text(15.5, 7, f'r^2={round(r2, 2)}', size=12)\n", + " plt.xlabel('Actual Consumption($/day)')\n", + " plt.ylabel('Predicted Consumption($/day)')\n", + " plt.title(f'{country} Results')\n", + " return fig" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "r2, yhat_mw = run_randomized_cv(x_mw, y_mw, random_seed=RANDOM_SEED, to_print=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig = plot_predictions(y_mw, yhat_mw, r2, 'Malawi', max_y=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "fig.savefig(os.path.join(FIGURES_DIR, 'malawi_results.png'))" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.savefig('malawi_results.png')" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "r2, yhat_eth = run_randomized_cv(x_eth, y_eth, random_seed=RANDOM_SEED, to_print=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig = plot_predictions(y_eth, yhat_eth, r2, 'Ethiopia', max_y=30)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "fig.savefig(os.path.join(FIGURES_DIR, 'ethiopia_results.png'))" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "r2, yhat_ng = run_randomized_cv(x_ng, y_ng, random_seed=RANDOM_SEED, to_print=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig = plot_predictions(y_ng, yhat_ng, r2, 'Nigeria', max_y=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [], + "source": [ + "fig.savefig(os.path.join(FIGURES_DIR, 'nigeria_results.png'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "name": "pytorch-gpu.1-4.m46", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-4:m46" + }, + "kernelspec": { + "display_name": "predicting-poverty-replication", + "language": "python", + "name": "predicting-poverty-replication" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/predicting-poverty-education-replication/scripts/process_survey_data.ipynb b/predicting-poverty-education-replication/scripts/process_survey_data.ipynb new file mode 100644 index 0000000..242dd2a --- /dev/null +++ b/predicting-poverty-education-replication/scripts/process_survey_data.ipynb @@ -0,0 +1,1283 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "environment": { + "name": "pytorch-gpu.1-4.m46", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-4:m46" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + }, + "colab": { + "name": "process_survey_data.ipynb", + "provenance": [], + "collapsed_sections": [] + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "QluoD5EBbpfs" + }, + "source": [ + "Make sure you download the 2016 Household LSMS survey data for Malawi, Ethiopia, and Nigeria from https://microdata.worldbank.org/index.php/catalog/lsms and put it in `../data/countries/`. Malawi's data should be named `malawi_2016/LSMS`, Ethiopia's should be named `ethiopia_2015/LSMS`, and Nigeria's should be named `nigeria_2015/LSMS`. Nightlights data should be downloaded from https://ngdc.noaa.gov/eog/viirs/download_dnb_composites.html using the annual composite from 2015 in tile 2 and tile 5." + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Tmzdkl_Qbs5T", + "outputId": "b56bde99-06e3-4428-fb82-2ea94d627415" + }, + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/gdrive')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Mounted at /content/gdrive\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "u7P0Nfidb4Nk", + "outputId": "bda6ae71-94a4-4a1a-8e2c-5299ff88fd83" + }, + "source": [ + "!pip install geoio" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Collecting geoio\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/4b/26/2647daf5ef5cfb9327abb2709024810cd9b1916f7b334eee33fb81af9073/geoio-1.3.0-py3-none-any.whl (60kB)\n", + "\u001b[K |████████████████████████████████| 61kB 2.3MB/s \n", + "\u001b[?25hRequirement already satisfied: gdal in /usr/local/lib/python3.7/dist-packages (from geoio) (2.2.2)\n", + "Collecting tinytools\n", + " Downloading https://files.pythonhosted.org/packages/c3/e6/e335406a22be352c8b680ed5d4d28937ac911dbda9756625006e502bf787/tinytools-1.1.0-py3-none-any.whl\n", + "Requirement already satisfied: ephem in /usr/local/lib/python3.7/dist-packages (from geoio) (3.7.7.1)\n", + "Collecting tzwhere\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/3d/e9/18e4822f6e4640332b97c744378da427bc28d2399235520349bb17e06aa4/tzwhere-3.0.3.tar.gz (23.7MB)\n", + "\u001b[K |████████████████████████████████| 23.7MB 77.2MB/s \n", + "\u001b[?25hRequirement already satisfied: plotly in /usr/local/lib/python3.7/dist-packages (from geoio) (4.4.1)\n", + "Collecting xmltodict\n", + " Downloading https://files.pythonhosted.org/packages/28/fd/30d5c1d3ac29ce229f6bdc40bbc20b28f716e8b363140c26eff19122d8a5/xmltodict-0.12.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from geoio) (2018.9)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from geoio) (1.19.5)\n", + "Requirement already satisfied: shapely in /usr/local/lib/python3.7/dist-packages (from tzwhere->geoio) (1.7.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from plotly->geoio) (1.15.0)\n", + "Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.7/dist-packages (from plotly->geoio) (1.3.3)\n", + "Building wheels for collected packages: tzwhere\n", + " Building wheel for tzwhere (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for tzwhere: filename=tzwhere-3.0.3-cp37-none-any.whl size=23742602 sha256=63f1d0c4f692cdd75dddfdc4be8ae4685fe39ab26b8b374316052dc096d42839\n", + " Stored in directory: /root/.cache/pip/wheels/89/18/2f/01c958c82b1223d6fe763c6b2fadf45b4f6ce6e8a9fce7bbd5\n", + "Successfully built tzwhere\n", + "Installing collected packages: tinytools, tzwhere, xmltodict, geoio\n", + "Successfully installed geoio-1.3.0 tinytools-1.1.0 tzwhere-3.0.3 xmltodict-0.12.0\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "s_AAVuW-bpfw" + }, + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import geoio" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "09AsPn0Ubpfy" + }, + "source": [ + "BASE_DIR = 'gdrive/MyDrive/geo'\n", + "NIGHTLIGHTS_DIRS = [os.path.join(BASE_DIR, 'data/nightlights/viirs_2015_00N060W.tif'),\n", + " os.path.join(BASE_DIR, 'data/nightlights/viirs_2015_75N060W.tif')]\n", + "\n", + "COUNTRIES_DIR = os.path.join(BASE_DIR, 'data', 'countries')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "q6bYA-O7bpfy" + }, + "source": [ + "import sys\n", + "sys.path.append(BASE_DIR)\n", + "from utils import create_space" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "qnNhORkUbpfz" + }, + "source": [ + "'''\n", + "The goal of each of these functions is to output a dataframe with the following columns:\n", + "country, cluster_lat, cluster_lon, cons_pc\n", + "\n", + "Each row should represent one cluster by combining the household data\n", + "'''\n", + "\n", + "def process_malawi():\n", + " lsms_dir = os.path.join(COUNTRIES_DIR, 'malawi_2016', 'LSMS')\n", + " consumption_file = 'IHS4 Consumption Aggregate.csv'\n", + " consumption_ph_col = 'rexpagg' # per household\n", + " hhsize_col = 'hhsize' # people in household\n", + "\n", + " geovariables_file = 'HouseholdGeovariables_csv/HouseholdGeovariablesIHS4.csv'\n", + " lat_col = 'lat_modified'\n", + " lon_col = 'lon_modified'\n", + "\n", + " # purchasing power parity for malawi in 2016 (https://data.worldbank.org/indicator/PA.NUS.PRVT.PP?locations=MW)\n", + " ppp = 215.182\n", + " \n", + " for file in [consumption_file, geovariables_file]:\n", + " assert os.path.isfile(os.path.join(lsms_dir, file)), print(f'Could not find {file}')\n", + " \n", + " df = pd.read_csv(os.path.join(lsms_dir, consumption_file))\n", + " df['cons_ph'] = df[consumption_ph_col]\n", + " df['pph'] = df[hhsize_col]\n", + " df['cons_ph'] = df['cons_ph'] / ppp / 365\n", + " df = df[['case_id', 'cons_ph', 'pph']]\n", + "\n", + " df_geo = pd.read_csv(os.path.join(lsms_dir, geovariables_file))\n", + " df_cords = df_geo[['case_id', 'HHID', lat_col, lon_col]]\n", + " df_cords.rename(columns={lat_col: 'cluster_lat', lon_col: 'cluster_lon'}, inplace=True)\n", + " df_combined = pd.merge(df, df_cords, on='case_id')\n", + " df_combined.drop(['case_id', 'HHID'], axis=1, inplace=True)\n", + " df_combined.dropna(inplace=True) # can't use na values\n", + " \n", + " df_clusters = df_combined.groupby(['cluster_lat', 'cluster_lon']).sum().reset_index()\n", + " df_clusters['cons_pc'] = df_clusters['cons_ph'] / df_clusters['pph'] # divides total cluster income by people\n", + " df_clusters['country'] = 'mw'\n", + " return df_clusters[['country', 'cluster_lat', 'cluster_lon', 'cons_pc']]\n", + "\n", + "def process_ethiopia():\n", + " lsms_dir = os.path.join(COUNTRIES_DIR, 'ethiopia_2015', 'LSMS')\n", + " consumption_file = 'Consumption Aggregate/cons_agg_w3.csv'\n", + " consumption_pc_col = 'total_cons_ann' # per capita\n", + " hhsize_col = 'hh_size' # people in household\n", + "\n", + " geovariables_file = 'ETH_2015_ESS_v03_M_CSV/Geovariables/ETH_HouseholdGeovars_y3.csv'\n", + " lat_col = 'lat_dd_mod'\n", + " lon_col = 'lon_dd_mod'\n", + "\n", + " # purchasing power parity for ethiopia in 2015 (https://data.worldbank.org/indicator/PA.NUS.PRVT.PP?locations=ET)\n", + " ppp = 7.882\n", + " \n", + " for file in [consumption_file, geovariables_file]:\n", + " assert os.path.isfile(os.path.join(lsms_dir, file)), print(f'Could not find {file}')\n", + " \n", + " df = pd.read_csv(os.path.join(lsms_dir, consumption_file))\n", + " df['cons_ph'] = df[consumption_pc_col] * df[hhsize_col]\n", + " df['pph'] = df[hhsize_col]\n", + " df['cons_ph'] = df['cons_ph'] / ppp / 365\n", + " df = df[['household_id2', 'cons_ph', 'pph']]\n", + "\n", + " df_geo = pd.read_csv(os.path.join(lsms_dir, geovariables_file)\n", + " df_cords = df_geo[['household_id2', lat_col, lon_col]]\n", + " df_cords.rename(columns={lat_col: 'cluster_lat', lon_col: 'cluster_lon'}, inplace=True)\n", + " df_combined = pd.merge(df, df_cords, on='household_id2')\n", + " df_combined.drop(['household_id2'], axis=1, inplace=True)\n", + " df_combined.dropna(inplace=True) # can't use na values\n", + " \n", + " df_clusters = df_combined.groupby(['cluster_lat', 'cluster_lon']).sum().reset_index()\n", + " df_clusters['cons_pc'] = df_clusters['cons_ph'] / df_clusters['pph'] # divides total cluster income by people\n", + " df_clusters['country'] = 'eth'\n", + " return df_clusters[['country', 'cluster_lat', 'cluster_lon', 'cons_pc']]\n", + "\n", + "def process_nigeria():\n", + " lsms_dir = os.path.join(COUNTRIES_DIR, 'nigeria_2015', 'LSMS')\n", + " consumption_file = 'cons_agg_wave3_visit1.csv'\n", + " consumption_pc_col = 'totcons' # per capita\n", + " hhsize_col = 'hhsize' # people in household\n", + "\n", + " geovariables_file = 'nga_householdgeovars_y3.csv'\n", + " lat_col = 'LAT_DD_MOD'\n", + " lon_col = 'LON_DD_MOD'\n", + "\n", + " # purchasing power parity for nigeria in 2015 (https://data.worldbank.org/indicator/PA.NUS.PRVT.PP?locations=NG)\n", + " ppp = 95.255\n", + " \n", + " for file in [consumption_file, geovariables_file]:\n", + " assert os.path.isfile(os.path.join(lsms_dir, file)), print(f'Could not find {file}')\n", + " \n", + " df = pd.read_csv(os.path.join(lsms_dir, consumption_file))\n", + " df['cons_ph'] = df[consumption_pc_col] * df[hhsize_col]\n", + " df['pph'] = df[hhsize_col]\n", + " df['cons_ph'] = df['cons_ph'] / ppp / 365\n", + " df = df[['hhid', 'cons_ph', 'pph']]\n", + "\n", + " df_geo = pd.read_csv(os.path.join(lsms_dir, geovariables_file))\n", + " df_cords = df_geo[['hhid', lat_col, lon_col]]\n", + " df_cords.rename(columns={lat_col: 'cluster_lat', lon_col: 'cluster_lon'}, inplace=True)\n", + " df_combined = pd.merge(df, df_cords, on='hhid')\n", + " df_combined.drop(['hhid'], axis=1, inplace=True)\n", + " df_combined.dropna(inplace=True) # can't use na values\n", + " \n", + " df_clusters = df_combined.groupby(['cluster_lat', 'cluster_lon']).sum().reset_index()\n", + " df_clusters['cons_pc'] = df_clusters['cons_ph'] / df_clusters['pph'] # divides total cluster income by people\n", + " df_clusters['country'] = 'ng'\n", + " return df_clusters[['country', 'cluster_lat', 'cluster_lon', 'cons_pc']]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "SnENheh-bpgA", + "outputId": "0fb9e549-ff70-4e85-a63a-0f1bf724355d" + }, + "source": [ + "df_mw = process_malawi()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py:4308: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YLPmDweFbpgg", + "outputId": "61022e01-56cc-4dcc-bb80-5e9d830a2600" + }, + "source": [ + "df_eth = process_ethiopia()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py:4308: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BYqikODMbpgh", + "outputId": "7165dacd-b00e-474a-884f-e815cbdcbf7d" + }, + "source": [ + "df_ng = process_nigeria()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py:4308: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "z_0vdo9Sbpgj", + "outputId": "14120956-919e-4848-fdff-b92ef129e2c5" + }, + "source": [ + "df_mw.shape, df_eth.shape, df_ng.shape" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "((780, 4), (523, 4), (664, 4))" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 18 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5shtFwIEbpgm" + }, + "source": [ + "tifs = [geoio.GeoImage(ndir) for ndir in NIGHTLIGHTS_DIRS]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "uqhnSwr1bpgo" + }, + "source": [ + "# loading both of these into memory requires A LOT of free memory (at least 4 gigs)\n", + "# using a swapfile of size 2 GB still did not fix my issues\n", + "# instead, I knew ahead of time the 0th tif is for Malawi, and the 1st tif is for Ethiopia and Nigeria\n", + "# I'll use this to only load one tif at a time\n", + "# thankfully, the countries did not span across two tifs\n", + "tif_array = np.squeeze(tifs[0].get_data())" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "C1Xqn-iKbpgo" + }, + "source": [ + "def add_nightlights(df, tif, tif_array):\n", + " ''' \n", + " This takes a dataframe with columns cluster_lat, cluster_lon and finds the average \n", + " nightlights in 2015 using a 10kmx10km box around the point\n", + " \n", + " I try all the nighlights tifs until a match is found, or none are left upon which an error is raised\n", + " '''\n", + " cluster_nightlights = []\n", + " for i,r in df.iterrows():\n", + " min_lat, min_lon, max_lat, max_lon = create_space(r.cluster_lat, r.cluster_lon)\n", + " \n", + " xminPixel, ymaxPixel = tif.proj_to_raster(min_lon, min_lat)\n", + " xmaxPixel, yminPixel = tif.proj_to_raster(max_lon, max_lat)\n", + " assert xminPixel < xmaxPixel, print(r.cluster_lat, r.cluster_lon)\n", + " assert yminPixel < ymaxPixel, print(r.cluster_lat, r.cluster_lon)\n", + " if xminPixel < 0 or xmaxPixel >= tif_array.shape[1]:\n", + " print(f\"no match for {r.cluster_lat}, {r.cluster_lon}\")\n", + " raise ValueError()\n", + " elif yminPixel < 0 or ymaxPixel >= tif_array.shape[0]:\n", + " print(f\"no match for {r.cluster_lat}, {r.cluster_lon}\")\n", + " raise ValueError()\n", + " xminPixel, yminPixel, xmaxPixel, ymaxPixel = int(xminPixel), int(yminPixel), int(xmaxPixel), int(ymaxPixel)\n", + " cluster_nightlights.append(tif_array[yminPixel:ymaxPixel,xminPixel:xmaxPixel].mean())\n", + " \n", + " df['nightlights'] = cluster_nightlights" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "llBn6PwlbphG" + }, + "source": [ + "add_nightlights(df_mw, tifs[0], tif_array)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rjw2-jPpbphK", + "outputId": "1aa3db94-f477-4bb9-c615-2bef35fb1c2e" + }, + "source": [ + "del tif_array\n", + "import gc\n", + "gc.collect()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "795" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 24 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-WlyCWQxbphS", + "outputId": "f62fd991-e5b6-4bed-9528-7890db4afeca" + }, + "source": [ + "import psutil\n", + "psutil.virtual_memory()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "svmem(total=13624659968, available=11927236608, percent=12.5, used=5066326016, free=247844864, active=1919000576, inactive=11074854912, buffers=100814848, cached=8209674240, shared=1175552, slab=318877696)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 25 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_24EvyZibphT" + }, + "source": [ + "tif_array = np.squeeze(tifs[1].get_data())" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "YDxv2jnvbphT" + }, + "source": [ + "add_nightlights(df_eth, tifs[1], tif_array)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "RiY3p31cbphU" + }, + "source": [ + "add_nightlights(df_ng, tifs[1], tif_array)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "id": "-iYX9cuMbphU", + "outputId": "6534303a-bb14-4366-9f39-9f432e04834e" + }, + "source": [ + "df_mw.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrycluster_latcluster_loncons_pcnightlights
0mw-17.09515035.2172131.4232390.025206
1mw-17.09235135.1146431.2662040.000000
2mw-17.01669835.0796291.5668700.000000
3mw-16.97724335.2057061.6692450.008266
4mw-16.95638535.1689671.0898910.002295
\n", + "
" + ], + "text/plain": [ + " country cluster_lat cluster_lon cons_pc nightlights\n", + "0 mw -17.095150 35.217213 1.423239 0.025206\n", + "1 mw -17.092351 35.114643 1.266204 0.000000\n", + "2 mw -17.016698 35.079629 1.566870 0.000000\n", + "3 mw -16.977243 35.205706 1.669245 0.008266\n", + "4 mw -16.956385 35.168967 1.089891 0.002295" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 29 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "id": "u6DQZKTPbphX", + "outputId": "d48983ef-b560-45c5-d5f6-7ec1b38c01a5" + }, + "source": [ + "df_eth.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrycluster_latcluster_loncons_pcnightlights
0eth3.45570139.51599414.8546340.0
1eth3.54993739.18423414.3120220.0
2eth3.86424339.10136612.4701450.0
3eth3.98293138.4913688.3365390.0
4eth4.04819441.9309284.7625260.0
\n", + "
" + ], + "text/plain": [ + " country cluster_lat cluster_lon cons_pc nightlights\n", + "0 eth 3.455701 39.515994 14.854634 0.0\n", + "1 eth 3.549937 39.184234 14.312022 0.0\n", + "2 eth 3.864243 39.101366 12.470145 0.0\n", + "3 eth 3.982931 38.491368 8.336539 0.0\n", + "4 eth 4.048194 41.930928 4.762526 0.0" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 31 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "id": "wVlRj71GbphY", + "outputId": "c5203111-bd2b-4f55-9f07-f37b74bb01e9" + }, + "source": [ + "df_ng.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrycluster_latcluster_loncons_pcnightlights
0ng4.3157866.2687534.3177170.123354
1ng4.3287196.3081724.8807110.013713
2ng4.3984277.1839628.76725838.470989
3ng4.4251927.16693510.77450440.519035
4ng4.6193777.6849465.1913330.000000
\n", + "
" + ], + "text/plain": [ + " country cluster_lat cluster_lon cons_pc nightlights\n", + "0 ng 4.315786 6.268753 4.317717 0.123354\n", + "1 ng 4.328719 6.308172 4.880711 0.013713\n", + "2 ng 4.398427 7.183962 8.767258 38.470989\n", + "3 ng 4.425192 7.166935 10.774504 40.519035\n", + "4 ng 4.619377 7.684946 5.191333 0.000000" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 32 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "guLp-gR3bphY", + "outputId": "da603f30-583d-44b0-fa9f-fb5e4f2f706d" + }, + "source": [ + "df_mw['nightlights'].mean()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.6038162894313969" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 33 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PbA7va6CbphZ", + "outputId": "71adfd2f-af0f-4b7e-e9c7-6e85c3d55d4a" + }, + "source": [ + "df_eth['nightlights'].mean()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.6727544504689029" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 34 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RRltkzEfbphZ", + "outputId": "212ed948-2b2d-4d0d-9a10-8accd1cff0ac" + }, + "source": [ + "df_ng['nightlights'].mean()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1.6584013095848769" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 35 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 173 + }, + "id": "hlTR1aPEbpha", + "outputId": "7af5b849-bd5b-4b2c-f6f1-9e6e8205e85d" + }, + "source": [ + "df_mw.corr()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cluster_latcluster_loncons_pcnightlights
cluster_lat1.000000-0.702793-0.026563-0.083273
cluster_lon-0.7027931.000000-0.002947-0.033367
cons_pc-0.026563-0.0029471.0000000.384939
nightlights-0.083273-0.0333670.3849391.000000
\n", + "
" + ], + "text/plain": [ + " cluster_lat cluster_lon cons_pc nightlights\n", + "cluster_lat 1.000000 -0.702793 -0.026563 -0.083273\n", + "cluster_lon -0.702793 1.000000 -0.002947 -0.033367\n", + "cons_pc -0.026563 -0.002947 1.000000 0.384939\n", + "nightlights -0.083273 -0.033367 0.384939 1.000000" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 36 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 173 + }, + "id": "RRgNzLNcbpha", + "outputId": "9083ad1e-0b07-4b8b-deab-1dba6b1b8437" + }, + "source": [ + "df_eth.corr()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cluster_latcluster_loncons_pcnightlights
cluster_lat1.0000000.157130-0.052673-0.006627
cluster_lon0.1571301.0000000.2118460.076039
cons_pc-0.0526730.2118461.0000000.343879
nightlights-0.0066270.0760390.3438791.000000
\n", + "
" + ], + "text/plain": [ + " cluster_lat cluster_lon cons_pc nightlights\n", + "cluster_lat 1.000000 0.157130 -0.052673 -0.006627\n", + "cluster_lon 0.157130 1.000000 0.211846 0.076039\n", + "cons_pc -0.052673 0.211846 1.000000 0.343879\n", + "nightlights -0.006627 0.076039 0.343879 1.000000" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 37 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 173 + }, + "id": "uMgDYNdebpha", + "outputId": "d6ae08c0-5912-40e4-e56f-d24b951732ca" + }, + "source": [ + "df_ng.corr()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cluster_latcluster_loncons_pcnightlights
cluster_lat1.0000000.330910-0.376356-0.188281
cluster_lon0.3309101.000000-0.246189-0.196671
cons_pc-0.376356-0.2461891.0000000.264398
nightlights-0.188281-0.1966710.2643981.000000
\n", + "
" + ], + "text/plain": [ + " cluster_lat cluster_lon cons_pc nightlights\n", + "cluster_lat 1.000000 0.330910 -0.376356 -0.188281\n", + "cluster_lon 0.330910 1.000000 -0.246189 -0.196671\n", + "cons_pc -0.376356 -0.246189 1.000000 0.264398\n", + "nightlights -0.188281 -0.196671 0.264398 1.000000" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 39 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "aVWtjMbhbphb" + }, + "source": [ + "for country in ['malawi_2016', 'ethiopia_2015', 'nigeria_2015']:\n", + " os.makedirs(os.path.join(COUNTRIES_DIR, country, 'processed'), exist_ok=True)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "fTsVoPIKbphb" + }, + "source": [ + "df_mw.to_csv(os.path.join(COUNTRIES_DIR, 'malawi_2016', 'processed/clusters.csv'), index=False)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "nbMXdSBRbphb" + }, + "source": [ + "df_eth.to_csv(os.path.join(COUNTRIES_DIR, 'ethiopia_2015', 'processed/clusters.csv'), index=False)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "T4CO_MvHbphb" + }, + "source": [ + "df_ng.to_csv(os.path.join(COUNTRIES_DIR, 'nigeria_2015', 'processed/clusters.csv'), index=False)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "AzQAGdaObphb" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/predicting-poverty-education-replication/scripts/train_cnn.ipynb b/predicting-poverty-education-replication/scripts/train_cnn.ipynb new file mode 100644 index 0000000..c87e48d --- /dev/null +++ b/predicting-poverty-education-replication/scripts/train_cnn.ipynb @@ -0,0 +1,1190 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import shutil\n", + "import numpy as np\n", + "import pandas as pd\n", + "from tqdm.notebook import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_DIR = '..'\n", + "RANDOM_SEED = 7 # for reproducibility\n", + "COUNTRIES_DIR = os.path.join(BASE_DIR, 'data', 'countries')\n", + "PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')\n", + "\n", + "# these relate to training the CNN to predict nightlights\n", + "CNN_TRAIN_IMAGE_DIR = os.path.join(BASE_DIR, 'data', 'cnn_images')\n", + "CNN_SAVE_DIR = os.path.join(BASE_DIR, 'models')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(CNN_TRAIN_IMAGE_DIR, exist_ok=True)\n", + "os.makedirs(CNN_SAVE_DIR, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocess\n", + "After doing this once, you can skip to the training if the script broke" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "actually downloaded: 25246, expected: 14500\n" + ] + } + ], + "source": [ + "df_download = pd.read_csv(os.path.join(PROCESSED_DIR, 'image_download_locs.csv'))\n", + "downloaded = os.listdir(os.path.join(COUNTRIES_DIR, 'malawi_2016', 'images')) + \\\n", + " os.listdir(os.path.join(COUNTRIES_DIR, 'ethiopia_2015', 'images')) + \\\n", + " os.listdir(os.path.join(COUNTRIES_DIR, 'nigeria_2015', 'images'))\n", + "\n", + "print(f\"actually downloaded: {len(downloaded)}, expected: {len(df_download)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image_nameimage_latimage_loncluster_latcluster_loncons_pcnightlightscountrynightlights_bin
04.31578611574_6.223837135554024_4.31578611574_...4.3157866.2238374.3157866.2687534.3177170.123354ng1
14.330758037141992_6.223837135554024_4.31578611...4.3307586.2238374.3157866.2687534.3177170.123354ng1
24.285842272936016_6.238809056956016_4.31578611...4.2858426.2388094.3157866.2687534.3177170.123354ng1
34.270870351534024_6.253780978358008_4.31578611...4.2708706.2537814.3157866.2687534.3177170.123354ng1
44.345729958543984_6.253780978358008_4.31578611...4.3457306.2537814.3157866.2687534.3177170.123354ng1
\n", + "
" + ], + "text/plain": [ + " image_name image_lat image_lon \\\n", + "0 4.31578611574_6.223837135554024_4.31578611574_... 4.315786 6.223837 \n", + "1 4.330758037141992_6.223837135554024_4.31578611... 4.330758 6.223837 \n", + "2 4.285842272936016_6.238809056956016_4.31578611... 4.285842 6.238809 \n", + "3 4.270870351534024_6.253780978358008_4.31578611... 4.270870 6.253781 \n", + "4 4.345729958543984_6.253780978358008_4.31578611... 4.345730 6.253781 \n", + "\n", + " cluster_lat cluster_lon cons_pc nightlights country nightlights_bin \n", + "0 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "1 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "2 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "3 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "4 4.315786 6.268753 4.317717 0.123354 ng 1 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_download.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df_download['row'] = np.arange(len(df_download))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "\"['-13.642146764205977_34.846897078598005_-13.597231_34.861869.png'\\n '-15.84688_35.071951921402_-15.84688_35.05698.png'\\n '-11.884834921401993_34.132379_-11.869863_34.132379.png' ...\\n '-11.305565235794024_33.48830884280398_-11.350481_33.458365.png'\\n '.DS_Store' '.DS_Store'] not found in axis\"", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0midx_not_download\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_download\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'image_name'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdownloaded\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'row'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdf_download\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0midx_not_download\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mdrop\u001b[0;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[1;32m 4313\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4314\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4315\u001b[0;31m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4316\u001b[0m )\n\u001b[1;32m 4317\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mdrop\u001b[0;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[1;32m 4151\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32min\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4152\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4153\u001b[0;31m \u001b[0mobj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_drop_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4154\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4155\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_drop_axis\u001b[0;34m(self, labels, axis, level, errors)\u001b[0m\n\u001b[1;32m 4186\u001b[0m \u001b[0mnew_axis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4187\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4188\u001b[0;31m \u001b[0mnew_axis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4189\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0maxis_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnew_axis\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4190\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mdrop\u001b[0;34m(self, labels, errors)\u001b[0m\n\u001b[1;32m 5589\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5590\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m\"ignore\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5591\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{labels[mask]} not found in axis\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5592\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5593\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdelete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: \"['-13.642146764205977_34.846897078598005_-13.597231_34.861869.png'\\n '-15.84688_35.071951921402_-15.84688_35.05698.png'\\n '-11.884834921401993_34.132379_-11.869863_34.132379.png' ...\\n '-11.305565235794024_33.48830884280398_-11.350481_33.458365.png'\\n '.DS_Store' '.DS_Store'] not found in axis\"" + ] + } + ], + "source": [ + "idx_not_download = df_download.set_index('image_name').drop(downloaded)['row'].values.tolist()\n", + "df_download.drop(idx_not_download, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df_download.drop('row', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.41379310344827586, 0.3586206896551724, 0.22758620689655173)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the distribution\n", + "(df_download['nightlights_bin']==0).mean(), (df_download['nightlights_bin']==1).mean(), (df_download['nightlights_bin']==2).mean()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Split images into train/valid.\n", + "Each cluster will contribute 80% of images for training, and 20% for validation." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df_download.reset_index(drop=True, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image_nameimage_latimage_loncluster_latcluster_loncons_pcnightlightscountrynightlights_bin
04.31578611574_6.223837135554024_4.31578611574_...4.3157866.2238374.3157866.2687534.3177170.123354ng1
14.330758037141992_6.223837135554024_4.31578611...4.3307586.2238374.3157866.2687534.3177170.123354ng1
24.285842272936016_6.238809056956016_4.31578611...4.2858426.2388094.3157866.2687534.3177170.123354ng1
34.270870351534024_6.253780978358008_4.31578611...4.2708706.2537814.3157866.2687534.3177170.123354ng1
44.345729958543984_6.253780978358008_4.31578611...4.3457306.2537814.3157866.2687534.3177170.123354ng1
\n", + "
" + ], + "text/plain": [ + " image_name image_lat image_lon \\\n", + "0 4.31578611574_6.223837135554024_4.31578611574_... 4.315786 6.223837 \n", + "1 4.330758037141992_6.223837135554024_4.31578611... 4.330758 6.223837 \n", + "2 4.285842272936016_6.238809056956016_4.31578611... 4.285842 6.238809 \n", + "3 4.270870351534024_6.253780978358008_4.31578611... 4.270870 6.253781 \n", + "4 4.345729958543984_6.253780978358008_4.31578611... 4.345730 6.253781 \n", + "\n", + " cluster_lat cluster_lon cons_pc nightlights country nightlights_bin \n", + "0 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "1 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "2 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "3 4.315786 6.268753 4.317717 0.123354 ng 1 \n", + "4 4.315786 6.268753 4.317717 0.123354 ng 1 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_download.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "df_download['is_train'] = True" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/pandas/core/indexing.py:1637: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " self._setitem_single_block(indexer, value, name)\n" + ] + } + ], + "source": [ + "np.random.seed(RANDOM_SEED)\n", + "groups = df_download.groupby(['cluster_lat', 'cluster_lon'])\n", + "for _, g in groups:\n", + " n_ims = len(g)\n", + " n_train = int(0.8 * n_ims)\n", + " n_valid = n_ims - n_train\n", + " valid_choices = np.random.choice(np.arange(n_ims), replace=False, size=n_valid).tolist()\n", + " current_index = g.index\n", + " idx_valid = current_index[valid_choices]\n", + " df_download['is_train'].loc[idx_valid] = False" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7978620689655173" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_download['is_train'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# save this new dataframe\n", + "df_download.to_csv(os.path.join(PROCESSED_DIR, 'image_download_actual.csv'), index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "os.makedirs(os.path.join(CNN_TRAIN_IMAGE_DIR, 'train'), exist_ok=False)\n", + "os.makedirs(os.path.join(CNN_TRAIN_IMAGE_DIR, 'valid'), exist_ok=False)\n", + "\n", + "labels = ['0', '1', '2']\n", + "for l in labels:\n", + " os.makedirs(os.path.join(CNN_TRAIN_IMAGE_DIR, 'train', l), exist_ok=False)\n", + " os.makedirs(os.path.join(CNN_TRAIN_IMAGE_DIR, 'valid', l), exist_ok=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "t = df_download[df_download['is_train']]\n", + "v = df_download[~df_download['is_train']]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(11569, 2931)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(t), len(v)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "copying train images\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c73db387267d48fc8812f72932669dda", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/11569 [00:00 10:\n", + " # fine tune whole model\n", + " for param in model_ft.parameters():\n", + " param.requires_grad = True\n", + " optimizer = optim.SGD(model_ft.parameters(), lr=1e-4, momentum=0.9)\n", + "\n", + " # Each epoch has a training and validation phase\n", + " for phase in ['train', 'valid']:\n", + " if phase == 'train':\n", + " model.train() # Set model to training mode\n", + " else:\n", + " model.eval() # Set model to evaluate mode\n", + "\n", + " running_loss = 0.0\n", + " running_corrects = 0\n", + "\n", + " # Iterate over data.\n", + " for inputs, labels in tqdm(dataloaders[phase]):\n", + " inputs = inputs.to(device)\n", + " labels = labels.to(device)\n", + "\n", + " # zero the parameter gradients\n", + " optimizer.zero_grad()\n", + "\n", + " # forward\n", + " # track history if only in train\n", + " with torch.set_grad_enabled(phase == 'train'):\n", + " outputs = model(inputs)\n", + " loss = criterion(outputs, labels)\n", + "\n", + " _, preds = torch.max(outputs, 1)\n", + "\n", + " # backward + optimize only if in training phase\n", + " if phase == 'train':\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " # statistics\n", + " running_loss += loss.item() * inputs.size(0)\n", + " running_corrects += torch.sum(preds == labels.data)\n", + "\n", + " epoch_loss = running_loss / len(dataloaders[phase].dataset)\n", + " epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)\n", + "\n", + " print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))\n", + "\n", + " # deep copy the model\n", + " if phase == 'valid' and epoch_acc > best_acc:\n", + " best_acc = epoch_acc\n", + " best_model_wts = copy.deepcopy(model.state_dict())\n", + " if phase == 'valid':\n", + " val_acc_history.append(epoch_acc)\n", + " \n", + " print()\n", + "\n", + " time_elapsed = time.time() - since\n", + " print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))\n", + " print('Best val Acc: {:4f}'.format(best_acc))\n", + "\n", + " # load best model weights\n", + " model.load_state_dict(best_model_wts)\n", + " return model, val_acc_history" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 0/19\n", + "----------\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "98513fb13ff84100b70ad72b4ebc3a18", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1447 [00:00\n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 138, in __getitem__\n sample = self.loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 174, in default_loader\n return pil_loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 155, in pil_loader\n with open(path, 'rb') as f:\nFileNotFoundError: [Errno 2] No such file or directory: '/Users/TinotendaMatsika/Documents/predicting-poverty-education-replication/data/cnn_images/train/1/5.577058854163986_5.771977164594023_5.547115011360002_5.8168929288.png'\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# Train and evaluate\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mmodel_ft\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhist\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_ft\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdataloaders_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcriterion\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptimizer_ft\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_epochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnum_epochs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain_model\u001b[0;34m(model, dataloaders, criterion, optimizer, num_epochs)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;31m# Iterate over data.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 29\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataloaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mphase\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 30\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/tqdm/notebook.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtqdm_notebook\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0;31m# return super(tqdm...) will not catch exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/tqdm/std.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1176\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1177\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1178\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1179\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1180\u001b[0m \u001b[0;31m# Update and possibly print the progressbar.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 818\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_task_info\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 819\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 820\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 821\u001b[0m \u001b[0mnext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m__next__\u001b[0m \u001b[0;31m# Python 2 compatibility\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_process_data\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 844\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_try_put_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 845\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mExceptionWrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 846\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 847\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/_utils.py\u001b[0m in \u001b[0;36mreraise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 383\u001b[0m \u001b[0;31m# (https://bugs.python.org/issue2651), so we work around it.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 384\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKeyErrorMessage\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 385\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m: Caught FileNotFoundError in DataLoader worker process 0.\nOriginal Traceback (most recent call last):\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py\", line 178, in _worker_loop\n data = fetcher.fetch(index)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py\", line 44, in fetch\n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py\", line 44, in \n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 138, in __getitem__\n sample = self.loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 174, in default_loader\n return pil_loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 155, in pil_loader\n with open(path, 'rb') as f:\nFileNotFoundError: [Errno 2] No such file or directory: '/Users/TinotendaMatsika/Documents/predicting-poverty-education-replication/data/cnn_images/train/1/5.577058854163986_5.771977164594023_5.547115011360002_5.8168929288.png'\n" + ] + } + ], + "source": [ + "# Setup the loss fxn\n", + "criterion = nn.CrossEntropyLoss()\n", + "\n", + "# Train and evaluate\n", + "model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A model is already saved at this location\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "None", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCNN_SAVE_DIR\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'trained_model.pt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misfile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'A model is already saved at this location'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'Saving model to {path}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_ft\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAssertionError\u001b[0m: None" + ] + } + ], + "source": [ + "path = os.path.join(CNN_SAVE_DIR, 'trained_model.pt')\n", + "assert not os.path.isfile(path), print('A model is already saved at this location')\n", + "print(f'Saving model to {path}')\n", + "torch.save(model_ft, path)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2fe8181ce2654062ba50450b26a2f647", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1447 [00:00\n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 138, in __getitem__\n sample = self.loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 174, in default_loader\n return pil_loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 155, in pil_loader\n with open(path, 'rb') as f:\nFileNotFoundError: [Errno 2] No such file or directory: '/Users/TinotendaMatsika/Documents/predicting-poverty-education-replication/data/cnn_images/train/0/10.271252764598009_9.682578659135977_10.286224686_9.637662894930001.png'\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;31m# Iterate over data.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataloaders_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'train'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/tqdm/notebook.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtqdm_notebook\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0;31m# return super(tqdm...) will not catch exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/tqdm/std.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1176\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1177\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1178\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1179\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1180\u001b[0m \u001b[0;31m# Update and possibly print the progressbar.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 818\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_task_info\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 819\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 820\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 821\u001b[0m \u001b[0mnext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m__next__\u001b[0m \u001b[0;31m# Python 2 compatibility\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_process_data\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 844\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_try_put_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 845\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mExceptionWrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 846\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 847\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/_utils.py\u001b[0m in \u001b[0;36mreraise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 383\u001b[0m \u001b[0;31m# (https://bugs.python.org/issue2651), so we work around it.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 384\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKeyErrorMessage\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 385\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m: Caught FileNotFoundError in DataLoader worker process 0.\nOriginal Traceback (most recent call last):\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py\", line 178, in _worker_loop\n data = fetcher.fetch(index)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py\", line 44, in fetch\n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py\", line 44, in \n data = [self.dataset[idx] for idx in possibly_batched_index]\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 138, in __getitem__\n sample = self.loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 174, in default_loader\n return pil_loader(path)\n File \"/opt/anaconda3/envs/testenv/lib/python3.7/site-packages/torchvision/datasets/folder.py\", line 155, in pil_loader\n with open(path, 'rb') as f:\nFileNotFoundError: [Errno 2] No such file or directory: '/Users/TinotendaMatsika/Documents/predicting-poverty-education-replication/data/cnn_images/train/0/10.271252764598009_9.682578659135977_10.286224686_9.637662894930001.png'\n" + ] + } + ], + "source": [ + "# you can run below if you want to see the final accuracy on nightlights over the train set\n", + "model_ft.eval() # Set model to evaluate mode\n", + "\n", + "criterion = nn.CrossEntropyLoss()\n", + "running_loss = 0.0\n", + "running_corrects = 0\n", + "total = 0\n", + "\n", + "# Iterate over data.\n", + "for inputs, labels in tqdm(dataloaders_dict['train']):\n", + " inputs = inputs.to(device)\n", + " labels = labels.to(device)\n", + "\n", + " # forward\n", + " # track history if only in train\n", + " with torch.set_grad_enabled(False):\n", + " outputs = model_ft(inputs)\n", + " loss = criterion(outputs, labels)\n", + "\n", + " _, preds = torch.max(outputs, 1)\n", + "\n", + " # statistics\n", + " running_loss += loss.item() * inputs.size(0)\n", + " running_corrects += torch.sum(preds == labels.data)\n", + " \n", + " total += len(preds)\n", + " \n", + "print(running_corrects.double()/total)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "name": "pytorch-gpu.1-4.m46", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-4:m46" + }, + "kernelspec": { + "display_name": "testenv", + "language": "python", + "name": "testenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/predicting-poverty-education-replication/scripts/use_paper_model/caffemodel2pytorch/README.md b/predicting-poverty-education-replication/scripts/use_paper_model/caffemodel2pytorch/README.md new file mode 100644 index 0000000..93abe9a --- /dev/null +++ b/predicting-poverty-education-replication/scripts/use_paper_model/caffemodel2pytorch/README.md @@ -0,0 +1,348 @@ +This converter can be useful for porting Caffe code and layers to PyTorch. Features: +* dump caffemodel weights to hdf5, npy, pt and json formats +* load Caffe models and use them from PyTorch +* mock PyCaffe API to allow for smooth porting of Caffe-using code (drop-in script for [OICR](https://github.com/ppengtang/oicr) for changing backend in train/eval to PyTorch is below): + * Net, Blob, SGDSolver +* wrapping Caffe's Python layers (see the OICR example) +* example of ROI pooling in PyTorch without manual CUDA code compilation (see the OICR example) + +The layer support isn't as complete as in https://github.com/marvis/pytorch-caffe. Currently it supports the following Caffe layers: +* convolution (num_output, kernel_size, stride, pad, dilation; constant and gaussian weight/bias fillers) +* inner_product (num_output; constant and gaussian weight/bias fillers) +* max / avg pooling (kernel_size, stride, pad) +* relu +* dropout (dropout_ratio) +* eltwise (prod, sum, max) +* softmax (axis) +* local response norm (local_size, alpha, beta) + +Dependencies: protobuf with Python bindings, including `protoc` binary in `PATH`. + +PRs to enable other layers or layer params are very welcome (see the definition of the `modules` dictionary in the code)! + +License is MIT. + +## Dump weights to PT or HDF5 +```shell +# prototxt and caffemodel from https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-md + +# dumps to PT by default to VGG_ILSVRC_16_layers.caffemodel.pt +python -m caffemodel2pytorch VGG_ILSVRC_16_layers.caffemodel + +# dumps to HDF5 converted.h5 +python -m caffemodel2pytorch VGG_ILSVRC_16_layers.caffemodel -o converted.h5 +``` + +```python +# load dumped VGG16 in PyTorch +import collections, torch, torchvision, numpy, h5py +model = torchvision.models.vgg16() +model.features = torch.nn.Sequential(collections.OrderedDict(zip(['conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2', 'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'pool3', 'conv4_1', 'relu4_1', 'conv4_2', 'relu4_2', 'conv4_3', 'relu4_3', 'pool4', 'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3', 'pool5'], model.features))) +model.classifier = torch.nn.Sequential(collections.OrderedDict(zip(['fc6', 'relu6', 'drop6', 'fc7', 'relu7', 'drop7', 'fc8'], model.classifier))) + +state_dict = h5py.File('converted.h5', 'r') # torch.load('VGG_ILSVRC_16_layers.caffemodel.pt') +model.load_state_dict({l : torch.from_numpy(numpy.array(v)).view_as(p) for k, v in state_dict.items() for l, p in model.named_parameters() if k in l}) +``` + +## Run Caffe models using PyTorch as backend +```python +import torch +import caffemodel2pytorch + +model = caffemodel2pytorch.Net( + prototxt = 'VGG_ILSVRC_16_layers_deploy.prototxt', + weights = 'VGG_ILSVRC_16_layers.caffemodel', + caffe_proto = 'https://raw.githubusercontent.com/BVLC/caffe/master/src/caffe/proto/caffe.proto' +) +model.cuda() +model.eval() +torch.set_grad_enabled(False) + +# make sure to have right procedure of image normalization and channel reordering +image = torch.autograd.Variable(torch.Tensor(8, 3, 224, 224).cuda()) + +# outputs dict of PyTorch Variables +# in this example the dict contains the only key "prob" +#output_dict = model(data = image) + +# you can remove unneeded layers: +del model.prob +del model.fc8 + +# a single input variable is interpreted as an input blob named "data" +# in this example the dict contains the only key "fc7" +output_dict = model(image) +``` + +## Imitate pycaffe interface to help in porting + +```python +import numpy as np +import caffemodel2pytorch as caffe + +caffe.set_mode_gpu() +caffe.set_device(0) + +# === LOADING AND USING THE NET IN EVAL MODE === + +net = caffe.Net('VGG_ILSVRC_16_layers_deploy.prototxt', caffe.TEST, weights = 'VGG_ILSVRC_16_layers.caffemodel') + +# outputs a dict of NumPy arrays, data layer is sidestepped +blobs_out = net.forward(data = np.zeros((8, 3, 224, 224), dtype = np.float32)) + +# access the last layer +layer = net.layers[-1] + +# converts and provides the output as NumPy array +numpy_array = net.blobs['conv1_1'].data + +# access the loss weights +loss_weights = net.blob_loss_weights + +# === BASIC OPTIMIZER === + +# this example uses paths from https://github.com/ppengtang/oicr + +# create an SGD solver, loads the net in train mode +# it knows about base_lr, weight_decay, momentum, lr_mult, decay_mult, iter_size, lr policy step, step_size, gamma +# it finds train.prototxt from the solver.prototxt's train_net or net parameters +solver = caffe.SGDSolver('oicr/models/VGG16/solver.prototxt') + +# load pretrained weights +solver.net.copy_from('oicr/data/imagenet_models/VGG16.v2.caffemodel') + +# runs one iteration of forward, backward, optimization; returns a float loss value +# data layer must be registered or inputs must be provided as keyword arguments +loss = solver.step(1) +``` + +## Drop-in script for OICR enabling PyTorch as backend for eval and training +Place `caffe_pytorch_oicr.py` and `caffemodel2pytorch.py` in the root `oicr` directory. To use the PyTorch backend in testing and in training, put a line `import caffe_pytorch_oicr` at the very top (before `import _init_paths`) in `tools/test_net.py` and `tools/train_net.py` respectively. It requires PyTorch and CuPy (for on-the-fly CUDA kernel compilation). + +```python +# caffe_pytorch_oicr.py + +import collections +import torch +import torch.nn.functional as F +import cupy +import caffemodel2pytorch + +caffemodel2pytorch.initialize('./caffe-oicr/src/caffe/proto/caffe.proto') # needs to be called explicitly for these porting scenarios to enable caffe.proto.caffe_pb2 variable +caffemodel2pytorch.set_mode_gpu() +caffemodel2pytorch.modules['GlobalSumPooling'] = lambda param: lambda pred: pred.sum(dim = 0, keepdim = True) +caffemodel2pytorch.modules['MulticlassCrossEntropyLoss'] = lambda param: lambda pred, labels, eps = 1e-6: F.binary_cross_entropy(pred.clamp(eps, 1 - eps), labels) +caffemodel2pytorch.modules['data'] = lambda param: __import__('roi_data_layer.layer').layer.RoIDataLayer() # wrapping a PyCaffe layer +caffemodel2pytorch.modules['OICRLayer'] = lambda param: OICRLayer # wrapping a PyTorch function +caffemodel2pytorch.modules['WeightedSoftmaxWithLoss'] = lambda param: WeightedSoftmaxWithLoss +caffemodel2pytorch.modules['ReLU'] = lambda param: torch.nn.ReLU(inplace = True) # wrapping a PyTorch module +caffemodel2pytorch.modules['ROIPooling'] = lambda param: lambda input, rois: RoiPooling(param['pooled_h'], param['pooled_w'], param['spatial_scale'])(input, rois) # wrapping a PyTorch autograd function + +def WeightedSoftmaxWithLoss(prob, labels_ic, cls_loss_weights, eps = 1e-12): + loss = -cls_loss_weights * F.log_softmax(prob, dim = -1).gather(-1, labels_ic.long().unsqueeze(-1)).squeeze(-1) + valid_sum = cls_loss_weights.gt(eps).float().sum() + return loss.sum() / (loss.numel() if valid_sum == 0 else valid_sum) + +def OICRLayer(boxes, cls_prob, im_labels, cfg_TRAIN_FG_THRESH = 0.5): + cls_prob = (cls_prob if cls_prob.size(-1) == im_labels.size(-1) else cls_prob[..., 1:]).clone() + boxes = boxes[..., 1:] + gt_boxes, gt_classes, gt_scores = [], [], [] + for i in im_labels.eq(1).nonzero()[:, 1]: + max_index = int(cls_prob[:, i].max(dim = 0)[1]) + gt_boxes.append(boxes[max_index]) + gt_classes.append(int(i) + 1) + gt_scores.append(float(cls_prob[max_index, i])) + cls_prob[max_index] = 0 + max_overlaps, gt_assignment = overlap(boxes, torch.stack(gt_boxes)).max(dim = 1) + return gt_assignment.new(gt_classes)[gt_assignment] * (max_overlaps > cfg_TRAIN_FG_THRESH).type_as(gt_assignment), max_overlaps.new(gt_scores)[gt_assignment] + +class RoiPooling(torch.autograd.Function): + CUDA_NUM_THREADS = 1024 + GET_BLOCKS = staticmethod(lambda N: (N + RoiPooling.CUDA_NUM_THREADS - 1) // RoiPooling.CUDA_NUM_THREADS) + Stream = collections.namedtuple('Stream', ['ptr']) + + kernel_forward = b''' + #define FLT_MAX 340282346638528859811704183484516925440.0f + typedef float Dtype; + #define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) + extern "C" + __global__ void ROIPoolForward(const int nthreads, const Dtype* bottom_data, + const Dtype spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const Dtype* bottom_rois, Dtype* top_data, int* argmax_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + bottom_rois += n * 5; + int roi_batch_ind = bottom_rois[0]; + int roi_start_w = round(bottom_rois[1] * spatial_scale); + int roi_start_h = round(bottom_rois[2] * spatial_scale); + int roi_end_w = round(bottom_rois[3] * spatial_scale); + int roi_end_h = round(bottom_rois[4] * spatial_scale); + + // Force malformed ROIs to be 1x1 + int roi_width = max(roi_end_w - roi_start_w + 1, 1); + int roi_height = max(roi_end_h - roi_start_h + 1, 1); + Dtype bin_size_h = static_cast(roi_height) + / static_cast(pooled_height); + Dtype bin_size_w = static_cast(roi_width) + / static_cast(pooled_width); + + int hstart = static_cast(floor(static_cast(ph) + * bin_size_h)); + int wstart = static_cast(floor(static_cast(pw) + * bin_size_w)); + int hend = static_cast(ceil(static_cast(ph + 1) + * bin_size_h)); + int wend = static_cast(ceil(static_cast(pw + 1) + * bin_size_w)); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart + roi_start_h, 0), height); + hend = min(max(hend + roi_start_h, 0), height); + wstart = min(max(wstart + roi_start_w, 0), width); + wend = min(max(wend + roi_start_w, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Define an empty pooling region to be zero + Dtype maxval = is_empty ? 0 : -FLT_MAX; + // If nothing is pooled, argmax = -1 causes nothing to be backprop'd + int maxidx = -1; + bottom_data += (roi_batch_ind * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int bottom_index = h * width + w; + if (bottom_data[bottom_index] > maxval) { + maxval = bottom_data[bottom_index]; + maxidx = bottom_index; + } + } + } + top_data[index] = maxval; + argmax_data[index] = maxidx; + } + } + ''' + + kernel_backward = b''' + typedef float Dtype; + #define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) + extern "C" + __global__ void ROIPoolBackward(const int nthreads, const Dtype* top_diff, + const int* argmax_data, const int num_rois, const Dtype spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, Dtype* bottom_diff, + const Dtype* bottom_rois) { + CUDA_KERNEL_LOOP(index, nthreads) { + // (n, c, h, w) coords in bottom data + int w = index % width; + int h = (index / width) % height; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + + Dtype gradient = 0; + // Accumulate gradient over all ROIs that pooled this element + for (int roi_n = 0; roi_n < num_rois; ++roi_n) { + const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + // Skip if ROI's batch index doesn't match n + if (n != roi_batch_ind) { + continue; + } + + int roi_start_w = round(offset_bottom_rois[1] * spatial_scale); + int roi_start_h = round(offset_bottom_rois[2] * spatial_scale); + int roi_end_w = round(offset_bottom_rois[3] * spatial_scale); + int roi_end_h = round(offset_bottom_rois[4] * spatial_scale); + + // Skip if ROI doesn't include (h, w) + const bool in_roi = (w >= roi_start_w && w <= roi_end_w && + h >= roi_start_h && h <= roi_end_h); + if (!in_roi) { + continue; + } + + int offset = (roi_n * channels + c) * pooled_height * pooled_width; + const Dtype* offset_top_diff = top_diff + offset; + const int* offset_argmax_data = argmax_data + offset; + + // Compute feasible set of pooled units that could have pooled + // this bottom unit + + // Force malformed ROIs to be 1x1 + int roi_width = max(roi_end_w - roi_start_w + 1, 1); + int roi_height = max(roi_end_h - roi_start_h + 1, 1); + + Dtype bin_size_h = static_cast(roi_height) + / static_cast(pooled_height); + Dtype bin_size_w = static_cast(roi_width) + / static_cast(pooled_width); + + int phstart = floor(static_cast(h - roi_start_h) / bin_size_h); + int phend = ceil(static_cast(h - roi_start_h + 1) / bin_size_h); + int pwstart = floor(static_cast(w - roi_start_w) / bin_size_w); + int pwend = ceil(static_cast(w - roi_start_w + 1) / bin_size_w); + + phstart = min(max(phstart, 0), pooled_height); + phend = min(max(phend, 0), pooled_height); + pwstart = min(max(pwstart, 0), pooled_width); + pwend = min(max(pwend, 0), pooled_width); + + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (offset_argmax_data[ph * pooled_width + pw] == (h * width + w)) { + gradient += offset_top_diff[ph * pooled_width + pw]; + } + } + } + } + bottom_diff[index] = gradient; + } + } + ''' + cupy_init = cupy.array([]) + compiled_forward = cupy.cuda.compiler.compile_with_cache(kernel_forward).get_function('ROIPoolForward') + compiled_backward = cupy.cuda.compiler.compile_with_cache(kernel_backward).get_function('ROIPoolBackward') + + def __init__(self, pooled_height, pooled_width, spatial_scale): + self.pooled_height = pooled_height + self.pooled_width = pooled_width + self.spatial_scale = spatial_scale + + def forward(self, images, rois): + output = torch.cuda.FloatTensor(len(rois), images.size(1) * self.pooled_height * self.pooled_width) + self.argmax = torch.cuda.IntTensor(output.size()).fill_(-1) + self.input_size = images.size() + self.save_for_backward(rois) + RoiPooling.compiled_forward(grid = (RoiPooling.GET_BLOCKS(output.numel()), 1, 1), block = (RoiPooling.CUDA_NUM_THREADS, 1, 1), args=[ + output.numel(), images.data_ptr(), cupy.float32(self.spatial_scale), self.input_size[-3], self.input_size[-2], self.input_size[-1], + self.pooled_height, self.pooled_width, rois.data_ptr(), output.data_ptr(), self.argmax.data_ptr() + ], stream=RoiPooling.Stream(ptr=torch.cuda.current_stream().cuda_stream)) + return output + + def backward(self, grad_output): + rois, = self.saved_tensors + grad_input = torch.cuda.FloatTensor(*self.input_size).zero_() + RoiPooling.compiled_backward(grid = (RoiPooling.GET_BLOCKS(grad_input.numel()), 1, 1), block = (RoiPooling.CUDA_NUM_THREADS, 1, 1), args=[ + grad_input.numel(), grad_output.data_ptr(), self.argmax.data_ptr(), len(rois), cupy.float32(self.spatial_scale), self.input_size[-3], + self.input_size[-2], self.input_size[-1], self.pooled_height, self.pooled_width, grad_input.data_ptr(), rois.data_ptr() + ], stream=RoiPooling.Stream(ptr=torch.cuda.current_stream().cuda_stream)) + return grad_input, None + +def overlap(box1, box2): + b1, b2 = box1.t().contiguous(), box2.t().contiguous() + xx1 = torch.max(b1[0].unsqueeze(1), b2[0].unsqueeze(0)) + yy1 = torch.max(b1[1].unsqueeze(1), b2[1].unsqueeze(0)) + xx2 = torch.min(b1[2].unsqueeze(1), b2[2].unsqueeze(0)) + yy2 = torch.min(b1[3].unsqueeze(1), b2[3].unsqueeze(0)) + inter = area(x1 = xx1, y1 = yy1, x2 = xx2, y2 = yy2) + return inter / (area(b1.t()).unsqueeze(1) + area(b2.t()).unsqueeze(0) - inter) + +def area(boxes = None, x1 = None, y1 = None, x2 = None, y2 = None): + return (boxes[..., 3] - boxes[..., 1] + 1) * (boxes[..., 2] - boxes[..., 0] + 1) if boxes is not None else (x2 - x1 + 1).clamp(min = 0) * (y2 - y1 + 1).clamp(min = 0) +``` +**Note:** I've also had to replace `utils/bbox.pyx` by `utils/cython_bbox.pyx` and `utils/nms.pyx` by `utils/cython_nms.pyx` in `lib/setup.py` to deal with some `setup.py` issues. diff --git a/predicting-poverty-education-replication/scripts/use_paper_model/caffemodel2pytorch/__init__.py b/predicting-poverty-education-replication/scripts/use_paper_model/caffemodel2pytorch/__init__.py new file mode 100644 index 0000000..e5fcb07 --- /dev/null +++ b/predicting-poverty-education-replication/scripts/use_paper_model/caffemodel2pytorch/__init__.py @@ -0,0 +1 @@ +from .caffemodel2pytorch import * \ No newline at end of file diff --git a/predicting-poverty-education-replication/scripts/use_paper_model/caffemodel2pytorch/caffemodel2pytorch.py b/predicting-poverty-education-replication/scripts/use_paper_model/caffemodel2pytorch/caffemodel2pytorch.py new file mode 100644 index 0000000..fd3584b --- /dev/null +++ b/predicting-poverty-education-replication/scripts/use_paper_model/caffemodel2pytorch/caffemodel2pytorch.py @@ -0,0 +1,400 @@ +import os +import sys +import time +import argparse +import tempfile +import subprocess +import collections +import torch +import torch.nn as nn +import torch.nn.functional as F +from functools import reduce + +from urllib.request import urlopen + +import google.protobuf.descriptor +import google.protobuf.descriptor_pool +import google.protobuf.symbol_database +import google.protobuf.text_format +from google.protobuf.descriptor import FieldDescriptor as FD + +TRAIN = 0 + +TEST = 1 + +caffe_pb2 = None + +def initialize(caffe_proto = 'https://raw.githubusercontent.com/BVLC/caffe/master/src/caffe/proto/caffe.proto', codegen_dir = tempfile.mkdtemp(), shadow_caffe = True): + global caffe_pb2 + if caffe_pb2 is None: + local_caffe_proto = os.path.join(codegen_dir, os.path.basename(caffe_proto)) + with open(local_caffe_proto, 'w') as f: + mybytes = urlopen(caffe_proto).read() + mystr = mybytes.decode('ascii', 'ignore') + f.write(mystr) + #f.write((urlopen if 'http' in caffe_proto else open)(caffe_proto).read()) + subprocess.check_call(['protoc', '--proto_path', os.path.dirname(local_caffe_proto), '--python_out', codegen_dir, local_caffe_proto]) + sys.path.insert(0, codegen_dir) + old_pool = google.protobuf.descriptor._message.default_pool + old_symdb = google.protobuf.symbol_database._DEFAULT + google.protobuf.descriptor._message.default_pool = google.protobuf.descriptor_pool.DescriptorPool() + google.protobuf.symbol_database._DEFAULT = google.protobuf.symbol_database.SymbolDatabase(pool = google.protobuf.descriptor._message.default_pool) + import caffe_pb2 as caffe_pb2 + google.protobuf.descriptor._message.default_pool = old_pool + google.protobuf.symbol_database._DEFAULT = old_symdb + sys.modules[__name__ + '.proto'] = sys.modules[__name__] + if shadow_caffe: + sys.modules['caffe'] = sys.modules[__name__] + sys.modules['caffe.proto'] = sys.modules[__name__] + return caffe_pb2 + + def set_mode_gpu(): + global convert_to_gpu_if_enabled + convert_to_gpu_if_enabled = lambda obj: obj.cuda() + + def set_device(gpu_id): + torch.cuda.set_device(gpu_id) + +class Net(nn.Module): + def __init__(self, prototxt, *args, **kwargs): + super(Net, self).__init__() + # to account for both constructors, see https://github.com/BVLC/caffe/blob/master/python/caffe/test/test_net.py#L145-L147 + caffe_proto = kwargs.pop('caffe_proto', None) + weights = kwargs.pop('weights', None) + phase = kwargs.pop('phase', None) + weights = weights or (args + (None, None))[0] + phase = phase or (args + (None, None))[1] + + self.net_param = initialize(caffe_proto).NetParameter() + google.protobuf.text_format.Parse(open(prototxt).read(), self.net_param) + + for layer in list(self.net_param.layer) + list(self.net_param.layers): + layer_type = layer.type if layer.type != 'Python' else layer.python_param.layer + if isinstance(layer_type, int): + layer_type = layer.LayerType.Name(layer_type) + module_constructor = ([v for k, v in modules.items() if k.replace('_', '').upper() in [layer_type.replace('_', '').upper(), layer.name.replace('_', '').upper()]] + [None])[0] + if module_constructor is not None: + param = to_dict(([v for f, v in layer.ListFields() if f.name.endswith('_param')] + [None])[0]) + caffe_input_variable_names = list(layer.bottom) + caffe_output_variable_names = list(layer.top) + caffe_loss_weight = (list(layer.loss_weight) or [1.0 if layer_type.upper().endswith('LOSS') else 0.0]) * len(layer.top) + caffe_propagate_down = list(getattr(layer, 'propagate_down', [])) or [True] * len(caffe_input_variable_names) + caffe_optimization_params = to_dict(layer.param) + param['inplace'] = len(caffe_input_variable_names) == 1 and caffe_input_variable_names == caffe_output_variable_names + module = module_constructor(param) + self.add_module(layer.name, module if isinstance(module, nn.Module) else CaffePythonLayerModule(module, caffe_input_variable_names, caffe_output_variable_names, param.get('param_str', '')) if type(module).__name__.endswith('Layer') else FunctionModule(module)) + module = getattr(self, layer.name) + module.caffe_layer_name = layer.name + module.caffe_layer_type = layer_type + module.caffe_input_variable_names = caffe_input_variable_names + module.caffe_output_variable_names = caffe_output_variable_names + module.caffe_loss_weight = caffe_loss_weight + module.caffe_propagate_down = caffe_propagate_down + module.caffe_optimization_params = caffe_optimization_params + for optim_param, p in zip(caffe_optimization_params, module.parameters()): + p.requires_grad = optim_param.get('lr_mult', 1) != 0 + else: + print('Skipping layer [{}, {}, {}]: not found in caffemodel2pytorch.modules dict'.format(layer.name, layer_type, layer.type)) + + if weights is not None: + self.copy_from(weights) + + self.blobs = collections.defaultdict(Blob) + self.blob_loss_weights = {name : loss_weight for module in self.children() for name, loss_weight in zip(module.caffe_output_variable_names, module.caffe_loss_weight)} + + self.train(phase != TEST) + convert_to_gpu_if_enabled(self) + + def forward(self, data = None, **variables): + if data is not None: + variables['data'] = data + numpy = not all(map(torch.is_tensor, variables.values())) + variables = {k : convert_to_gpu_if_enabled(torch.from_numpy(v.copy()) if numpy else v) for k, v in variables.items()} + + for module in [module for module in self.children() if not all(name in variables for name in module.caffe_output_variable_names)]: + for name in module.caffe_input_variable_names: + assert name in variables, 'Variable [{}] does not exist. Pass it as a keyword argument or provide a layer which produces it.'.format(name) + inputs = [variables[name] if propagate_down else variables[name].detach() for name, propagate_down in zip(module.caffe_input_variable_names, module.caffe_propagate_down)] + outputs = module(*inputs) + if not isinstance(outputs, tuple): + outputs = (outputs, ) + variables.update(dict(zip(module.caffe_output_variable_names, outputs))) + + self.blobs.update({k : Blob(data = v, numpy = numpy) for k, v in variables.items()}) + caffe_output_variable_names = set([name for module in self.children() for name in module.caffe_output_variable_names]) - set([name for module in self.children() for name in module.caffe_input_variable_names if name not in module.caffe_output_variable_names]) + return {k : v.detach().cpu().numpy() if numpy else v for k, v in variables.items() if k in caffe_output_variable_names} + + def copy_from(self, weights): + try: + import h5py, numpy + state_dict = self.state_dict() + for k, v in h5py.File(weights, 'r').items(): + if k in state_dict: + state_dict[k].resize_(v.shape).copy_(torch.from_numpy(numpy.array(v))) + print('caffemodel2pytorch: loaded model from [{}] in HDF5 format'.format(weights)) + except Exception as e: + print('caffemodel2pytorch: loading model from [{}] in HDF5 format failed [{}], falling back to caffemodel format'.format(weights, e)) + bytes_weights = open(weights, 'rb').read() + bytes_parsed = self.net_param.ParseFromString(bytes_weights) + if bytes_parsed != len(bytes_weights): + print('caffemodel2pytorch: loading model from [{}] in caffemodel format, WARNING: file length [{}] is not equal to number of parsed bytes [{}]'.format(weights, len(bytes_weights), bytes_parsed)) + for layer in list(self.net_param.layer) + list(self.net_param.layers): + module = getattr(self, layer.name, None) + if module is None: + continue + parameters = {name : convert_to_gpu_if_enabled(torch.FloatTensor(blob.data)).view(list(blob.shape.dim) if len(blob.shape.dim) > 0 else [blob.num, blob.channels, blob.height, blob.width]) for name, blob in zip(['weight', 'bias'], layer.blobs)} + if len(parameters) > 0: + module.set_parameters(**parameters) + print('caffemodel2pytorch: loaded model from [{}] in caffemodel format'.format(weights)) + + def save(self, weights): + import h5py + with h5py.File(weights, 'w') as h: + for k, v in self.state_dict().items(): + h[k] = v.cpu().numpy() + print('caffemodel2pytorch: saved model to [{}] in HDF5 format'.format(weights)) + + @property + def layers(self): + return list(self.children()) + +class Blob(object): + AssignmentAdapter = type('', (object, ), dict(shape = property(lambda self: self.contents.shape), __setitem__ = lambda self, indices, values: setattr(self, 'contents', values))) + + def __init__(self, data = None, diff = None, numpy = False): + self.data_ = data if data is not None else Blob.AssignmentAdapter() + self.diff_ = diff if diff is not None else Blob.AssignmentAdapter() + self.shape_ = None + self.numpy = numpy + + def reshape(self, *args): + self.shape_ = args + + def count(self, *axis): + return reduce(lambda x, y: x * y, self.shape_[slice(*(axis + [-1])[:2])]) + + @property + def data(self): + if self.numpy and isinstance(self.data_, torch.autograd.Variable): + self.data_ = self.data_.detach().cpu().numpy() + return self.data_ + + @property + def diff(self): + if self.numpy and isinstance(self.diff_, torch.autograd.Variable): + self.diff_ = self.diff_.detach().cpu().numpy() + return self.diff_ + + @property + def shape(self): + return self.shape_ if self.shape_ is not None else self.data_.shape + + @property + def num(self): + return self.shape[0] + + @property + def channels(self): + return self.shape[1] + + @property + def height(self): + return self.shape[2] + + @property + def width(self): + return self.shape[3] + +class Layer(torch.autograd.Function): + def __init__(self, caffe_python_layer = None, caffe_input_variable_names = None, caffe_output_variable_names = None, caffe_propagate_down = None): + self.caffe_python_layer = caffe_python_layer + self.caffe_input_variable_names = caffe_input_variable_names + self.caffe_output_variable_names = caffe_output_variable_names + self.caffe_propagate_down = caffe_propagate_down + + def forward(self, *inputs): + bottom = [Blob(data = v.cpu().numpy()) for v in inputs] + top = [Blob() for name in self.caffe_output_variable_names] + + #self.caffe_python_layer.reshape() + self.caffe_python_layer.setup(bottom, top) + self.caffe_python_layer.setup = lambda *args: None + + self.caffe_python_layer.forward(bottom, top) + outputs = tuple(convert_to_gpu_if_enabled(torch.from_numpy(v.data.contents.reshape(*v.shape))) for v in top) + self.save_for_backward(*(inputs + outputs)) + return outputs + + def backward(self, grad_outputs): + inputs, outputs = self.saved_tensors[:len(self.caffe_input_variable_names)], self.saved_tensors[len(self.caffe_input_variable_names):] + bottom = [Blob(data = v.cpu().numpy()) for v in inputs] + top = [Blob(data = output.cpu().numpy(), diff = grad_output.cpu().numpy()) for grad_output, output in zip(grad_outputs, outputs)] + self.caffe_python_layer.backward(top, self.caffe_propagate_down, bottom) + return tuple(convert_to_gpu_if_enabled(torch.from_numpy(blob.diff.contents.reshape(*v.reshape))) if propagate_down else None for v, propagate_down in zip(bottom, self.caffe_propagate_down)) + +class SGDSolver(object): + def __init__(self, solver_prototxt): + solver_param = initialize().SolverParameter() + google.protobuf.text_format.Parse(open(solver_prototxt).read(), solver_param) + solver_param = to_dict(solver_param) + self.net = Net(solver_param.get('train_net') or solver_param.get('net'), phase = TRAIN) + self.iter = 1 + self.iter_size = solver_param.get('iter_size', 1) + self.optimizer_params = dict(lr = solver_param.get('base_lr') / self.iter_size, momentum = solver_param.get('momentum', 0), weight_decay = solver_param.get('weight_decay', 0)) + self.lr_scheduler_params = dict(policy = solver_param.get('lr_policy'), step_size = solver_param.get('stepsize'), gamma = solver_param.get('gamma')) + self.optimizer, self.scheduler = None, None + + def init_optimizer_scheduler(self): + self.optimizer = torch.optim.SGD([dict(params = [param], lr = self.optimizer_params['lr'] * mult.get('lr_mult', 1), weight_decay = self.optimizer_params['weight_decay'] * mult.get('decay_mult', 1), momentum = self.optimizer_params['momentum']) for module in self.net.children() for param, mult in zip(module.parameters(), module.caffe_optimization_params + [{}, {}]) if param.requires_grad]) + self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size = self.lr_scheduler_params['step_size'], gamma = self.lr_scheduler_params['gamma']) if self.lr_scheduler_params.get('policy') == 'step' else type('', (object, ), dict(step = lambda self: None))() + + def step(self, iterations = 1, **inputs): + loss_total = 0.0 + for i in range(iterations): + tic = time.time() + if self.optimizer is not None: + self.optimizer.zero_grad() + + loss_batch = 0 + losses_batch = collections.defaultdict(float) + for j in range(self.iter_size): + outputs = [kv for kv in self.net(**inputs).items() if self.net.blob_loss_weights[kv[0]] != 0] + loss = sum([self.net.blob_loss_weights[k] * v.sum() for k, v in outputs]) + loss_batch += float(loss) / self.iter_size + for k, v in outputs: + losses_batch[k] += float(v.sum()) / self.iter_size + if self.optimizer is None: + self.init_optimizer_scheduler() + self.optimizer.zero_grad() + loss.backward() + + loss_total += loss_batch + self.optimizer.step() + self.scheduler.step() + self.iter += 1 + + log_prefix = self.__module__ + '.' + type(self).__name__ + print('{}] Iteration {}, loss: {}'.format(log_prefix, self.iter, loss_batch)) + for i, (name, loss) in enumerate(sorted(losses_batch.items())): + print('{}] Train net output #{}: {} = {} (* {} = {} loss)'.format(log_prefix, i, name, loss, self.net.blob_loss_weights[name], self.net.blob_loss_weights[name] * loss)) + print('{}] Iteration {}, lr = {}, time = {}'.format(log_prefix, self.iter, self.optimizer_params['lr'], time.time() - tic)) + + return loss_total + +modules = dict( + Convolution = lambda param: Convolution(param), + InnerProduct = lambda param: InnerProduct(param), + Pooling = lambda param: [nn.MaxPool2d, nn.AvgPool2d][param['pool']](kernel_size = first_or(param, 'kernel_size', 1), stride = first_or(param, 'stride', 1), padding = first_or(param, 'pad', 0)), + Softmax = lambda param: nn.Softmax(dim = param.get('axis', -1)), + ReLU = lambda param: nn.ReLU(), + Dropout = lambda param: nn.Dropout(p = param['dropout_ratio']), + Eltwise = lambda param: [torch.mul, torch.add, torch.max][param.get('operation', 1)], + LRN = lambda param: nn.LocalResponseNorm(size = param['local_size'], alpha = param['alpha'], beta = param['beta']) +) + +class FunctionModule(nn.Module): + def __init__(self, forward): + super(FunctionModule, self).__init__() + self.forward_func = forward + + def forward(self, *inputs): + return self.forward_func(*inputs) + +class CaffePythonLayerModule(nn.Module): + def __init__(self, caffe_python_layer, caffe_input_variable_names, caffe_output_variable_names, param_str): + super(CaffePythonLayerModule, self).__init__() + caffe_python_layer.param_str = param_str + self.caffe_python_layer = caffe_python_layer + self.caffe_input_variable_names = caffe_input_variable_names + self.caffe_output_variable_names = caffe_output_variable_names + + def forward(self, *inputs): + return Layer(self.caffe_python_layer, self.caffe_input_variable_names, self.caffe_output_variable_names)(*inputs) + + def __getattr__(self, name): + return nn.Module.__getattr__(self, name) if name in dir(self) else getattr(self.caffe_python_layer, name) + +class Convolution(nn.Conv2d): + def __init__(self, param): + super(Convolution, self).__init__(first_or(param,'group',1), param['num_output'], kernel_size = first_or(param, 'kernel_size', 1), stride = first_or(param, 'stride', 1), padding = first_or(param, 'pad', 0), dilation = first_or(param, 'dilation', 1), groups = first_or(param, 'group', 1)) + self.weight, self.bias = nn.Parameter(), nn.Parameter() + self.weight_init, self.bias_init = param.get('weight_filler', {}), param.get('bias_filler', {}) + + def forward(self, x): + if self.weight.numel() == 0 and self.bias.numel() == 0: + requires_grad = [self.weight.requires_grad, self.bias.requires_grad] + super(Convolution, self).__init__(x.size(1), self.out_channels, kernel_size = self.kernel_size, stride = self.stride, padding = self.padding, dilation = self.dilation) + convert_to_gpu_if_enabled(self) + init_weight_bias(self, requires_grad = requires_grad) + return super(Convolution, self).forward(x) + + def set_parameters(self, weight = None, bias = None): + init_weight_bias(self, weight = weight, bias = bias.view(-1) if bias is not None else bias) + self.in_channels = self.weight.size(1) + +class InnerProduct(nn.Linear): + def __init__(self, param): + super(InnerProduct, self).__init__(1, param['num_output']) + self.weight, self.bias = nn.Parameter(), nn.Parameter() + self.weight_init, self.bias_init = param.get('weight_filler', {}), param.get('bias_filler', {}) + + def forward(self, x): + if self.weight.numel() == 0 and self.bias.numel() == 0: + requires_grad = [self.weight.requires_grad, self.bias.requires_grad] + super(InnerProduct, self).__init__(x.size(1), self.out_features) + convert_to_gpu_if_enabled(self) + init_weight_bias(self, requires_grad = requires_grad) + return super(InnerProduct, self).forward(x if x.size(-1) == self.in_features else x.view(len(x), -1)) + + def set_parameters(self, weight = None, bias = None): + init_weight_bias(self, weight = weight.view(weight.size(-2), weight.size(-1)) if weight is not None else None, bias = bias.view(-1) if bias is not None else None) + self.in_features = self.weight.size(1) + +def init_weight_bias(self, weight = None, bias = None, requires_grad = []): + if weight is not None: + self.weight = nn.Parameter(weight.type_as(self.weight), requires_grad = self.weight.requires_grad) + if bias is not None: + self.bias = nn.Parameter(bias.type_as(self.bias), requires_grad = self.bias.requires_grad) + for name, requires_grad in zip(['weight', 'bias'], requires_grad): + param, init = getattr(self, name), getattr(self, name + '_init') + if init.get('type') == 'gaussian': + nn.init.normal_(param, std = init['std']) + elif init.get('type') == 'constant': + nn.init.constant_(param, val = init['value']) + param.requires_grad = requires_grad + +def convert_to_gpu_if_enabled(obj): + return obj + +def first_or(param, key, default): + return param[key] if isinstance(param.get(key), int) else (param.get(key, []) + [default])[0] + +def to_dict(obj): + return list(map(to_dict, obj)) if isinstance(obj, collections.Iterable) else {} if obj is None else {f.name : converter(v) if f.label != FD.LABEL_REPEATED else list(map(converter, v)) for f, v in obj.ListFields() for converter in [{FD.TYPE_DOUBLE: float, FD.TYPE_SFIXED32: float, FD.TYPE_SFIXED64: float, FD.TYPE_SINT32: int, FD.TYPE_SINT64: int, FD.TYPE_FLOAT: float, FD.TYPE_ENUM: int, FD.TYPE_UINT32: int, FD.TYPE_INT64: int, FD.TYPE_UINT64: int, FD.TYPE_INT32: int, FD.TYPE_FIXED64: float, FD.TYPE_FIXED32: float, FD.TYPE_BOOL: bool, FD.TYPE_STRING: str, FD.TYPE_BYTES: lambda x: x.encode('string_escape'), FD.TYPE_MESSAGE: to_dict}[f.type]]} + +if __name__ == 'main': + parser = argparse.ArgumentParser() + parser.add_argument(metavar = 'model.caffemodel', dest = 'model_caffemodel', help = 'Path to model.caffemodel') + parser.add_argument('-o', dest = 'output_path', help = 'Path to converted model, supported file extensions are: h5, npy, npz, json, pt') + parser.add_argument('--caffe.proto', metavar = '--caffe.proto', dest = 'caffe_proto', help = 'Path to caffe.proto (typically located at CAFFE_ROOT/src/caffe/proto/caffe.proto)', default = 'https://raw.githubusercontent.com/BVLC/caffe/master/src/caffe/proto/caffe.proto') + args = parser.parse_args() + args.output_path = args.output_path or args.model_caffemodel + '.pt' + + net_param = initialize(args.caffe_proto).NetParameter() + net_param.ParseFromString(open(args.model_caffemodel, 'rb').read()) + blobs = {layer.name + '.' + name : dict(data = blob.data, shape = list(blob.shape.dim) if len(blob.shape.dim) > 0 else [blob.num, blob.channels, blob.height, blob.width]) for layer in list(net_param.layer) + list(net_param.layers) for name, blob in zip(['weight', 'bias'], layer.blobs)} + + if args.output_path.endswith('.json'): + import json + with open(args.output_path, 'w') as f: + json.dump(blobs, f) + elif args.output_path.endswith('.h5'): + import h5py, numpy + with h5py.File(args.output_path, 'w') as h: + h.update(**{k : numpy.array(blob['data'], dtype = numpy.float32).reshape(*blob['shape']) for k, blob in blobs.items()}) + elif args.output_path.endswith('.npy') or args.output_path.endswith('.npz'): + import numpy + (numpy.savez if args.output_path[-1] == 'z' else numpy.save)(args.output_path, **{k : numpy.array(blob['data'], dtype = numpy.float32).reshape(*blob['shape']) for k, blob in blobs.items()}) + elif args.output_path.endswith('.pt'): + torch.save({k : torch.FloatTensor(blob['data']).view(*blob['shape']) for k, blob in blobs.items()}, args.output_path) + diff --git a/predicting-poverty-education-replication/scripts/use_paper_model/caffemodel2pytorch/original.py b/predicting-poverty-education-replication/scripts/use_paper_model/caffemodel2pytorch/original.py new file mode 100755 index 0000000..393b610 --- /dev/null +++ b/predicting-poverty-education-replication/scripts/use_paper_model/caffemodel2pytorch/original.py @@ -0,0 +1,402 @@ +import os +import sys +import time +import argparse +import tempfile +import subprocess +import collections +import torch +import torch.nn as nn +import torch.nn.functional as F +from functools import reduce + +try: + from urllib.request import urlopen +except: + from urllib2 import urlopen # Python 2 support. + +import google.protobuf.descriptor +import google.protobuf.descriptor_pool +import google.protobuf.symbol_database +import google.protobuf.text_format +from google.protobuf.descriptor import FieldDescriptor as FD + +TRAIN = 0 + +TEST = 1 + +caffe_pb2 = None + +def initialize(caffe_proto = 'https://raw.githubusercontent.com/BVLC/caffe/master/src/caffe/proto/caffe.proto', codegen_dir = tempfile.mkdtemp(), shadow_caffe = True): + global caffe_pb2 + if caffe_pb2 is None: + local_caffe_proto = os.path.join(codegen_dir, os.path.basename(caffe_proto)) + with open(local_caffe_proto, 'w') as f: + mybytes = urlopen(caffe_proto).read() + mystr = mybytes.decode('ascii', 'ignore') + f.write(mystr) + #f.write((urlopen if 'http' in caffe_proto else open)(caffe_proto).read()) + subprocess.check_call(['protoc', '--proto_path', os.path.dirname(local_caffe_proto), '--python_out', codegen_dir, local_caffe_proto]) + sys.path.insert(0, codegen_dir) + old_pool = google.protobuf.descriptor._message.default_pool + old_symdb = google.protobuf.symbol_database._DEFAULT + google.protobuf.descriptor._message.default_pool = google.protobuf.descriptor_pool.DescriptorPool() + google.protobuf.symbol_database._DEFAULT = google.protobuf.symbol_database.SymbolDatabase(pool = google.protobuf.descriptor._message.default_pool) + import caffe_pb2 as caffe_pb2 + google.protobuf.descriptor._message.default_pool = old_pool + google.protobuf.symbol_database._DEFAULT = old_symdb + sys.modules[__name__ + '.proto'] = sys.modules[__name__] + if shadow_caffe: + sys.modules['caffe'] = sys.modules[__name__] + sys.modules['caffe.proto'] = sys.modules[__name__] + return caffe_pb2 + +def set_mode_gpu(): + global convert_to_gpu_if_enabled + convert_to_gpu_if_enabled = lambda obj: obj.cuda() + +def set_device(gpu_id): + torch.cuda.set_device(gpu_id) + +class Net(nn.Module): + def __init__(self, prototxt, *args, **kwargs): + super(Net, self).__init__() + # to account for both constructors, see https://github.com/BVLC/caffe/blob/master/python/caffe/test/test_net.py#L145-L147 + caffe_proto = kwargs.pop('caffe_proto', None) + weights = kwargs.pop('weights', None) + phase = kwargs.pop('phase', None) + weights = weights or (args + (None, None))[0] + phase = phase or (args + (None, None))[1] + + self.net_param = initialize(caffe_proto).NetParameter() + google.protobuf.text_format.Parse(open(prototxt).read(), self.net_param) + + for layer in list(self.net_param.layer) + list(self.net_param.layers): + layer_type = layer.type if layer.type != 'Python' else layer.python_param.layer + if isinstance(layer_type, int): + layer_type = layer.LayerType.Name(layer_type) + module_constructor = ([v for k, v in modules.items() if k.replace('_', '').upper() in [layer_type.replace('_', '').upper(), layer.name.replace('_', '').upper()]] + [None])[0] + if module_constructor is not None: + param = to_dict(([v for f, v in layer.ListFields() if f.name.endswith('_param')] + [None])[0]) + caffe_input_variable_names = list(layer.bottom) + caffe_output_variable_names = list(layer.top) + caffe_loss_weight = (list(layer.loss_weight) or [1.0 if layer_type.upper().endswith('LOSS') else 0.0]) * len(layer.top) + caffe_propagate_down = list(getattr(layer, 'propagate_down', [])) or [True] * len(caffe_input_variable_names) + caffe_optimization_params = to_dict(layer.param) + param['inplace'] = len(caffe_input_variable_names) == 1 and caffe_input_variable_names == caffe_output_variable_names + module = module_constructor(param) + self.add_module(layer.name, module if isinstance(module, nn.Module) else CaffePythonLayerModule(module, caffe_input_variable_names, caffe_output_variable_names, param.get('param_str', '')) if type(module).__name__.endswith('Layer') else FunctionModule(module)) + module = getattr(self, layer.name) + module.caffe_layer_name = layer.name + module.caffe_layer_type = layer_type + module.caffe_input_variable_names = caffe_input_variable_names + module.caffe_output_variable_names = caffe_output_variable_names + module.caffe_loss_weight = caffe_loss_weight + module.caffe_propagate_down = caffe_propagate_down + module.caffe_optimization_params = caffe_optimization_params + for optim_param, p in zip(caffe_optimization_params, module.parameters()): + p.requires_grad = optim_param.get('lr_mult', 1) != 0 + else: + print('Skipping layer [{}, {}, {}]: not found in caffemodel2pytorch.modules dict'.format(layer.name, layer_type, layer.type)) + + if weights is not None: + self.copy_from(weights) + + self.blobs = collections.defaultdict(Blob) + self.blob_loss_weights = {name : loss_weight for module in self.children() for name, loss_weight in zip(module.caffe_output_variable_names, module.caffe_loss_weight)} + + self.train(phase != TEST) + convert_to_gpu_if_enabled(self) + + def forward(self, data = None, **variables): + if data is not None: + variables['data'] = data + numpy = not all(map(torch.is_tensor, variables.values())) + variables = {k : convert_to_gpu_if_enabled(torch.from_numpy(v.copy()) if numpy else v) for k, v in variables.items()} + + for module in [module for module in self.children() if not all(name in variables for name in module.caffe_output_variable_names)]: + for name in module.caffe_input_variable_names: + assert name in variables, 'Variable [{}] does not exist. Pass it as a keyword argument or provide a layer which produces it.'.format(name) + inputs = [variables[name] if propagate_down else variables[name].detach() for name, propagate_down in zip(module.caffe_input_variable_names, module.caffe_propagate_down)] + outputs = module(*inputs) + if not isinstance(outputs, tuple): + outputs = (outputs, ) + variables.update(dict(zip(module.caffe_output_variable_names, outputs))) + + self.blobs.update({k : Blob(data = v, numpy = numpy) for k, v in variables.items()}) + caffe_output_variable_names = set([name for module in self.children() for name in module.caffe_output_variable_names]) - set([name for module in self.children() for name in module.caffe_input_variable_names if name not in module.caffe_output_variable_names]) + return {k : v.detach().cpu().numpy() if numpy else v for k, v in variables.items() if k in caffe_output_variable_names} + + def copy_from(self, weights): + try: + import h5py, numpy + state_dict = self.state_dict() + for k, v in h5py.File(weights, 'r').items(): + if k in state_dict: + state_dict[k].resize_(v.shape).copy_(torch.from_numpy(numpy.array(v))) + print('caffemodel2pytorch: loaded model from [{}] in HDF5 format'.format(weights)) + except Exception as e: + print('caffemodel2pytorch: loading model from [{}] in HDF5 format failed [{}], falling back to caffemodel format'.format(weights, e.message)) + bytes_weights = open(weights).read() + bytes_parsed = self.net_param.ParseFromString(bytes_weights) + if bytes_parsed != len(bytes_weights): + print('caffemodel2pytorch: loading model from [{}] in caffemodel format, WARNING: file length [{}] is not equal to number of parsed bytes [{}]'.format(weights, len(bytes_weights), bytes_parsed)) + for layer in list(self.net_param.layer) + list(self.net_param.layers): + module = getattr(self, layer.name, None) + if module is None: + continue + parameters = {name : convert_to_gpu_if_enabled(torch.FloatTensor(blob.data)).view(list(blob.shape.dim) if len(blob.shape.dim) > 0 else [blob.num, blob.channels, blob.height, blob.width]) for name, blob in zip(['weight', 'bias'], layer.blobs)} + if len(parameters) > 0: + module.set_parameters(**parameters) + print('caffemodel2pytorch: loaded model from [{}] in caffemodel format'.format(weights)) + + def save(self, weights): + import h5py + with h5py.File(weights, 'w') as h: + for k, v in self.state_dict().items(): + h[k] = v.cpu().numpy() + print('caffemodel2pytorch: saved model to [{}] in HDF5 format'.format(weights)) + + @property + def layers(self): + return list(self.children()) + +class Blob(object): + AssignmentAdapter = type('', (object, ), dict(shape = property(lambda self: self.contents.shape), __setitem__ = lambda self, indices, values: setattr(self, 'contents', values))) + + def __init__(self, data = None, diff = None, numpy = False): + self.data_ = data if data is not None else Blob.AssignmentAdapter() + self.diff_ = diff if diff is not None else Blob.AssignmentAdapter() + self.shape_ = None + self.numpy = numpy + + def reshape(self, *args): + self.shape_ = args + + def count(self, *axis): + return reduce(lambda x, y: x * y, self.shape_[slice(*(axis + [-1])[:2])]) + + @property + def data(self): + if self.numpy and isinstance(self.data_, torch.autograd.Variable): + self.data_ = self.data_.detach().cpu().numpy() + return self.data_ + + @property + def diff(self): + if self.numpy and isinstance(self.diff_, torch.autograd.Variable): + self.diff_ = self.diff_.detach().cpu().numpy() + return self.diff_ + + @property + def shape(self): + return self.shape_ if self.shape_ is not None else self.data_.shape + + @property + def num(self): + return self.shape[0] + + @property + def channels(self): + return self.shape[1] + + @property + def height(self): + return self.shape[2] + + @property + def width(self): + return self.shape[3] + +class Layer(torch.autograd.Function): + def __init__(self, caffe_python_layer = None, caffe_input_variable_names = None, caffe_output_variable_names = None, caffe_propagate_down = None): + self.caffe_python_layer = caffe_python_layer + self.caffe_input_variable_names = caffe_input_variable_names + self.caffe_output_variable_names = caffe_output_variable_names + self.caffe_propagate_down = caffe_propagate_down + + def forward(self, *inputs): + bottom = [Blob(data = v.cpu().numpy()) for v in inputs] + top = [Blob() for name in self.caffe_output_variable_names] + + #self.caffe_python_layer.reshape() + self.caffe_python_layer.setup(bottom, top) + self.caffe_python_layer.setup = lambda *args: None + + self.caffe_python_layer.forward(bottom, top) + outputs = tuple(convert_to_gpu_if_enabled(torch.from_numpy(v.data.contents.reshape(*v.shape))) for v in top) + self.save_for_backward(*(inputs + outputs)) + return outputs + + def backward(self, grad_outputs): + inputs, outputs = self.saved_tensors[:len(self.caffe_input_variable_names)], self.saved_tensors[len(self.caffe_input_variable_names):] + bottom = [Blob(data = v.cpu().numpy()) for v in inputs] + top = [Blob(data = output.cpu().numpy(), diff = grad_output.cpu().numpy()) for grad_output, output in zip(grad_outputs, outputs)] + self.caffe_python_layer.backward(top, self.caffe_propagate_down, bottom) + return tuple(convert_to_gpu_if_enabled(torch.from_numpy(blob.diff.contents.reshape(*v.reshape))) if propagate_down else None for v, propagate_down in zip(bottom, self.caffe_propagate_down)) + +class SGDSolver(object): + def __init__(self, solver_prototxt): + solver_param = initialize().SolverParameter() + google.protobuf.text_format.Parse(open(solver_prototxt).read(), solver_param) + solver_param = to_dict(solver_param) + self.net = Net(solver_param.get('train_net') or solver_param.get('net'), phase = TRAIN) + self.iter = 1 + self.iter_size = solver_param.get('iter_size', 1) + self.optimizer_params = dict(lr = solver_param.get('base_lr') / self.iter_size, momentum = solver_param.get('momentum', 0), weight_decay = solver_param.get('weight_decay', 0)) + self.lr_scheduler_params = dict(policy = solver_param.get('lr_policy'), step_size = solver_param.get('stepsize'), gamma = solver_param.get('gamma')) + self.optimizer, self.scheduler = None, None + + def init_optimizer_scheduler(self): + self.optimizer = torch.optim.SGD([dict(params = [param], lr = self.optimizer_params['lr'] * mult.get('lr_mult', 1), weight_decay = self.optimizer_params['weight_decay'] * mult.get('decay_mult', 1), momentum = self.optimizer_params['momentum']) for module in self.net.children() for param, mult in zip(module.parameters(), module.caffe_optimization_params + [{}, {}]) if param.requires_grad]) + self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size = self.lr_scheduler_params['step_size'], gamma = self.lr_scheduler_params['gamma']) if self.lr_scheduler_params.get('policy') == 'step' else type('', (object, ), dict(step = lambda self: None))() + + def step(self, iterations = 1, **inputs): + loss_total = 0.0 + for i in range(iterations): + tic = time.time() + if self.optimizer is not None: + self.optimizer.zero_grad() + + loss_batch = 0 + losses_batch = collections.defaultdict(float) + for j in range(self.iter_size): + outputs = [kv for kv in self.net(**inputs).items() if self.net.blob_loss_weights[kv[0]] != 0] + loss = sum([self.net.blob_loss_weights[k] * v.sum() for k, v in outputs]) + loss_batch += float(loss) / self.iter_size + for k, v in outputs: + losses_batch[k] += float(v.sum()) / self.iter_size + if self.optimizer is None: + self.init_optimizer_scheduler() + self.optimizer.zero_grad() + loss.backward() + + loss_total += loss_batch + self.optimizer.step() + self.scheduler.step() + self.iter += 1 + + log_prefix = self.__module__ + '.' + type(self).__name__ + print('{}] Iteration {}, loss: {}'.format(log_prefix, self.iter, loss_batch)) + for i, (name, loss) in enumerate(sorted(losses_batch.items())): + print('{}] Train net output #{}: {} = {} (* {} = {} loss)'.format(log_prefix, i, name, loss, self.net.blob_loss_weights[name], self.net.blob_loss_weights[name] * loss)) + print('{}] Iteration {}, lr = {}, time = {}'.format(log_prefix, self.iter, self.optimizer_params['lr'], time.time() - tic)) + + return loss_total + +modules = dict( + Convolution = lambda param: Convolution(param), + InnerProduct = lambda param: InnerProduct(param), + Pooling = lambda param: [nn.MaxPool2d, nn.AvgPool2d][param['pool']](kernel_size = first_or(param, 'kernel_size', 1), stride = first_or(param, 'stride', 1), padding = first_or(param, 'pad', 0)), + Softmax = lambda param: nn.Softmax(dim = param.get('axis', -1)), + ReLU = lambda param: nn.ReLU(), + Dropout = lambda param: nn.Dropout(p = param['dropout_ratio']), + Eltwise = lambda param: [torch.mul, torch.add, torch.max][param.get('operation', 1)], + LRN = lambda param: nn.LocalResponseNorm(size = param['local_size'], alpha = param['alpha'], beta = param['beta']) +) + +class FunctionModule(nn.Module): + def __init__(self, forward): + super(FunctionModule, self).__init__() + self.forward_func = forward + + def forward(self, *inputs): + return self.forward_func(*inputs) + +class CaffePythonLayerModule(nn.Module): + def __init__(self, caffe_python_layer, caffe_input_variable_names, caffe_output_variable_names, param_str): + super(CaffePythonLayerModule, self).__init__() + caffe_python_layer.param_str = param_str + self.caffe_python_layer = caffe_python_layer + self.caffe_input_variable_names = caffe_input_variable_names + self.caffe_output_variable_names = caffe_output_variable_names + + def forward(self, *inputs): + return Layer(self.caffe_python_layer, self.caffe_input_variable_names, self.caffe_output_variable_names)(*inputs) + + def __getattr__(self, name): + return nn.Module.__getattr__(self, name) if name in dir(self) else getattr(self.caffe_python_layer, name) + +class Convolution(nn.Conv2d): + def __init__(self, param): + super(Convolution, self).__init__(first_or(param,'group',1), param['num_output'], kernel_size = first_or(param, 'kernel_size', 1), stride = first_or(param, 'stride', 1), padding = first_or(param, 'pad', 0), dilation = first_or(param, 'dilation', 1), groups = first_or(param, 'group', 1)) + self.weight, self.bias = nn.Parameter(), nn.Parameter() + self.weight_init, self.bias_init = param.get('weight_filler', {}), param.get('bias_filler', {}) + + def forward(self, x): + if self.weight.numel() == 0 and self.bias.numel() == 0: + requires_grad = [self.weight.requires_grad, self.bias.requires_grad] + super(Convolution, self).__init__(x.size(1), self.out_channels, kernel_size = self.kernel_size, stride = self.stride, padding = self.padding, dilation = self.dilation) + convert_to_gpu_if_enabled(self) + init_weight_bias(self, requires_grad = requires_grad) + return super(Convolution, self).forward(x) + + def set_parameters(self, weight = None, bias = None): + init_weight_bias(self, weight = weight, bias = bias.view(-1) if bias is not None else bias) + self.in_channels = self.weight.size(1) + +class InnerProduct(nn.Linear): + def __init__(self, param): + super(InnerProduct, self).__init__(1, param['num_output']) + self.weight, self.bias = nn.Parameter(), nn.Parameter() + self.weight_init, self.bias_init = param.get('weight_filler', {}), param.get('bias_filler', {}) + + def forward(self, x): + if self.weight.numel() == 0 and self.bias.numel() == 0: + requires_grad = [self.weight.requires_grad, self.bias.requires_grad] + super(InnerProduct, self).__init__(x.size(1), self.out_features) + convert_to_gpu_if_enabled(self) + init_weight_bias(self, requires_grad = requires_grad) + return super(InnerProduct, self).forward(x if x.size(-1) == self.in_features else x.view(len(x), -1)) + + def set_parameters(self, weight = None, bias = None): + init_weight_bias(self, weight = weight.view(weight.size(-2), weight.size(-1)) if weight is not None else None, bias = bias.view(-1) if bias is not None else None) + self.in_features = self.weight.size(1) + +def init_weight_bias(self, weight = None, bias = None, requires_grad = []): + if weight is not None: + self.weight = nn.Parameter(weight.type_as(self.weight), requires_grad = self.weight.requires_grad) + if bias is not None: + self.bias = nn.Parameter(bias.type_as(self.bias), requires_grad = self.bias.requires_grad) + for name, requires_grad in zip(['weight', 'bias'], requires_grad): + param, init = getattr(self, name), getattr(self, name + '_init') + if init.get('type') == 'gaussian': + nn.init.normal_(param, std = init['std']) + elif init.get('type') == 'constant': + nn.init.constant_(param, val = init['value']) + param.requires_grad = requires_grad + +def convert_to_gpu_if_enabled(obj): + return obj + +def first_or(param, key, default): + return param[key] if isinstance(param.get(key), int) else (param.get(key, []) + [default])[0] + +def to_dict(obj): + return list(map(to_dict, obj)) if isinstance(obj, collections.Iterable) else {} if obj is None else {f.name : converter(v) if f.label != FD.LABEL_REPEATED else list(map(converter, v)) for f, v in obj.ListFields() for converter in [{FD.TYPE_DOUBLE: float, FD.TYPE_SFIXED32: float, FD.TYPE_SFIXED64: float, FD.TYPE_SINT32: int, FD.TYPE_SINT64: int, FD.TYPE_FLOAT: float, FD.TYPE_ENUM: int, FD.TYPE_UINT32: int, FD.TYPE_INT64: int, FD.TYPE_UINT64: int, FD.TYPE_INT32: int, FD.TYPE_FIXED64: float, FD.TYPE_FIXED32: float, FD.TYPE_BOOL: bool, FD.TYPE_STRING: str, FD.TYPE_BYTES: lambda x: x.encode('string_escape'), FD.TYPE_MESSAGE: to_dict}[f.type]]} + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument(metavar = 'model.caffemodel', dest = 'model_caffemodel', help = 'Path to model.caffemodel') + parser.add_argument('-o', dest = 'output_path', help = 'Path to converted model, supported file extensions are: h5, npy, npz, json, pt') + parser.add_argument('--caffe.proto', metavar = '--caffe.proto', dest = 'caffe_proto', help = 'Path to caffe.proto (typically located at CAFFE_ROOT/src/caffe/proto/caffe.proto)', default = 'https://raw.githubusercontent.com/BVLC/caffe/master/src/caffe/proto/caffe.proto') + args = parser.parse_args() + args.output_path = args.output_path or args.model_caffemodel + '.pt' + + net_param = initialize(args.caffe_proto).NetParameter() + net_param.ParseFromString(open(args.model_caffemodel, 'rb').read()) + blobs = {layer.name + '.' + name : dict(data = blob.data, shape = list(blob.shape.dim) if len(blob.shape.dim) > 0 else [blob.num, blob.channels, blob.height, blob.width]) for layer in list(net_param.layer) + list(net_param.layers) for name, blob in zip(['weight', 'bias'], layer.blobs)} + + if args.output_path.endswith('.json'): + import json + with open(args.output_path, 'w') as f: + json.dump(blobs, f) + elif args.output_path.endswith('.h5'): + import h5py, numpy + with h5py.File(args.output_path, 'w') as h: + h.update(**{k : numpy.array(blob['data'], dtype = numpy.float32).reshape(*blob['shape']) for k, blob in blobs.items()}) + elif args.output_path.endswith('.npy') or args.output_path.endswith('.npz'): + import numpy + (numpy.savez if args.output_path[-1] == 'z' else numpy.save)(args.output_path, **{k : numpy.array(blob['data'], dtype = numpy.float32).reshape(*blob['shape']) for k, blob in blobs.items()}) + elif args.output_path.endswith('.pt'): + torch.save({k : torch.FloatTensor(blob['data']).view(*blob['shape']) for k, blob in blobs.items()}, args.output_path) diff --git a/predicting-poverty-education-replication/scripts/use_paper_model/forward_pass.ipynb b/predicting-poverty-education-replication/scripts/use_paper_model/forward_pass.ipynb new file mode 100644 index 0000000..b7e48ab --- /dev/null +++ b/predicting-poverty-education-replication/scripts/use_paper_model/forward_pass.ipynb @@ -0,0 +1,358 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This file uses a Python2 library from Github (https://github.com/vadimkantorov/caffemodel2pytorch) to convert Caffe models into PyTorch. The original model was trained with Caffe. Remarkably, you don't ever have to install caffe for it to work!\n", + "\n", + "I converted it to work on Python3." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "import numpy as np\n", + "import torchvision\n", + "from torchvision import datasets, models, transforms\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_DIR = '..'\n", + "COUNTRIES_DIR = os.path.join(BASE_DIR, 'data', 'countries')\n", + "PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')\n", + "RESULTS_DIR = os.path.join(BASE_DIR, 'results')\n", + "CNN_TRAIN_IMAGE_DIR = os.path.join(BASE_DIR, 'data', 'cnn_images')" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from caffemodel2pytorch import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "caffemodel2pytorch: loading model from [predicting_poverty_trained.caffemodel] in HDF5 format failed [Unable to open file (file signature not found)], falling back to caffemodel format\n", + "caffemodel2pytorch: loaded model from [predicting_poverty_trained.caffemodel] in caffemodel format\n" + ] + } + ], + "source": [ + "model = Net(\n", + " prototxt = 'predicting_poverty_deploy.prototxt',\n", + " weights = 'predicting_poverty_trained.caffemodel',\n", + " caffe_proto = 'https://raw.githubusercontent.com/BVLC/caffe/master/src/caffe/proto/caffe.proto'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Net(\n", + " (conv1): Convolution(3, 64, kernel_size=(11, 11), stride=(4, 4))\n", + " (relu1): ReLU()\n", + " (norm1): LocalResponseNorm(5, alpha=0.0005000000237487257, beta=0.75, k=1.0)\n", + " (pool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (conv2): Convolution(64, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))\n", + " (relu2): ReLU()\n", + " (norm2): LocalResponseNorm(5, alpha=0.0005000000237487257, beta=0.75, k=1.0)\n", + " (pool2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (conv3): Convolution(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (relu3): ReLU()\n", + " (conv4): Convolution(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (relu4): ReLU()\n", + " (conv5): Convolution(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (relu5): ReLU()\n", + " (pool5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (conv6): Convolution(256, 4096, kernel_size=(6, 6), stride=(6, 6))\n", + " (relu6): ReLU()\n", + " (conv7): Convolution(4096, 4096, kernel_size=(1, 1), stride=(1, 1))\n", + " (relu7): ReLU()\n", + " (conv8): Convolution(4096, 3, kernel_size=(1, 1), stride=(1, 1))\n", + " (pool6): AvgPool2d(kernel_size=2, stride=1, padding=0)\n", + " (prob): Softmax()\n", + ")" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# we strip the last layers, so the output is just what was at the conv7 layer\n", + "# the paper uses the data at this layer as the \"features\" for the image\n", + "del model.prob, model.pool6, model.conv8, model.relu7" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Net(\n", + " (conv1): Convolution(3, 64, kernel_size=(11, 11), stride=(4, 4))\n", + " (relu1): ReLU()\n", + " (norm1): LocalResponseNorm(5, alpha=0.0005000000237487257, beta=0.75, k=1.0)\n", + " (pool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (conv2): Convolution(64, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))\n", + " (relu2): ReLU()\n", + " (norm2): LocalResponseNorm(5, alpha=0.0005000000237487257, beta=0.75, k=1.0)\n", + " (pool2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (conv3): Convolution(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (relu3): ReLU()\n", + " (conv4): Convolution(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (relu4): ReLU()\n", + " (conv5): Convolution(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", + " (relu5): ReLU()\n", + " (pool5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n", + " (conv6): Convolution(256, 4096, kernel_size=(6, 6), stride=(6, 6))\n", + " (relu6): ReLU()\n", + " (conv7): Convolution(4096, 4096, kernel_size=(1, 1), stride=(1, 1))\n", + ")" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "model.to(device)\n", + "model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_images = pd.read_csv(os.path.join(PROCESSED_DIR, 'image_download_actual.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from torchvision import datasets, models, transforms" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "transformer = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n", + " ])\n", + "\n", + "# custom dataset for fast image loading and processing\n", + "# does not follow the usual style of folder -> folder for each class -> image\n", + "# we just want one folder with images\n", + "class ForwardPassDataset(torch.utils.data.Dataset):\n", + " def __init__(self, image_dir, transformer):\n", + " self.image_dir = image_dir\n", + " self.image_list = os.listdir(self.image_dir)\n", + " self.transformer = transformer\n", + "\n", + " def __len__(self):\n", + " return len(self.image_list)\n", + "\n", + " def __getitem__(self, index):\n", + " image_name = self.image_list[index]\n", + "\n", + " # Load image\n", + " X = self.filename_to_im_tensor(self.image_dir + '/' + image_name)\n", + " \n", + " # dataloaders need to return a label, but for the forward pass we don't really care\n", + " return X, -1\n", + " \n", + " def filename_to_im_tensor(self, file):\n", + " im = plt.imread(file)[:,:,:3]\n", + " im = self.transformer(im)\n", + " return im\n", + "\n", + "model.eval() \n", + "classes = [0, 1, 2]\n", + "# shape of final array will be (num_validation_images, 4096)\n", + "# we also want to record the image each index represents\n", + "feats = np.zeros(((~df_images['is_train']).sum(), 4096))\n", + "image_order = []\n", + "i = 0\n", + "for c in classes:\n", + " # use the validation images to do the forward pass\n", + " dataset = ForwardPassDataset(os.path.join(CNN_TRAIN_IMAGE_DIR, 'valid', str(c)), transformer)\n", + " dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=4)\n", + " image_order += dataset.image_list\n", + " # forward pass for this class\n", + " for inputs, _ in tqdm(dataloader):\n", + " inputs = inputs.to(device)\n", + " outputs = model(inputs)\n", + " feats[i:i+len(inputs),:] = outputs.cpu().detach().numpy()\n", + " i += len(inputs)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "forward_pass_df = pd.DataFrame.from_dict({'image_name': image_order, 'feat_index': np.arange(len(image_order))})\n", + "forward_pass_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_consumption = pd.merge(left=df_images, right=forward_pass_df, on='image_name')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# have we maintained all validation images?\n", + "assert len(df_consumption) == (~df_images['is_train']).sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aggregate Features\n", + "For each country, we aggregate the image features per cluster and save them to results/country/cnn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "country_abbrv = ['mw', 'eth', 'ng']\n", + "country_dir = ['malawi_2016', 'ethiopia_2015', 'nigeria_2015']\n", + "\n", + "for ca, cd in zip(country_abbrv, country_dir):\n", + " df_c = df_consumption[df_consumption['country'] == ca]\n", + " group = df_c.groupby(['cluster_lat', 'cluster_lon'])\n", + " x = np.zeros((len(group), 4096))\n", + " cluster_list = [] # the corresponding clusters (lat, lon) to the x aggregate feature array\n", + " for i, g in enumerate(group):\n", + " lat, lon = g[0]\n", + " im_sub = df_consumption[(df_consumption['cluster_lat'] == lat) & (df_consumption['cluster_lon'] == lon)].reset_index(drop=True)\n", + " agg_feats = np.zeros((len(im_sub), 4096))\n", + " for j, d in im_sub.iterrows():\n", + " agg_feats[j,:] = feats[d.feat_index]\n", + " agg_feats = agg_feats.mean(axis=0) # averages the features across all images in the cluster\n", + "\n", + " x[i,:] = agg_feats\n", + " cluster_list.append([lat, lon])\n", + " # save to the correct directory\n", + " save_dir = os.path.join(RESULTS_DIR, cd, 'cnn')\n", + " os.makedirs(save_dir, exist_ok=True)\n", + " np.save(os.path.join(save_dir, 'cluster_feats.npy'), x)\n", + " pickle.dump(cluster_list, open(os.path.join(save_dir, 'cluster_order.pkl'), 'wb')) \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "name": "pytorch-gpu.1-4.m46", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-4:m46" + }, + "kernelspec": { + "display_name": "predicting-poverty-replication", + "language": "python", + "name": "predicting-poverty-replication" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/predicting-poverty-education-replication/scripts/use_paper_model/predicting_poverty_deploy.prototxt b/predicting-poverty-education-replication/scripts/use_paper_model/predicting_poverty_deploy.prototxt new file mode 100644 index 0000000..09e56d1 --- /dev/null +++ b/predicting-poverty-education-replication/scripts/use_paper_model/predicting_poverty_deploy.prototxt @@ -0,0 +1,282 @@ +name: "POVERTY_PREDICT" +input: "data" +input_dim: 32 +input_dim: 3 +input_dim: 400 +input_dim: 400 + +layers { + bottom: "data" + top: "conv1" + name: "conv1" + type: CONVOLUTION + convolution_param { + num_output: 64 + kernel_size: 11 + stride: 4 + + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.1 + } + + } + blobs_lr: 0 + blobs_lr: 0 +} + +layers { + bottom: "conv1" + top: "conv1" + name: "relu1" + type: RELU +} +layers { + bottom: "conv1" + top: "norm1" + name: "norm1" + type: LRN + lrn_param { + local_size: 5 + alpha: 0.0005 + beta: 0.75 + k: 2 + } +} +layers { + bottom: "norm1" + top: "pool1" + name: "pool1" + type: POOLING + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layers { + bottom: "pool1" + top: "conv2" + name: "conv2" + type: CONVOLUTION + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layers { + bottom: "conv2" + top: "conv2" + name: "relu2" + type: RELU +} +layers { + bottom: "conv2" + top: "norm2" + name: "norm2" + type: LRN + lrn_param { + local_size: 5 + alpha: 0.0005 + beta: 0.75 + k: 2 + } +} +layers { + bottom: "norm2" + top: "pool2" + name: "pool2" + type: POOLING + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layers { + bottom: "pool2" + top: "conv3" + name: "conv3" + type: CONVOLUTION + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.1 + } + } + blobs_lr:0.1 + blobs_lr:1 +} +layers { + bottom: "conv3" + top: "conv3" + name: "relu3" + type: RELU +} +layers { + bottom: "conv3" + top: "conv4" + name: "conv4" + type: CONVOLUTION + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.1 + } + } + blobs_lr:0.1 + blobs_lr:1 +} +layers { + bottom: "conv4" + top: "conv4" + name: "relu4" + type: RELU +} +layers { + bottom: "conv4" + top: "conv5" + name: "conv5" + type: CONVOLUTION + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.1 + } + } + blobs_lr:0.1 + blobs_lr:1 +} +layers { + bottom: "conv5" + top: "conv5" + name: "relu5" + type: RELU +} +layers { + bottom: "conv5" + top: "pool5" + name: "pool5" + type: POOLING + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layers { + bottom: "pool5" + top: "conv6" + name: "conv6" + type: CONVOLUTION + convolution_param { + num_output: 4096 + pad: 0 + kernel_size: 6 + stride:6 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layers { + bottom: "conv6" + top: "conv6" + name: "relu6" + type: RELU +} +layers { + bottom: "conv6" + top: "conv7" + name: "conv7" + type: CONVOLUTION + convolution_param { + num_output: 4096 + pad: 0 + kernel_size: 1 + stride:1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layers { + bottom: "conv7" + top: "conv7" + name: "relu7" + type: RELU +} +layers { + bottom: "conv7" + top: "conv8" + name: "conv8" + type: CONVOLUTION + convolution_param { + num_output: 3 + pad: 0 + kernel_size: 1 + stride:1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layers { + bottom: "conv8" + top: "pool6" + name: "pool6" + type: POOLING + pooling_param { + pool: AVE + kernel_size:2 + stride: 1 + } +} + +layers { + bottom: "pool6" + top: "prob" + name: "prob" + type: SOFTMAX +} diff --git a/predicting-poverty-education-replication/scripts/utils/__init__.py b/predicting-poverty-education-replication/scripts/utils/__init__.py new file mode 100644 index 0000000..342d8b0 --- /dev/null +++ b/predicting-poverty-education-replication/scripts/utils/__init__.py @@ -0,0 +1,4 @@ +from .utils import * +from .google_downloader import * +from .planet_downloader import * +from .ridge_training import * diff --git a/predicting-poverty-education-replication/scripts/utils/__pycache__/__init__.cpython-37.pyc b/predicting-poverty-education-replication/scripts/utils/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..29c8079 Binary files /dev/null and b/predicting-poverty-education-replication/scripts/utils/__pycache__/__init__.cpython-37.pyc differ diff --git a/predicting-poverty-education-replication/scripts/utils/__pycache__/google_downloader.cpython-37.pyc b/predicting-poverty-education-replication/scripts/utils/__pycache__/google_downloader.cpython-37.pyc new file mode 100644 index 0000000..4990eea Binary files /dev/null and b/predicting-poverty-education-replication/scripts/utils/__pycache__/google_downloader.cpython-37.pyc differ diff --git a/predicting-poverty-education-replication/scripts/utils/__pycache__/planet_downloader.cpython-37.pyc b/predicting-poverty-education-replication/scripts/utils/__pycache__/planet_downloader.cpython-37.pyc new file mode 100644 index 0000000..897d6b6 Binary files /dev/null and b/predicting-poverty-education-replication/scripts/utils/__pycache__/planet_downloader.cpython-37.pyc differ diff --git a/predicting-poverty-education-replication/scripts/utils/__pycache__/ridge_training.cpython-37.pyc b/predicting-poverty-education-replication/scripts/utils/__pycache__/ridge_training.cpython-37.pyc new file mode 100644 index 0000000..447a99a Binary files /dev/null and b/predicting-poverty-education-replication/scripts/utils/__pycache__/ridge_training.cpython-37.pyc differ diff --git a/predicting-poverty-education-replication/scripts/utils/__pycache__/utils.cpython-37.pyc b/predicting-poverty-education-replication/scripts/utils/__pycache__/utils.cpython-37.pyc new file mode 100644 index 0000000..16a6c43 Binary files /dev/null and b/predicting-poverty-education-replication/scripts/utils/__pycache__/utils.cpython-37.pyc differ diff --git a/predicting-poverty-education-replication/scripts/utils/google_downloader.py b/predicting-poverty-education-replication/scripts/utils/google_downloader.py new file mode 100644 index 0000000..48b318a --- /dev/null +++ b/predicting-poverty-education-replication/scripts/utils/google_downloader.py @@ -0,0 +1,18 @@ +''' +Very simple download interface to download images from Google's Static Maps API +''' + +class GoogleDownloader: + def __init__(self, access_token): + self.access_token = access_token + self.url = 'https://maps.googleapis.com/maps/api/staticmap?center={},{}&zoom={}&size=400x400&maptype=satellite&key={}' + + def download(self, lat, long, zoom): + res = requests.get(self.url.format(lat, long, zoom, self.access_token)) + # server needs to make image available, takes a few seconds + if res.status_code == 403: + return 'RETRY' + assert res.status_code < 400, print(f'Error - failed to download {lat}, {long}, {zoom}') + image = plt.imread(BytesIO(res.content)) + return image + \ No newline at end of file diff --git a/predicting-poverty-education-replication/scripts/utils/planet_downloader.py b/predicting-poverty-education-replication/scripts/utils/planet_downloader.py new file mode 100644 index 0000000..909cc12 --- /dev/null +++ b/predicting-poverty-education-replication/scripts/utils/planet_downloader.py @@ -0,0 +1,172 @@ +''' +More complex interface than Google's to download images from Planet. Unlike Google, Planet allows us to query images at a certain time. However, Planet's API is not great and there are a variety of issues ranging from a little annoying to fairly serious. This interface simplifies the use from a user's perspective. +''' + +import math +import requests +import matplotlib.pyplot as plt +from requests.auth import HTTPBasicAuth +import os +import json +from io import BytesIO +from shapely.geometry import Polygon + + +class PlanetDownloader: + def __init__(self, api_key, item_type='PSScene3Band'): + self.api_key = api_key + self.item_type = item_type + + def create_cords(lat, lon, zoom): + xtile, ytile = deg_to_tile(lat, lon, zoom) + + coords = [tilexy_to_deg(xtile, ytile, zoom, a, b) for a,b in [(0,0), (0,255), (255,255), (255,0)]] + return [[b,a] for a,b in coords] + + def download_image(self, lat, lon, min_year, min_month, max_year, max_month, zoom=14, cloud_max=0.05): + ''' + Use this method to download an image at a lat, lon in some time range + If multiple images are available, the latest is downloaded + + I would not increase zoom + cloud_max is the maximum cloud filter, defaulting to 5% + ''' + assert 0 <= cloud_max <= 1.0 + if min_month < 10: + min_month = '0' + str(min_month) + + if max_month < 10: + max_month = '0' + str(max_month) + + cords = PlanetDownloader.create_cords(lat, lon, zoom) + geo_json_geometry = { + "type": "Polygon", + "coordinates": [ + cords + ], + + } + + # filter for items the overlap with our chosen geometry + geometry_filter = { + "type": "GeometryFilter", + "field_name": "geometry", + "config": geo_json_geometry, + } + + # filter images acquired in a certain date range + date_range_filter = { + "type": "DateRangeFilter", + "field_name": "acquired", + "config": { + "gte": "{}-{}-01T00:00:00.000Z".format(min_year, min_month), + "lte": "{}-{}-01T00:00:00.000Z".format(max_year, max_month) + } + } + + # filter any images which are more than 50% clouds + cloud_cover_filter = { + "type": "RangeFilter", + "field_name": "cloud_cover", + "config": { + "lte": cloud_max + } + } + + # create a filter that combines our geo and date filters + # could also use an "OrFilter" + reservoir = { + "type": "AndFilter", + "config": [geometry_filter, date_range_filter, cloud_cover_filter] + } + + # Search API request object + search_endpoint_request = { + "item_types": [self.item_type], + "filter": reservoir + } + + result = \ + requests.post( + 'https://api.planet.com/data/v1/quick-search', + auth=HTTPBasicAuth(self.api_key, ''), + json=search_endpoint_request) + + res = json.loads(result.text) + + x, y = deg_to_tile(lat, lon, zoom) + item_id = None + + if len(res['features']) == 0: + # print('No image found, try widening your search or using a different satellite') + return None + else: + # planet for some reason will return results that don't even contain the requested geometry -_- + # this will look for the LATEST (closest to the max time) match that actually contains our geometry + polya = Polygon(cords) + b_cords = [tilexy_to_deg(x,y,zoom,a,b) for a,b in [(0,0), (1,0), (1,1), (0,1)]] + polyb = Polygon([(b,a) for (a,b) in b_cords]) + + for idx in range(len(res['features']) - 1, -1, -1): + polyc = Polygon(res['features'][idx]['geometry']['coordinates'][0]) + + if polyc.contains(polya) and polyc.contains(polyb): + item_id = res['features'][idx]['id'] + break + + if item_id is None: + # print('No good images found') + return None + + url = 'https://tiles0.planet.com/data/v1/{}/{}/{}/{}/{}.png?api_key={}'.format(self.item_type, item_id, zoom, x, y, self.api_key) + + res = requests.get(url) + if res.status_code >= 400: + # print('download error') + return None + + return plt.imread(BytesIO(res.content)) + + +""" +Important geoconversion functions +""" + +def tilexy_to_deg(xtile, ytile, zoom, x, y): + """Converts a specific location on a tile (x,y) to geocoordinates.""" + decimal_x = xtile + x / 256 + decimal_y = ytile + y / 256 + n = 2.0 ** zoom + lon_deg = decimal_x / n * 360.0 - 180.0 + lat_rad = math.atan(math.sinh(math.pi * (1 - 2 * decimal_y / n))) + lat_deg = math.degrees(lat_rad) + return (lat_deg, lon_deg) + +def deg_to_tilexy(lat_deg, lon_deg, zoom): + """Converts geocoordinates to an x,y position on a tile.""" + lat_rad = math.radians(lat_deg) + n = 2.0 ** zoom + x = ((lon_deg + 180.0) / 360.0 * n) + y = ((1.0 - math.log(math.tan(lat_rad) + (1 / math.cos(lat_rad))) + / math.pi) / 2.0 * n) + return (int((x % 1) * 256), int((y % 1) * 256)) + +def tile_to_deg(xtile, ytile, zoom): + """Returns the coordinates of the northwest corner of a Slippy Map + x,y tile""" + n = 2.0 ** zoom + lon_deg = xtile / n * 360.0 - 180.0 + lat_rad = math.atan(math.sinh(math.pi * (1 - 2 * ytile / n))) + lat_deg = math.degrees(lat_rad) + return (lat_deg, lon_deg) + +def deg_to_tile(lat_deg, lon_deg, zoom): + """Converts coordinates into the nearest x,y Slippy Map tile""" + lat_rad = math.radians(lat_deg) + n = 2.0 ** zoom + xtile = int((lon_deg + 180.0) / 360.0 * n) + ytile = int((1.0 - math.log(math.tan(lat_rad) + (1 / math.cos(lat_rad))) + / math.pi) / 2.0 * n) + return (xtile, ytile) + + diff --git a/predicting-poverty-education-replication/scripts/utils/ridge_training.py b/predicting-poverty-education-replication/scripts/utils/ridge_training.py new file mode 100644 index 0000000..1818681 --- /dev/null +++ b/predicting-poverty-education-replication/scripts/utils/ridge_training.py @@ -0,0 +1,153 @@ +# This is based on code from the Jean et al Github that is modified to work with Python3 and our metrics + +import numpy as np +import pandas as pd +import random +from scipy import stats +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import KFold +import sklearn.linear_model as linear_model +import matplotlib.pyplot as plt +import sklearn.metrics as metrics +from sklearn.cluster import KMeans + +def run_randomized_cv(X, y, k=5, k_inner=5, random_seed=7, points=10, + alpha_low=1, alpha_high=5, to_print=False): + """ + Run randomized CV on given X and y + Returns r2, yhat + """ + np.random.seed(random_seed) + alphas = np.logspace(alpha_low, alpha_high, points) + r2s = [] + y_hat = np.zeros_like(y) + kf = KFold(n_splits=k, shuffle=True) + fold = 0 + for train_idx, test_idx in kf.split(X): + if to_print: + print(f"fold: {fold}", end='\r') + r2, y_p = evaluate_fold(X, y, train_idx, test_idx, k_inner, alphas, to_print) + r2s.append(r2) + y_hat[test_idx] = y_p + fold += 1 + return np.mean(r2s), y_hat + + +def scale_features(X_train, X_test): + """ + Scales features using StandardScaler. + """ + X_scaler = StandardScaler(with_mean=True, with_std=False) + X_train = X_scaler.fit_transform(X_train) + X_test = X_scaler.transform(X_test) + return X_train, X_test + + +def train_and_predict_ridge(alpha, X_train, y_train, X_test): + """ + Trains ridge model and predicts test set. + """ + ridge = linear_model.Ridge(alpha) + ridge.fit(X_train, y_train) + y_hat = ridge.predict(X_test) + return y_hat + +def find_best_alpha(X, y, k_inner, alphas, to_print=False): + """ + Finds the best alpha in an inner fully randomized CV loop. + """ + kf = KFold(n_splits=k_inner, shuffle=True) + best_alpha = 0 + best_r2 = 0 + for idx, alpha in enumerate(alphas): + y_hat = np.zeros_like(y) + for train_idx, test_idx in kf.split(X): + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + X_train, X_test = scale_features(X_train, X_test) + y_hat[test_idx] = train_and_predict_ridge(alpha, X_train, y_train, X_test) + r2 = metrics.r2_score(y, y_hat) + if r2 > best_r2: + best_alpha = alpha + best_r2 = r2 + if to_print: + print(best_alpha) + return best_alpha + + +def evaluate_fold(X, y, train_idx, test_idx, k_inner, alphas, to_print=False): + """ + Evaluates one fold of outer CV. + """ + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + best_alpha = find_best_alpha(X_train, y_train, k_inner, alphas, to_print) + X_train, X_test = scale_features(X_train, X_test) + y_test_hat = train_and_predict_ridge(best_alpha, X_train, y_train, X_test) + r2 = metrics.r2_score(y_test, y_test_hat) + return r2, y_test_hat + + +def run_spatial_cv(X, y, groups, k_inner=5, random_seed=7, points=10, + alpha_low=1, alpha_high=5, to_print=False): + """ + Run randomized CV on given X and y + Returns r2, yhat + """ + np.random.seed(random_seed) + alphas = np.logspace(alpha_low, alpha_high, points) + k = int(groups.max() + 1) + r2s = [] + y_hat = np.zeros_like(y) + fold = 0 + for i in range(k): + train_idx = groups != i + test_idx = groups == i + if to_print: + print(f"fold: {fold}", end='\r') + r2, y_p = evaluate_fold(X, y, train_idx, test_idx, k_inner, alphas) + # could use this function to do inner-fold spatial validation + # r2, y_p = evaluate_spatial_fold(X, y, groups, train_idx, test_idx, alphas) + r2s.append(r2) + y_hat[test_idx] = y_p + fold += 1 + return np.mean(r2s), y_hat + +def evaluate_spatial_fold(X, y, groups, train_idx, test_idx, alphas): + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + groups_train = groups[train_idx] + best_alpha = find_best_alpha_spatial(X_train, y_train, groups_train, alphas) + X_train, X_test = scale_features(X_train, X_test) + y_test_hat = train_and_predict_ridge(best_alpha, X_train, y_train, X_test) + r2 = metrics.r2_score(y_test, y_test_hat) + return r2, y_test_hat + +def find_best_alpha_spatial(X, y, groups, alphas): + """ + Finds the best alpha in an inner spatial CV loop. + """ + gs = np.unique(groups) + best_alpha = 0 + best_r2 = 0 + for alpha in alphas: + y_hat = np.zeros_like(y) + for g in gs: + # hold out each g in the inner spatial loop while choosing the best alpha + train_idx = groups != g + test_idx = groups == g + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + X_train, X_test = scale_features(X_train, X_test) + y_hat[test_idx] = train_and_predict_ridge(alpha, X_train, y_train, X_test) + r2 = metrics.r2_score(y, y_hat) + if r2 > best_r2: + best_alpha = alpha + best_r2 = r2 + return best_alpha + +def assign_groups(df, k, random_seed=7): + ''' Assign clusters in df (columns cluster_lat, cluster_lon) into k groups, also returns cluster centers''' + np.random.seed(random_seed) + km = KMeans(k) + return km.fit_predict(df[['cluster_lat', 'cluster_lon']]), km.cluster_centers_ diff --git a/predicting-poverty-education-replication/scripts/utils/utils.py b/predicting-poverty-education-replication/scripts/utils/utils.py new file mode 100644 index 0000000..22308e1 --- /dev/null +++ b/predicting-poverty-education-replication/scripts/utils/utils.py @@ -0,0 +1,32 @@ +''' +Handful of utility functions used throughout the repo +''' + +import math +import pandas as pd + +def merge_on_lat_lon(df1, df2, keys=['cluster_lat', 'cluster_lon'], how='inner'): + """ + Allows two dataframes to be merged on lat/lon + Necessary because pandas has trouble merging on floats (understandably so) + """ + df1 = df1.copy() + df2 = df2.copy() + + # must use ints for merging, as floats induce errors + df1['merge_lat'] = (10000 * df1[keys[0]]).astype(int) + df1['merge_lon'] = (10000 * df1[keys[1]]).astype(int) + + df2['merge_lat'] = (10000 * df2[keys[0]]).astype(int) + df2['merge_lon'] = (10000 * df2[keys[1]]).astype(int) + + df2.drop(keys, axis=1, inplace=True) + merged = pd.merge(df1, df2, on=['merge_lat', 'merge_lon'], how=how) + merged.drop(['merge_lat', 'merge_lon'], axis=1, inplace=True) + return merged + +def create_space(lat, lon, s=10): + """Creates a s km x s km square centered on (lat, lon)""" + v = (180/math.pi)*(500/6378137)*s # roughly 0.045 for s=10 + return lat - v, lon - v, lat + v, lon + v + \ No newline at end of file diff --git a/predicting-poverty-education-replication/utils/__init__.py b/predicting-poverty-education-replication/utils/__init__.py new file mode 100644 index 0000000..342d8b0 --- /dev/null +++ b/predicting-poverty-education-replication/utils/__init__.py @@ -0,0 +1,4 @@ +from .utils import * +from .google_downloader import * +from .planet_downloader import * +from .ridge_training import * diff --git a/predicting-poverty-education-replication/utils/__pycache__/__init__.cpython-37.pyc b/predicting-poverty-education-replication/utils/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..29c8079 Binary files /dev/null and b/predicting-poverty-education-replication/utils/__pycache__/__init__.cpython-37.pyc differ diff --git a/predicting-poverty-education-replication/utils/__pycache__/google_downloader.cpython-37.pyc b/predicting-poverty-education-replication/utils/__pycache__/google_downloader.cpython-37.pyc new file mode 100644 index 0000000..4990eea Binary files /dev/null and b/predicting-poverty-education-replication/utils/__pycache__/google_downloader.cpython-37.pyc differ diff --git a/predicting-poverty-education-replication/utils/__pycache__/planet_downloader.cpython-37.pyc b/predicting-poverty-education-replication/utils/__pycache__/planet_downloader.cpython-37.pyc new file mode 100644 index 0000000..897d6b6 Binary files /dev/null and b/predicting-poverty-education-replication/utils/__pycache__/planet_downloader.cpython-37.pyc differ diff --git a/predicting-poverty-education-replication/utils/__pycache__/ridge_training.cpython-37.pyc b/predicting-poverty-education-replication/utils/__pycache__/ridge_training.cpython-37.pyc new file mode 100644 index 0000000..447a99a Binary files /dev/null and b/predicting-poverty-education-replication/utils/__pycache__/ridge_training.cpython-37.pyc differ diff --git a/predicting-poverty-education-replication/utils/__pycache__/utils.cpython-37.pyc b/predicting-poverty-education-replication/utils/__pycache__/utils.cpython-37.pyc new file mode 100644 index 0000000..16a6c43 Binary files /dev/null and b/predicting-poverty-education-replication/utils/__pycache__/utils.cpython-37.pyc differ diff --git a/predicting-poverty-education-replication/utils/google_downloader.py b/predicting-poverty-education-replication/utils/google_downloader.py new file mode 100644 index 0000000..48b318a --- /dev/null +++ b/predicting-poverty-education-replication/utils/google_downloader.py @@ -0,0 +1,18 @@ +''' +Very simple download interface to download images from Google's Static Maps API +''' + +class GoogleDownloader: + def __init__(self, access_token): + self.access_token = access_token + self.url = 'https://maps.googleapis.com/maps/api/staticmap?center={},{}&zoom={}&size=400x400&maptype=satellite&key={}' + + def download(self, lat, long, zoom): + res = requests.get(self.url.format(lat, long, zoom, self.access_token)) + # server needs to make image available, takes a few seconds + if res.status_code == 403: + return 'RETRY' + assert res.status_code < 400, print(f'Error - failed to download {lat}, {long}, {zoom}') + image = plt.imread(BytesIO(res.content)) + return image + \ No newline at end of file diff --git a/predicting-poverty-education-replication/utils/planet_downloader.py b/predicting-poverty-education-replication/utils/planet_downloader.py new file mode 100644 index 0000000..909cc12 --- /dev/null +++ b/predicting-poverty-education-replication/utils/planet_downloader.py @@ -0,0 +1,172 @@ +''' +More complex interface than Google's to download images from Planet. Unlike Google, Planet allows us to query images at a certain time. However, Planet's API is not great and there are a variety of issues ranging from a little annoying to fairly serious. This interface simplifies the use from a user's perspective. +''' + +import math +import requests +import matplotlib.pyplot as plt +from requests.auth import HTTPBasicAuth +import os +import json +from io import BytesIO +from shapely.geometry import Polygon + + +class PlanetDownloader: + def __init__(self, api_key, item_type='PSScene3Band'): + self.api_key = api_key + self.item_type = item_type + + def create_cords(lat, lon, zoom): + xtile, ytile = deg_to_tile(lat, lon, zoom) + + coords = [tilexy_to_deg(xtile, ytile, zoom, a, b) for a,b in [(0,0), (0,255), (255,255), (255,0)]] + return [[b,a] for a,b in coords] + + def download_image(self, lat, lon, min_year, min_month, max_year, max_month, zoom=14, cloud_max=0.05): + ''' + Use this method to download an image at a lat, lon in some time range + If multiple images are available, the latest is downloaded + + I would not increase zoom + cloud_max is the maximum cloud filter, defaulting to 5% + ''' + assert 0 <= cloud_max <= 1.0 + if min_month < 10: + min_month = '0' + str(min_month) + + if max_month < 10: + max_month = '0' + str(max_month) + + cords = PlanetDownloader.create_cords(lat, lon, zoom) + geo_json_geometry = { + "type": "Polygon", + "coordinates": [ + cords + ], + + } + + # filter for items the overlap with our chosen geometry + geometry_filter = { + "type": "GeometryFilter", + "field_name": "geometry", + "config": geo_json_geometry, + } + + # filter images acquired in a certain date range + date_range_filter = { + "type": "DateRangeFilter", + "field_name": "acquired", + "config": { + "gte": "{}-{}-01T00:00:00.000Z".format(min_year, min_month), + "lte": "{}-{}-01T00:00:00.000Z".format(max_year, max_month) + } + } + + # filter any images which are more than 50% clouds + cloud_cover_filter = { + "type": "RangeFilter", + "field_name": "cloud_cover", + "config": { + "lte": cloud_max + } + } + + # create a filter that combines our geo and date filters + # could also use an "OrFilter" + reservoir = { + "type": "AndFilter", + "config": [geometry_filter, date_range_filter, cloud_cover_filter] + } + + # Search API request object + search_endpoint_request = { + "item_types": [self.item_type], + "filter": reservoir + } + + result = \ + requests.post( + 'https://api.planet.com/data/v1/quick-search', + auth=HTTPBasicAuth(self.api_key, ''), + json=search_endpoint_request) + + res = json.loads(result.text) + + x, y = deg_to_tile(lat, lon, zoom) + item_id = None + + if len(res['features']) == 0: + # print('No image found, try widening your search or using a different satellite') + return None + else: + # planet for some reason will return results that don't even contain the requested geometry -_- + # this will look for the LATEST (closest to the max time) match that actually contains our geometry + polya = Polygon(cords) + b_cords = [tilexy_to_deg(x,y,zoom,a,b) for a,b in [(0,0), (1,0), (1,1), (0,1)]] + polyb = Polygon([(b,a) for (a,b) in b_cords]) + + for idx in range(len(res['features']) - 1, -1, -1): + polyc = Polygon(res['features'][idx]['geometry']['coordinates'][0]) + + if polyc.contains(polya) and polyc.contains(polyb): + item_id = res['features'][idx]['id'] + break + + if item_id is None: + # print('No good images found') + return None + + url = 'https://tiles0.planet.com/data/v1/{}/{}/{}/{}/{}.png?api_key={}'.format(self.item_type, item_id, zoom, x, y, self.api_key) + + res = requests.get(url) + if res.status_code >= 400: + # print('download error') + return None + + return plt.imread(BytesIO(res.content)) + + +""" +Important geoconversion functions +""" + +def tilexy_to_deg(xtile, ytile, zoom, x, y): + """Converts a specific location on a tile (x,y) to geocoordinates.""" + decimal_x = xtile + x / 256 + decimal_y = ytile + y / 256 + n = 2.0 ** zoom + lon_deg = decimal_x / n * 360.0 - 180.0 + lat_rad = math.atan(math.sinh(math.pi * (1 - 2 * decimal_y / n))) + lat_deg = math.degrees(lat_rad) + return (lat_deg, lon_deg) + +def deg_to_tilexy(lat_deg, lon_deg, zoom): + """Converts geocoordinates to an x,y position on a tile.""" + lat_rad = math.radians(lat_deg) + n = 2.0 ** zoom + x = ((lon_deg + 180.0) / 360.0 * n) + y = ((1.0 - math.log(math.tan(lat_rad) + (1 / math.cos(lat_rad))) + / math.pi) / 2.0 * n) + return (int((x % 1) * 256), int((y % 1) * 256)) + +def tile_to_deg(xtile, ytile, zoom): + """Returns the coordinates of the northwest corner of a Slippy Map + x,y tile""" + n = 2.0 ** zoom + lon_deg = xtile / n * 360.0 - 180.0 + lat_rad = math.atan(math.sinh(math.pi * (1 - 2 * ytile / n))) + lat_deg = math.degrees(lat_rad) + return (lat_deg, lon_deg) + +def deg_to_tile(lat_deg, lon_deg, zoom): + """Converts coordinates into the nearest x,y Slippy Map tile""" + lat_rad = math.radians(lat_deg) + n = 2.0 ** zoom + xtile = int((lon_deg + 180.0) / 360.0 * n) + ytile = int((1.0 - math.log(math.tan(lat_rad) + (1 / math.cos(lat_rad))) + / math.pi) / 2.0 * n) + return (xtile, ytile) + + diff --git a/predicting-poverty-education-replication/utils/ridge_training.py b/predicting-poverty-education-replication/utils/ridge_training.py new file mode 100644 index 0000000..1818681 --- /dev/null +++ b/predicting-poverty-education-replication/utils/ridge_training.py @@ -0,0 +1,153 @@ +# This is based on code from the Jean et al Github that is modified to work with Python3 and our metrics + +import numpy as np +import pandas as pd +import random +from scipy import stats +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import KFold +import sklearn.linear_model as linear_model +import matplotlib.pyplot as plt +import sklearn.metrics as metrics +from sklearn.cluster import KMeans + +def run_randomized_cv(X, y, k=5, k_inner=5, random_seed=7, points=10, + alpha_low=1, alpha_high=5, to_print=False): + """ + Run randomized CV on given X and y + Returns r2, yhat + """ + np.random.seed(random_seed) + alphas = np.logspace(alpha_low, alpha_high, points) + r2s = [] + y_hat = np.zeros_like(y) + kf = KFold(n_splits=k, shuffle=True) + fold = 0 + for train_idx, test_idx in kf.split(X): + if to_print: + print(f"fold: {fold}", end='\r') + r2, y_p = evaluate_fold(X, y, train_idx, test_idx, k_inner, alphas, to_print) + r2s.append(r2) + y_hat[test_idx] = y_p + fold += 1 + return np.mean(r2s), y_hat + + +def scale_features(X_train, X_test): + """ + Scales features using StandardScaler. + """ + X_scaler = StandardScaler(with_mean=True, with_std=False) + X_train = X_scaler.fit_transform(X_train) + X_test = X_scaler.transform(X_test) + return X_train, X_test + + +def train_and_predict_ridge(alpha, X_train, y_train, X_test): + """ + Trains ridge model and predicts test set. + """ + ridge = linear_model.Ridge(alpha) + ridge.fit(X_train, y_train) + y_hat = ridge.predict(X_test) + return y_hat + +def find_best_alpha(X, y, k_inner, alphas, to_print=False): + """ + Finds the best alpha in an inner fully randomized CV loop. + """ + kf = KFold(n_splits=k_inner, shuffle=True) + best_alpha = 0 + best_r2 = 0 + for idx, alpha in enumerate(alphas): + y_hat = np.zeros_like(y) + for train_idx, test_idx in kf.split(X): + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + X_train, X_test = scale_features(X_train, X_test) + y_hat[test_idx] = train_and_predict_ridge(alpha, X_train, y_train, X_test) + r2 = metrics.r2_score(y, y_hat) + if r2 > best_r2: + best_alpha = alpha + best_r2 = r2 + if to_print: + print(best_alpha) + return best_alpha + + +def evaluate_fold(X, y, train_idx, test_idx, k_inner, alphas, to_print=False): + """ + Evaluates one fold of outer CV. + """ + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + best_alpha = find_best_alpha(X_train, y_train, k_inner, alphas, to_print) + X_train, X_test = scale_features(X_train, X_test) + y_test_hat = train_and_predict_ridge(best_alpha, X_train, y_train, X_test) + r2 = metrics.r2_score(y_test, y_test_hat) + return r2, y_test_hat + + +def run_spatial_cv(X, y, groups, k_inner=5, random_seed=7, points=10, + alpha_low=1, alpha_high=5, to_print=False): + """ + Run randomized CV on given X and y + Returns r2, yhat + """ + np.random.seed(random_seed) + alphas = np.logspace(alpha_low, alpha_high, points) + k = int(groups.max() + 1) + r2s = [] + y_hat = np.zeros_like(y) + fold = 0 + for i in range(k): + train_idx = groups != i + test_idx = groups == i + if to_print: + print(f"fold: {fold}", end='\r') + r2, y_p = evaluate_fold(X, y, train_idx, test_idx, k_inner, alphas) + # could use this function to do inner-fold spatial validation + # r2, y_p = evaluate_spatial_fold(X, y, groups, train_idx, test_idx, alphas) + r2s.append(r2) + y_hat[test_idx] = y_p + fold += 1 + return np.mean(r2s), y_hat + +def evaluate_spatial_fold(X, y, groups, train_idx, test_idx, alphas): + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + groups_train = groups[train_idx] + best_alpha = find_best_alpha_spatial(X_train, y_train, groups_train, alphas) + X_train, X_test = scale_features(X_train, X_test) + y_test_hat = train_and_predict_ridge(best_alpha, X_train, y_train, X_test) + r2 = metrics.r2_score(y_test, y_test_hat) + return r2, y_test_hat + +def find_best_alpha_spatial(X, y, groups, alphas): + """ + Finds the best alpha in an inner spatial CV loop. + """ + gs = np.unique(groups) + best_alpha = 0 + best_r2 = 0 + for alpha in alphas: + y_hat = np.zeros_like(y) + for g in gs: + # hold out each g in the inner spatial loop while choosing the best alpha + train_idx = groups != g + test_idx = groups == g + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + X_train, X_test = scale_features(X_train, X_test) + y_hat[test_idx] = train_and_predict_ridge(alpha, X_train, y_train, X_test) + r2 = metrics.r2_score(y, y_hat) + if r2 > best_r2: + best_alpha = alpha + best_r2 = r2 + return best_alpha + +def assign_groups(df, k, random_seed=7): + ''' Assign clusters in df (columns cluster_lat, cluster_lon) into k groups, also returns cluster centers''' + np.random.seed(random_seed) + km = KMeans(k) + return km.fit_predict(df[['cluster_lat', 'cluster_lon']]), km.cluster_centers_ diff --git a/predicting-poverty-education-replication/utils/utils.py b/predicting-poverty-education-replication/utils/utils.py new file mode 100644 index 0000000..22308e1 --- /dev/null +++ b/predicting-poverty-education-replication/utils/utils.py @@ -0,0 +1,32 @@ +''' +Handful of utility functions used throughout the repo +''' + +import math +import pandas as pd + +def merge_on_lat_lon(df1, df2, keys=['cluster_lat', 'cluster_lon'], how='inner'): + """ + Allows two dataframes to be merged on lat/lon + Necessary because pandas has trouble merging on floats (understandably so) + """ + df1 = df1.copy() + df2 = df2.copy() + + # must use ints for merging, as floats induce errors + df1['merge_lat'] = (10000 * df1[keys[0]]).astype(int) + df1['merge_lon'] = (10000 * df1[keys[1]]).astype(int) + + df2['merge_lat'] = (10000 * df2[keys[0]]).astype(int) + df2['merge_lon'] = (10000 * df2[keys[1]]).astype(int) + + df2.drop(keys, axis=1, inplace=True) + merged = pd.merge(df1, df2, on=['merge_lat', 'merge_lon'], how=how) + merged.drop(['merge_lat', 'merge_lon'], axis=1, inplace=True) + return merged + +def create_space(lat, lon, s=10): + """Creates a s km x s km square centered on (lat, lon)""" + v = (180/math.pi)*(500/6378137)*s # roughly 0.045 for s=10 + return lat - v, lon - v, lat + v, lon + v + \ No newline at end of file