civictechdc · nathanielconroy · Apr 12, 2018 · Apr 12, 2018 · Apr 12, 2018 · Apr 12, 2018
diff --git a/scripts/feature_engineering/extract_building_permit_features/README.md b/scripts/feature_engineering/extract_building_permit_features/README.md
@@ -0,0 +1,32 @@
+This is a python script that will generate a feature table as described below.
+
+Adapted from [issue 3](https://github.com/jasonasher/dc_doh_hackathon/tree/master/issue_3) in the dc_doh_hackathon.
+
+When running the script, please provide the following arguments:
+1. Path to the folder containing the source building permit data csv files.
+2. Path to the shape file.
+3. Path to the csv file where the output should be saved.
+
+Sample usage:
+
+python building_features_with_census.py C:\RatHackData\Data\Building-Permit-Data C:\RatHackData\ShapeFiles\tl_2016_11_tabblock10.shp output.csv
+
+The script will return a feature data table for the number of new building permits issued in the last 4 weeks.
+
+**Input:**  
+CSV files with data for each given year  
+A shape file  
+
+**Output:**  
+A CSV file with the format given below:  
+
+- 1 row for each building permit type and subtype, and each week, year, and census block  
+- The data set should include the following columns:  
+
+`feature_id`: The ID for the feature, in this case, "building_permits_issued_last_4_weeks"  
+`feature_type`: Building permit type  
+`feature_subtype`: Building permit subtype  
+`year`: The ISO-8601 year of the feature value  
+`week`: The ISO-8601 week number of the feature value  
+`census_block_2010`: The 2010 Census Block of the feature value  
+`value`: The value of the feature, i.e. the number of new building permits of the specified types and subtypes issued in the given census block during the previous 4 weeks starting from the year and week above.  
diff --git a/...pts/feature_engineering/extract_building_permit_features/building_features_with_census.py b/...pts/feature_engineering/extract_building_permit_features/building_features_with_census.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import pandas as pd
+import geopandas.tools
+from shapely.geometry import Point
+import sys
+import glob
+
+source_data = sys.argv[1]
+shape_file = sys.argv[2]
+output_file_name = sys.argv[3]
+
+# read in data
+fields = ['ISSUE_DATE', 'PERMIT_TYPE_NAME', 'PERMIT_SUBTYPE_NAME', 'LONGITUDE', 'LATITUDE']
+# df = pd.read_csv(sys.argv[1], usecols=fields)
+
+data_frames = []
+for file in glob.glob(sys.argv[1] + "/*.csv"):
+    df = pd.read_csv(file, index_col=None, header=0, usecols=fields)
+    data_frames.append(df)
+df = pd.concat(data_frames)
+
+# geocode lat long to census block
+df['geometry'] = df.apply(lambda row: Point(row['LONGITUDE'], row['LATITUDE']), axis=1)
+df = geopandas.GeoDataFrame(df, geometry='geometry')
+df.crs = {'init': 'epsg:4326'}
+
+census_blocks = geopandas.GeoDataFrame.from_file(shape_file)
+census_blocks.crs = {'init': 'epsg:4326'}
+
+result = geopandas.tools.sjoin(df[['geometry']], census_blocks[['BLOCKCE10', 'geometry']], how='left')
+
+df['census_block_2010'] = result['BLOCKCE10']
+df = df[fields + ['census_block_2010']]
+
+# clean up, rename
+del df['LONGITUDE']
+del df['LATITUDE']
+df = df.rename(columns={'PERMIT_TYPE_NAME': 'feature_type', 'PERMIT_SUBTYPE_NAME': 'feature_subtype'})
+
+# adding value and feature_id field
+df = df.groupby(['feature_type', 'feature_subtype', 'census_block_2010', 'ISSUE_DATE']).size()
+df = df.reset_index()
+df = df.rename(columns={0: 'value'})
+
+# convert date column to date type
+df['ISSUE_DATE'] = pd.to_datetime(df.ISSUE_DATE)
+df.index = df['ISSUE_DATE']
+
+# Resample to weekly and fill in blanks with zeros
+WeeklyData = df.groupby(['feature_type', 'feature_subtype', 'census_block_2010']).resample('W-MON', closed='left',
+                                                                                           on='ISSUE_DATE',
+                                                                                           label='left').sum().reset_index()
+WeeklyData.sort_values(by=['feature_type', 'feature_subtype', 'census_block_2010', 'ISSUE_DATE'])
+WeeklyData = WeeklyData.fillna(0)
+
+# Sum over rolling 4 week periods from weekly data
+Avg4Week = WeeklyData.groupby(by=['feature_type', 'feature_subtype', 'census_block_2010']).rolling(4, min_periods=1,
+                                                                                                   on='ISSUE_DATE').sum()
+
+# Add integer year and weeks
+Avg4Week = Avg4Week[['feature_type', 'feature_subtype', 'census_block_2010', 'ISSUE_DATE', 'value']].reset_index(
+    drop=True)
+Avg4Week['year'], Avg4Week['week'] = Avg4Week['ISSUE_DATE'].apply(lambda x: x.isocalendar()[0]), Avg4Week[
+    'ISSUE_DATE'].apply(lambda x: x.isocalendar()[1])
+Avg4Week['feature_id'] = 'building_permits_issued_last_4_weeks'
+Avg4Week = Avg4Week[['feature_id', 'feature_type', 'feature_subtype', 'year', 'week', 'census_block_2010', 'value']]
+
+# Output File to CSV
+Avg4Week.to_csv(output_file_name, index=False)