From 9dd35ea68faa2b1098227d67cf80baa333ef7858 Mon Sep 17 00:00:00 2001 From: nathanielconroy Date: Wed, 11 Apr 2018 20:51:18 -0400 Subject: [PATCH 1/6] Modified building features with census script so that it takes command line arguments. Added readme. --- .../README.md | 30 ++++++++ .../building_features_with_census.py | 75 +++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 scripts/feature_engineering/extract_building_permit_features/README.md create mode 100644 scripts/feature_engineering/extract_building_permit_features/building_features_with_census.py diff --git a/scripts/feature_engineering/extract_building_permit_features/README.md b/scripts/feature_engineering/extract_building_permit_features/README.md new file mode 100644 index 0000000..f172cda --- /dev/null +++ b/scripts/feature_engineering/extract_building_permit_features/README.md @@ -0,0 +1,30 @@ +This is a python script that will generate a feature table as described below. + +When running the script, please provide the following arguments: +1. Path to the folder containing the source building permit data csv files. +2. Path to the shape file. +3. Path to the csv file where the output should be saved. + +Sample usage: + +python building_features_with_census.py C:\RatHackData\Data\Building-Permit-Data C:\RatHackData\ShapeFiles\tl_2016_11_tabblock10.shp output.csv + +The script will return a feature data table for the number of new building permits issued in the last 4 weeks. + +**Input:** +CSV files with data for each given year +A shape file + +**Output:** +A CSV file with the format given below: + +- 1 row for each building permit type and subtype, and each week, year, and census block +- The data set should include the following columns: + +`feature_id`: The ID for the feature, in this case, "building_permits_issued_last_4_weeks" +`feature_type`: Building permit type +`feature_subtype`: Building permit subtype +`year`: The ISO-8601 year of the feature value +`week`: The ISO-8601 week number of the feature value +`census_block_2010`: The 2010 Census Block of the feature value +`value`: The value of the feature, i.e. the number of new building permits of the specified types and subtypes issued in the given census block during the previous 4 weeks starting from the year and week above. \ No newline at end of file diff --git a/scripts/feature_engineering/extract_building_permit_features/building_features_with_census.py b/scripts/feature_engineering/extract_building_permit_features/building_features_with_census.py new file mode 100644 index 0000000..8b16767 --- /dev/null +++ b/scripts/feature_engineering/extract_building_permit_features/building_features_with_census.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import pandas as pd +import geopandas.tools +from shapely.geometry import Point +import sys +import glob + +# argv[1] = 'Building Permits/All_Building_permits.csv' +# argv[2] = shapefiles and geospatial information/dc_2010_block_shapefiles/tl_2016_11_tabblock10.shp +# argv[3] = + +source_data = sys.argv[1] +shape_file = sys.argv[2] +output_file_name = sys.argv[3] + +# read in data +fields = ['ISSUE_DATE', 'PERMIT_TYPE_NAME', 'PERMIT_SUBTYPE_NAME', 'LONGITUDE', 'LATITUDE'] +# df = pd.read_csv(sys.argv[1], usecols=fields) + +data_frames = [] +for file in glob.glob(sys.argv[1] + "/*.csv"): + df = pd.read_csv(file, index_col=None, header=0, usecols=fields) + data_frames.append(df) +df = pd.concat(data_frames) + +# geocode lat long to census block +df['geometry'] = df.apply(lambda row: Point(row['LONGITUDE'], row['LATITUDE']), axis=1) +df = geopandas.GeoDataFrame(df, geometry='geometry') +df.crs = {'init': 'epsg:4326'} + +census_blocks = geopandas.GeoDataFrame.from_file(shape_file) +census_blocks.crs = {'init': 'epsg:4326'} + +result = geopandas.tools.sjoin(df[['geometry']], census_blocks[['BLOCKCE10', 'geometry']], how='left') + +df['census_block_2010'] = result['BLOCKCE10'] +df = df[fields + ['census_block_2010']] + +# clean up, rename +del df['LONGITUDE'] +del df['LATITUDE'] +df = df.rename(columns={'PERMIT_TYPE_NAME': 'feature_type', 'PERMIT_SUBTYPE_NAME': 'feature_subtype'}) + +# adding value and feature_id field +df = df.groupby(['feature_type', 'feature_subtype', 'census_block_2010', 'ISSUE_DATE']).size() +df = df.reset_index() +df = df.rename(columns={0: 'value'}) + +# convert date column to date type +df['ISSUE_DATE'] = pd.to_datetime(df.ISSUE_DATE) +df.index = df['ISSUE_DATE'] + +# Resample to weekly and fill in blanks with zeros +WeeklyData = df.groupby(['feature_type', 'feature_subtype', 'census_block_2010']).resample('W-MON', closed='left', + on='ISSUE_DATE', + label='left').sum().reset_index() +WeeklyData.sort_values(by=['feature_type', 'feature_subtype', 'census_block_2010', 'ISSUE_DATE']) +WeeklyData = WeeklyData.fillna(0) + +# Sum over rolling 4 week periods from weekly data +Avg4Week = WeeklyData.groupby(by=['feature_type', 'feature_subtype', 'census_block_2010']).rolling(4, min_periods=1, + on='ISSUE_DATE').sum() + +# Add integer year and weeks +Avg4Week = Avg4Week[['feature_type', 'feature_subtype', 'census_block_2010', 'ISSUE_DATE', 'value']].reset_index( + drop=True) +Avg4Week['year'], Avg4Week['week'] = Avg4Week['ISSUE_DATE'].apply(lambda x: x.isocalendar()[0]), Avg4Week[ + 'ISSUE_DATE'].apply(lambda x: x.isocalendar()[1]) +Avg4Week['feature_id'] = 'building_permits_issued_last_4_weeks' +Avg4Week = Avg4Week[['feature_id', 'feature_type', 'feature_subtype', 'year', 'week', 'census_block_2010', 'value']] + +# Output File to CSV +Avg4Week.to_csv(output_file_name, index=False) From 3ad16e64d87c5ccdb76381c7e2950e93e7c07630 Mon Sep 17 00:00:00 2001 From: nathanielconroy Date: Wed, 11 Apr 2018 20:51:18 -0400 Subject: [PATCH 2/6] Modified building features with census script so that it takes command line arguments. Added readme. --- .../README.md | 30 ++++++++ .../building_features_with_census.py | 75 +++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 scripts/feature_engineering/extract_building_permit_features/README.md create mode 100644 scripts/feature_engineering/extract_building_permit_features/building_features_with_census.py diff --git a/scripts/feature_engineering/extract_building_permit_features/README.md b/scripts/feature_engineering/extract_building_permit_features/README.md new file mode 100644 index 0000000..f172cda --- /dev/null +++ b/scripts/feature_engineering/extract_building_permit_features/README.md @@ -0,0 +1,30 @@ +This is a python script that will generate a feature table as described below. + +When running the script, please provide the following arguments: +1. Path to the folder containing the source building permit data csv files. +2. Path to the shape file. +3. Path to the csv file where the output should be saved. + +Sample usage: + +python building_features_with_census.py C:\RatHackData\Data\Building-Permit-Data C:\RatHackData\ShapeFiles\tl_2016_11_tabblock10.shp output.csv + +The script will return a feature data table for the number of new building permits issued in the last 4 weeks. + +**Input:** +CSV files with data for each given year +A shape file + +**Output:** +A CSV file with the format given below: + +- 1 row for each building permit type and subtype, and each week, year, and census block +- The data set should include the following columns: + +`feature_id`: The ID for the feature, in this case, "building_permits_issued_last_4_weeks" +`feature_type`: Building permit type +`feature_subtype`: Building permit subtype +`year`: The ISO-8601 year of the feature value +`week`: The ISO-8601 week number of the feature value +`census_block_2010`: The 2010 Census Block of the feature value +`value`: The value of the feature, i.e. the number of new building permits of the specified types and subtypes issued in the given census block during the previous 4 weeks starting from the year and week above. \ No newline at end of file diff --git a/scripts/feature_engineering/extract_building_permit_features/building_features_with_census.py b/scripts/feature_engineering/extract_building_permit_features/building_features_with_census.py new file mode 100644 index 0000000..8b16767 --- /dev/null +++ b/scripts/feature_engineering/extract_building_permit_features/building_features_with_census.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import pandas as pd +import geopandas.tools +from shapely.geometry import Point +import sys +import glob + +# argv[1] = 'Building Permits/All_Building_permits.csv' +# argv[2] = shapefiles and geospatial information/dc_2010_block_shapefiles/tl_2016_11_tabblock10.shp +# argv[3] = + +source_data = sys.argv[1] +shape_file = sys.argv[2] +output_file_name = sys.argv[3] + +# read in data +fields = ['ISSUE_DATE', 'PERMIT_TYPE_NAME', 'PERMIT_SUBTYPE_NAME', 'LONGITUDE', 'LATITUDE'] +# df = pd.read_csv(sys.argv[1], usecols=fields) + +data_frames = [] +for file in glob.glob(sys.argv[1] + "/*.csv"): + df = pd.read_csv(file, index_col=None, header=0, usecols=fields) + data_frames.append(df) +df = pd.concat(data_frames) + +# geocode lat long to census block +df['geometry'] = df.apply(lambda row: Point(row['LONGITUDE'], row['LATITUDE']), axis=1) +df = geopandas.GeoDataFrame(df, geometry='geometry') +df.crs = {'init': 'epsg:4326'} + +census_blocks = geopandas.GeoDataFrame.from_file(shape_file) +census_blocks.crs = {'init': 'epsg:4326'} + +result = geopandas.tools.sjoin(df[['geometry']], census_blocks[['BLOCKCE10', 'geometry']], how='left') + +df['census_block_2010'] = result['BLOCKCE10'] +df = df[fields + ['census_block_2010']] + +# clean up, rename +del df['LONGITUDE'] +del df['LATITUDE'] +df = df.rename(columns={'PERMIT_TYPE_NAME': 'feature_type', 'PERMIT_SUBTYPE_NAME': 'feature_subtype'}) + +# adding value and feature_id field +df = df.groupby(['feature_type', 'feature_subtype', 'census_block_2010', 'ISSUE_DATE']).size() +df = df.reset_index() +df = df.rename(columns={0: 'value'}) + +# convert date column to date type +df['ISSUE_DATE'] = pd.to_datetime(df.ISSUE_DATE) +df.index = df['ISSUE_DATE'] + +# Resample to weekly and fill in blanks with zeros +WeeklyData = df.groupby(['feature_type', 'feature_subtype', 'census_block_2010']).resample('W-MON', closed='left', + on='ISSUE_DATE', + label='left').sum().reset_index() +WeeklyData.sort_values(by=['feature_type', 'feature_subtype', 'census_block_2010', 'ISSUE_DATE']) +WeeklyData = WeeklyData.fillna(0) + +# Sum over rolling 4 week periods from weekly data +Avg4Week = WeeklyData.groupby(by=['feature_type', 'feature_subtype', 'census_block_2010']).rolling(4, min_periods=1, + on='ISSUE_DATE').sum() + +# Add integer year and weeks +Avg4Week = Avg4Week[['feature_type', 'feature_subtype', 'census_block_2010', 'ISSUE_DATE', 'value']].reset_index( + drop=True) +Avg4Week['year'], Avg4Week['week'] = Avg4Week['ISSUE_DATE'].apply(lambda x: x.isocalendar()[0]), Avg4Week[ + 'ISSUE_DATE'].apply(lambda x: x.isocalendar()[1]) +Avg4Week['feature_id'] = 'building_permits_issued_last_4_weeks' +Avg4Week = Avg4Week[['feature_id', 'feature_type', 'feature_subtype', 'year', 'week', 'census_block_2010', 'value']] + +# Output File to CSV +Avg4Week.to_csv(output_file_name, index=False) From 7da9cef98d934d4c2f9ec2d884055c1fe0dfe4ac Mon Sep 17 00:00:00 2001 From: nathanielconroy Date: Wed, 11 Apr 2018 20:59:22 -0400 Subject: [PATCH 3/6] fixed markup file --- .../extract_building_permit_features/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/feature_engineering/extract_building_permit_features/README.md b/scripts/feature_engineering/extract_building_permit_features/README.md index f172cda..b262a0b 100644 --- a/scripts/feature_engineering/extract_building_permit_features/README.md +++ b/scripts/feature_engineering/extract_building_permit_features/README.md @@ -27,4 +27,4 @@ A CSV file with the format given below: `year`: The ISO-8601 year of the feature value `week`: The ISO-8601 week number of the feature value `census_block_2010`: The 2010 Census Block of the feature value -`value`: The value of the feature, i.e. the number of new building permits of the specified types and subtypes issued in the given census block during the previous 4 weeks starting from the year and week above. \ No newline at end of file +`value`: The value of the feature, i.e. the number of new building permits of the specified types and subtypes issued in the given census block during the previous 4 weeks starting from the year and week above. \ No newline at end of file From 927dd0d07fe119f93929c79405d453b6636883f6 Mon Sep 17 00:00:00 2001 From: nathanielconroy Date: Wed, 11 Apr 2018 21:03:22 -0400 Subject: [PATCH 4/6] fixed readme --- .../README.md | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/scripts/feature_engineering/extract_building_permit_features/README.md b/scripts/feature_engineering/extract_building_permit_features/README.md index b262a0b..a077375 100644 --- a/scripts/feature_engineering/extract_building_permit_features/README.md +++ b/scripts/feature_engineering/extract_building_permit_features/README.md @@ -11,20 +11,20 @@ python building_features_with_census.py C:\RatHackData\Data\Building-Permit-Data The script will return a feature data table for the number of new building permits issued in the last 4 weeks. -**Input:** -CSV files with data for each given year -A shape file - -**Output:** -A CSV file with the format given below: - -- 1 row for each building permit type and subtype, and each week, year, and census block -- The data set should include the following columns: - -`feature_id`: The ID for the feature, in this case, "building_permits_issued_last_4_weeks" -`feature_type`: Building permit type -`feature_subtype`: Building permit subtype -`year`: The ISO-8601 year of the feature value -`week`: The ISO-8601 week number of the feature value -`census_block_2010`: The 2010 Census Block of the feature value +**Input:** +CSV files with data for each given year +A shape file + +**Output:** +A CSV file with the format given below: + +- 1 row for each building permit type and subtype, and each week, year, and census block +- The data set should include the following columns: + +`feature_id`: The ID for the feature, in this case, "building_permits_issued_last_4_weeks" +`feature_type`: Building permit type +`feature_subtype`: Building permit subtype +`year`: The ISO-8601 year of the feature value +`week`: The ISO-8601 week number of the feature value +`census_block_2010`: The 2010 Census Block of the feature value `value`: The value of the feature, i.e. the number of new building permits of the specified types and subtypes issued in the given census block during the previous 4 weeks starting from the year and week above. \ No newline at end of file From 5c92e5ce23153f2477c728af0b34227066ed3eb1 Mon Sep 17 00:00:00 2001 From: nathanielconroy Date: Wed, 11 Apr 2018 21:05:40 -0400 Subject: [PATCH 5/6] removed extraneous comments --- .../building_features_with_census.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/feature_engineering/extract_building_permit_features/building_features_with_census.py b/scripts/feature_engineering/extract_building_permit_features/building_features_with_census.py index 8b16767..9e2e394 100644 --- a/scripts/feature_engineering/extract_building_permit_features/building_features_with_census.py +++ b/scripts/feature_engineering/extract_building_permit_features/building_features_with_census.py @@ -7,10 +7,6 @@ import sys import glob -# argv[1] = 'Building Permits/All_Building_permits.csv' -# argv[2] = shapefiles and geospatial information/dc_2010_block_shapefiles/tl_2016_11_tabblock10.shp -# argv[3] = - source_data = sys.argv[1] shape_file = sys.argv[2] output_file_name = sys.argv[3] From 0f550a8a630e1bd21b8b4309a7e125787119ce58 Mon Sep 17 00:00:00 2001 From: nathanielconroy Date: Wed, 11 Apr 2018 21:12:32 -0400 Subject: [PATCH 6/6] added link to original issue --- .../extract_building_permit_features/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/feature_engineering/extract_building_permit_features/README.md b/scripts/feature_engineering/extract_building_permit_features/README.md index a077375..bb49667 100644 --- a/scripts/feature_engineering/extract_building_permit_features/README.md +++ b/scripts/feature_engineering/extract_building_permit_features/README.md @@ -1,5 +1,7 @@ This is a python script that will generate a feature table as described below. +Adapted from [issue 3](https://github.com/jasonasher/dc_doh_hackathon/tree/master/issue_3) in the dc_doh_hackathon. + When running the script, please provide the following arguments: 1. Path to the folder containing the source building permit data csv files. 2. Path to the shape file.