Skip to content

Latest commit

 

History

History
287 lines (261 loc) · 13.3 KB

part_06_cleaning_up_messy_data.md

File metadata and controls

287 lines (261 loc) · 13.3 KB

CB-06: Cleaning Up Messy Data

As in part 2 of the cookbook, we are going to use the 311 service requests data from NYC Open Data. The data should already be downloaded and ready to load:

(def complaints
  (g/read-csv! spark complaints-data-path {:kebab-columns true}))

6.1 Messy Zip Codes

When scanning through the schema, notice that the zip codes are parsed as strings:

(-> complaints g/dtypes :incident-zip)
=> "StringType"

There are two issues, namely dash-separated zip codes and comments such as "NO CLUE":

(-> complaints
    (g/select :incident-zip)
    g/distinct
    (g/collect-col :incident-zip)
    sort)
=> (nil "00000" "000000" "00083" "02061" "06901" "07020" "07087" "07093" "07109" "07114"
    "07201" "07208" "07306" "07604" "08807" "10000" "10001" "10002" "10003" "10004"
    "10005" "10006" "10007" "10009" "10010" "10011" "10012" "10013" "10014" "10016"
    "10017" "10018" "10019" "10020" "10021" "10022" "10023" "10024" "10025" "10026"
    "10027" "10028" "10029" "10030" "10031" "10032" "10033" "10034" "10035" "10036"
    "10037" "10038" "10039" "10040" "10044" "10048" "10065" "10069" "10075" "10103"
    "10107" "10112" "10119" "10128" "10129" "10153" "10162" "10280" "10281" "10282"
    "10301" "10302" "10303" "10304" "10305" "10306" "10307" "10308" "10309" "10310"
    "10312" "10314" "10451" "10452" "10453" "10454" "10455" "10456" "10457" "10458" 
    "10459" "10460" "10461" "10462" "10463" "10464" "10465" "10466" "10467" "10468"
    "10469" "10470" "10471" "10472" "10473" "10474" "10475" "10573" "10803" "10954"
    "11001" "11003" "11004" "11005" "11040" "11042" "11101" "11102" "11103" "11104"
    "11105" "11106" "11109" "11111" "11201" "11203" "11204" "11205" "11206" "11207"
    "11208" "11209" "11210" "11211" "11212" "11213" "11214" "11215" "11216" "11217"
    "11218" "11219" "11220" "11221" "11222" "11223" "11224" "11225" "11226" "11228"
    "11229" "11230" "11231" "11232" "11233" "11234" "11235" "11236" "11237" "11238"
    "11239" "11249" "11354" "11355" "11356" "11357" "11358" "11360" "11361" "11362"
    "11363" "11364" "11365" "11366" "11367" "11368" "11369" "11370" "11372" "11373"
    "11374" "11375" "11377" "11378" "11379" "11385" "11411" "11412" "11413" "11414"
    "11415" "11416" "11417" "11418" "11419" "11420" "11421" "11422" "11423" "11426"
    "11427" "11428" "11429" "11430" "11432" "11433" "11434" "11435" "11436" "11501"
    "11518" "11520" "11530" "11549-3650" "11559" "11563" "11575" "11577" "11580"
    "11590" "11691" "11692" "11693" "11694" "11697" "11716" "11722" "11735" "11747"
    "11776" "11788" "11797" "13221" "14225" "19711" "23502" "23541" "29616-0759" 
    "35209-3114" "41042" "55164-0737" "61702" "70711" "77056" "77092-2016" "90010"
    "92123" "N/A" "NA" "NO CLUE")

6.2 Fixing NaN Values Confusion

We flag up the actual values that should be converted to nulls, and we use g/when with the then and else clauses. Unlike Clojure's if and when, Geni's when can behave like either special forms depending on the number of arguments. Note that nil is interpreted as a null literal:

(def faulty-zips ["NO CLUE" "N/A" "NA" "00000" "000000"])

(def fixed-faulty-zips
  (-> complaints
      (g/with-column
        :incident-zip
        (g/when (g/isin :incident-zip faulty-zips) nil :incident-zip))))

(-> fixed-faulty-zips
    (g/select :incident-zip)
    g/distinct
    (g/collect-col :incident-zip)
    sort)
=> (nil "00083" "02061" "06901" "07020" "07087" "07093" "07109" "07114" "07201" "07208"
    "07306" "07604" "08807" "10000" "10001" "10002" "10003" "10004" "10005" "10006"
    "10007" "10009" "10010" "10011" "10012" "10013" "10014" "10016" "10017" "10018"
    "10019" "10020" "10021" "10022" "10023" "10024" "10025" "10026" "10027" "10028" 
    "10029" "10030" "10031" "10032" "10033" "10034" "10035" "10036" "10037" "10038"
    "10039" "10040" "10044" "10048" "10065" "10069" "10075" "10103" "10107" "10112"
    "10119" "10128" "10129" "10153" "10162" "10280" "10281" "10282" "10301" "10302"
    "10303" "10304" "10305" "10306" "10307" "10308" "10309" "10310" "10312" "10314" 
    "10451" "10452" "10453" "10454" "10455" "10456" "10457" "10458" "10459" "10460"
    "10461" "10462" "10463" "10464" "10465" "10466" "10467" "10468" "10469" "10470"
    "10471" "10472" "10473" "10474" "10475" "10573" "10803" "10954" "11001" "11003"
    "11004" "11005" "11040" "11042" "11101" "11102" "11103" "11104" "11105" "11106"
    "11109" "11111" "11201" "11203" "11204" "11205" "11206" "11207" "11208" "11209"
    "11210" "11211" "11212" "11213" "11214" "11215" "11216" "11217" "11218" "11219"
    "11220" "11221" "11222" "11223" "11224" "11225" "11226" "11228" "11229" "11230"
    "11231" "11232" "11233" "11234" "11235" "11236" "11237" "11238" "11239" "11249"
    "11354" "11355" "11356" "11357" "11358" "11360" "11361" "11362" "11363" "11364"
    "11365" "11366" "11367" "11368" "11369" "11370" "11372" "11373" "11374" "11375"
    "11377" "11378" "11379" "11385" "11411" "11412" "11413" "11414" "11415" "11416"
    "11417" "11418" "11419" "11420" "11421" "11422" "11423" "11426" "11427" "11428"
    "11429" "11430" "11432" "11433" "11434" "11435" "11436" "11501" "11518" "11520"
    "11530" "11549-3650" "11559" "11563" "11575" "11577" "11580" "11590" "11691" 
    "11692" "11693" "11694" "11697" "11716" "11722" "11735" "11747" "11776" "11788"
    "11797" "13221" "14225" "19711" "23502" "23541" "29616-0759" "35209-3114" "41042" 
    "55164-0737" "61702" "70711" "77056" "77092-2016" "90010" "92123")

6.3 What's Up With The Dashes?

We expect the zip codes to be only five digits. We can check non-five-digit zips as follows:

(-> fixed-faulty-zips
    (g/filter (g/=!= (g/length :incident-zip) 5))
    (g/select :incident-zip)
    g/distinct
    g/show)
; +------------+
; |incident-zip|
; +------------+
; |11549-3650  |
; |29616-0759  |
; |35209-3114  |
; |77092-2016  |
; |55164-0737  |
; +------------+

We can handle these zips by simply truncating the value to the first five characters:

(def fixed-dashed-zips
  (-> fixed-faulty-zips
      (g/with-column :incident-zip (g/substring :incident-zip 0 5))))

(-> fixed-dashed-zips
    (g/select :incident-zip)
    g/distinct
    (g/collect-col :incident-zip)
    sort)
=> (nil "00083" "02061" "06901" "07020" "07087" "07093" "07109" "07114" "07201" "07208"
    "07306" "07604" "08807" "10000" "10001" "10002" "10003" "10004" "10005" "10006"
    "10007" "10009" "10010" "10011" "10012" "10013" "10014" "10016" "10017" "10018"
    "10019" "10020" "10021" "10022" "10023" "10024" "10025" "10026" "10027" "10028"
    "10029" "10030" "10031" "10032" "10033" "10034" "10035" "10036" "10037" "10038"
    "10039" "10040" "10044" "10048" "10065" "10069" "10075" "10103" "10107" "10112"
    "10119" "10128" "10129" "10153" "10162" "10280" "10281" "10282" "10301" "10302"
    "10303" "10304" "10305" "10306" "10307" "10308" "10309" "10310" "10312" "10314"
    "10451" "10452" "10453" "10454" "10455" "10456" "10457" "10458" "10459" "10460"
    "10461" "10462" "10463" "10464" "10465" "10466" "10467" "10468" "10469" "10470"
    "10471" "10472" "10473" "10474" "10475" "10573" "10803" "10954" "11001" "11003"
    "11004" "11005" "11040" "11042" "11101" "11102" "11103" "11104" "11105" "11106"
    "11109" "11111" "11201" "11203" "11204" "11205" "11206" "11207" "11208" "11209"
    "11210" "11211" "11212" "11213" "11214" "11215" "11216" "11217" "11218" "11219"
    "11220" "11221" "11222" "11223" "11224" "11225" "11226" "11228" "11229" "11230"
    "11231" "11232" "11233" "11234" "11235" "11236" "11237" "11238" "11239" "11249"
    "11354" "11355" "11356" "11357" "11358" "11360" "11361" "11362" "11363" "11364"
    "11365" "11366" "11367" "11368" "11369" "11370" "11372" "11373" "11374" "11375"
    "11377" "11378" "11379" "11385" "11411" "11412" "11413" "11414" "11415" "11416"
    "11417" "11418" "11419" "11420" "11421" "11422" "11423" "11426" "11427" "11428"
    "11429" "11430" "11432" "11433" "11434" "11435" "11436" "11501" "11518" "11520"
    "11530" "11549" "11559" "11563" "11575" "11577" "11580" "11590" "11691" "11692"
    "11693" "11694" "11697" "11716" "11722" "11735" "11747" "11776" "11788" "11797"
    "13221" "14225" "19711" "23502" "23541" "29616" "35209" "41042" "55164" "61702"
    "70711" "77056" "77092" "90010" "92123")

As a sanity check, we look up non-null zip codes that do not start with zeroes or ones to see if they look okay:

(def close?
  (g/||
    (g/starts-with :incident-zip "0")
    (g/starts-with :incident-zip "1")))

(-> fixed-dashed-zips
    (g/filter (g/&& (g/not close?) (g/not-null? :incident-zip)))
    (g/select :incident-zip :descriptor :city)
    g/distinct
    (g/order-by :incident-zip)
    g/show)
; +------------+-----------------+-----------+
; |incident-zip|descriptor       |city       |
; +------------+-----------------+-----------+
; |23502       |Harassment       |NORFOLK    |
; |23541       |Harassment       |NORFOLK    |
; |29616       |Debt Not Owed    |GREENVILLE |
; |35209       |Harassment       |BIRMINGHAM |
; |41042       |Harassment       |FLORENCE   |
; |55164       |Harassment       |ST. PAUL   |
; |61702       |Billing Dispute  |BLOOMIGTON |
; |70711       |Contract Dispute |CLIFTON    |
; |77056       |Debt Not Owed    |HOUSTON    |
; |77092       |False Advertising|HOUSTON    |
; |90010       |Billing Dispute  |LOS ANGELES|
; |92123       |Billing Dispute  |SAN DIEGO  |
; |92123       |Harassment       |SAN DIEGO  |
; +------------+-----------------+-----------+

Finally, check the number of complaints per city is reasonable:

(-> fixed-dashed-zips
    (g/select (g/upper :city))
    g/value-counts
    (g/show {:num-rows 40}))
; +-------------------+-----+
; |upper(city)        |count|
; +-------------------+-----+
; |BROOKLYN           |31662|
; |NEW YORK           |22664|
; |BRONX              |18438|
; |null               |12208|
; |STATEN ISLAND      |4766 |
; |JAMAICA            |2246 |
; |FLUSHING           |1803 |
; |ASTORIA            |1568 |
; |RIDGEWOOD          |1073 |
; |CORONA             |707  |
; |OZONE PARK         |693  |
; |LONG ISLAND CITY   |678  |
; |FAR ROCKAWAY       |652  |
; |ELMHURST           |647  |
; |WOODSIDE           |609  |
; |EAST ELMHURST      |562  |
; |QUEENS VILLAGE     |549  |
; |FOREST HILLS       |541  |
; |JACKSON HEIGHTS    |541  |
; |SOUTH RICHMOND HILL|521  |
; |MASPETH            |473  |
; |WOODHAVEN          |464  |
; |FRESH MEADOWS      |435  |
; |SPRINGFIELD GARDENS|434  |
; |BAYSIDE            |411  |
; |SOUTH OZONE PARK   |410  |
; |RICHMOND HILL      |404  |
; |REGO PARK          |402  |
; |MIDDLE VILLAGE     |396  |
; |SAINT ALBANS       |387  |
; |WHITESTONE         |348  |
; |SUNNYSIDE          |337  |
; |COLLEGE POINT      |315  |
; |HOWARD BEACH       |308  |
; |LITTLE NECK        |293  |
; |HOLLIS             |289  |
; |OAKLAND GARDENS    |277  |
; |ROCKAWAY PARK      |276  |
; |KEW GARDENS        |238  |
; |CAMBRIA HEIGHTS    |234  |
; +-------------------+-----+
; only showing top 40 rows

6.4 All In One Form

For conciseness, we can simply do everything in one form:

(-> complaints
    (g/with-column
      :incident-zip
      (g/when (g/isin :incident-zip faulty-zips)
        nil
        (g/substring :incident-zip 0 5)))
    (g/select :incident-zip)
    g/distinct
    (g/collect-col :incident-zip)
    sort)
(nil "00083" "02061" "06901" "07020" "07087" "07093" "07109" "07114" "07201"
 "07208" "07306" "07604" "08807" "10000" "10001" "10002" "10003" "10004" "10005"
 "10006" "10007" "10009" "10010" "10011" "10012" "10013" "10014" "10016" "10017"
 "10018" "10019" "10020" "10021" "10022" "10023" "10024" "10025" "10026" "10027"
 "10028" "10029" "10030" "10031" "10032" "10033" "10034" "10035" "10036" "10037"
 "10038" "10039" "10040" "10044" "10048" "10065" "10069" "10075" "10103" "10107"
 "10112" "10119" "10128" "10129" "10153" "10162" "10280" "10281" "10282" "10301"
 "10302" "10303" "10304" "10305" "10306" "10307" "10308" "10309" "10310" "10312"
 "10314" "10451" "10452" "10453" "10454" "10455" "10456" "10457" "10458" "10459"
 "10460" "10461" "10462" "10463" "10464" "10465" "10466" "10467" "10468" "10469"
 "10470" "10471" "10472" "10473" "10474" "10475" "10573" "10803" "10954" "11001"
 "11003" "11004" "11005" "11040" "11042" "11101" "11102" "11103" "11104" "11105"
 "11106" "11109" "11111" "11201" "11203" "11204" "11205" "11206" "11207" "11208"
 "11209" "11210" "11211" "11212" "11213" "11214" "11215" "11216" "11217" "11218"
 "11219" "11220" "11221" "11222" "11223" "11224" "11225" "11226" "11228" "11229"
 "11230" "11231" "11232" "11233" "11234" "11235" "11236" "11237" "11238" "11239"
 "11249" "11354" "11355" "11356" "11357" "11358" "11360" "11361" "11362" "11363"
 "11364" "11365" "11366" "11367" "11368" "11369" "11370" "11372" "11373" "11374"
 "11375" "11377" "11378" "11379" "11385" "11411" "11412" "11413" "11414" "11415"
 "11416" "11417" "11418" "11419" "11420" "11421" "11422" "11423" "11426" "11427"
 "11428" "11429" "11430" "11432" "11433" "11434" "11435" "11436" "11501" "11518"
 "11520" "11530" "11549" "11559" "11563" "11575" "11577" "11580" "11590" "11691"
 "11692" "11693" "11694" "11697" "11716" "11722" "11735" "11747" "11776" "11788"
 "11797" "13221" "14225" "19711" "23502" "23541" "29616" "35209" "41042" "55164"
 "61702" "70711" "77056" "77092" "90010" "92123")