Skip to content

Commit

Permalink
fix formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
trevorspreadbury committed May 22, 2024
1 parent bad8d30 commit 4e4336e
Showing 1 changed file with 8 additions and 9 deletions.
17 changes: 8 additions & 9 deletions src/utils/classify_infogroup_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def sic_matcher(if_sic_code: float, relevant_sic_code_df: pd.DataFrame) -> str:
def get_classification(row: pd.Series) -> str:
"""Gets the final classification of the company
Returns the final classification of the company based on the Primary SIC Code and the SIC
Returns the final classification of the company based on the Primary SIC Code and the SIC
Code Column.
Theoretically these should be the same (both fossil fuel, both clean energy, or neither)
Function should be used with .apply() on a row of the DataFrame.
Expand All @@ -69,7 +69,7 @@ def get_classification(row: pd.Series) -> str:
row: a row of the InfoGroup DataFrame
Returns:
the str classification of the company (f or c). if neither SIC code is relevant,
the str classification of the company (f or c). if neither SIC code is relevant,
returns None. if they return different classifications, returns "ambiguous"
"""
# if the same classification, just return one of them
Expand Down Expand Up @@ -99,12 +99,12 @@ def prepare_infogroup_data(
Args:
infogroup_csv: the InfoGroup csv file
sic6_codes_df: DataFrame of the relevant SIC6 codes w/ corresponding regex codes
sic6_codes_df: DataFrame of the relevant SIC6 codes w/ corresponding regex codes
and descriptions
output_file_path: the resulting df will be written as a csv to this file path location
testing: Boolean - True if code is being tested on only several chunks, False if
testing: Boolean - True if code is being tested on only several chunks, False if
whole InfoGroup csv should be used
chunksize: the number of rows per chunk in the IG dataset (default 10,000 but can be
chunksize: the number of rows per chunk in the IG dataset (default 10,000 but can be
changed for testing purposes)
num_testing_chunks: number of chunks to iterate through when testing = True
Expand Down Expand Up @@ -151,7 +151,7 @@ def prepare_infogroup_data(
counter = 0 # will keep track of which chunk is being processed
business_data_df = pd.read_csv(
infogroup_csv, sep=",", header=0, chunksize=10000
) # chunksize = 10,000
) # chunksize = 10,000
for chunk in business_data_df:
print("processing chunk", counter, "...")

Expand Down Expand Up @@ -366,9 +366,9 @@ def get_infogroup_df(
sic6_codes_csv: a csv of relevant SIC6 codes and NCAIS codes and their descriptions
infogroup_csv: the InfoGroup csv file
output_file_path: the output df will be written as a csv to this file path
cached: True if you want to use existing files to bypass creating the InfoGroup data.
cached: True if you want to use existing files to bypass creating the InfoGroup data.
Will return a df of the output file path in this case
testing_subset: Boolean - True if code is being tested on only several chunks,
testing_subset: Boolean - True if code is being tested on only several chunks,
False if whole InfoGroup csv should be used
Returns:
a cleaned InfoGroup DataFrame that is formatted in the same schema as the aggregated
Expand All @@ -387,4 +387,3 @@ def get_infogroup_df(
testing=testing_subset,
)
return infogroup_df

0 comments on commit 4e4336e

Please sign in to comment.