Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] More control on creating the output dataframe for conditional_join - PR no 1 #1396

Merged
merged 11 commits into from
Sep 14, 2024
1,577 changes: 858 additions & 719 deletions examples/notebooks/Pivoting Data from Wide to Long.ipynb

Large diffs are not rendered by default.

73 changes: 47 additions & 26 deletions examples/notebooks/anime.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
"metadata": {},
"outputs": [],
"source": [
"filename = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-23/raw_anime.csv'\n",
"filename = \"https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-23/raw_anime.csv\"\n",
"df = pd.read_csv(filename)\n",
"\n",
"\n",
Expand Down Expand Up @@ -124,10 +124,11 @@
" stop: int = None,\n",
" pat: str = \" \",\n",
" *args,\n",
" **kwargs\n",
" **kwargs,\n",
"):\n",
" \"\"\"\n",
" Wrapper around `df.str.split` with additional `start` and `end` arguments\n",
" Wrapper around `df.str.split`\n",
" with additional `start` and `end` arguments\n",
" to select a slice of the list of words.\n",
" \"\"\"\n",
"\n",
Expand All @@ -148,7 +149,12 @@
"\n",
"@pf.register_dataframe_method\n",
"def str_slice(\n",
" df, column_name: str, start: int = None, stop: int = None, *args, **kwargs\n",
" df, \n",
" column_name: str,\n",
" start: int = None, \n",
" stop: int = None, \n",
" *args, \n",
" **kwargs\n",
"):\n",
" \"\"\"\n",
" Wrapper around `df.str.slice\n",
Expand All @@ -175,7 +181,9 @@
" .str_word(column_name=\"aired\", start=0, stop=2, pat=\",\")\n",
" .str_join(column_name=\"aired\", sep=\",\")\n",
" .deconcatenate_column(\n",
" column_name=\"aired\", new_column_names=[\"start_date\", \"end_date\"], sep=\",\"\n",
" column_name=\"aired\",\n",
" new_column_names=[\"start_date\", \"end_date\"],\n",
" sep=\",\",\n",
" )\n",
" .remove_columns(column_names=[\"aired\"])\n",
" .str_remove(column_name=\"start_date\", pat=\"'\")\n",
Expand Down Expand Up @@ -858,19 +866,22 @@
"outputs": [],
"source": [
"@pf.register_dataframe_method\n",
"def str_remove(df, column_name: str, pat: str, *args, **kwargs):\n",
"def str_remove(df, column_name: str, pat: str, *args, **kwargs): # noqa: F811\n",
" \"\"\"\n",
" Wrapper around df.str.replace\n",
" The function will loop through regex patterns and remove them from the desired column.\n",
" The function will loop through regex patterns \n",
" and remove them from the desired column.\n",
"\n",
" :param df: A pandas DataFrame.\n",
" :param column_name: A `str` indicating which column the string removal action is to be made.\n",
" :param column_name: A `str` indicating which column \n",
" the string removal action is to be made.\n",
" :param pat: A regex pattern to match and remove.\n",
" \"\"\"\n",
"\n",
" if not isinstance(pat, str):\n",
" raise TypeError(\n",
" f\"Pattern should be a valid regex pattern. Received pattern: {pat} with dtype: {type(pat)}\"\n",
" f\"Pattern should be a valid regex pattern. \"\n",
" f\"Received pattern: {pat} with dtype: {type(pat)}\"\n",
" )\n",
" df[column_name] = df[column_name].str.replace(pat, \"\", *args, **kwargs)\n",
" return df"
Expand Down Expand Up @@ -939,13 +950,17 @@
"outputs": [],
"source": [
"@pf.register_dataframe_method\n",
"def explode(df: pd.DataFrame, column_name: str, sep: str):\n",
"def explode(df: pd.DataFrame, column_name: str, sep: str): # noqa: F811\n",
" \"\"\"\n",
" For rows with a list of values, this function will create new rows for each value in the list\n",
" For rows with a list of values,\n",
" this function will create new rows\n",
" for each value in the list\n",
"\n",
" :param df: A pandas DataFrame.\n",
" :param column_name: A `str` indicating which column the string removal action is to be made.\n",
" :param sep: The delimiter. Example delimiters include `|`, `, `, `,` etc.\n",
" :param column_name: A `str` indicating which column\n",
" the string removal action is to be made.\n",
" :param sep: The delimiter.\n",
" Example delimiters include `|`, `, `, `,` etc.\n",
" \"\"\"\n",
"\n",
" df[\"id\"] = df.index\n",
Expand Down Expand Up @@ -1046,7 +1061,7 @@
"outputs": [],
"source": [
"@pf.register_dataframe_method\n",
"def str_trim(df, column_name: str, *args, **kwargs):\n",
"def str_trim(df, column_name: str, *args, **kwargs): #noqa: F811\n",
" \"\"\"Remove trailing and leading characters, in a given column\"\"\"\n",
" df[column_name] = df[column_name].str.strip(*args, **kwargs)\n",
" return df"
Expand Down Expand Up @@ -1300,54 +1315,60 @@
"outputs": [],
"source": [
"@pf.register_dataframe_method\n",
"def str_word(\n",
"def str_word( #noqa: F811\n",
" df,\n",
" column_name: str,\n",
" start: int = None,\n",
" stop: int = None,\n",
" pat: str = \" \",\n",
" *args,\n",
" **kwargs\n",
"):\n",
"): #noqa: F811\n",
" \"\"\"\n",
" Wrapper around `df.str.split` with additional `start` and `end` arguments\n",
" Wrapper around `df.str.split`,\n",
" with additional `start` and `end` arguments\n",
" to select a slice of the list of words.\n",
"\n",
" :param df: A pandas DataFrame.\n",
" :param column_name: A `str` indicating which column the split action is to be made.\n",
" :param column_name: A `str` indicating which column \n",
" the split action is to be made.\n",
" :param start: optional An `int` for the start index of the slice\n",
" :param stop: optional An `int` for the end index of the slice\n",
" :param pat: String or regular expression to split on. If not specified, split on whitespace.\n",
" :param pat: String or regular expression to split on. \n",
" If not specified, split on whitespace.\n",
"\n",
" \"\"\"\n",
" df[column_name] = df[column_name].str.split(pat).str[start:stop]\n",
" return df\n",
"\n",
"\n",
"@pf.register_dataframe_method\n",
"def str_join(df, column_name: str, sep: str, *args, **kwargs):\n",
"def str_join(df, column_name: str, sep: str, *args, **kwargs): #noqa: F811\n",
" \"\"\"\n",
" Wrapper around `df.str.join`\n",
" Joins items in a list.\n",
"\n",
" :param df: A pandas DataFrame.\n",
" :param column_name: A `str` indicating which column the split action is to be made.\n",
" :param sep: The delimiter. Example delimiters include `|`, `, `, `,` etc.\n",
" :param column_name: A `str` indicating which column \n",
" the split action is to be made.\n",
" :param sep: The delimiter. Example delimiters \n",
" include `|`, `, `, `,` etc.\n",
" \"\"\"\n",
" df[column_name] = df[column_name].str.join(sep)\n",
" return df\n",
"\n",
"\n",
"@pf.register_dataframe_method\n",
"def str_slice(\n",
"def str_slice( #noqa: F811\n",
" df, column_name: str, start: int = None, stop: int = None, *args, **kwargs\n",
"):\n",
"): #noqa: F811\n",
" \"\"\"\n",
" Wrapper around `df.str.slice\n",
" Slices strings.\n",
"\n",
" :param df: A pandas DataFrame.\n",
" :param column_name: A `str` indicating which column the split action is to be made.\n",
" :param column_name: A `str` indicating which column \n",
" the split action is to be made.\n",
" :param start: 'int' indicating start of slice.\n",
" :param stop: 'int' indicating stop of slice.\n",
" \"\"\"\n",
Expand Down Expand Up @@ -1745,7 +1766,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.9.16"
}
},
"nbformat": 4,
Expand Down
11 changes: 7 additions & 4 deletions examples/notebooks/bird_call.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,8 @@
"source": [
"clean_birds = (\n",
" raw_birds\n",
" .merge(clean_call, how='left') # merge the raw_birds dataframe with clean_raw dataframe\n",
" # merge the raw_birds dataframe with clean_raw dataframe\n",
" .merge(clean_call, how='left') \n",
" .select_columns(\n",
" [\n",
" \"Genus\",\n",
Expand All @@ -611,9 +612,11 @@
" ]\n",
" ) # include list of cols\n",
" .clean_names()\n",
" .rename_column(\"collisions\", \"family\") # rename 'collisions' column to 'family' in merged dataframe\n",
" # rename 'collisions' column to 'family' in merged dataframe\n",
" .rename_column(\"collisions\", \"family\") \n",
" .rename_column(\"call\", \"flight_call\")\n",
" .dropna() # drop all rows which contain a NaN\n",
" # drop all rows which contain a NaN\n",
" .dropna() \n",
")"
]
},
Expand Down Expand Up @@ -755,7 +758,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.9.16"
}
},
"nbformat": 4,
Expand Down
14 changes: 9 additions & 5 deletions examples/notebooks/board_games.ipynb

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions examples/notebooks/complete.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@
}
],
"source": [
"new_year_values = lambda year: range(year.min(), year.max() + 1)\n",
"new_year_values = lambda year: range(year.min(), year.max() + 1) # noqa: E731\n",
"\n",
"df.complete({\"Year\": new_year_values}, \"Taxon\")"
]
Expand Down Expand Up @@ -963,7 +963,7 @@
}
],
"source": [
"new_year_values = lambda year: range(year.min(), year.max() + 1)\n",
"new_year_values = lambda year: range(year.min(), year.max() + 1) # noqa: E731\n",
"\n",
"df.complete(\n",
" {'year': new_year_values},\n",
Expand Down Expand Up @@ -1163,7 +1163,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
"version": "3.9.16"
},
"orig_nbformat": 4
},
Expand Down
17 changes: 11 additions & 6 deletions examples/notebooks/french_trains.ipynb

Large diffs are not rendered by default.

88 changes: 50 additions & 38 deletions examples/notebooks/medium_franchise.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Suppress user warnings when we try overwriting our custom pandas flavor functions\n",
"# Suppress user warnings \n",
"# when we try overwriting our custom pandas flavor functions\n",
"import warnings\n",
"\n",
"warnings.filterwarnings('ignore')"
Expand Down Expand Up @@ -191,7 +192,9 @@
}
],
"source": [
"fileurl = '../data/medium_franchise_raw_table.csv' # originally from https://en.wikipedia.org/wiki/List_of_highest-grossing_media_franchises\n",
"# originally from \n",
"# https://en.wikipedia.org/wiki/List_of_highest-grossing_media_franchises\n",
"fileurl = '../data/medium_franchise_raw_table.csv' \n",
"df_raw = pd.read_csv(fileurl)\n",
"df_raw.head(3)"
]
Expand Down Expand Up @@ -883,20 +886,23 @@
"source": [
"# Value mapper `revenue_category`\n",
"value_mapper = {\n",
" 'box office': 'Box Office',\n",
" 'dvd|blu|vhs|home video|video rentals|video sales|streaming|home entertainment': 'Home Video/Entertainment',\n",
" 'video game|computer game|mobile game|console|game|pachinko|pet|card': 'Video Games/Games',\n",
" 'comic|manga': 'Comic or Manga',\n",
" 'music|soundtrac': 'Music',\n",
" 'tv': 'TV',\n",
" 'merchandise|licens|mall|stage|retail': 'Merchandise, Licensing & Retail',\n",
" \"box office\": \"Box Office\",\n",
" \"dvd|blu|vhs|home video|video rentals|video sales|streaming|home entertainment\": \"Home Video/Entertainment\", # noqa: E501\n",
" \"video game|computer game|mobile game|console|game|pachinko|pet|card\": \"Video Games/Games\", # noqa: E501\n",
" \"comic|manga\": \"Comic or Manga\",\n",
" \"music|soundtrac\": \"Music\",\n",
" \"tv\": \"TV\",\n",
" \"merchandise|licens|mall|stage|retail\": \"Merchandise, Licensing & Retail\",\n",
"}\n",
"\n",
"column_name = 'revenue_category'\n",
"column_name = \"revenue_category\"\n",
"# [pyjanitor] convert to lower case\n",
"df_clean_category = (\n",
" df_clean_category.transform_column(column_name, str.lower) # [pyjanitor] convert to lower case\n",
" .transform_column(column_name, str.strip) # [pyjanitor] strip leading/trailing white space\n",
" .fuzzy_match_replace(column_name, mapper=value_mapper) # [pyjanitor + pandas_flavor]\n",
" df_clean_category.transform_column(column_name, str.lower)\n",
" # [pyjanitor] strip leading/trailing white space\n",
" .transform_column(column_name, str.strip)\n",
" # [pyjanitor + pandas_flavor]\n",
" .fuzzy_match_replace(column_name, mapper=value_mapper)\n",
")\n",
"df_clean_category.head(3)"
]
Expand Down Expand Up @@ -1127,31 +1133,37 @@
"df_clean = (\n",
" pd.read_csv(fileurl)\n",
" .rename(\n",
" columns={col_old: col_new for col_old, col_new in zip(df_raw.columns, colnames)}\n",
" columns={\n",
" col_old: col_new\n",
" for col_old, col_new in zip(df_raw.columns, colnames)\n",
" }\n",
" )\n",
" .str_remove('total_revenue', pattern='est.') # [pandas-flavor]\n",
" .str_trim('total_revenue') # [pandas-flavor]\n",
" .str_remove('total_revenue', pattern='\\$') # [pandas-flavor]\n",
" .str_slice('total_revenue', start=0, stop=2) # [pandas-flavor]\n",
" .change_type('total_revenue', float) # [pyjanitor]\n",
" .separate_rows('revenue_items', sep='\\[') # [pandas-flavor]\n",
" .filter_string('revenue_items', 'illion') # [pyjanitor]\n",
" .separate(\n",
" 'revenue_items', into=['revenue_category', 'revenue'], sep='\\$'\n",
" ) # [pyjanitor + pandas-flavor]\n",
" .str_remove('revenue_category', pattern=' – ') # [pandas-flavor]\n",
" .str_remove('revenue_category', pattern='.*\\]') # [pandas-flavor]\n",
" .str_remove('revenue_category', pattern='\\n') # [pandas-flavor]\n",
" .transform_column('revenue_category', str.lower) # [pyjanitor] convert to lower case\n",
" .transform_column('revenue_category', str.strip) # [pyjanitor] strip leading/trailing white space\n",
" .fuzzy_match_replace('revenue_category', mapper=value_mapper) # [pyjanitor + pandas_flavor]\n",
" .str_remove('revenue', 'illion') # [pandas-flavor]\n",
" .str_trim('revenue') # [pandas-flavor]\n",
" .str_remove('revenue', ' ') # [pandas-flavor]\n",
" .str_replace('revenue', '\\s*b', '') # [pandas-flavor]\n",
" .str_replace('revenue', '\\s*m', 'e-3') # [pandas-flavor]\n",
" .str_remove(\"total_revenue\", pattern=\"est.\") # [pandas-flavor]\n",
" .str_trim(\"total_revenue\") # [pandas-flavor]\n",
" .str_remove(\"total_revenue\", pattern=\"\\$\") # [pandas-flavor]\n",
" .str_slice(\"total_revenue\", start=0, stop=2) # [pandas-flavor]\n",
" .change_type(\"total_revenue\", float) # [pyjanitor]\n",
" .separate_rows(\"revenue_items\", sep=\"\\[\") # [pandas-flavor]\n",
" .filter_string(\"revenue_items\", \"illion\") # [pyjanitor]\n",
" # [pyjanitor + pandas-flavor]\n",
" .separate(\"revenue_items\", into=[\"revenue_category\", \"revenue\"], sep=\"\\$\")\n",
" # [pandas-flavor]\n",
" .str_remove(\"revenue_category\", pattern=\" – \")\n",
" .str_remove(\"revenue_category\", pattern=\".*\\]\")\n",
" .str_remove(\"revenue_category\", pattern=\"\\n\")\n",
" # [pyjanitor] convert to lower case\n",
" .transform_column(\"revenue_category\", str.lower)\n",
" # [pyjanitor] strip leading/trailing white space\n",
" .transform_column(\"revenue_category\", str.strip)\n",
" # [pyjanitor + pandas_flavor]\n",
" .fuzzy_match_replace(\"revenue_category\", mapper=value_mapper)\n",
" .str_remove(\"revenue\", \"illion\") # [pandas-flavor]\n",
" .str_trim(\"revenue\") # [pandas-flavor]\n",
" .str_remove(\"revenue\", \" \") # [pandas-flavor]\n",
" .str_replace(\"revenue\", \"\\s*b\", \"\") # [pandas-flavor]\n",
" .str_replace(\"revenue\", \"\\s*m\", \"e-3\") # [pandas-flavor]\n",
" .parse_number() # [pandas-flavor]\n",
" .str_remove('original_media', '\\[.+') # [pandas-flavor]\n",
" .str_remove(\"original_media\", \"\\[.+\") # [pandas-flavor]\n",
")"
]
},
Expand Down Expand Up @@ -1443,9 +1455,9 @@
"# Generate final dataframe\n",
"df_final = (\n",
" pd.merge(\n",
" df_sum, df_metadata, how='left', on=['franchise', 'revenue_category']\n",
" df_sum, df_metadata, how=\"left\", on=[\"franchise\", \"revenue_category\"]\n",
" )\n",
" .drop_duplicates(keep='first')\n",
" .drop_duplicates(keep=\"first\")\n",
" .reset_index(drop=True)\n",
")\n",
"df_final.head(3)"
Expand Down
8 changes: 5 additions & 3 deletions examples/notebooks/teacher_pupil.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,10 @@
"\n",
"@pf.register_dataframe_method\n",
"def drop_duplicated_column(df, column_name: str, column_order: int=0):\n",
" \"\"\"Remove duplicated columns and retain only a column given its order.\n",
" Order 0 is to remove the first column, Order 1 is to remove the second column, and etc\"\"\"\n",
" \"\"\"Remove duplicated columns \n",
" and retain only a column given its order.\n",
" Order 0 is to remove the first column, \n",
" Order 1 is to remove the second column, and etc\"\"\"\n",
"\n",
" cols = list(df.columns)\n",
" col_indexes = [\n",
Expand Down Expand Up @@ -467,7 +469,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.9.16"
},
"stem_cell": {
"cell_type": "raw",
Expand Down
Loading
Loading