diff --git a/.gitignore b/.gitignore index 1c7b6c1fa..6233a8a48 100644 --- a/.gitignore +++ b/.gitignore @@ -15,7 +15,7 @@ build/ dist/ .eggs/ *.egg-info/ -optimus/__pycache__/ +__pycache__/ # Spark-Package release *.zip diff --git a/.travis.yml b/.travis.yml index 9d794797c..6df6d7c88 100644 --- a/.travis.yml +++ b/.travis.yml @@ -38,7 +38,7 @@ jdk: - oraclejdk8 script: - - py.test + - py.test -v - sonar-scanner deploy: diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..31c0e2b49 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = Optimus +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 000000000..4251e41bb --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Optimus documentation build configuration file, created by +# sphinx-quickstart on Wed Oct 11 19:21:00 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'sphinx.ext.intersphinx', + 'sphinx.ext.mathjax'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Optimus' +copyright = '2017, Favio Vazquez' +author = 'Favio Vazquez' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '1.1' +# The full version, including alpha/beta/rc tags. +release = '1.1.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Optimusdoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'Optimus.tex', 'Optimus Documentation', + 'Favio Vazquez', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'optimus', 'Optimus Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'Optimus', 'Optimus Documentation', + author, 'Optimus', 'One line description of project.', + 'Miscellaneous'), +] + + + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'https://docs.python.org/': None} diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 000000000..4043d240b --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,32 @@ +.. Optimus documentation master file, created by + sphinx-quickstart on Wed Oct 11 19:21:00 2017. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to Optimus's documentation! +=================================== + +.. image:: sections/images/logoOptimus.png + +As data scientists, we care about extracting the best information out of our data. Data is the new soil, you have to get in and get your hands dirty, without cleaning and preparing it, it just useless. + +Data preparation accounts for about 80% of the work of data scientists, so having a solution that connects to your database or file system, uses the most important framework for machine learning and data science at the moment (Apache Spark) and that can handle lots of information, working both in a cluster in a parallelized fashion or locally on your laptop is really important to have. + +Say Hi! to Optimus_ and visit out web page. + +.. _Optimus: https://hioptimus.com + +Prepare, process and explore your Big Data with fastest open source library on the planet using Apache Spark and Python (PySpark). + +.. toctree:: + :maxdepth: 2 + + sections/overview + sections/installing + sections/analyzing + sections/transforming + sections/feature + +Library mantained by `Favio Vazquez`_ +------------------------------------------ +.. _Favio Vazquez: https://github.com/faviovazquez \ No newline at end of file diff --git a/docs/source/sections/analyzing.rst b/docs/source/sections/analyzing.rst new file mode 100644 index 000000000..71c930701 --- /dev/null +++ b/docs/source/sections/analyzing.rst @@ -0,0 +1,607 @@ +Analyzing your Data +==================== + +DataFrameProfiler class +----------------------- + +This class makes a profile for a given dataframe and its different general features. +Based on spark-df-profiling by Julio Soto. + +Initially it is a good idea to see a general view of the DataFrame to be analyzed. + +Lets assume you have the following dataset, called foo.csv, in your current directory: + ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| id | firstName | lastName | billingId | product | price | birth | dummyCol | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 1 | Luis | Alvarez$$%! | 123 | Cake | 10 | 1980/07/07 | never | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 2 | André | Ampère | 423 | piza | 8 | 1950/07/08 | gonna | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 3 | NiELS | Böhr//((%% | 551 | pizza | 8 | 1990/07/09 | give | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 4 | PAUL | dirac$ | 521 | pizza | 8 | 1954/07/10 | you | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 5 | Albert | Einstein | 634 | pizza | 8 | 1990/07/11 | up | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 6 | Galileo | GALiLEI | 672 | arepa | 5 | 1930/08/12 | never | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 7 | CaRL | Ga%%%uss | 323 | taco | 3 | 1970/07/13 | gonna | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 8 | David | H$$$ilbert | 624 | taaaccoo | 3 | 1950/07/14 | let | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 9 | Johannes | KEPLER | 735 | taco | 3 | 1920/04/22 | you | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 10 | JaMES | M$$ax%%well | 875 | taco | 3 | 1923/03/12 | down | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 11 | Isaac | Newton | 992 | pasta | 9 | 1999/02/15 | never | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 12 | Emmy%% | Nöether$ | 234 | pasta | 9 | 1993/12/08 | gonna | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 13 | Max!!! | Planck!!! | 111 | hamburguer | 4 | 1994/01/04 | run | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 14 | Fred | Hoy&&&le | 553 | pizzza | 8 | 1997/06/27 | around | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 15 | ((( Heinrich ))))) | Hertz | 116 | pizza | 8 | 1956/11/30 | and | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 16 | William | Gilbert### | 886 | BEER | 2 | 1958/03/26 | desert | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 17 | Marie | CURIE | 912 | Rice | 1 | 2000/03/22 | you | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 18 | Arthur | COM%%%pton | 812 | 110790 | 5 | 1899/01/01 | # | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 19 | JAMES | Chadwick | 467 | null | 10 | 1921/05/03 | # | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ + +.. code:: python + + # Import optimus + import optimus as op + #Import os module for system tools + import os + + # Reading dataframe. os.getcwd() returns de current directory of the notebook + # 'file:///' is a prefix that specifies the type of file system used, in this + # case, local file system (hard drive of the pc) is used. + filePath = "file:///" + os.getcwd() + "/foo.csv" + + df = tools.read_csv(path=filePath, + sep=',') + + # Instance of profiler class + profiler = op.DataFrameProfiler(df) + profiler.profiler() + +This overview presents basic information about the DataFrame, like number of variable it has, +how many are missing values and in which column, the types of each varaible, also some statistical information +that describes the variable plus a frecuency plot. table that specifies the existing datatypes in each column +dataFrame and other features. Also, for this particular case, the table of dataType is shown in order to visualize +a sample of column content. + +DataFrameAnalyzer class +----------------------- + +DataFrameAnalyzer class analyze dataType of rows in each columns of +dataFrames. + +**DataFrameAnalyzer methods** + +- DataFrameAnalyzer.column_analyze(column_list, plots=True, values_bar=True, print_type=False, num_bars=10) +- DataFrameAnalyzer.plot_hist(df_one_col, hist_dict, type_hist, num_bars=20, values_bar=True) +- DataFrameAnalyzer.get_categorical_hist(df_one_col, num_bars) +- DataFrameAnalyzer.get_numerical_hist(df_one_col, num_bars) +- DataFrameAnalyzer.unique_values_col(column) +- DataFrameAnalyzer.write_json(json_cols, path_to_json_file) +- DataFrameAnalyzer.get_frequency(columns, sort_by_count=True) + +Lets assume you have the following dataset, called foo.csv, in your current directory: + ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| id | firstName | lastName | billingId | product | price | birth | dummyCol | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 1 | Luis | Alvarez$$%! | 123 | Cake | 10 | 1980/07/07 | never | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 2 | André | Ampère | 423 | piza | 8 | 1950/07/08 | gonna | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 3 | NiELS | Böhr//((%% | 551 | pizza | 8 | 1990/07/09 | give | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 4 | PAUL | dirac$ | 521 | pizza | 8 | 1954/07/10 | you | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 5 | Albert | Einstein | 634 | pizza | 8 | 1990/07/11 | up | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 6 | Galileo | GALiLEI | 672 | arepa | 5 | 1930/08/12 | never | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 7 | CaRL | Ga%%%uss | 323 | taco | 3 | 1970/07/13 | gonna | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 8 | David | H$$$ilbert | 624 | taaaccoo | 3 | 1950/07/14 | let | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 9 | Johannes | KEPLER | 735 | taco | 3 | 1920/04/22 | you | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 10 | JaMES | M$$ax%%well | 875 | taco | 3 | 1923/03/12 | down | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 11 | Isaac | Newton | 992 | pasta | 9 | 1999/02/15 | never | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 12 | Emmy%% | Nöether$ | 234 | pasta | 9 | 1993/12/08 | gonna | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 13 | Max!!! | Planck!!! | 111 | hamburguer | 4 | 1994/01/04 | run | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 14 | Fred | Hoy&&&le | 553 | pizzza | 8 | 1997/06/27 | around | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 15 | ((( Heinrich ))))) | Hertz | 116 | pizza | 8 | 1956/11/30 | and | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 16 | William | Gilbert### | 886 | BEER | 2 | 1958/03/26 | desert | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 17 | Marie | CURIE | 912 | Rice | 1 | 2000/03/22 | you | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 18 | Arthur | COM%%%pton | 812 | 110790 | 5 | 1899/01/01 | # | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ +| 19 | JAMES | Chadwick | 467 | null | 10 | 1921/05/03 | # | ++----+----------------------+-------------+-----------+------------+-------+------------+----------+ + +The following code shows how to instantiate the class to analyze a dataFrame: + +.. code:: python + + # Import optimus + import optimus as op + # Instance of Utilities class + tools = op.Utilities() + + # Reading dataframe. os.getcwd() returns de current directory of the notebook + # 'file:///' is a prefix that specifies the type of file system used, in this + # case, local file system (hard drive of the pc) is used. + filePath = "file:///" + os.getcwd() + "/foo.csv" + + df = tools.read_csv(path=filePath, sep=',') + + analyzer = op.DataFrameAnalyzer(df=df,pathFile=filePath) + +Analyzer.column_analyze(column_list, plots=True, values_bar=True, print_type=False, num_bars=10) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This function counts the number of registers in a column that are numbers (integers, floats) and the number of string registers. + +Input: + +``column_list``: A list or a string column name. + +``plots``: Can be True or False. If true it will output the predefined plots. + +``values_bar (optional)``: Can be True or False. If it is True, frequency values are placed over each bar. + +``print_type (optional)``: Can be one of the following strings: 'integer', 'string', 'float'. Depending of what string +is provided, a list of distinct values of that type is printed. + +``num_bars``: number of bars printed in histogram + +The method outputs a list containing the number of the different datatypes [nulls, strings, integers, floats]. + +Example: + +.. code:: python + + analyzer.column_analyze("*", plots=False, values_bar=True, print_type=False, num_bars=10) + ++-----------+----------+------------+----------------------+ +| | | | Column name: id | ++-----------+----------+------------+----------------------+ +| | | | Column datatype: int | ++-----------+----------+------------+----------------------+ +| Datatype | Quantity | Percentage | | ++-----------+----------+------------+----------------------+ +| None | 0 | 0.00 % | | ++-----------+----------+------------+----------------------+ +| Empty str | 0 | 0.00 % | | ++-----------+----------+------------+----------------------+ +| String | 0 | 0.00 % | | ++-----------+----------+------------+----------------------+ +| Integer | 19 | 100.00 % | | ++-----------+----------+------------+----------------------+ +| Float | 0 | 0.00 % | | ++-----------+----------+------------+----------------------+ + +Min value: 1 + +Max value: 19 + +end of __analyze 4.059180021286011 + ++-----------+----------+------------+-------------------------+ +| | | | Column name: firstName | ++-----------+----------+------------+-------------------------+ +| | | | Column datatype: string | ++-----------+----------+------------+-------------------------+ +| Datatype | Quantity | Percentage | | ++-----------+----------+------------+-------------------------+ +| None | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| Empty str | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| String | 19 | 100.00 % | | ++-----------+----------+------------+-------------------------+ +| Integer | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| Float | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ + +end of __analyze 1.1431787014007568 + ++-----------+----------+------------+-------------------------+ +| | | | Column name: lastName | ++-----------+----------+------------+-------------------------+ +| | | | Column datatype: string | ++-----------+----------+------------+-------------------------+ +| Datatype | Quantity | Percentage | | ++-----------+----------+------------+-------------------------+ +| None | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| Empty str | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| String | 19 | 100.00 % | | ++-----------+----------+------------+-------------------------+ +| Integer | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| Float | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ + +end of __analyze 0.9663524627685547 + ++-----------+----------+------------+------------------------+ +| | | | Column name: billingId | ++-----------+----------+------------+------------------------+ +| | | | Column datatype: int | ++-----------+----------+------------+------------------------+ +| Datatype | Quantity | Percentage | | ++-----------+----------+------------+------------------------+ +| None | 0 | 0.00 % | | ++-----------+----------+------------+------------------------+ +| Empty str | 0 | 0.00 % | | ++-----------+----------+------------+------------------------+ +| String | 0 | 0.00 % | | ++-----------+----------+------------+------------------------+ +| Integer | 19 | 100.00 % | | ++-----------+----------+------------+------------------------+ +| Float | 0 | 0.00 % | | ++-----------+----------+------------+------------------------+ + +Min value: 111 + +Max value: 992 + +end of __analyze 4.292513847351074 + ++-----------+----------+------------+-------------------------+ +| | | | Column name: product | ++-----------+----------+------------+-------------------------+ +| | | | Column datatype: string | ++-----------+----------+------------+-------------------------+ +| Datatype | Quantity | Percentage | | ++-----------+----------+------------+-------------------------+ +| None | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| Empty str | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| String | 18 | 94.74 % | | ++-----------+----------+------------+-------------------------+ +| Integer | 1 | 5.26 % | | ++-----------+----------+------------+-------------------------+ +| Float | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ + +end of __analyze 1.180891990661621 + ++-----------+----------+------------+------------------------+ +| | | | Column name: price | ++-----------+----------+------------+------------------------+ +| | | | Column datatype: int | ++-----------+----------+------------+------------------------+ +| Datatype | Quantity | Percentage | | ++-----------+----------+------------+------------------------+ +| None | 0 | 0.00 % | | ++-----------+----------+------------+------------------------+ +| Empty str | 0 | 0.00 % | | ++-----------+----------+------------+------------------------+ +| String | 0 | 0.00 % | | ++-----------+----------+------------+------------------------+ +| Integer | 19 | 100.00 % | | ++-----------+----------+------------+------------------------+ +| Float | 0 | 0.00 % | | ++-----------+----------+------------+------------------------+ + +Min value: 1 + +Max value: 10 + +end of __analyze 4.364053964614868 + ++-----------+----------+------------+-------------------------+ +| | | | Column name: birth | ++-----------+----------+------------+-------------------------+ +| | | | Column datatype: string | ++-----------+----------+------------+-------------------------+ +| Datatype | Quantity | Percentage | | ++-----------+----------+------------+-------------------------+ +| None | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| Empty str | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| String | 19 | 100.00 % | | ++-----------+----------+------------+-------------------------+ +| Integer | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| Float | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ + +end of __analyze 0.9144570827484131 + ++-----------+----------+------------+-------------------------+ +| | | | Column name: dummyCol | ++-----------+----------+------------+-------------------------+ +| | | | Column datatype: string | ++-----------+----------+------------+-------------------------+ +| Datatype | Quantity | Percentage | | ++-----------+----------+------------+-------------------------+ +| None | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| Empty str | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| String | 19 | 100.00 % | | ++-----------+----------+------------+-------------------------+ +| Integer | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ +| Float | 0 | 0.00 % | | ++-----------+----------+------------+-------------------------+ + +end of __analyze 0.9651758670806885 + +Total execution time: 17.98968768119812 + ++-----------+------------------+---------------------+ +| | | General Description | ++-----------+------------------+---------------------+ +| Features | Name or Quantity | | ++-----------+------------------+---------------------+ +| File Name | foo.csv | | ++-----------+------------------+---------------------+ +| Columns | 8 | | ++-----------+------------------+---------------------+ +| Rows | 19 | | ++-----------+------------------+---------------------+ + +Analyzer.get_categorical_hist(df_one_col, num_bars) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This function analyzes a dataframe of a single column (only string type columns) and returns a dictionary with bins and values of frequency. + +Input: + +``df_one_col``:One column dataFrame. + +``num_bars``: Number of bars or histogram bins. + +The method outputs a dictionary with bins and values of frequency for only type strings colmuns. + +Example: + +Lets say we want to plot a histogram of frecuencies for the ``product`` column. We first need to obtain the dictionary of the frecuencies for each one. This is what this function does for categorical data. Remember that if you run the ``columnAnalyze()`` method with ``plots = True`` this is done for you. + +.. code:: python + + productDf = analyzer.get_data_frame.select("product") #or df.select("product") + hist_dictPro = analyzer.get_categorical_hist(df_one_col=productDf, num_bars=10) + print(hist_dictPro) + +.. code:: python + + #Output + """[{'cont': 4, 'value': 'pizza'}, {'cont': 3, 'value': 'taco'}, {'cont': 2, 'value': 'pasta'}, {'cont': 1, 'value': 'hamburguer'}, {'cont': 1, 'value': 'BEER'}, {'cont': 1, 'value': 'Rice'}, {'cont': 1, 'value': 'piza'}, {'cont': 1, 'value': 'Cake'}, {'cont': 1, 'value': 'arepa'}, {'cont': 1, 'value': '110790'}]""" + +Now that we have the dictionary we just need to call ``plot_hist()``. + +Analyzer.get_numerical_hist(df_one_col, num_bars) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This function analyzes a dataframe of a single column (only numerical columns) and returns a dictionary with bins and values of frequency. + +Input: + +``df_one_col``:One column dataFrame. + +``num_bars``: Number of bars or histogram bins. + +The method outputs a dictionary with bins and values of frequency for only numerical colmuns. + +Example: + +Lets say we want to plot a histogram of frecuencies for the ``price`` column. We first need to obtain the dictionary of the frecuencies for each one. This is what this function does for numerical data. Remember that if you run the ``columnAnalyze()`` method with ``plots = True`` this is done for you. + +.. code:: python + + priceDf = analyzer.get_data_frame.select("price") #or df.select("price") + hist_dictPri = analyzer.get_numerical_hist(df_one_col=priceDf, num_bars=10) + print(hist_dictPri) + +.. code:: python + + #Output + """[{'cont': 2, 'value': 9.55}, {'cont': 2, 'value': 8.649999999999999}, {'cont': 6, 'value': 7.749999999999999}, {'cont': 2, 'value': 5.05}, {'cont': 1, 'value': 4.1499999999999995}, {'cont': 4, 'value': 3.25}, {'cont': 1, 'value': 2.3499999999999996}, {'cont': 1, 'value': 1.45}]""" + + +Analyzer.plot_hist(df_one_col, hist_dict, type_hist, num_bars=20, values_bar=True) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This function builds the histogram (bins) of a categorical or numerical column dataframe. + +Input: + +``df_one_col``: A dataFrame of one column. + +``hist_dict``: Python dictionary with histogram values. + +``type_hist``: type of histogram to be generated, numerical or categorical. + +``num_bars``: Number of bars in histogram. + +``values_bar``: If values_bar is True, values of frequency are plotted over bars. + +The method outputs a plot of the histogram for a categorical or numerical column. + +Example: + +.. code:: python + + # For a categorical DF + analyzer.plot_hist(df_one_col=productDf,hist_dict= hist_dictPro, type_hist='categorical') + +.. image:: images/productHist.png + +.. code:: python + + # For a numerical DF + analyzer.plot_hist(df_one_col=priceDf,hist_dict= hist_dictPri, type_hist='categorical') + +.. image:: images/priceHist.png + +Analyzer.unique_values_col(column) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This function counts the number of values that are unique and also the total number of values. Then, returns the values obtained. + +Input: + +``column``: Name of column dataFrame, this argument must be string type. + +The method outputs a dictionary of values counted, as an example: ``{'unique': 10, 'total': 15}``. + +Example: + +.. code:: python + + print(analyzer.unique_values_col("product")) + print(analyzer.unique_values_col("price")) + +.. code:: python + + #Output + {'unique': 13, 'total': 19} + {'unique': 8, 'total': 19} + +Analyzer.write_json(json_cols, path_to_json_file) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This functions outputs a JSON for the DataFrame in the specified path. + +Input: + +``json_cols``: Dictionary that represents the dataframe. + +``path_to_json_file``: Specified path to write the returned JSON. + +The method outputs the dataFrame as a JSON. To use it in a simple way first run + +.. code:: python + + json_cols = analyzer.column_analyze(column_list="*", print_type=False, plots=False) + +And you will have the desired dictionary to pass to the write_json function. + +Example: + +.. code:: python + + analyzer.write_json(json_cols=json_cols, path_to_json_file= os.getcwd() + "/foo.json") + +Analyzer.get_frequency(self, columns, sort_by_count=True) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This function gets the frequencies for values inside the specified columns. + +Input: + +``columns``: String or List of columns to analyze + +``sort_by_count``: Boolean if true the counts will be sort desc. + +The method outputs a Spark Dataframe with counts per existing values in each column. + +Tu use it, first lets create a sample DataFrame: + +.. code:: python + + import random + import optimus as op + from pyspark.sql.types import StringType, StructType, IntegerType, FloatType, DoubleType, StructField + + schema = StructType( + [ + StructField("strings", StringType(), True), + StructField("integers", IntegerType(), True), + StructField("integers2", IntegerType(), True), + StructField("floats", FloatType(), True), + StructField("double", DoubleType(), True) + ] + ) + + size = 200 + # Generating strings column: + foods = [' pizza! ', 'pizza', 'PIZZA;', 'pizza', 'pízza¡', 'Pizza', 'Piz;za'] + foods = [foods[random.randint(0,6)] for count in range(size)] + # Generating integer column: + num_col_1 = [random.randint(0,9) for number in range(size)] + # Generating integer column: + num_col_2 = [random.randint(0,9) for number in range(size)] + # Generating integer column: + num_col_3 = [random.random() for number in range(size)] + # Generating integer column: + num_col_4 = [random.random() for number in range(size)] + + # Building DataFrame + df = op.spark.createDataFrame(list(zip(foods, num_col_1, num_col_2, num_col_3, num_col_4)),schema=schema) + + # Instantiate Analyzer + analyzer = op.DataFrameAnalyzer(df) + + # Get frequency DataFrame + df_counts = analyzer.get_frequency(["strings", "integers"], True) + +And you will get (note that these are random generated values): + ++-----------------+-----+ +| strings|count| ++-----------------+-----+ +| pizza| 48| ++-----------------+-----+ +| Piz;za| 38| ++-----------------+-----+ +| Pizza| 37| ++-----------------+-----+ +| pízza¡| 29| ++-----------------+-----+ +| pizza! | 25| ++-----------------+-----+ +| PIZZA;| 23| ++-----------------+-----+ + ++--------+-----+ +|integers|count| ++--------+-----+ +| 8| 31| ++--------+-----+ +| 5| 24| ++--------+-----+ +| 1| 24| ++--------+-----+ +| 9| 20| ++--------+-----+ +| 6| 20| ++--------+-----+ +| 2| 19| ++--------+-----+ +| 3| 19| ++--------+-----+ +| 0| 17| ++--------+-----+ +| 4| 14| ++--------+-----+ +| 7| 12| ++--------+-----+ \ No newline at end of file diff --git a/docs/source/sections/feature.rst b/docs/source/sections/feature.rst new file mode 100644 index 000000000..ddabdf846 --- /dev/null +++ b/docs/source/sections/feature.rst @@ -0,0 +1,358 @@ +Feature Engineering with Optimus +================================== + +Now with Optimus we have made easy the process of Feature Engineering. + + +When we talk about Feature Engineering we refer to creating new features from your existing ones to improve model +performance. Sometimes this is the case, or sometimes you need to do it because a certain model doesn't recognize +the data as you have it, so these transformations let you run most of Machine and Deep Learning algorithms. + +These methods are part of the DataFrameTransformer, and they are a high level of abstraction for Spark Feature +Engineering methods. You'll see how easy it is to prepare your data with Optimus for Machine Learning. + + +Methods for Feature Engineering +--------------------------------- + +Transformer.string_to_index(input_cols) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This method maps a string column of labels to an ML column of label indices. If the input column is numeric, we cast it +to string and index the string values. + +``input_cols`` argument receives a list of columns to be indexed. + +Let's start by creating a DataFrame with Optimus. + +.. code:: python + + # Importing Optimus + import optimus as op + #Importing utilities + tools = op.Utilities() + + # Creating DF with Optimus + data = [('Japan', 'Tokyo', 37800000),('USA', 'New York', 19795791),('France', 'Paris', 12341418), + ('Spain','Madrid',6489162)] + df = tools.create_data_frame(data, ["country", "city", "population"]) + + # Instantiating transformer + transformer = op.DataFrameTransformer(df) + + # Show DF + transformer.show() + ++-------+--------+----------+ +|country| city|population| ++-------+--------+----------+ +| Japan| Tokyo| 37800000| ++-------+--------+----------+ +| USA|New York| 19795791| ++-------+--------+----------+ +| France| Paris| 12341418| ++-------+--------+----------+ +| Spain| Madrid| 6489162| ++-------+--------+----------+ + +.. code:: python + + # Indexing columns 'city" and 'country' + transformer.string_to_index(["city", "country"]) + + # Show indexed DF + transformer.show() + ++-------+--------+----------+----------+-------------+ +|country| city|population|city_index|country_index| ++-------+--------+----------+----------+-------------+ +| Japan| Tokyo| 37800000| 1.0| 1.0| ++-------+--------+----------+----------+-------------+ +| USA|New York| 19795791| 2.0| 3.0| ++-------+--------+----------+----------+-------------+ +| France| Paris| 12341418| 3.0| 2.0| ++-------+--------+----------+----------+-------------+ +| Spain| Madrid| 6489162| 0.0| 0.0| ++-------+--------+----------+----------+-------------+ + + +Transformer.index_to_string(input_cols) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This method maps a column of indices back to a new column of corresponding string values. The index-string mapping is +either from the ML (Spark) attributes of the input column, or from user-supplied labels (which take precedence over +ML attributes). + +``input_cols`` argument receives a list of columns to be indexed. + +Let's go back to strings with the DataFrame we created in the last step. + +.. code:: python + + # Importing Optimus + import optimus as op + #Importing utilities + tools = op.Utilities() + + # Instantiating transformer + transformer = op.DataFrameTransformer(df) + + # Show DF + transformer.show() + ++-------+--------+----------+ +|country| city|population| ++-------+--------+----------+ +| Japan| Tokyo| 37800000| ++-------+--------+----------+ +| USA|New York| 19795791| ++-------+--------+----------+ +| France| Paris| 12341418| ++-------+--------+----------+ +| Spain| Madrid| 6489162| ++-------+--------+----------+ + + +.. code:: python + + # Indexing columns 'city" and 'country' + transformer.string_to_index(["city", "country"]) + + # Show indexed DF + transformer.show() + ++-------+--------+----------+----------+-------------+ +|country| city|population|city_index|country_index| ++-------+--------+----------+----------+-------------+ +| Japan| Tokyo| 37800000| 1.0| 1.0| ++-------+--------+----------+----------+-------------+ +| USA|New York| 19795791| 2.0| 3.0| ++-------+--------+----------+----------+-------------+ +| France| Paris| 12341418| 3.0| 2.0| ++-------+--------+----------+----------+-------------+ +| Spain| Madrid| 6489162| 0.0| 0.0| ++-------+--------+----------+----------+-------------+ + +.. code:: python + + # Going back to strings from index + transformer.index_to_string(["country_index"]) + + # Show DF with column "county_index" back to string + transformer.show() + ++-------+--------+----------+-------------+----------+--------------------+ +|country| city|population|country_index|city_index|country_index_string| ++-------+--------+----------+-------------+----------+--------------------+ +| Japan| Tokyo| 37800000| 1.0| 1.0| Japan | ++-------+--------+----------+-------------+----------+--------------------+ +| USA|New York| 19795791| 3.0| 2.0| USA | ++-------+--------+----------+-------------+----------+--------------------+ +| France| Paris| 12341418| 2.0| 3.0| France | ++-------+--------+----------+-------------+----------+--------------------+ +| Spain| Madrid| 6489162| 0.0| 0.0| Spain | ++-------+--------+----------+-------------+----------+--------------------+ + + +Transformer.one_hot_encoder(input_cols) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This method maps a column of label indices to a column of binary vectors, with at most a single one-value. + +``input_cols`` argument receives a list of columns to be encoded. + +Let's create a sample dataframe to see what OHE does: + +.. code:: python + + # Importing Optimus + import optimus as op + #Importing utilities + tools = op.Utilities() + + # Creating DataFrame + data = [ + (0, "a"), + (1, "b"), + (2, "c"), + (3, "a"), + (4, "a"), + (5, "c") + ] + df = tools.create_data_frame(data,["id", "category"]) + + # Instantiating the transformer + transformer = op.DataFrameTransformer(df) + + # One Hot Encoding + transformer.one_hot_encoder(["id"]) + + # Show encoded dataframe + transformer.show() + ++---+--------+-------------+ +| id|category| id_encoded| ++---+--------+-------------+ +| 0| a|(5,[0],[1.0])| ++---+--------+-------------+ +| 1| b|(5,[1],[1.0])| ++---+--------+-------------+ +| 2| c|(5,[2],[1.0])| ++---+--------+-------------+ +| 3| a|(5,[3],[1.0])| ++---+--------+-------------+ +| 4| a|(5,[4],[1.0])| ++---+--------+-------------+ +| 5| c| (5,[],[])| ++---+--------+-------------+ + +Transformer.sql(sql_expression) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This method implements the transformations which are defined by SQL statement. Spark only support +SQL syntax like "SELECT ... FROM __THIS__ ..." where "__THIS__" represents the +underlying table of the input dataframe. Thank Spark for this amazing function. + +`sql_expression`` argument receives a string that contains SQL expression. + +Let's create a sample DataFrame to test this function. + +.. code:: python + + # Importing Optimus + import optimus as op + #Importing utilities + tools = op.Utilities() + + # Creating DataFrame + data = [ + (0, 1.0, 3.0), + (2, 2.0, 5.0) + ] + + df = tools.create_data_frame(data,["id", "v1", "v2"]) + + # Instantiating the transformer + transformer = op.DataFrameTransformer(df) + + +This dataframe is just this: + ++---+---+---+ +| id| v1| v2| ++---+---+---+ +| 0|1.0|3.0| ++---+---+---+ +| 2|2.0|5.0| ++---+---+---+ + +Now let's create two new columns from these ones. The first will be the sum of the columns `v1` and `v2`, and +the second one will be the multiplication of this two columns. With the `sql()` function we just need to +pass the sql expression and use at the end `FROM __THIS__` that will be the underlying table of the input dataframe. + +So: + +.. code:: python + + transformer.sql("SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") + + +And this will output: + ++---+---+---+---+----+---+----+ +| id| v1| v2| v3| v4| v3| v4| ++---+---+---+---+----+---+----+ +| 0|1.0|3.0|4.0| 3.0|4.0| 3.0| ++---+---+---+---+----+---+----+ +| 2|2.0|5.0|7.0|10.0|7.0|10.0| ++---+---+---+---+----+---+----+ + +Transformer.vector_assembler(input_cols) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This method combines a given list of columns into a single vector column. + +``input_cols`` argument receives a list of columns to be encoded. + +This is very important because lots of Machine Learning algorithms in Spark need this format to work. + +Let's create a sample dataframe to see what vector assembler does: + +.. code:: python + + # Importing Optimus + import optimus as op + #Importing utilities + tools = op.Utilities() + # Import Vectors + from pyspark.ml.linalg import Vectors + + # Creating DataFrame + data = [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)] + + df = tools.create_data_frame(data,["id", "hour", "mobile", "user_features", "clicked"] + + # Instantiating the transformer + transformer = op.DataFrameTransformer(df) + + # Assemble features + transformer.vector_assembler(["hour", "mobile", "userFeatures"]) + + + # Show assembled df + print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") + transform.get_data_frame.select("features", "clicked").show(truncate=False) + + ++-----------------------+-------+ +|features |clicked| ++-----------------------+-------+ +|[18.0,1.0,0.0,10.0,0.5]|1.0 | ++-----------------------+-------+ + +Transformer.normalizer(input_cols,p=2.0) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This method transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which +specifies the p-norm used for normalization. (p=2) by default. + +``input_cols`` argument receives a list of columns to be normalized. + +``p`` argument is the p-norm used for normalization. + + + +Let's create a sample dataframe to see what normalizer does: + +.. code:: python + + # Importing Optimus + import optimus as op + #Importing utilities + tools = op.Utilities() + # Import Vectors + from pyspark.ml.linalg import Vectors + + data = [ + (0, Vectors.dense([1.0, 0.5, -1.0]),), + (1, Vectors.dense([2.0, 1.0, 1.0]),), + (2, Vectors.dense([4.0, 10.0, 2.0]),) + ] + + df = tools.create_data_frame(data,["id", "features"]) + + transformer.normalizer(["features"], p=2.0).show(truncate=False) + + # Show normalized data + transformer.show(truncate=False) + + ++---+--------------+-----------------------------------------------------------+ +|id |features |features_normalized | ++---+--------------+-----------------------------------------------------------+ +|0 |[1.0,0.5,-1.0]|[0.6666666666666666,0.3333333333333333,-0.6666666666666666]| ++---+--------------+-----------------------------------------------------------+ +|1 |[2.0,1.0,1.0] |[0.8164965809277261,0.4082482904638631,0.4082482904638631] | ++---+--------------+-----------------------------------------------------------+ +|2 |[4.0,10.0,2.0]|[0.3651483716701107,0.9128709291752769,0.18257418583505536]| ++---+--------------+-----------------------------------------------------------+ \ No newline at end of file diff --git a/docs/images/logoOptimus.png b/docs/source/sections/images/logoOptimus.png similarity index 100% rename from docs/images/logoOptimus.png rename to docs/source/sections/images/logoOptimus.png diff --git a/docs/images/priceHist.png b/docs/source/sections/images/priceHist.png similarity index 100% rename from docs/images/priceHist.png rename to docs/source/sections/images/priceHist.png diff --git a/docs/images/productHist.png b/docs/source/sections/images/productHist.png similarity index 100% rename from docs/images/productHist.png rename to docs/source/sections/images/productHist.png diff --git a/docs/images/robotOptimus.png b/docs/source/sections/images/robotOptimus.png similarity index 100% rename from docs/images/robotOptimus.png rename to docs/source/sections/images/robotOptimus.png diff --git a/docs/source/sections/installing.rst b/docs/source/sections/installing.rst new file mode 100644 index 000000000..23a84b362 --- /dev/null +++ b/docs/source/sections/installing.rst @@ -0,0 +1,15 @@ +Installation +=============== + +In your terminal just type: + +.. code:: bash + + pip install optimuspyspark + + +Requirements +---------------- + +- Apache Spark >= 2.2.0 +- Python >= 3.5 \ No newline at end of file diff --git a/docs/source/sections/overview.rst b/docs/source/sections/overview.rst new file mode 100644 index 000000000..31741f9d4 --- /dev/null +++ b/docs/source/sections/overview.rst @@ -0,0 +1,15 @@ +Overview +============= + +.. image:: images/logoOptimus.png + +Description +------------ + +Optimus (By Iron_ )is the missing framework for cleaning and pre-processing data in a distributed fashion. It uses all the power of `Apache Spark`_ (optimized via Catalyst_) to do it. It implements several handy tools for data wrangling and munging that will make your life much easier. The first obvious advantage over any other public data cleaning library or framerwork is that it will work on your laptop or your big cluster, and second, it is amazingly easy to install, use and understand. + +.. _Iron: https://github.com/ironmussa + +.. _Apache Spark: https://spark.apache. + +.. _Catalyst: https://static.javadoc.io/org.apache.spark/spark-catalyst_2.10/1.0.1/index.html#org.apache.spark.sql.catalyst.package diff --git a/docs/index.rst b/docs/source/sections/transforming.rst similarity index 51% rename from docs/index.rst rename to docs/source/sections/transforming.rst index 6ff2bc4e9..cb3075e01 100644 --- a/docs/index.rst +++ b/docs/source/sections/transforming.rst @@ -1,656 +1,5 @@ -Optimus (By Iron_) -===================== - -.. image:: images/logoOptimus.png - - -.. _Iron: https://github.com/ironmussa - -Description ------------- - -Optimus is the missing framework for cleaning and pre-processing data in a distributed fashion. It uses all the power of `Apache Spark`_ (optimized via Catalyst_) to do it. It implements several handy tools for data wrangling and munging that will make your life much easier. The first obvious advantage over any other public data cleaning library or framerwork is that it will work on your laptop or your big cluster, and second, it is amazingly easy to install, use and understand. - -.. _Apache Spark: https://spark.apache. - -.. _Catalyst: https://static.javadoc.io/org.apache.spark/spark-catalyst_2.10/1.0.1/index.html#org.apache.spark.sql.catalyst.package - -Requirements ------------- - -- Apache Spark 2.2.0 -- Python 3.5 - -Installation -------------- - -In your terminal just type: - -.. code:: bash - - pip install optimuspyspark - - -DataFrameTransformer --------------------- - -DataFrameTransformer is a powerful and flexible library to make -dataFrame transformations in Apache Spark (pySpark). - -This library contains several transformation functions based in spark -original modules but with some features added to facilitate its use. - -Since functions in this library are mounted in the Spark SQL Context, it -offers not only the high performance of original Spark SQL functions but -also an easier usability. - -DataFrameProfiler class ------------------------ - -This class makes a profile for a given dataframe and its different general features. -Based on spark-df-profiling by Julio Soto. - -Initially it is a good idea to see a general view of the DataFrame to be analyzed. - -Lets assume you have the following dataset, called foo.csv, in your current directory: - -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| id | firstName | lastName | billingId | product | price | birth | dummyCol | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 1 | Luis | Alvarez$$%! | 123 | Cake | 10 | 1980/07/07 | never | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 2 | André | Ampère | 423 | piza | 8 | 1950/07/08 | gonna | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 3 | NiELS | Böhr//((%% | 551 | pizza | 8 | 1990/07/09 | give | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 4 | PAUL | dirac$ | 521 | pizza | 8 | 1954/07/10 | you | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 5 | Albert | Einstein | 634 | pizza | 8 | 1990/07/11 | up | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 6 | Galileo | GALiLEI | 672 | arepa | 5 | 1930/08/12 | never | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 7 | CaRL | Ga%%%uss | 323 | taco | 3 | 1970/07/13 | gonna | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 8 | David | H$$$ilbert | 624 | taaaccoo | 3 | 1950/07/14 | let | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 9 | Johannes | KEPLER | 735 | taco | 3 | 1920/04/22 | you | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 10 | JaMES | M$$ax%%well | 875 | taco | 3 | 1923/03/12 | down | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 11 | Isaac | Newton | 992 | pasta | 9 | 1999/02/15 | never | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 12 | Emmy%% | Nöether$ | 234 | pasta | 9 | 1993/12/08 | gonna | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 13 | Max!!! | Planck!!! | 111 | hamburguer | 4 | 1994/01/04 | run | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 14 | Fred | Hoy&&&le | 553 | pizzza | 8 | 1997/06/27 | around | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 15 | ((( Heinrich ))))) | Hertz | 116 | pizza | 8 | 1956/11/30 | and | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 16 | William | Gilbert### | 886 | BEER | 2 | 1958/03/26 | desert | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 17 | Marie | CURIE | 912 | Rice | 1 | 2000/03/22 | you | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 18 | Arthur | COM%%%pton | 812 | 110790 | 5 | 1899/01/01 | # | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 19 | JAMES | Chadwick | 467 | null | 10 | 1921/05/03 | # | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ - -.. code:: python - - # Import optimus - import optimus as op - #Import os module for system tools - import os - - # Reading dataframe. os.getcwd() returns de current directory of the notebook - # 'file:///' is a prefix that specifies the type of file system used, in this - # case, local file system (hard drive of the pc) is used. - filePath = "file:///" + os.getcwd() + "/foo.csv" - - df = tools.read_csv(path=filePath, - sep=',') - - # Instance of profiler class - profiler = op.DataFrameProfiler(df) - profiler.profiler() - -This overview presents basic information about the DataFrame, like number of variable it has, -how many are missing values and in which column, the types of each varaible, also some statistical information -that describes the variable plus a frecuency plot. table that specifies the existing datatypes in each column -dataFrame and other features. Also, for this particular case, the table of dataType is shown in order to visualize -a sample of column content. - -DataFrameAnalyzer class ------------------------ - -DataFrameAnalyzer class analyze dataType of rows in each columns of -dataFrames. - -**DataFrameAnalyzer methods** - -- DataFrameAnalyzer.column_analyze(column_list, plots=True, values_bar=True, print_type=False, num_bars=10) -- DataFrameAnalyzer.plot_hist(df_one_col, hist_dict, type_hist, num_bars=20, values_bar=True) -- DataFrameAnalyzer.get_categorical_hist(df_one_col, num_bars) -- DataFrameAnalyzer.get_numerical_hist(df_one_col, num_bars) -- DataFrameAnalyzer.unique_values_col(column) -- DataFrameAnalyzer.write_json(json_cols, path_to_json_file) -- DataFrameAnalyzer.get_frequency(columns, sort_by_count=True) - -Lets assume you have the following dataset, called foo.csv, in your current directory: - -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| id | firstName | lastName | billingId | product | price | birth | dummyCol | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 1 | Luis | Alvarez$$%! | 123 | Cake | 10 | 1980/07/07 | never | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 2 | André | Ampère | 423 | piza | 8 | 1950/07/08 | gonna | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 3 | NiELS | Böhr//((%% | 551 | pizza | 8 | 1990/07/09 | give | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 4 | PAUL | dirac$ | 521 | pizza | 8 | 1954/07/10 | you | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 5 | Albert | Einstein | 634 | pizza | 8 | 1990/07/11 | up | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 6 | Galileo | GALiLEI | 672 | arepa | 5 | 1930/08/12 | never | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 7 | CaRL | Ga%%%uss | 323 | taco | 3 | 1970/07/13 | gonna | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 8 | David | H$$$ilbert | 624 | taaaccoo | 3 | 1950/07/14 | let | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 9 | Johannes | KEPLER | 735 | taco | 3 | 1920/04/22 | you | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 10 | JaMES | M$$ax%%well | 875 | taco | 3 | 1923/03/12 | down | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 11 | Isaac | Newton | 992 | pasta | 9 | 1999/02/15 | never | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 12 | Emmy%% | Nöether$ | 234 | pasta | 9 | 1993/12/08 | gonna | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 13 | Max!!! | Planck!!! | 111 | hamburguer | 4 | 1994/01/04 | run | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 14 | Fred | Hoy&&&le | 553 | pizzza | 8 | 1997/06/27 | around | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 15 | ((( Heinrich ))))) | Hertz | 116 | pizza | 8 | 1956/11/30 | and | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 16 | William | Gilbert### | 886 | BEER | 2 | 1958/03/26 | desert | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 17 | Marie | CURIE | 912 | Rice | 1 | 2000/03/22 | you | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 18 | Arthur | COM%%%pton | 812 | 110790 | 5 | 1899/01/01 | # | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ -| 19 | JAMES | Chadwick | 467 | null | 10 | 1921/05/03 | # | -+----+----------------------+-------------+-----------+------------+-------+------------+----------+ - -The following code shows how to instantiate the class to analyze a dataFrame: - -.. code:: python - - # Import optimus - import optimus as op - # Instance of Utilities class - tools = op.Utilities() - - # Reading dataframe. os.getcwd() returns de current directory of the notebook - # 'file:///' is a prefix that specifies the type of file system used, in this - # case, local file system (hard drive of the pc) is used. - filePath = "file:///" + os.getcwd() + "/foo.csv" - - df = tools.read_csv(path=filePath, sep=',') - - analyzer = op.DataFrameAnalyzer(df=df,pathFile=filePath) - -Methods --------- - -Analyzer.column_analyze(column_list, plots=True, values_bar=True, print_type=False, num_bars=10) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This function counts the number of registers in a column that are numbers (integers, floats) and the number of string registers. - -Input: - -``column_list``: A list or a string column name. - -``plots``: Can be True or False. If true it will output the predefined plots. - -``values_bar (optional)``: Can be True or False. If it is True, frequency values are placed over each bar. - -``print_type (optional)``: Can be one of the following strings: 'integer', 'string', 'float'. Depending of what string -is provided, a list of distinct values of that type is printed. - -``num_bars``: number of bars printed in histogram - -The method outputs a list containing the number of the different datatypes [nulls, strings, integers, floats]. - -Example: - -.. code:: python - - analyzer.column_analyze("*", plots=False, values_bar=True, print_type=False, num_bars=10) - -+-----------+----------+------------+----------------------+ -| | | | Column name: id | -+-----------+----------+------------+----------------------+ -| | | | Column datatype: int | -+-----------+----------+------------+----------------------+ -| Datatype | Quantity | Percentage | | -+-----------+----------+------------+----------------------+ -| None | 0 | 0.00 % | | -+-----------+----------+------------+----------------------+ -| Empty str | 0 | 0.00 % | | -+-----------+----------+------------+----------------------+ -| String | 0 | 0.00 % | | -+-----------+----------+------------+----------------------+ -| Integer | 19 | 100.00 % | | -+-----------+----------+------------+----------------------+ -| Float | 0 | 0.00 % | | -+-----------+----------+------------+----------------------+ - -Min value: 1 - -Max value: 19 - -end of __analyze 4.059180021286011 - -+-----------+----------+------------+-------------------------+ -| | | | Column name: firstName | -+-----------+----------+------------+-------------------------+ -| | | | Column datatype: string | -+-----------+----------+------------+-------------------------+ -| Datatype | Quantity | Percentage | | -+-----------+----------+------------+-------------------------+ -| None | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| Empty str | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| String | 19 | 100.00 % | | -+-----------+----------+------------+-------------------------+ -| Integer | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| Float | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ - -end of __analyze 1.1431787014007568 - -+-----------+----------+------------+-------------------------+ -| | | | Column name: lastName | -+-----------+----------+------------+-------------------------+ -| | | | Column datatype: string | -+-----------+----------+------------+-------------------------+ -| Datatype | Quantity | Percentage | | -+-----------+----------+------------+-------------------------+ -| None | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| Empty str | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| String | 19 | 100.00 % | | -+-----------+----------+------------+-------------------------+ -| Integer | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| Float | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ - -end of __analyze 0.9663524627685547 - -+-----------+----------+------------+------------------------+ -| | | | Column name: billingId | -+-----------+----------+------------+------------------------+ -| | | | Column datatype: int | -+-----------+----------+------------+------------------------+ -| Datatype | Quantity | Percentage | | -+-----------+----------+------------+------------------------+ -| None | 0 | 0.00 % | | -+-----------+----------+------------+------------------------+ -| Empty str | 0 | 0.00 % | | -+-----------+----------+------------+------------------------+ -| String | 0 | 0.00 % | | -+-----------+----------+------------+------------------------+ -| Integer | 19 | 100.00 % | | -+-----------+----------+------------+------------------------+ -| Float | 0 | 0.00 % | | -+-----------+----------+------------+------------------------+ - -Min value: 111 - -Max value: 992 - -end of __analyze 4.292513847351074 - -+-----------+----------+------------+-------------------------+ -| | | | Column name: product | -+-----------+----------+------------+-------------------------+ -| | | | Column datatype: string | -+-----------+----------+------------+-------------------------+ -| Datatype | Quantity | Percentage | | -+-----------+----------+------------+-------------------------+ -| None | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| Empty str | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| String | 18 | 94.74 % | | -+-----------+----------+------------+-------------------------+ -| Integer | 1 | 5.26 % | | -+-----------+----------+------------+-------------------------+ -| Float | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ - -end of __analyze 1.180891990661621 - -+-----------+----------+------------+------------------------+ -| | | | Column name: price | -+-----------+----------+------------+------------------------+ -| | | | Column datatype: int | -+-----------+----------+------------+------------------------+ -| Datatype | Quantity | Percentage | | -+-----------+----------+------------+------------------------+ -| None | 0 | 0.00 % | | -+-----------+----------+------------+------------------------+ -| Empty str | 0 | 0.00 % | | -+-----------+----------+------------+------------------------+ -| String | 0 | 0.00 % | | -+-----------+----------+------------+------------------------+ -| Integer | 19 | 100.00 % | | -+-----------+----------+------------+------------------------+ -| Float | 0 | 0.00 % | | -+-----------+----------+------------+------------------------+ - -Min value: 1 - -Max value: 10 - -end of __analyze 4.364053964614868 - -+-----------+----------+------------+-------------------------+ -| | | | Column name: birth | -+-----------+----------+------------+-------------------------+ -| | | | Column datatype: string | -+-----------+----------+------------+-------------------------+ -| Datatype | Quantity | Percentage | | -+-----------+----------+------------+-------------------------+ -| None | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| Empty str | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| String | 19 | 100.00 % | | -+-----------+----------+------------+-------------------------+ -| Integer | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| Float | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ - -end of __analyze 0.9144570827484131 - -+-----------+----------+------------+-------------------------+ -| | | | Column name: dummyCol | -+-----------+----------+------------+-------------------------+ -| | | | Column datatype: string | -+-----------+----------+------------+-------------------------+ -| Datatype | Quantity | Percentage | | -+-----------+----------+------------+-------------------------+ -| None | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| Empty str | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| String | 19 | 100.00 % | | -+-----------+----------+------------+-------------------------+ -| Integer | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ -| Float | 0 | 0.00 % | | -+-----------+----------+------------+-------------------------+ - -end of __analyze 0.9651758670806885 - -Total execution time: 17.98968768119812 - -+-----------+------------------+---------------------+ -| | | General Description | -+-----------+------------------+---------------------+ -| Features | Name or Quantity | | -+-----------+------------------+---------------------+ -| File Name | foo.csv | | -+-----------+------------------+---------------------+ -| Columns | 8 | | -+-----------+------------------+---------------------+ -| Rows | 19 | | -+-----------+------------------+---------------------+ - -Analyzer.get_categorical_hist(df_one_col, num_bars) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This function analyzes a dataframe of a single column (only string type columns) and returns a dictionary with bins and values of frequency. - -Input: - -``df_one_col``:One column dataFrame. - -``num_bars``: Number of bars or histogram bins. - -The method outputs a dictionary with bins and values of frequency for only type strings colmuns. - -Example: - -Lets say we want to plot a histogram of frecuencies for the ``product`` column. We first need to obtain the dictionary of the frecuencies for each one. This is what this function does for categorical data. Remember that if you run the ``columnAnalyze()`` method with ``plots = True`` this is done for you. - -.. code:: python - - productDf = analyzer.get_data_frame.select("product") #or df.select("product") - hist_dictPro = analyzer.get_categorical_hist(df_one_col=productDf, num_bars=10) - print(hist_dictPro) - -.. code:: python - - #Output - """[{'cont': 4, 'value': 'pizza'}, {'cont': 3, 'value': 'taco'}, {'cont': 2, 'value': 'pasta'}, {'cont': 1, 'value': 'hamburguer'}, {'cont': 1, 'value': 'BEER'}, {'cont': 1, 'value': 'Rice'}, {'cont': 1, 'value': 'piza'}, {'cont': 1, 'value': 'Cake'}, {'cont': 1, 'value': 'arepa'}, {'cont': 1, 'value': '110790'}]""" - -Now that we have the dictionary we just need to call ``plot_hist()``. - -Analyzer.get_numerical_hist(df_one_col, num_bars) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This function analyzes a dataframe of a single column (only numerical columns) and returns a dictionary with bins and values of frequency. - -Input: - -``df_one_col``:One column dataFrame. - -``num_bars``: Number of bars or histogram bins. - -The method outputs a dictionary with bins and values of frequency for only numerical colmuns. - -Example: - -Lets say we want to plot a histogram of frecuencies for the ``price`` column. We first need to obtain the dictionary of the frecuencies for each one. This is what this function does for numerical data. Remember that if you run the ``columnAnalyze()`` method with ``plots = True`` this is done for you. - -.. code:: python - - priceDf = analyzer.get_data_frame.select("price") #or df.select("price") - hist_dictPri = analyzer.get_numerical_hist(df_one_col=priceDf, num_bars=10) - print(hist_dictPri) - -.. code:: python - - #Output - """[{'cont': 2, 'value': 9.55}, {'cont': 2, 'value': 8.649999999999999}, {'cont': 6, 'value': 7.749999999999999}, {'cont': 2, 'value': 5.05}, {'cont': 1, 'value': 4.1499999999999995}, {'cont': 4, 'value': 3.25}, {'cont': 1, 'value': 2.3499999999999996}, {'cont': 1, 'value': 1.45}]""" - - -Analyzer.plot_hist(df_one_col, hist_dict, type_hist, num_bars=20, values_bar=True) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This function builds the histogram (bins) of a categorical or numerical column dataframe. - -Input: - -``df_one_col``: A dataFrame of one column. - -``hist_dict``: Python dictionary with histogram values. - -``type_hist``: type of histogram to be generated, numerical or categorical. - -``num_bars``: Number of bars in histogram. - -``values_bar``: If values_bar is True, values of frequency are plotted over bars. - -The method outputs a plot of the histogram for a categorical or numerical column. - -Example: - -.. code:: python - - # For a categorical DF - analyzer.plot_hist(df_one_col=productDf,hist_dict= hist_dictPro, type_hist='categorical') - -.. image:: images/productHist.png - -.. code:: python - - # For a numerical DF - analyzer.plot_hist(df_one_col=priceDf,hist_dict= hist_dictPri, type_hist='categorical') - -.. image:: images/priceHist.png - -Analyzer.unique_values_col(column) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This function counts the number of values that are unique and also the total number of values. Then, returns the values obtained. - -Input: - -``column``: Name of column dataFrame, this argument must be string type. - -The method outputs a dictionary of values counted, as an example: ``{'unique': 10, 'total': 15}``. - -Example: - -.. code:: python - - print(analyzer.unique_values_col("product")) - print(analyzer.unique_values_col("price")) - -.. code:: python - - #Output - {'unique': 13, 'total': 19} - {'unique': 8, 'total': 19} - -Analyzer.write_json(json_cols, path_to_json_file) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This functions outputs a JSON for the DataFrame in the specified path. - -Input: - -``json_cols``: Dictionary that represents the dataframe. - -``path_to_json_file``: Specified path to write the returned JSON. - -The method outputs the dataFrame as a JSON. To use it in a simple way first run - -.. code:: python - - json_cols = analyzer.column_analyze(column_list="*", print_type=False, plots=False) - -And you will have the desired dictionary to pass to the write_json function. - -Example: - -.. code:: python - - analyzer.write_json(json_cols=json_cols, path_to_json_file= os.getcwd() + "/foo.json") - -Analyzer.get_frequency(self, columns, sort_by_count=True) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This function gets the frequencies for values inside the specified columns. - -Input: - -``columns``: String or List of columns to analyze - -``sort_by_count``: Boolean if true the counts will be sort desc. - -The method outputs a Spark Dataframe with counts per existing values in each column. - -Tu use it, first lets create a sample DataFrame: - -.. code:: python - - import random - import optimus as op - from pyspark.sql.types import StringType, StructType, IntegerType, FloatType, DoubleType, StructField - - schema = StructType( - [ - StructField("strings", StringType(), True), - StructField("integers", IntegerType(), True), - StructField("integers2", IntegerType(), True), - StructField("floats", FloatType(), True), - StructField("double", DoubleType(), True) - ] - ) - - size = 200 - # Generating strings column: - foods = [' pizza! ', 'pizza', 'PIZZA;', 'pizza', 'pízza¡', 'Pizza', 'Piz;za'] - foods = [foods[random.randint(0,6)] for count in range(size)] - # Generating integer column: - num_col_1 = [random.randint(0,9) for number in range(size)] - # Generating integer column: - num_col_2 = [random.randint(0,9) for number in range(size)] - # Generating integer column: - num_col_3 = [random.random() for number in range(size)] - # Generating integer column: - num_col_4 = [random.random() for number in range(size)] - - # Building DataFrame - df = op.spark.createDataFrame(list(zip(foods, num_col_1, num_col_2, num_col_3, num_col_4)),schema=schema) - - # Instantiate Analyzer - analyzer = op.DataFrameAnalyzer(df) - - # Get frequency DataFrame - df_counts = analyzer.get_frequency(["strings", "integers"], True) - -And you will get (note that these are random generated values): - -+-----------------+-----+ -| strings|count| -+-----------------+-----+ -| pizza| 48| -+-----------------+-----+ -| Piz;za| 38| -+-----------------+-----+ -| Pizza| 37| -+-----------------+-----+ -| pízza¡| 29| -+-----------------+-----+ -| pizza! | 25| -+-----------------+-----+ -| PIZZA;| 23| -+-----------------+-----+ - -+--------+-----+ -|integers|count| -+--------+-----+ -| 8| 31| -+--------+-----+ -| 5| 24| -+--------+-----+ -| 1| 24| -+--------+-----+ -| 9| 20| -+--------+-----+ -| 6| 20| -+--------+-----+ -| 2| 19| -+--------+-----+ -| 3| 19| -+--------+-----+ -| 0| 17| -+--------+-----+ -| 4| 14| -+--------+-----+ -| 7| 12| -+--------+-----+ +Transforming your Data +======================= DataFrameTransformer class -------------------------- @@ -680,7 +29,7 @@ DataFrameTransformer class - DataFrameTransformer.remove_special_chars(columns) - DataFrameTransformer.date_transform(column, dateFormat) -* **General operation function**: +* **General operation function**: - DataFrameTransformer.set_col(columns, func, dataType) @@ -721,9 +70,9 @@ dataFrame: transformer = op.DataFrameTransformer(df) transformer.show() - + Output: - + +-----------+-------+----------+ | city|country|population| +-----------+-------+----------+ @@ -735,9 +84,6 @@ Output: +-----------+-------+----------+ | Madrid| Spain| 6489162| +-----------+-------+----------+ - -Methods -------- Transformer.trim_col(columns) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -892,7 +238,7 @@ Original dataFrame: +-----------+-------+----------+ New dataFrame: - + +-----------+----------+ | city|population| +-----------+----------+ @@ -1062,7 +408,7 @@ Original dataFrame: +-----------+-------+----------+ New dataFrame: - + +-----------+-------+----------+ | city|country|population| +-----------+-------+----------+ @@ -1106,7 +452,7 @@ Here some examples: print (' Replacing a number if value in cell is greater than 5:') - # Replacing a number: + # Replacing a number: func = lambda cell: (cell * 2) if (cell > 14000000 ) else cell transformer.set_col(['population'], func, 'integer') @@ -1828,7 +1174,73 @@ Now lets write this DF as a CSV This will create a folder with the name "test.csv" in the current path, and inside it will be te CSV with the concept. But with the ``read_csv`` function you can just pass the name "test.csv" and Optimus will understand. - -Library mantained by `Favio Vazquez`_ ------------------------------------------- -.. _Favio Vazquez: https://github.com/faviovazquez + +DataFrameTransformer.replace_na(value, columns=None) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This method replace nulls with specified value. + +``columns`` argument is an optional list of column names to consider. Columns specified in subset that do not have +matching data type are ignored. For example, if value is a string, and subset contains a non-string column, +then the non-string column is simply ignored. If `columns == "*"` then it will choose all columns. + +``value`` argument is the value to replace nulls with. If the value is a dict, then subset is ignored and value +must be a mapping from column name (string) to replacement value. The replacement value must be an int, long, +float, or string. + +Let's download a sample data using our amazing `read_url` function. + + +.. code:: python + # Import optimus + import optimus as op + # Instance of Utilities class + tools = op.Utilities() + # Reading df from web + url = "https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/impute_data.csv" + df = tools.read_dataset_url(path=url) + +If we examine this DF we see that there are some missing values. + ++---+---+ +| a| b| ++---+---+ +|1.0|NaN| ++---+---+ +|2.0|NaN| ++---+---+ +|NaN|3.0| ++---+---+ +|4.0|4.0| ++---+---+ +|5.0|5.0| ++---+---+ + +Remember that we have the `impute_missing` function that lets you choose to use the mean or the median of the columns in +which the missing values are located for your imputation. But with `replace_na` you can say replace the nulls in one, +or all columns in the dataframe with a specific value. For this example we will replace NA with 0's. + +.. code:: python + + # Instantiation of DataTransformer class: + transformer = op.DataFrameTransformer(df) + # Replace NA with 0's + transformer.replace_na(0.0, columns="*") + # Show DF + transformer.show() + ++---+---+ +| a| b| ++---+---+ +|1.0|0.0| ++---+---+ +|2.0|0.0| ++---+---+ +|0.0|3.0| ++---+---+ +|4.0|4.0| ++---+---+ +|5.0|5.0| ++---+---+ + +And that's it! \ No newline at end of file diff --git a/optimus/df_analyzer.py b/optimus/df_analyzer.py index 6ca260d81..b3875bf6e 100644 --- a/optimus/df_analyzer.py +++ b/optimus/df_analyzer.py @@ -503,11 +503,13 @@ def get_data_frame(self): """This function return the dataframe of the class""" return self._df - def show(self): + def show(self, n=10, truncate=True): """This function shows the dataframe of the class + :param n: number or rows to show + :param truncate: If set to True, truncate strings longer than 20 chars by default. :rtype: pyspark.sql.dataframe.DataFrame.show() """ - return self._df.show() + return self._df.show(n, truncate) # Function to give general features of dataFrame: def general_description(self): diff --git a/optimus/df_outliers.py b/optimus/df_outliers.py index c4616ee4e..1f26c78d9 100644 --- a/optimus/df_outliers.py +++ b/optimus/df_outliers.py @@ -7,7 +7,12 @@ class OutlierDetector: """ Outlier detection for pyspark dataframes. """ - def __init__(self, df, column): + def __init__(self, df, column, threshold=2): + """ + :param df: Spark Dataframe to analyze + :param column: Column in dataframe to get outliers + :param threshold: Threshold for MAD. Default = 2. + """ self.spark = SparkSession.builder.enableHiveSupport().getOrCreate() self._df = df self._column = column @@ -22,7 +27,7 @@ def __init__(self, df, column): self.mad_value = median(absolute_deviation, column) - self.threshold = 2 + self.threshold = threshold self._limits = [] self._limits.append(round((self.median_value - self.threshold * self.mad_value), 2)) @@ -78,12 +83,39 @@ def get_data_frame(self): """ return self._df - def show(self, n=10): + def show(self, n=10, truncate=True): """This function shows the dataframe of the class :param n: number or rows to show + :param truncate: If set to True, truncate strings longer than 20 chars by default. :rtype: pyspark.sql.dataframe.DataFrame.show() """ - return self._df.show(n) + return self._df.show(n, truncate) + + def to_csv(self, path_name, header="true", mode="overwrite", sep=",", *args, **kargs): + """ + Write dataframe as CSV. + :param path_name: Path to write the DF and the name of the output CSV file. + :param header: True or False to include header + :param mode: Specifies the behavior of the save operation when data already exists. + "append": Append contents of this DataFrame to existing data. + "overwrite" (default case): Overwrite existing data. + "ignore": Silently ignore this operation if data already exists. + "error": Throw an exception if data already exists. + :param sep: sets the single character as a separator for each field and value. If None is set, + it uses the default value. + :return: Dataframe in a CSV format in the specified path. + """ + + assert isinstance(path_name, str), "Error: path_name argument must be a string." + + assert header == "true" or header == "false", "Error header must be 'true' or 'false'." + + if header == 'true': + header = True + else: + header = False + + return self._df.write.options(header=header).mode(mode).csv(path_name, sep=sep, *args, **kargs) def median(df, column): diff --git a/optimus/df_transformer.py b/optimus/df_transformer.py index 2693c42fb..c5a02b728 100644 --- a/optimus/df_transformer.py +++ b/optimus/df_transformer.py @@ -52,7 +52,8 @@ def _assert_cols_in_df(cls, columns_provided, columns_df): columns_provided: the list of columns to be process. columns_df: list of columns's dataFrames """ - col_not_valids = (set([column for column in columns_provided]).difference(set([column for column in columns_df]))) + col_not_valids = ( + set([column for column in columns_provided]).difference(set([column for column in columns_df]))) assert (col_not_valids == set()), 'Error: The following columns do not exits in dataFrame: %s' % col_not_valids def _add_transformation(self): @@ -75,12 +76,13 @@ def get_data_frame(self): """ return self._df - def show(self, n=10): + def show(self, n=10, truncate=True): """This function shows the dataframe of the class :param n: number or rows to show + :param truncate: If set to True, truncate strings longer than 20 chars by default. :rtype: pyspark.sql.dataframe.DataFrame.show() """ - return self._df.show(n) + return self._df.show(n, truncate) def lower_case(self, columns): """This function set all strings in columns of dataframe specified to lowercase. @@ -129,6 +131,40 @@ def impute(cols): return self + def replace_na(self, value, columns=None): + """ + Replace nulls with specified value. + :param columns: optional list of column names to consider. Columns specified in subset that do not have + matching data type are ignored. For example, if value is a string, and subset contains a non-string column, + then the non-string column is simply ignored. + :param value: Value to replace null values with. If the value is a dict, then subset is ignored and value + must be a mapping from column name (string) to replacement value. The replacement + value must be an int, long, float, or string. + :return: Transformer object (DF with columns with replaced null values). + """ + + if columns == "*": + columns = None + + # Columns to list + if isinstance(columns, str): + columns = [columns] + + if columns is not None: + assert isinstance(columns, list), "Error: columns argument must be a list" + + assert isinstance(value, (int, float, str, dict)), "Error: value argument must be an " \ + "int, long, float, string, or dict" + + def replace_it(val, col): + self._df = self._df.fillna(val, subset=col) + + replace_it(val=value, col=columns) + + self._add_transformation() + + return self + def check_point(self): """This method is a very useful function to break lineage of transformations. By default Spark uses the lazy evaluation approach in processing data: transformation functions are not computed into an action is called. @@ -144,11 +180,11 @@ def check_point(self): # Checkpointing of dataFrame. One question can be thought. Why not use cache() or persist() instead of # checkpoint. This is because cache() and persis() apparently do not break the lineage of operations, - print ("Saving changes at disk by checkpoint...") + print("Saving changes at disk by checkpoint...") self._df.checkpoint() self._df.count() self._df = self._sql_context.createDataFrame(self._df, self._df.schema) - print ("Done.") + print("Done.") execute = check_point @@ -249,7 +285,7 @@ def col_replace(columns): # Asserting change_to parameter is a string or a number assert isinstance(change_to, str) or isinstance(change_to, float) or isinstance(change_to, - int),\ + int), \ "Error: change_to parameter must be a number or string" # Asserting search and change_to have same type @@ -274,7 +310,7 @@ def col_replace(columns): assert ( col_not_valids == set()), 'Error: The following columns do not have same datatype argument provided: %s' % \ - col_not_valids + col_not_valids col_replace(columns) @@ -313,7 +349,7 @@ def set_col(self, columns, func, data_type): :return transformer object """ dict_types = {'string': StringType(), 'str': StringType(), 'integer': IntegerType(), - 'int': IntegerType(), 'float': FloatType(), 'double': DoubleType(), 'Double': DoubleType()} + 'int': IntegerType(), 'float': FloatType(), 'double': DoubleType(), 'Double': DoubleType()} types = {'string': 'string', 'str': 'string', 'String': 'string', 'integer': 'int', 'int': 'int', 'float': 'float', 'double': 'double', 'Double': 'double'} @@ -350,7 +386,7 @@ def col_set(columns, function): assert ( col_not_valids == set()), 'Error: The following columns do not have same datatype argument provided: %s' \ - % col_not_valids + % col_not_valids col_set(columns, function) @@ -415,7 +451,7 @@ def clear_accents(self, columns): assert ( col_not_valids == set()), 'Error: The following columns do not have same datatype argument provided: %s' \ - % col_not_valids + % col_not_valids # Receives a string as an argument def remove_accents(input_str): @@ -460,7 +496,7 @@ def remove_special_chars(self, columns): assert ( col_not_valids == set()), 'Error: The following columns do not have same datatype argument provided: %s' \ - % col_not_valids + % col_not_valids def rm_spec_chars(input_str): # Remove all punctuation and control characters @@ -507,7 +543,7 @@ def remove_special_chars_regex(self, columns, regex): assert ( col_not_valids == set()), 'Error: The following columns do not have same datatype argument provided: %s' \ - % col_not_valids + % col_not_valids def rm_spec_chars_regex(input_str, regex): for _ in set(input_str): @@ -515,7 +551,7 @@ def rm_spec_chars_regex(input_str, regex): return input_str # User define function that does operation in cells - function = udf(lambda cell: rm_spec_chars_regex(cell,regex) if cell is not None else cell, StringType()) + function = udf(lambda cell: rm_spec_chars_regex(cell, regex) if cell is not None else cell, StringType()) exprs = [function(c).alias(c) if (c in columns) and (c in valid_cols) else c for c in self._df.columns] @@ -535,7 +571,6 @@ def rename_col(self, columns): assert isinstance(columns, list) and isinstance(columns[0], tuple), \ "Error: Column argument must be a list of tuples" - col_not_valids = ( set([column[0] for column in columns]).difference(set([column for column in self._df.columns]))) @@ -570,7 +605,7 @@ def lookup(self, column, str_to_replace, list_str=None): # Asserting columns is string or list: assert isinstance(str_to_replace, (str, dict)), "Error: str_to_replace argument must be a string or a dict" - if isinstance(str_to_replace, dict): + if isinstance(str_to_replace, dict): assert (str_to_replace != {}), "Error, str_to_replace must be a string or a non empty python dictionary" assert ( list_str is None), "Error, If a python dictionary if specified, list_str argument must be None: list_str=None" @@ -627,10 +662,10 @@ def replace_from_dic(str_test): def move_col(self, column, ref_col, position): """This funcion change column position in dataFrame. - :param column Name of the column to be moved in dataFrame. column argument must be a string. - :param ref_col Name of reference column in dataFrame. This column will be a reference to place the + :param column: Name of the column to be moved in dataFrame. column argument must be a string. + :param ref_col: Name of reference column in dataFrame. This column will be a reference to place the column to be moved. - :param position Can be one of the following options: 'after' or 'before'. If 'after' is provided, column + :param position: Can be one of the following options: 'after' or 'before'. If 'after' is provided, column provided will be placed just after the ref_col selected.""" # Columns of dataFrame columns = self._df.columns @@ -738,7 +773,7 @@ def count_items(self, col_id, col_search, new_col_feature, search_string): df_mod = df_mod.drop(col_id + '_other').drop(col_search).withColumnRenamed('count', new_col_feature) \ .dropna("any") - print("Counting existing "+search_string + " in "+col_search) + print("Counting existing " + search_string + " in " + col_search) return df_mod.sort(col_id).drop_duplicates([col_id]) def date_transform(self, columns, current_format, output_format): @@ -818,7 +853,7 @@ def cast_func(self, cols_and_types): """ dict_types = {'string': StringType(), 'str': StringType(), 'integer': IntegerType(), - 'int': IntegerType(), 'float': FloatType(), 'double': DoubleType(), 'Double': DoubleType()} + 'int': IntegerType(), 'float': FloatType(), 'double': DoubleType(), 'Double': DoubleType()} types = {'string': 'string', 'str': 'string', 'String': 'string', 'integer': 'int', 'int': 'int', 'float': 'float', 'double': 'double', 'Double': 'double'} @@ -919,7 +954,7 @@ def check_data_type(value): "Error: Column %s specified as columnName argument does not exist in dataframe" % column # Checking if column has a valid datatype: assert (data_type in ['integer', 'float', 'string', - 'null']), \ + 'null']), \ "Error: data_type only can be one of the followings options: integer, float, string, null." # Checking if func parameters is func data_type or None assert isinstance(func, type(None)) or isinstance(func, type(lambda x: x)), \ @@ -958,7 +993,7 @@ def row_filter_by_type(self, column_name, type_to_delete): self._assert_type_str(type_to_delete, "type_to_delete") # Asserting if dataType argument has a valid type: assert (type_to_delete in ['integer', 'float', 'string', - 'null']), \ + 'null']), \ "Error: dataType only can be one of the followings options: integer, float, string, null." # Function for determine if register value is float or int or string: @@ -1135,7 +1170,7 @@ def remove_empty_rows(self, how="all"): assert isinstance(how, str), "Error, how argument provided must be a string." assert how == 'all' or ( - how == 'any'), "Error, how only can be 'all' or 'any'." + how == 'any'), "Error, how only can be 'all' or 'any'." self._df = self._df.dropna(how) @@ -1162,7 +1197,7 @@ def write_df_as_json(self, path): # outfile.write(str(json_cols).replace("'", "\"")) outfile.write(p) - def to_csv(self, path_name, header=True, mode="overwrite", sep=",", *args, **kargs): + def to_csv(self, path_name, header="true", mode="overwrite", sep=",", *args, **kargs): """ Write dataframe as CSV. :param path_name: Path to write the DF and the name of the output CSV file. @@ -1177,4 +1212,157 @@ def to_csv(self, path_name, header=True, mode="overwrite", sep=",", *args, **kar :return: Dataframe in a CSV format in the specified path. """ + self._assert_type_str(path_name, "path_name") + + assert header == "true" or header == "false", "Error header must be 'true' or 'false'" + + if header == 'true': + header = True + else: + header = False + return self._df.write.options(header=header).mode(mode).csv(path_name, sep=sep, *args, **kargs) + + def string_to_index(self, input_cols): + """ + Maps a string column of labels to an ML column of label indices. If the input column is + numeric, we cast it to string and index the string values. + :param input_cols: Columns to be indexed. + :return: Dataframe with indexed columns. + """ + + # Check if columns argument must be a string or list datatype: + self._assert_type_str_or_list(input_cols, "input_cols") + + if isinstance(input_cols, str): + input_cols = [input_cols] + + from pyspark.ml import Pipeline + from pyspark.ml.feature import StringIndexer + + indexers = [StringIndexer(inputCol=column, outputCol=column + "_index").fit(self._df) for column in + list(set(input_cols))] + + pipeline = Pipeline(stages=indexers) + self._df = pipeline.fit(self._df).transform(self._df) + + return self + + def index_to_string(self, input_cols): + """ + Maps a column of indices back to a new column of corresponding string values. The index-string mapping is + either from the ML attributes of the input column, or from user-supplied labels (which take precedence over + ML attributes). + :param input_cols: Columns to be indexed. + :return: Dataframe with indexed columns. + """ + + # Check if columns argument must be a string or list datatype: + self._assert_type_str_or_list(input_cols, "input_cols") + + if isinstance(input_cols, str): + input_cols = [input_cols] + + from pyspark.ml import Pipeline + from pyspark.ml.feature import IndexToString + + indexers = [IndexToString(inputCol=column, outputCol=column + "_string") for column in + list(set(input_cols))] + + pipeline = Pipeline(stages=indexers) + self._df = pipeline.fit(self._df).transform(self._df) + + return self + + def one_hot_encoder(self, input_cols): + """ + Maps a column of label indices to a column of binary vectors, with at most a single one-value. + :param input_cols: Columns to be encoded. + :return: Dataframe with encoded columns. + """ + + # Check if columns argument must be a string or list datatype: + self._assert_type_str_or_list(input_cols, "input_cols") + + if isinstance(input_cols, str): + input_cols = [input_cols] + + from pyspark.ml import Pipeline + from pyspark.ml.feature import OneHotEncoder + + encode = [OneHotEncoder(inputCol=column, outputCol=column + "_encoded") for column in + list(set(input_cols))] + + pipeline = Pipeline(stages=encode) + self._df = pipeline.fit(self._df).transform(self._df) + + return self + + def sql(self, sql_expression): + """ + Implements the transformations which are defined by SQL statement. Currently we only support + SQL syntax like "SELECT ... FROM __THIS__ ..." where "__THIS__" represents the + underlying table of the input dataframe. + :param sql_expression: SQL expression. + :return: Dataframe with columns changed by SQL statement. + """ + + self._assert_type_str(sql_expression, "sql_expression") + + from pyspark.ml.feature import SQLTransformer + + sql_trans = SQLTransformer(statement=sql_expression) + + self._df = sql_trans.transform(self._df) + + return self + + def vector_assembler(self, input_cols): + """ + Combines a given list of columns into a single vector column. + :param input_cols: Columns to be assembled. + :return: Dataframe with assembled column. + """ + + # Check if columns argument must be a string or list datatype: + self._assert_type_str_or_list(input_cols, "input_cols") + + if isinstance(input_cols, str): + input_cols = [input_cols] + + from pyspark.ml import Pipeline + + assembler = [VectorAssembler(inputCols=input_cols, outputCol="features")] + + pipeline = Pipeline(stages=assembler) + self._df = pipeline.fit(self._df).transform(self._df) + + return self + + def normalizer(self, input_cols, p=2.0): + """ + Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which + specifies the p-norm used for normalization. (p=2) by default. + :param input_cols: Columns to be normalized. + :param p: p-norm used for normalization. + :return: Dataframe with normalized columns. + """ + + # Check if columns argument must be a string or list datatype: + self._assert_type_str_or_list(input_cols, "input_cols") + + if isinstance(input_cols, str): + input_cols = [input_cols] + + assert isinstance(p, (float, int)), "Error: p argument must be a numeric value." + + from pyspark.ml import Pipeline + from pyspark.ml.feature import Normalizer + + normal = [Normalizer(inputCol=column, outputCol=column + "_normalized", p=p) for column in + list(set(input_cols))] + + pipeline = Pipeline(stages=normal) + self._df = pipeline.fit(self._df).transform(self._df) + + return self diff --git a/optimus/utilities.py b/optimus/utilities.py index eb2350fbe..1a9637901 100644 --- a/optimus/utilities.py +++ b/optimus/utilities.py @@ -26,6 +26,22 @@ def __init__(self): # Set empty container for url self.url = "" + def create_data_frame(self, data, names): + """ + Create a spark Dataframe from a list of tuples. This will infer the type for each column. + :param data: List of tuples with data + :param names: List of names for the columns + :return: Spark dataframe + """ + + assert isinstance(data, list) and isinstance(data[0], tuple), \ + "data should be a list of tuples" + + assert isinstance(names, list) and isinstance(names[0], str), \ + "names should be a list of strings" + + return self.spark.createDataFrame(data, names) + def read_csv(self, path, sep=',', header='true'): """This funcion read a dataset from a csv file. @@ -48,7 +64,7 @@ def read_csv(self, path, sep=',', header='true'): .options(inferSchema='true') \ .load(path) - def read_dataset_url(self, path=None, ty="csv"): + def read_url(self, path=None, ty="csv"): """ Reads dataset from URL. :param path: string for URL to read diff --git a/optimus/version.py b/optimus/version.py index a9ef56d2a..c8e1e0d33 100644 --- a/optimus/version.py +++ b/optimus/version.py @@ -4,5 +4,5 @@ def _safe_int(string): except ValueError: return string -__version__ = '1.0.4' +__version__ = '1.1.0' VERSION = tuple(_safe_int(x) for x in __version__.split('.')) diff --git a/setup.py b/setup.py index caa54997d..1c52c10fa 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ def readme(): author='Favio Vazquez', author_email='favio.vazquez@ironmussa.com', url='https://github.com/ironmussa/Optimus/', - download_url='https://github.com/ironmussa/Optimus/archive/1.0.4.tar.gz', + download_url='https://github.com/ironmussa/Optimus/archive/1.1.0.tar.gz', description=('Optimus is the missing framework for cleaning and preprocessing data in a distributed fashion with ' 'pyspark.'), long_description=readme(), diff --git a/tests/impute_data.csv b/tests/impute_data.csv new file mode 100644 index 000000000..ac38c7b37 --- /dev/null +++ b/tests/impute_data.csv @@ -0,0 +1,6 @@ +a,b +1.0, NaN +2.0, NaN +NaN, 3.0 +4.0, 4.0 +5.0, 5.0 \ No newline at end of file diff --git a/tests/tests.py b/tests/tests.py index c705fc886..bd9fd0384 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,5 +1,6 @@ import optimus as op from pyspark.sql.types import StringType, IntegerType, StructType, StructField +from pyspark.ml.linalg import Vectors from pyspark.sql.functions import col import pyspark import sys @@ -49,6 +50,39 @@ def create_other_df(spark_session): sys.exit(1) +def create_sql_df(spark_session): + try: + df = spark_session.createDataFrame([ + (0, 1.0, 3.0), + (2, 2.0, 5.0) + ], ["id", "v1", "v2"]) + return df + except RuntimeError: + sys.exit(1) + + +def create_vector_df(spark_session): + try: + df = spark_session.createDataFrame([ + (0, Vectors.dense([1.0, 0.5, -1.0]),), + (1, Vectors.dense([2.0, 1.0, 1.0]),), + (2, Vectors.dense([4.0, 10.0, 2.0]),) + ], ["id", "features"]) + return df + except RuntimeError: + sys.exit(1) + + +def create_assembler_df(spark_session): + try: + df = spark_session.createDataFrame( + [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)], + ["id", "hour", "mobile", "userFeatures", "clicked"]) + return df + except RuntimeError: + sys.exit(1) + + def create_another_df(spark_session): try: # Building a simple dataframe: @@ -224,3 +258,81 @@ def test_read_csv(): assert_spark_df(df) except RuntimeError: sys.exit(1) + + +def test_create_data_frame(): + try: + tools = op.Utilities() + data = [('Japan', 'Tokyo', 37800000), ('USA', 'New York', 19795791), ('France', 'Paris', 12341418), + ('Spain', 'Madrid', 6489162)] + names = ["country", "city", "population"] + df = tools.create_data_frame(data=data, names=names) + assert_spark_df(df) + except RuntimeError: + sys.exit(1) + + +def test_string_to_index(spark_session): + try: + transformer = op.DataFrameTransformer(create_df(spark_session)) + transformer.string_to_index(["city", "country"]) + assert_spark_df(transformer.get_data_frame) + except RuntimeError: + sys.exit(1) + + +def test_index_to_string(spark_session): + try: + transformer = op.DataFrameTransformer(create_df(spark_session)) + transformer.string_to_index(["city", "country"]) + transformer.index_to_string(["city_index", "country_index"]) + assert_spark_df(transformer.get_data_frame) + except RuntimeError: + sys.exit(1) + + +def test_one_hot_encoder(spark_session): + try: + transformer = op.DataFrameTransformer(create_sql_df(spark_session)) + transformer.one_hot_encoder(["id"]) + assert_spark_df(transformer.get_data_frame) + except RuntimeError: + sys.exit(1) + + +def test_sql(spark_session): + try: + transformer = op.DataFrameTransformer(create_sql_df(spark_session)) + transformer.sql("SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") + assert_spark_df(transformer.get_data_frame) + except RuntimeError: + sys.exit(1) + + +def test_assembler(spark_session): + try: + transformer = op.DataFrameTransformer(create_assembler_df(spark_session)) + transformer.vector_assembler(["hour", "mobile", "userFeatures"]) + assert_spark_df(transformer.get_data_frame) + except RuntimeError: + sys.exit(1) + + +def test_normalizer(spark_session): + try: + transformer = op.DataFrameTransformer(create_vector_df(spark_session)) + transformer.normalizer(["features"]) + assert_spark_df(transformer.get_data_frame) + except RuntimeError: + sys.exit(1) + + +def replace_na(): + try: + tools = op.Utilities() + df = tools.read_csv("tests/impute_data.csv", header="true", sep=",") + transformer = op.DataFrameTransformer(df) + transformer.replace_na(10, columns="*") + assert_spark_df(transformer.get_data_frame) + except RuntimeError: + sys.exit(1)