Merge branch 'master' of https://github.com/A3Data/hermione

A3Data · Jun 1, 2020 · 6e01b16 · 6e01b16
2 parents 5d87395 + c13070b
commit 6e01b16
Show file tree

Hide file tree

Showing 9 changed files with 156 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -76,6 +76,8 @@ After installed Hermione:
 
 ![](https://cdn-images-1.medium.com/max/800/1*U3ToR5jDjQJihT9EnxeDdg.png)
 
+Do you want to create your **project from scratch**? There click [here](tutorial_base.md) to check a tutorial.
+
 
 ## Documentation
 This is the class structure diagram that Hermione relies on:

diff --git a/hermione/file_text/normalization.txt b/hermione/file_text/normalization.txt
@@ -22,7 +22,7 @@ class Normalizer:
         Normalization
         """
         self.norm_cols = norm_cols
-        self.col_nomes = [nome for norm in norm_cols for nome in norm_cols[norm]]
+        self.col_names = [name for norm in norm_cols for name in norm_cols[norm]]
         self.norms = {'min-max': MinMaxScaler, 
                       'standard': StandardScaler}
         self.fitted = False
@@ -40,11 +40,11 @@ class Normalizer:
         None
         """
         zip_cols = lambda result: zip(result.index.values, result.values)
-        self.col_min = {col: value for col, value in zip_cols(df[self.col_nomes].min())}
-        self.col_max = {col: value for col, value in zip_cols(df[self.col_nomes].max())}
-        self.col_std = {col: value for col, value in zip_cols(df[self.col_nomes].std())}
-        self.col_mean = {col: value for col, value in zip_cols(df[self.col_nomes].mean())}
-        self.col_median = {col: value for col, value in zip_cols(df[self.col_nomes].median())}
+        self.col_min = {col: value for col, value in zip_cols(df[self.col_names].min())}
+        self.col_max = {col: value for col, value in zip_cols(df[self.col_names].max())}
+        self.col_std = {col: value for col, value in zip_cols(df[self.col_names].std())}
+        self.col_mean = {col: value for col, value in zip_cols(df[self.col_names].mean())}
+        self.col_median = {col: value for col, value in zip_cols(df[self.col_names].median())}
 
     def __apply_func(self, X, normalization):
         """

diff --git a/hermione/file_text/text_vectorizer.txt b/hermione/file_text/text_vectorizer.txt
@@ -14,11 +14,10 @@ class TextVectorizer:
         vectorizer_cols : dict
                        Receives a dict with the name of the vectorizer to be 
                        performed and which are the columns
-                       Ex: vectorizer_cols = {'embedding_mediana': ['col'], 
-                                              'embedding_media': ['col'],
+                       Ex: vectorizer_cols = {'embedding_median': ['col'], 
+                                              'embedding_mean': ['col'],
                                               'tf_idf': ['col'],
-                                              'bag_of_words' : [col],
-                                              'embedding_media': ['col']}
+                                              'bag_of_words' : [col]}
     	Returns
     	-------
         Normalization
@@ -45,7 +44,7 @@ class TextVectorizer:
         """
         self.vectorizers_fitted = dict()
         for vectorizer in self.vectorizer_cols:
-            if vectorizer in ['index', 'embedding_mediana', 'embedding_media']:
+            if vectorizer in ['index', 'embedding_median', 'embedding_mean']:
                 continue
             for col in self.vectorizer_cols[vectorizer]:
                 self.vectorizers_fitted[vectorizer] = {}
@@ -72,10 +71,10 @@ class TextVectorizer:
             if vectorizer == 'index':
                 for col in self.vectorizer_cols[vectorizer]:
                     df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 3))
-            elif vectorizer == 'embedding_mediana':
+            elif vectorizer == 'embedding_median':
                 for col in self.vectorizer_cols[vectorizer]:
                     df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 1))
-            elif vectorizer == 'embedding_media':
+            elif vectorizer == 'embedding_mean':
                 for col in self.vectorizer_cols[vectorizer]:
                     df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 2))
             elif (vectorizer == 'bag_of_words') | (vectorizer == 'tf_idf'):
@@ -113,12 +112,12 @@ class TextVectorizer:
         elif typ_transform == 3: # indexação
             idx = self.word2vec.index2word
             set_idx = set(idx)
-            indices = [idx.index(token) for token in X.split() if token in set_idx]
-            indices = [self.index_ini_fim] + indices + [self.index_ini_fim]
+            indexes = [idx.index(token) for token in X.split() if token in set_idx]
+            indexes = [self.index_ini_fim] + indexes + [self.index_ini_fim]
             # Create vector
-            X_length = len(indices)
+            X_length = len(indexes)
             vector = np.zeros(X_length, dtype=np.int64)
-            vector[:len(indices)] = indices
+            vector[:len(indexes)] = indexes
         else:
             vector = []
         return vector

diff --git a/hermione/file_text/visualization.txt b/hermione/file_text/visualization.txt
@@ -120,7 +120,7 @@ class Visualization:
 
 
     @staticmethod
-    def correlation_analyze(df, fig_size=(5,4), path=None):
+    def correlation_analysis(df, fig_size=(5,4), path=None):
         """
         Correlation of variables in the dataframe
 

diff --git a/images/create.png b/images/create.png
diff --git a/images/env.png b/images/env.png
diff --git a/images/mlflow.png b/images/mlflow.png
diff --git a/images/requirements.png b/images/requirements.png
diff --git a/tutorial_base.md b/tutorial_base.md
@@ -0,0 +1,137 @@
+
+
+## Start your project from scratch with Hermione
+
+1. After you have installed Hermione, you need to create your project:
+
+![](images/create.png)
+
+2. Activate it environment:
+
+ ![](images/env.png)
+
+3. Now we are going to install by default libraries that are in file ``requirements.txt``:
+
+![](images/requirements.png)
+
+4. When you create a project at Hermione, it already creates a configuration file, this file can be found at ``src/config/config.json``. This file sets some project settings data, and if you find it necessary, you can change it, including new fields or changing existing ones:
+
+```json
+{
+"project_name": "project_scratch",
+"env_path": "project_scratch/project_scratch_env",
+"files_path": "../data/raw/",
+"key": "<<<<key>>>>",
+"user": "<<<<user>>>>"
+}
+```
+5. The first step in creating our project is to load the database. In this tutorial we will use the [Titanic dataset](https://www.kaggle.com/c/titanic/data). For that we load it in the class ``Spreadsheet`` in the method ``get_data`` as follows the example below:
+
+```python
+def  get_data(self, path)->pd.DataFrame:
+	"""
+	Returns a flat table in Dataframe
+	Parameters
+	----------
+	arg : type
+	description
+	Returns
+	-------
+	pd.DataFrame
+	Dataframe with data
+	"""
+	return pd.read_csv(path)[['Survived', 'Pclass', 'Sex', 'Age']]
+```
+6. Then we need to apply pre-processing to our database, this is done in the class ``Preprocessing`` in the method ``process``:
+```python
+def  process(self, df: pd.DataFrame):
+	"""
+	Perform data cleansing.
+	Parameters
+	----------
+	df : pd.Dataframe
+	Dataframe to be processed 
+
+	Returns
+	-------
+	pd.Dataframe
+	Cleaned Data Frame
+	"""
+	print("Cleaning data")
+	df_copy = df.copy()
+	df_copy['Pclass'] = df_copy.Pclass.astype('object')
+	df_copy = df_copy.dropna()
+	df_copy = pd.get_dummies(df_copy)
+	return df_copy
+```
+
+Here we apply three pre-processing: transformation of the column `` Pclass`` to the type `` object``, removal of empty lines and creation of dummies.
+
+7. The next step is to define the algorithm that we will be training. If you are going to run your model with some sklearn algorithm, the ``TrainerSklearn`` class already has the implementation and you just need to call the ``train`` method, passing some parameters. The ``train`` method also supports training with cross validation or just dividing into training and testing (all parameterized).
+If you need to use another package, just implement your own class, inheriting from the ``Trainer`` class. Similar to what was implemented in ``TrainerSklearn``:
+
+```python
+class  TrainerSklearn(Trainer):
+	pass
+```
+
+8. Now that we have loaded the data, implemented the pre-processing and already have the method to train, we need to join these steps, that is, set up our execution pipeline.
+At Hermione this process must be performed in the **script** ``train.py``. So come on!
+
+
+	8.1. Load the project name, defined in the config file (step 4).
+	```python
+	with  open('config/config.json', 'r') as  file:
+		project_name = json.load(file)['project_name']
+	```
+	8.2. Create an experiment in mlflow:
+	```python
+	mlflow.set_experiment(project_name)
+	```
+	8.3. Enter the path of the dataset to be loaded by the ``Spreadsheet`` class (step 5):
+	```python
+	df = Spreadsheet().get_data('../data/raw/train.csv')
+    ```
+	8.3. Apply the preprocessing defined in step 6:
+	```python
+	p = Preprocessing()
+	df = p.process(df)
+	```
+	8.4. Define features (X) and target (y):
+	```python
+	X = df.drop(columns=["Survived"])
+	y = df["Survived"]
+	```
+	8.5. Define the sklearn algorithms that we will apply:
+	```python
+	algos = [RandomForestClassifier, GradientBoostingClassifier, LogisticRegression]	
+	```
+	8.6. Now we will configure the execution of the algorithms, using the ``TrainerSklearn`` class (step 7). Here we train with mlflow so that the results can be stored and analyzed later:
+	```python
+	for algo in algos:
+		with mlflow.start_run() as run:
+			model = TrainerSklearn().train(X, y,
+										   classification=True,
+										   algorithm=algo,
+										   data_split=('cv', {'cv': 8}),
+										   preprocessing=p)
+		   mlflow.log_params({'algorithm': algo})
+		   mlflow.log_metrics(model.get_metrics())
+		   mlflow.sklearn.log_model(model.get_model(), 'model')
+	```
+	8.7. After the ``train.py`` script has been built, with the previous steps. You need to run it. You can do this in two ways:
+
+	 - Run the entire script on the python console
+	 - Run the ``hermione train`` command, in the ``src`` folder
+
+
+9. After executing step 8.7 the models, their parameters and metrics are logged in mlflow. To access them, simply execute the command below at the command prompt inside the path ``src/``:
+
+```ssh
+mlflow ui
+``` 
+10. Open the URL, which the previous command returns, in your preferred browser. So you can analyze the results returned:
+
+![](images/mlflow.png)
+
+Ready! Now you have built a project from scratch using Hermione.