Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/A3Data/hermione
Browse files Browse the repository at this point in the history
  • Loading branch information
Neylson Crepalde - A3 Data committed Jun 1, 2020
2 parents 5d87395 + c13070b commit 6e01b16
Show file tree
Hide file tree
Showing 9 changed files with 156 additions and 18 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ After installed Hermione:

![](https://cdn-images-1.medium.com/max/800/1*U3ToR5jDjQJihT9EnxeDdg.png)

Do you want to create your **project from scratch**? There click [here](tutorial_base.md) to check a tutorial.


## Documentation
This is the class structure diagram that Hermione relies on:
Expand Down
12 changes: 6 additions & 6 deletions hermione/file_text/normalization.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class Normalizer:
Normalization
"""
self.norm_cols = norm_cols
self.col_nomes = [nome for norm in norm_cols for nome in norm_cols[norm]]
self.col_names = [name for norm in norm_cols for name in norm_cols[norm]]
self.norms = {'min-max': MinMaxScaler,
'standard': StandardScaler}
self.fitted = False
Expand All @@ -40,11 +40,11 @@ class Normalizer:
None
"""
zip_cols = lambda result: zip(result.index.values, result.values)
self.col_min = {col: value for col, value in zip_cols(df[self.col_nomes].min())}
self.col_max = {col: value for col, value in zip_cols(df[self.col_nomes].max())}
self.col_std = {col: value for col, value in zip_cols(df[self.col_nomes].std())}
self.col_mean = {col: value for col, value in zip_cols(df[self.col_nomes].mean())}
self.col_median = {col: value for col, value in zip_cols(df[self.col_nomes].median())}
self.col_min = {col: value for col, value in zip_cols(df[self.col_names].min())}
self.col_max = {col: value for col, value in zip_cols(df[self.col_names].max())}
self.col_std = {col: value for col, value in zip_cols(df[self.col_names].std())}
self.col_mean = {col: value for col, value in zip_cols(df[self.col_names].mean())}
self.col_median = {col: value for col, value in zip_cols(df[self.col_names].median())}

def __apply_func(self, X, normalization):
"""
Expand Down
21 changes: 10 additions & 11 deletions hermione/file_text/text_vectorizer.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,10 @@ class TextVectorizer:
vectorizer_cols : dict
Receives a dict with the name of the vectorizer to be
performed and which are the columns
Ex: vectorizer_cols = {'embedding_mediana': ['col'],
'embedding_media': ['col'],
Ex: vectorizer_cols = {'embedding_median': ['col'],
'embedding_mean': ['col'],
'tf_idf': ['col'],
'bag_of_words' : [col],
'embedding_media': ['col']}
'bag_of_words' : [col]}
Returns
-------
Normalization
Expand All @@ -45,7 +44,7 @@ class TextVectorizer:
"""
self.vectorizers_fitted = dict()
for vectorizer in self.vectorizer_cols:
if vectorizer in ['index', 'embedding_mediana', 'embedding_media']:
if vectorizer in ['index', 'embedding_median', 'embedding_mean']:
continue
for col in self.vectorizer_cols[vectorizer]:
self.vectorizers_fitted[vectorizer] = {}
Expand All @@ -72,10 +71,10 @@ class TextVectorizer:
if vectorizer == 'index':
for col in self.vectorizer_cols[vectorizer]:
df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 3))
elif vectorizer == 'embedding_mediana':
elif vectorizer == 'embedding_median':
for col in self.vectorizer_cols[vectorizer]:
df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 1))
elif vectorizer == 'embedding_media':
elif vectorizer == 'embedding_mean':
for col in self.vectorizer_cols[vectorizer]:
df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 2))
elif (vectorizer == 'bag_of_words') | (vectorizer == 'tf_idf'):
Expand Down Expand Up @@ -113,12 +112,12 @@ class TextVectorizer:
elif typ_transform == 3: # indexação
idx = self.word2vec.index2word
set_idx = set(idx)
indices = [idx.index(token) for token in X.split() if token in set_idx]
indices = [self.index_ini_fim] + indices + [self.index_ini_fim]
indexes = [idx.index(token) for token in X.split() if token in set_idx]
indexes = [self.index_ini_fim] + indexes + [self.index_ini_fim]
# Create vector
X_length = len(indices)
X_length = len(indexes)
vector = np.zeros(X_length, dtype=np.int64)
vector[:len(indices)] = indices
vector[:len(indexes)] = indexes
else:
vector = []
return vector
Expand Down
2 changes: 1 addition & 1 deletion hermione/file_text/visualization.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ class Visualization:


@staticmethod
def correlation_analyze(df, fig_size=(5,4), path=None):
def correlation_analysis(df, fig_size=(5,4), path=None):
"""
Correlation of variables in the dataframe

Expand Down
Binary file added images/create.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/env.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/mlflow.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/requirements.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
137 changes: 137 additions & 0 deletions tutorial_base.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@


## Start your project from scratch with Hermione

1. After you have installed Hermione, you need to create your project:

![](images/create.png)

2. Activate it environment:

![](images/env.png)

3. Now we are going to install by default libraries that are in file ``requirements.txt``:

![](images/requirements.png)

4. When you create a project at Hermione, it already creates a configuration file, this file can be found at ``src/config/config.json``. This file sets some project settings data, and if you find it necessary, you can change it, including new fields or changing existing ones:

```json
{
"project_name": "project_scratch",
"env_path": "project_scratch/project_scratch_env",
"files_path": "../data/raw/",
"key": "<<<<key>>>>",
"user": "<<<<user>>>>"
}
```
5. The first step in creating our project is to load the database. In this tutorial we will use the [Titanic dataset](https://www.kaggle.com/c/titanic/data). For that we load it in the class ``Spreadsheet`` in the method ``get_data`` as follows the example below:

```python
def get_data(self, path)->pd.DataFrame:
"""
Returns a flat table in Dataframe
Parameters
----------
arg : type
description
Returns
-------
pd.DataFrame
Dataframe with data
"""
return pd.read_csv(path)[['Survived', 'Pclass', 'Sex', 'Age']]
```
6. Then we need to apply pre-processing to our database, this is done in the class ``Preprocessing`` in the method ``process``:
```python
def process(self, df: pd.DataFrame):
"""
Perform data cleansing.
Parameters
----------
df : pd.Dataframe
Dataframe to be processed
Returns
-------
pd.Dataframe
Cleaned Data Frame
"""
print("Cleaning data")
df_copy = df.copy()
df_copy['Pclass'] = df_copy.Pclass.astype('object')
df_copy = df_copy.dropna()
df_copy = pd.get_dummies(df_copy)
return df_copy
```

Here we apply three pre-processing: transformation of the column `` Pclass`` to the type `` object``, removal of empty lines and creation of dummies.

7. The next step is to define the algorithm that we will be training. If you are going to run your model with some sklearn algorithm, the ``TrainerSklearn`` class already has the implementation and you just need to call the ``train`` method, passing some parameters. The ``train`` method also supports training with cross validation or just dividing into training and testing (all parameterized).
If you need to use another package, just implement your own class, inheriting from the ``Trainer`` class. Similar to what was implemented in ``TrainerSklearn``:

```python
class TrainerSklearn(Trainer):
pass
```

8. Now that we have loaded the data, implemented the pre-processing and already have the method to train, we need to join these steps, that is, set up our execution pipeline.
At Hermione this process must be performed in the **script** ``train.py``. So come on!


8.1. Load the project name, defined in the config file (step 4).
```python
with open('config/config.json', 'r') as file:
project_name = json.load(file)['project_name']
```
8.2. Create an experiment in mlflow:
```python
mlflow.set_experiment(project_name)
```
8.3. Enter the path of the dataset to be loaded by the ``Spreadsheet`` class (step 5):
```python
df = Spreadsheet().get_data('../data/raw/train.csv')
```
8.3. Apply the preprocessing defined in step 6:
```python
p = Preprocessing()
df = p.process(df)
```
8.4. Define features (X) and target (y):
```python
X = df.drop(columns=["Survived"])
y = df["Survived"]
```
8.5. Define the sklearn algorithms that we will apply:
```python
algos = [RandomForestClassifier, GradientBoostingClassifier, LogisticRegression]
```
8.6. Now we will configure the execution of the algorithms, using the ``TrainerSklearn`` class (step 7). Here we train with mlflow so that the results can be stored and analyzed later:
```python
for algo in algos:
with mlflow.start_run() as run:
model = TrainerSklearn().train(X, y,
classification=True,
algorithm=algo,
data_split=('cv', {'cv': 8}),
preprocessing=p)
mlflow.log_params({'algorithm': algo})
mlflow.log_metrics(model.get_metrics())
mlflow.sklearn.log_model(model.get_model(), 'model')
```
8.7. After the ``train.py`` script has been built, with the previous steps. You need to run it. You can do this in two ways:

- Run the entire script on the python console
- Run the ``hermione train`` command, in the ``src`` folder


9. After executing step 8.7 the models, their parameters and metrics are logged in mlflow. To access them, simply execute the command below at the command prompt inside the path ``src/``:

```ssh
mlflow ui
```
10. Open the URL, which the previous command returns, in your preferred browser. So you can analyze the results returned:

![](images/mlflow.png)

Ready! Now you have built a project from scratch using Hermione.

0 comments on commit 6e01b16

Please sign in to comment.