Skip to content

Commit

Permalink
simplify input logic
Browse files Browse the repository at this point in the history
  • Loading branch information
浅梦 authored Oct 3, 2019
1 parent c8a04f9 commit caa12dd
Show file tree
Hide file tree
Showing 13 changed files with 73 additions and 110 deletions.
2 changes: 1 addition & 1 deletion deepctr_torch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
from . import models
from .utils import check_version

__version__ = '0.1.2'
__version__ = '0.1.3'
check_version(__version__)
73 changes: 18 additions & 55 deletions deepctr_torch/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,70 +41,33 @@ def __new__(cls, name, dimension, maxlen, combiner="mean", use_hash=False, dtype
embedding_name, embedding)


def get_fixlen_feature_names(feature_columns):
features = build_input_features(
feature_columns, include_varlen=False, include_fixlen=True)
def get_feature_names(feature_columns):
features = build_input_features(feature_columns)
return list(features.keys())


def get_varlen_feature_names(feature_columns):
features = build_input_features(
feature_columns, include_varlen=True, include_fixlen=False)
return list(features.keys())


def get_inputs_list(inputs):
return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs)))))


def build_input_features(feature_columns, include_varlen=True, mask_zero=True, prefix='', include_fixlen=True):
input_features = OrderedDict()
def build_input_features(feature_columns):
features = OrderedDict()

start = 0

if include_fixlen:
for feat in feature_columns:
feat_name = feat.name
if feat_name in features:
continue
if isinstance(feat, SparseFeat):
features[feat_name] = (start, start + 1)
start += 1
elif isinstance(feat, DenseFeat):
features[feat_name] = (start, start + feat.dimension)
start += feat.dimension
if include_varlen:
for feat in feature_columns:
feat_name = feat.name
if feat_name in features:
continue
if isinstance(feat, VarLenSparseFeat):
features[feat_name] = (start, start + feat.maxlen)
start += feat.maxlen

# if include_fixlen:
# for fc in feature_columns:
# if isinstance(fc, SparseFeat):
# input_features[fc.name] = 1
# # Input( shape=(1,), name=prefix+fc.name, dtype=fc.dtype)
# elif isinstance(fc, DenseFeat):
# input_features[fc.name] = 1
# # Input(
# # shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype)
# if include_varlen:
# for fc in feature_columns:
# if isinstance(fc, VarLenSparseFeat):
# input_features[fc.name] = 1
# # Input(shape=(fc.maxlen,), name=prefix + 'seq_' + fc.name,
# # dtype=fc.dtype)
# if not mask_zero:
# for fc in feature_columns:
# input_features[fc.name + "_seq_length"] = 1
# # Input(shape=(
# # 1,), name=prefix + 'seq_length_' + fc.name)
# input_features[fc.name + "_seq_max_length"] = 1 # fc.maxlen

for feat in feature_columns:
feat_name = feat.name
if feat_name in features:
continue
if isinstance(feat, SparseFeat):
features[feat_name] = (start, start + 1)
start += 1
elif isinstance(feat, DenseFeat):
features[feat_name] = (start, start + feat.dimension)
start += feat.dimension
elif isinstance(feat,VarLenSparseFeat):
features[feat_name] = (start, start + feat.maxlen)
start += feat.maxlen
else:
raise TypeError("Invalid feature column type,got",type(feat))
return features


Expand Down
19 changes: 14 additions & 5 deletions deepctr_torch/models/basemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ def fit(self, x=None,
shuffle=True, ):
"""
:param x: Numpy array of training data (if the model has a single input), or list of Numpy arrays (if the model has multiple inputs).
:param x: Numpy array of training data (if the model has a single input), or list of Numpy arrays (if the model has multiple inputs).If input layers in the model are named, you can also pass a
dictionary mapping input names to Numpy arrays.
:param y: Numpy array of target (label) data (if the model has a single output), or list of Numpy arrays (if the model has multiple outputs).
:param batch_size: Integer or `None`. Number of samples per gradient update. If unspecified, `batch_size` will default to 256.
:param epochs: Integer. Number of epochs to train the model. An epoch is an iteration over the entire `x` and `y` data provided. Note that in conjunction with `initial_epoch`, `epochs` is to be understood as "final epoch". The model is not trained for a number of iterations given by `epochs`, but merely until the epoch of index `epochs` is reached.
Expand All @@ -146,6 +147,8 @@ def fit(self, x=None,
:param shuffle: Boolean. Whether to shuffle the order of the batches at the beginning of each epoch.
"""
if isinstance(x,dict):
x = [x[feature] for feature in self.feature_index]
if validation_data:
if len(validation_data) == 2:
val_x, val_y = validation_data
Expand All @@ -160,6 +163,8 @@ def fit(self, x=None,
'or alternatively it could be a dataset or a '
'dataset or a dataset iterator. '
'However we received `validation_data=%s`' % validation_data)
if isinstance(val_x, dict):
val_x = [val_x[feature] for feature in self.feature_index]

elif validation_split and 0. < validation_split < 1.:
if hasattr(x[0], 'shape'):
Expand Down Expand Up @@ -191,16 +196,18 @@ def fit(self, x=None,
model = self.train()
loss_func = self.loss_func
optim = self.optim
print("Train on {0} samples, validate on {1} samples".format(
len(train_tensor_data), len(val_y)))

sample_num = len(train_tensor_data)
steps_per_epoch = (sample_num - 1) // batch_size + 1

print("Train on {0} samples, validate on {1} samples, {2} steps per epoch".format(
len(train_tensor_data), len(val_y),steps_per_epoch))
for epoch in range(initial_epoch, epochs):
start_time = time.time()
loss_epoch = 0
total_loss_epoch = 0
# if abs(loss_last - loss_now) < 0.0
sample_num = len(train_tensor_data)
train_result = {}
steps_per_epoch = (sample_num - 1) // batch_size + 1
try:
with tqdm(enumerate(train_loader), disable=verbose != 1) as t:
for index, (x_train, y_train) in t:
Expand Down Expand Up @@ -272,6 +279,8 @@ def predict(self, x, batch_size=256):
:return: Numpy array(s) of predictions.
"""
model = self.eval()
if isinstance(x, dict):
x = [x[feature] for feature in self.feature_index]
for i in range(len(x)):
if len(x[i].shape) == 1:
x[i] = np.expand_dims(x[i], axis=1)
Expand Down
27 changes: 13 additions & 14 deletions docs/source/Examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.models import *
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_fixlen_feature_names
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
import torch

if __name__ == "__main__":
Expand Down Expand Up @@ -59,14 +59,14 @@ if __name__ == "__main__":
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

fixlen_feature_names = get_fixlen_feature_names(
feature_names = get_feature_names(
linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model

train, test = train_test_split(data, test_size=0.2)
train_model_input = [train[name] for name in fixlen_feature_names]
test_model_input = [test[name] for name in fixlen_feature_names]
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

# 4.Define Model,train,predict and evaluate

Expand Down Expand Up @@ -111,7 +111,7 @@ from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from deepctr_torch.models import DeepFM
from deepctr_torch.inputs import SparseFeat,get_fixlen_feature_names
from deepctr_torch.inputs import SparseFeat,get_feature_names

if __name__ == "__main__":

Expand All @@ -129,12 +129,12 @@ if __name__ == "__main__":
for feat in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model
train, test = train_test_split(data, test_size=0.2)
train_model_input = [train[name].values for name in fixlen_feature_names]
test_model_input = [test[name].values for name in fixlen_feature_names]
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}
# 4.Define Model,train,predict and evaluate

device = 'cpu'
Expand Down Expand Up @@ -190,7 +190,7 @@ from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr_torch.models import DeepFM
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat,get_fixlen_feature_names,get_varlen_feature_names
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat,get_feature_names,get_varlen_feature_names


def split(x):
Expand Down Expand Up @@ -229,14 +229,13 @@ varlen_feature_columns = [VarLenSparseFeat('genres', len(

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
varlen_feature_names = get_varlen_feature_names(linear_feature_columns+dnn_feature_columns)
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)


# 3.generate input data for model
fixlen_input = [data[name].values for name in fixlen_feature_names]
varlen_input = [genres_list]#varlen_feature_names[0]
model_input = fixlen_input + varlen_input # make sure the order is right
model_input = {name:data[name] for name in feature_names}
model_input['genres'] = genres_list


# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression')
Expand Down
7 changes: 3 additions & 4 deletions docs/source/FAQ.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,18 @@ model = torch.load('DeepFM.h5')
## 2. How to add a long dense feature vector as a input to the model?
```python
from deepctr_torch.models import DeepFM
from deepctr_torch.inputs import DenseFeat,SparseFeat,get_fixlen_feature_names
from deepctr_torch.inputs import DenseFeat,SparseFeat,get_feature_names
import numpy as np

feature_columns = [SparseFeat('user_id',120,),SparseFeat('item_id',60,),DenseFeat("pic_vec",5)]
fixlen_feature_names = get_fixlen_feature_names(feature_columns)
fixlen_feature_names = get_feature_names(feature_columns)

user_id = np.array([[1],[0],[1]])
item_id = np.array([[30],[20],[10]])
pic_vec = np.array([[0.1,0.5,0.4,0.3,0.2],[0.1,0.5,0.4,0.3,0.2],[0.1,0.5,0.4,0.3,0.2]])
label = np.array([1,0,1])

input_dict = {'user_id':user_id,'item_id':item_id,'pic_vec':pic_vec}
model_input = [input_dict[name] for name in fixlen_feature_names]
model_input = {'user_id':user_id,'item_id':item_id,'pic_vec':pic_vec}

model = DeepFM(feature_columns,feature_columns)
model.compile('adagrad','binary_crossentropy')
Expand Down
1 change: 1 addition & 0 deletions docs/source/History.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# History
- 10/03/2019 : [v0.1.3](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.1.3) released.Simplify the input logic.
- 09/28/2019 : [v0.1.2](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.1.2) released.Add [sequence(multi-value) input support](./Examples.html#multi-value-input-movielens).
- 09/24/2019 : [v0.1.1](https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.1.1) released. Add [CCPM](./Features.html#ccpm-convolutional-click-prediction-model).
- 09/22/2019 : DeepCTR-Torch first version v0.1.0 is released on [PyPi](https://pypi.org/project/deepctr-torch/)
14 changes: 4 additions & 10 deletions docs/source/Quick-Start.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from deepctr_torch.models import DeepFM
from deepctr_torch.inputs import SparseFeat, DenseFeat,get_fixlen_feature_names
from deepctr_torch.inputs import SparseFeat, DenseFeat,get_feature_names

data = pd.read_csv('./criteo_sample.txt')

Expand Down Expand Up @@ -75,22 +75,16 @@ dense_feature_columns = [DenseFeat(feat, 1)
dnn_feature_columns = sparse_feature_columns + dense_feature_columns
linear_feature_columns = sparse_feature_columns + dense_feature_columns

feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

```
### Step 4: Generate the training samples and train the model

There are two rules here that we must follow

- The `SparseFeat` and `DenseFeat` are placed in front of the `VarlenSparseFeat`.
- The order of the feature we fit into the model must be consistent with the order of the feature config list.


```python
train, test = train_test_split(data, test_size=0.2)
train_model_input = [train[name] for name in feature_names]
train_model_input = {name:train[name] for name in feature_names}

test_model_input = [test[name] for name in feature_names]
test_model_input = {name:test[name] for name in feature_names}


device = 'cpu'
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# The short X.Y version
version = ''
# The full version, including alpha/beta/rc tags
release = '0.1.2'
release = '0.1.3'


# -- General configuration ---------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ You can read the latest code at https://github.com/shenweichen/DeepCTR-Torch and

News
-----
10/03/2019 : Simplify the input logic(`examples <./Examples.html#classification-criteo>`_). `Changelog <https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.1.3>`_

09/28/2019 : Add `sequence(multi-value) input support <./Examples.html#multi-value-input-movielens>`_ . `Changelog <https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.1.2)>`_

09/24/2019 : Add `CCPM <./Features.html#ccpm-convolutional-click-prediction-model>`_ . `Changelog <https://github.com/shenweichen/DeepCTR-Torch/releases/tag/v0.1.1>`_

09/22/2019 : DeepCTR-Torch first version v0.1.0 is released on `PyPi <https://pypi.org/project/deepctr-torch/>`_ !


.. toctree::
:maxdepth: 2
Expand Down
11 changes: 6 additions & 5 deletions examples/run_classification_criteo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.models import *
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_fixlen_feature_names
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
import torch


Expand Down Expand Up @@ -34,14 +34,15 @@
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

fixlen_feature_names = get_fixlen_feature_names(
feature_names = get_feature_names(
linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model

train, test = train_test_split(data, test_size=0.2)
train_model_input = [train[name] for name in fixlen_feature_names]
test_model_input = [test[name] for name in fixlen_feature_names]

train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

# 4.Define Model,train,predict and evaluate

Expand All @@ -57,7 +58,7 @@
model.compile("adagrad", "binary_crossentropy",
metrics=["binary_crossentropy", "auc"],)
model.fit(train_model_input, train[target].values,
batch_size=32, epochs=10, validation_split=0.2, verbose=2)
batch_size=32, epochs=10, validation_split=0.0, verbose=2)

pred_ans = model.predict(test_model_input, 256)
print("")
Expand Down
11 changes: 4 additions & 7 deletions examples/run_multivalue_movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr_torch.models import DeepFM
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat,get_fixlen_feature_names,get_varlen_feature_names
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat,get_feature_names


def split(x):
Expand Down Expand Up @@ -43,15 +43,12 @@ def split(x):

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
varlen_feature_names = get_varlen_feature_names(linear_feature_columns+dnn_feature_columns)
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)


# 3.generate input data for model
fixlen_input = [data[name].values for name in fixlen_feature_names]
varlen_input = [genres_list]#varlen_feature_names[0]

model_input = fixlen_input + varlen_input # make sure the order is right
model_input = {name:data[name] for name in feature_names}
model_input['genres'] = genres_list

# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression')
Expand Down
Loading

0 comments on commit caa12dd

Please sign in to comment.