diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e32c0b6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,107 @@ +training/training_output/ +*-model-building-code.zip +# Hidden files +.DS_store + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.idea/ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..69d1179 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,14 @@ +language: python +python: + - 3.7 +services: + - docker +install: + - docker build -t max-rec . + - docker run -it -d -p 5000:5000 max-rec + - pip install pytest requests flake8 +before_script: + - flake8 . --max-line-length=127 --exclude training/training_code/dataset + - sleep 30 +script: + - pytest tests/test.py \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9a40480 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,50 @@ +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +FROM codait/max-base:v1.3.2 + +# Fill in these with a link to the bucket containing the model and the model file name +ARG model_bucket=https://max-cdn.cdn.appdomain.cloud/max-recommender/1.0.0 +ARG model_file=assets.tar.gz + +WORKDIR /workspace + +ARG use_pre_trained_model=true + +RUN if [ "$use_pre_trained_model" = "true" ] ; then\ + # download pre-trained model artifacts from Cloud Object Storage + wget -nv --show-progress --progress=bar:force:noscroll ${model_bucket}/${model_file} --output-document=assets/${model_file} &&\ + tar -x -C assets/ -f assets/${model_file} -v && rm assets/${model_file} ; \ + fi + +COPY requirements.txt /workspace +RUN pip install -r requirements.txt + +COPY . /workspace + +RUN if [ "$use_pre_trained_model" = "true" ] ; then \ + # validate downloaded pre-trained model assets + sha512sum -c sha512sums.txt ; \ + else \ + # rename the directory that contains the custom-trained model artifacts + if [ -d "./custom_assets/" ] ; then \ + rm -rf ./assets && ln -s ./custom_assets ./assets ; \ + fi \ + fi + +EXPOSE 5000 + +CMD python app.py diff --git a/README.md b/README.md index 7f5d2fe..9e3f8f8 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,175 @@ -# MAX-Recommender -Generate personalized recommendations +[![Build Status](https://travis-ci.com/IBM/MAX-Recommender.svg?branch=master)](https://travis-ci.com/IBM/MAX-Recommender) [![Website Status](https://img.shields.io/website/http/max-recommender.max.us-south.containers.appdomain.cloud/swagger.json.svg?label=api+demo)](http://max-recommender.max.us-south.containers.appdomain.cloud/) + +[](http://ibm.biz/max-to-ibm-cloud-tutorial) + +# IBM Developer Model Asset Exchange: MAX Recommender + +This repository contains code to instantiate and deploy a recommender model. +This model can be trained on a dataset containing users, items, ratings, and timestamps and make personalized item recommendations for a given user. Once trained, the input to the model is a user IDs and the output is a list of recommended item IDs sorted by probability in descending order. For demo purposes this model has been trained on a subset of the [MovieTweetings Dataset](https://github.com/sidooms/MovieTweetings), containing 457 users with their IDs mapped from 0 to 457 for convenience. + +The model is based on the [Neural Collaborative Filtering model]([https://github.com/microsoft/recommenders]). The model files are hosted on +[IBM Cloud Object Storage](https://max-cdn.cdn.appdomain.cloud/max-recommender/1.0.0/assets.tar.gz). +The code in this repository deploys the model as a web service in a Docker container. This repository was developed +as part of the [IBM Developer Model Asset Exchange](https://developer.ibm.com/exchanges/models/) and the public API is powered by [IBM Cloud](https://ibm.biz/Bdz2XM). + +## Model Metadata +| Domain | Application | Industry | Framework | Training Data | Input Data Format | +| ------------- | -------- | -------- | --------- | --------- | -------------- | +| Information Retrieval | Recommendations | Commerce | TensorFlow | [MovieTweetings](https://github.com/sidooms/MovieTweetings) | CSV | + +## References + + +* _X. He, L. Liao, H. Zhang, L. Nie, X. Hu, T. Chua_, ["Neural Collaborative Filtering"](https://arxiv.org/abs/1708.05031), WWW 2017. +* [Microsoft Recommender Systems GitHub Repo](https://github.com/microsoft/recommenders) + +## Licenses + +| Component | License | Link | +| ------------- | -------- | -------- | +| This repository | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | [LICENSE](LICENSE) | +| Model Weights | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | [LICENSE](LICENSE) | +| Model Code (3rd party) | [MIT](https://opensource.org/licenses/mit-license.html) | [Microsoft Recommender Systems GitHub Repo](https://github.com/microsoft/recommenders/blob/master/LICENSE) | + +## Pre-requisites: + +* `docker`: The [Docker](https://www.docker.com/) command-line interface. Follow the [installation instructions](https://docs.docker.com/install/) for your system. +* The minimum recommended resources for this model is 4GB Memory and 2 CPUs. + +# Steps + +1. [Deploy from Docker Hub](#deploy-from-docker-hub) +2. [Deploy on Kubernetes](#deploy-on-kubernetes) +3. [Deploy on Red Hat OpenShift](#deploy-on-red-hat-openshift) +4. [Run Locally](#run-locally) + +## Deploy from Docker Hub + +To run the docker image, which automatically starts the model serving API, run: + +``` +$ docker run -it -p 5000:5000 codait/max-recommender +``` + +This will pull a pre-built image from Docker Hub (or use an existing image if already cached locally) and run it. +If you'd rather checkout and build the model locally you can follow the [run locally](#run-locally) steps below. + +## Deploy on Kubernetes + +You can also deploy the model on Kubernetes using the latest docker image on Docker Hub. + +On your Kubernetes cluster, run the following commands: + +``` +$ kubectl apply -f https://github.com/IBM/MAX-Recommender/raw/master/max-recommender.yaml +``` + +The model will be available internally at port `5000`, but can also be accessed externally through the `NodePort`. + +A more elaborate tutorial on how to deploy this MAX model to production on [IBM Cloud](https://ibm.biz/Bdz2XM) can be found [here](http://ibm.biz/max-to-ibm-cloud-tutorial). + +## Deploy on Red Hat OpenShift: + + Follow the instructions for the OpenShift web console or the OpenShift Container Platform CLI in [this tutorial](https://developer.ibm.com/tutorials/deploy-a-model-asset-exchange-microservice-on-red-hat-openshift/) and specify `codait/max-recommender` as the image name. + +## Run Locally + +1. [Build the Model](#1-build-the-model) +2. [Deploy the Model](#2-deploy-the-model) +3. [Use the Model](#3-use-the-model) +4. [Development](#4-development) +5. [Cleanup](#5-cleanup) + + +### 1. Build the Model + +Clone this repository locally. In a terminal, run the following command: + +``` +$ git clone https://github.com/IBM/MAX-Recommender.git +``` + +Change directory into the repository base folder: + +``` +$ cd MAX-Recommender +``` + +To build the docker image locally, run: + +``` +$ docker build -t max-recommender . +``` + +All required model assets will be downloaded during the build process. _Note_ that currently this docker image is CPU only (we will add support for GPU images later). + + +### 2. Deploy the Model + +To run the docker image, which automatically starts the model serving API, run: + +``` +$ docker run -it -p 5000:5000 max-recommender +``` + +### 3. Use the Model + +The API server automatically generates an interactive Swagger documentation page. Go to `http://localhost:5000` to load it. From there you can explore the API and also create test requests. + +User the `model/predict` endpoint to retrieve recommendations for a user ID. The number of predictions returned can be specified with `num_results`, by default the model returns 5 predictions. + + +![SWAGGER UI SCREENSHOT](docs/swagger-screenshot.png) + +You can also test it on the command line, for example: + +``` +$ curl -X POST "http://localhost:5000/model/predict?user_id=1&num_results=5" -H "accept: application/json" +``` + +You should see a JSON response like that below: + +```json +{ + "status": "ok", + "predictions": [ + { + "user": "1", + "item": "1454468", + "prediction": 0.995230495929718 + }, + { + "user": "1", + "item": "1300854", + "prediction": 0.9938176274299622 + }, + { + "user": "1", + "item": "77413", + "prediction": 0.9930911064147949 + }, + { + "user": "1", + "item": "1731141", + "prediction": 0.9929673671722412 + }, + { + "user": "1", + "item": "363226", + "prediction": 0.9914621710777283 + } + ] +} +``` + +### 4. Development + +To run the Flask API app in debug mode, edit `config.py` to set `DEBUG = True` under the application settings. You will then need to rebuild the docker image (see [step 1](#1-build-the-model)). + +### 5. Cleanup + +To stop the Docker container, type `CTRL` + `C` in your terminal. + +## Train this Model on Watson Machine Learning + +This model supports both fine-tuning with transfer learning and training from scratch on a custom dataset. Please follow the steps listed under the [training readme](training/README.md) to retrain the model on [Watson Machine Learning](https://www.ibm.com/cloud/machine-learning), a deep learning as a service offering of [IBM Cloud](https://ibm.biz/Bdz2XM). diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..57f0448 --- /dev/null +++ b/api/__init__.py @@ -0,0 +1,18 @@ +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .metadata import ModelMetadataAPI # noqa +from .predict import ModelPredictAPI # noqa diff --git a/api/metadata.py b/api/metadata.py new file mode 100644 index 0000000..81393fc --- /dev/null +++ b/api/metadata.py @@ -0,0 +1,26 @@ +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from core.model import ModelWrapper +from maxfw.core import MAX_API, MetadataAPI, METADATA_SCHEMA + + +class ModelMetadataAPI(MetadataAPI): + + @MAX_API.marshal_with(METADATA_SCHEMA) + def get(self): + """Return the metadata associated with the model""" + return ModelWrapper.MODEL_META_DATA diff --git a/api/predict.py b/api/predict.py new file mode 100644 index 0000000..ca4da22 --- /dev/null +++ b/api/predict.py @@ -0,0 +1,57 @@ +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from core.model import ModelWrapper +from maxfw.core import MAX_API, PredictAPI +from flask_restplus import fields + +# Set up parser for input data (http://flask-restplus.readthedocs.io/en/stable/parsing.html) +input_parser = MAX_API.parser() +input_parser.add_argument('user_id', type=str, required=True, help='User ID to generate recommendations for') +input_parser.add_argument('num_results', type=int, required=False, default=5, help='Number of items to return') + + +# Creating a JSON response model: https://flask-restplus.readthedocs.io/en/stable/marshalling.html#the-api-model-factory +item_prediction = MAX_API.model('ItemPrediction', { + 'user': fields.String(required=True, description='User ID'), + 'item': fields.String(required=True, description='Item ID'), + 'prediction': fields.Float(required=True, description='Predicted score') +}) + +predict_response = MAX_API.model('ModelPredictResponse', { + 'status': fields.String(required=True, description='Response status message'), + 'predictions': fields.List(fields.Nested(item_prediction), description='Recommended items and scores') +}) + + +class ModelPredictAPI(PredictAPI): + + model_wrapper = ModelWrapper() + + @MAX_API.doc('predict') + @MAX_API.expect(input_parser) + @MAX_API.marshal_with(predict_response) + def post(self): + """Make a prediction given input data""" + result = {'status': 'error'} + + args = input_parser.parse_args() + preds = self.model_wrapper.predict(args) + + result['predictions'] = preds + result['status'] = 'ok' + + return result diff --git a/app.py b/app.py new file mode 100644 index 0000000..afa203c --- /dev/null +++ b/app.py @@ -0,0 +1,24 @@ +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from maxfw.core import MAXApp +from api import ModelMetadataAPI, ModelPredictAPI +from config import API_TITLE, API_DESC, API_VERSION + +max = MAXApp(API_TITLE, API_DESC, API_VERSION) +max.add_api(ModelMetadataAPI, '/metadata') +max.add_api(ModelPredictAPI, '/predict') +max.run() diff --git a/config.py b/config.py new file mode 100644 index 0000000..669f693 --- /dev/null +++ b/config.py @@ -0,0 +1,41 @@ +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flask settings +DEBUG = False + +# Flask-restplus settings +RESTPLUS_MASK_SWAGGER = False +SWAGGER_UI_DOC_EXPANSION = 'none' + +# API metadata +API_TITLE = 'MAX Recommender' +API_DESC = 'Generate personalized recommendations' +API_VERSION = '1.0.0' + +# default model +MODEL_NAME = 'NCF' +DEFAULT_MODEL_PATH = 'assets/{}'.format(MODEL_NAME) + + +MODEL_META_DATA = { + 'id': '{}'.format(MODEL_NAME.lower()), + 'name': API_TITLE, + 'description': API_DESC, + 'type': 'recommendation', + 'source': 'https://developer.ibm.com/exchanges/models/all/max-recommender/', + 'license': 'Apache V2' +} diff --git a/core/NCF.py b/core/NCF.py new file mode 100644 index 0000000..56d470b --- /dev/null +++ b/core/NCF.py @@ -0,0 +1,405 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import os +import numpy as np +import tensorflow as tf +from time import time +import logging + + +logger = logging.getLogger(__name__) + + +MODEL_CHECKPOINT = "model.ckpt" + + +class NCF: + """Neural Collaborative Filtering (NCF) implementation + + Reference: + He, Xiangnan, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu, and Tat-Seng Chua. "Neural collaborative filtering." + In Proceedings of the 26th International Conference on World Wide Web, pp. 173-182. International World Wide Web + Conferences Steering Committee, 2017. + + Link: https://www.comp.nus.edu.sg/~xiangnan/papers/ncf.pdf + """ + + def __init__( + self, + n_users, + n_items, + model_type="NeuMF", + n_factors=8, + layer_sizes=[16, 8, 4], + n_epochs=50, + batch_size=64, + learning_rate=5e-3, + verbose=1, + seed=None, + ): + """Constructor + + Args: + n_users (int): Number of users in the dataset. + n_items (int): Number of items in the dataset. + model_type (str): Model type. + n_factors (int): Dimension of latent space. + layer_sizes (list): Number of layers for MLP. + n_epochs (int): Number of epochs for training. + batch_size (int): Batch size. + learning_rate (float): Learning rate. + verbose (int): Whether to show the training output or not. + seed (int): Seed. + + """ + + # seed + tf.set_random_seed(seed) + np.random.seed(seed) + self.seed = seed + + self.n_users = n_users + self.n_items = n_items + self.model_type = model_type.lower() + self.n_factors = n_factors + self.layer_sizes = layer_sizes + self.n_epochs = n_epochs + self.verbose = verbose + self.batch_size = batch_size + self.learning_rate = learning_rate + + # check model type + model_options = ["gmf", "mlp", "neumf"] + if self.model_type not in model_options: + raise ValueError( + "Wrong model type, please select one of this list: {}".format( + model_options + ) + ) + + # ncf layer input size + self.ncf_layer_size = n_factors + layer_sizes[-1] + # create ncf model + self._create_model() + # set GPU use with demand growth + gpu_options = tf.GPUOptions(allow_growth=True) + # set TF Session + self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) + # parameters initialization + self.sess.run(tf.global_variables_initializer()) + + def _create_model(self,): + # reset graph + tf.reset_default_graph() + + with tf.variable_scope("input_data", reuse=tf.AUTO_REUSE): + + # input: index of users, items and ground truth + self.user_input = tf.placeholder(tf.int32, shape=[None, 1]) + self.item_input = tf.placeholder(tf.int32, shape=[None, 1]) + self.labels = tf.placeholder(tf.float32, shape=[None, 1]) + + with tf.variable_scope("embedding", reuse=tf.AUTO_REUSE): + + # set embedding table + self.embedding_gmf_P = tf.Variable( + tf.truncated_normal( + shape=[self.n_users, self.n_factors], mean=0.0, stddev=0.01, seed=self.seed, + ), + name="embedding_gmf_P", + dtype=tf.float32, + ) + + self.embedding_gmf_Q = tf.Variable( + tf.truncated_normal( + shape=[self.n_items, self.n_factors], mean=0.0, stddev=0.01, seed=self.seed, + ), + name="embedding_gmf_Q", + dtype=tf.float32, + ) + + # set embedding table + self.embedding_mlp_P = tf.Variable( + tf.truncated_normal( + shape=[self.n_users, int(self.layer_sizes[0] / 2)], + mean=0.0, + stddev=0.01, + seed=self.seed, + ), + name="embedding_mlp_P", + dtype=tf.float32, + ) + + self.embedding_mlp_Q = tf.Variable( + tf.truncated_normal( + shape=[self.n_items, int(self.layer_sizes[0] / 2)], + mean=0.0, + stddev=0.01, + seed=self.seed, + ), + name="embedding_mlp_Q", + dtype=tf.float32, + ) + + with tf.variable_scope("gmf", reuse=tf.AUTO_REUSE): + + # get user embedding p and item embedding q + self.gmf_p = tf.reduce_sum( + tf.nn.embedding_lookup(self.embedding_gmf_P, self.user_input), 1 + ) + self.gmf_q = tf.reduce_sum( + tf.nn.embedding_lookup(self.embedding_gmf_Q, self.item_input), 1 + ) + + # get gmf vector + self.gmf_vector = self.gmf_p * self.gmf_q + + with tf.variable_scope("mlp", reuse=tf.AUTO_REUSE): + + # get user embedding p and item embedding q + self.mlp_p = tf.reduce_sum( + tf.nn.embedding_lookup(self.embedding_mlp_P, self.user_input), 1 + ) + self.mlp_q = tf.reduce_sum( + tf.nn.embedding_lookup(self.embedding_mlp_Q, self.item_input), 1 + ) + + # concatenate user and item vector + output = tf.concat([self.mlp_p, self.mlp_q], 1) + + # MLP Layers + for layer_size in self.layer_sizes[1:]: + output = tf.contrib.layers.fully_connected( + output, + num_outputs=layer_size, + activation_fn=tf.nn.relu, + weights_initializer=tf.contrib.layers.xavier_initializer(seed=self.seed), + ) + self.mlp_vector = output + + # self.output = tf.sigmoid(tf.reduce_sum(self.mlp_vector, axis=1, keepdims=True)) + + with tf.variable_scope("ncf", reuse=tf.AUTO_REUSE): + + if self.model_type == "gmf": + # GMF only + output = tf.contrib.layers.fully_connected( + self.gmf_vector, + num_outputs=1, + activation_fn=None, + biases_initializer=None, + weights_initializer=tf.contrib.layers.xavier_initializer(seed=self.seed), + ) + self.output = tf.sigmoid(output) + + elif self.model_type == "mlp": + # MLP only + output = tf.contrib.layers.fully_connected( + self.mlp_vector, + num_outputs=1, + activation_fn=None, + biases_initializer=None, + weights_initializer=tf.contrib.layers.xavier_initializer(seed=self.seed), + ) + self.output = tf.sigmoid(output) + + elif self.model_type == "neumf": + # concatenate GMF and MLP vector + self.ncf_vector = tf.concat([self.gmf_vector, self.mlp_vector], 1) + # get predicted rating score + output = tf.contrib.layers.fully_connected( + self.ncf_vector, + num_outputs=1, + activation_fn=None, + biases_initializer=None, + weights_initializer=tf.contrib.layers.xavier_initializer(seed=self.seed), + ) + self.output = tf.sigmoid(output) + + with tf.variable_scope("loss", reuse=tf.AUTO_REUSE): + + # set loss function + self.loss = tf.losses.log_loss(self.labels, self.output) + + with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE): + + # set optimizer + self.optimizer = tf.train.AdamOptimizer( + learning_rate=self.learning_rate + ).minimize(self.loss) + + def save(self, dir_name): + """Save model parameters in `dir_name` + + Args: + dir_name (str): directory name, which should be a folder name instead of file name + we will create a new directory if not existing. + """ + # save trained model + if not os.path.exists(dir_name): + os.makedirs(dir_name) + saver = tf.train.Saver() + saver.save(self.sess, os.path.join(dir_name, MODEL_CHECKPOINT)) + + def load(self, gmf_dir=None, mlp_dir=None, neumf_dir=None, alpha=0.5): + """Load model parameters for further use. + GMF model --> load parameters in `gmf_dir` + MLP model --> load parameters in `mlp_dir` + NeuMF model --> load parameters in `neumf_dir` or in `gmf_dir` and `mlp_dir` + + Args: + gmf_dir (str): Directory name for GMF model. + mlp_dir (str): Directory name for MLP model. + neumf_dir (str): Directory name for neumf model. + alpha (float): the concatenation hyper-parameter for gmf and mlp output layer. + + Returns: + obj: Load parameters in this model. + """ + + # load pre-trained model + if self.model_type == "gmf" and gmf_dir is not None: + saver = tf.train.Saver() + saver.restore(self.sess, os.path.join(gmf_dir, MODEL_CHECKPOINT)) + + elif self.model_type == "mlp" and mlp_dir is not None: + saver = tf.train.Saver() + saver.restore(self.sess, os.path.join(mlp_dir, MODEL_CHECKPOINT)) + + elif self.model_type == "neumf" and neumf_dir is not None: + saver = tf.train.Saver() + saver.restore(self.sess, os.path.join(neumf_dir, MODEL_CHECKPOINT)) + + elif self.model_type == "neumf" and gmf_dir is not None and mlp_dir is not None: + # load neumf using gmf and mlp + self._load_neumf(gmf_dir, mlp_dir, alpha) + + else: + raise NotImplementedError + + def _load_neumf(self, gmf_dir, mlp_dir, alpha): + """Load gmf and mlp model parameters for further use in NeuMF. + NeuMF model --> load parameters in `gmf_dir` and `mlp_dir` + """ + # load gmf part + variables = tf.global_variables() + # get variables with 'gmf' + var_flow_restore = [ + val for val in variables if "gmf" in val.name and "ncf" not in val.name + ] + # load 'gmf' variable + saver = tf.train.Saver(var_flow_restore) + # restore + saver.restore(self.sess, os.path.join(gmf_dir, MODEL_CHECKPOINT)) + + # load mlp part + variables = tf.global_variables() + # get variables with 'gmf' + var_flow_restore = [ + val for val in variables if "mlp" in val.name and "ncf" not in val.name + ] + # load 'gmf' variable + saver = tf.train.Saver(var_flow_restore) + # restore + saver.restore(self.sess, os.path.join(mlp_dir, MODEL_CHECKPOINT)) + + # concat pretrain h_from_gmf and h_from_mlp + vars_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="ncf") + + assert len(vars_list) == 1 + ncf_fc = vars_list[0] + + # get weight from gmf and mlp + gmf_fc = tf.contrib.framework.load_variable(gmf_dir, ncf_fc.name) + mlp_fc = tf.contrib.framework.load_variable(mlp_dir, ncf_fc.name) + + # load fc layer by tf.concat + assign_op = tf.assign( + ncf_fc, tf.concat([alpha * gmf_fc, (1 - alpha) * mlp_fc], axis=0) + ) + self.sess.run(assign_op) + + def fit(self, data): + """Fit model with training data + + Args: + data (NCFDataset): initialized Dataset in ./dataset.py + """ + + # get user and item mapping dict + self.user2id = data.user2id + self.item2id = data.item2id + self.id2user = data.id2user + self.id2item = data.id2item + + # loop for n_epochs + for epoch_count in range(1, self.n_epochs + 1): + + # negative sampling for training + train_begin = time() + data.negative_sampling() + + # initialize + train_loss = [] + + # calculate loss and update NCF parameters + for user_input, item_input, labels in data.train_loader(self.batch_size): + + user_input = np.array([self.user2id[x] for x in user_input]) + item_input = np.array([self.item2id[x] for x in item_input]) + labels = np.array(labels) + + feed_dict = { + self.user_input: user_input[..., None], + self.item_input: item_input[..., None], + self.labels: labels[..., None], + } + + # get loss and execute optimization + loss, _ = self.sess.run([self.loss, self.optimizer], feed_dict) + train_loss.append(loss) + train_time = time() - train_begin + + # output every self.verbose + if self.verbose and epoch_count % self.verbose == 0: + logger.info( + "Epoch %d [%.2fs]: train_loss = %.6f " + % (epoch_count, train_time, sum(train_loss) / len(train_loss)) + ) + + def predict(self, user_input, item_input, is_list=False): + """Predict function of this trained model + + Args: + user_input (list or element of list): userID or userID list + item_input (list or element of list): itemID or itemID list + is_list (bool): if true, the input is list type + noting that list-wise type prediction is faster than element-wise's. + + Returns: + list or float: list of predicted rating or predicted rating score. + """ + + if is_list: + output = self._predict(user_input, item_input) + return list(output.reshape(-1)) + + else: + output = self._predict(np.array([user_input]), np.array([item_input])) + return float(output.reshape(-1)[0]) + + def _predict(self, user_input, item_input): + + # index converting + # user_input = np.array([self.user2id[x] for x in user_input]) + # item_input = np.array([self.item2id[x] for x in item_input]) + + # get feed dict + feed_dict = { + self.user_input: user_input[..., None], + self.item_input: item_input[..., None], + } + + # calculate predicted score + return self.sess.run(self.output, feed_dict) diff --git a/core/__init__.py b/core/__init__.py new file mode 100644 index 0000000..487277e --- /dev/null +++ b/core/__init__.py @@ -0,0 +1,15 @@ +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/core/model.py b/core/model.py new file mode 100644 index 0000000..0f0d5f8 --- /dev/null +++ b/core/model.py @@ -0,0 +1,75 @@ +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from maxfw.model import MAXModelWrapper + +import pickle +import pandas as pd +import numpy as np +import logging +from config import DEFAULT_MODEL_PATH, MODEL_META_DATA as model_meta + +from core.NCF import NCF + +logger = logging.getLogger() + + +class ModelWrapper(MAXModelWrapper): + + MODEL_META_DATA = model_meta + + def __init__(self, path=DEFAULT_MODEL_PATH): + logger.info('Loading model from: {}...'.format(path)) + + with open('assets/user_mapping.p', 'rb') as fp: + self.user_to_id_mapping = pickle.load(fp) + + with open('assets/item_mapping.p', 'rb') as fp: + self.item_to_id_mapping = pickle.load(fp) + + with open('assets/parameters.p', 'rb') as fp: + self.parameters = pickle.load(fp) + + self.users = [user for user in self.user_to_id_mapping] + self.items = [item for item in self.item_to_id_mapping] + self.item_ids = np.array([self.item_to_id_mapping[item] for item in self.items]) + self.len_item_ids = len(self.item_ids) + + # Load the graph + self.model = NCF( + n_users=self.parameters["n_users"], + n_items=self.parameters["n_items"], + model_type="NeuMF", + n_factors=self.parameters["factors"], + layer_sizes=[16, 8, 4] + ) + self.model.load(neumf_dir="assets", alpha=0.5) + + def _pre_process(self, inp): + return inp + + def _post_process(self, result): + return result + + def _predict(self, input_args): + user = input_args['user_id'] + user_id = self.user_to_id_mapping[user] + raw_preds = self.model.predict(np.tile(user_id, self.len_item_ids), self.item_ids, is_list=True) + predictions = [[user, i, p] for i, p in zip(self.items, raw_preds)] + predictions_sorted = pd.DataFrame(predictions, columns=['user', 'item', 'prediction']) \ + .nlargest(input_args['num_results'], 'prediction') \ + .to_dict('records') + return predictions_sorted diff --git a/docs/deploy-max-to-ibm-cloud-with-kubernetes-button.png b/docs/deploy-max-to-ibm-cloud-with-kubernetes-button.png new file mode 100644 index 0000000..0b54da3 Binary files /dev/null and b/docs/deploy-max-to-ibm-cloud-with-kubernetes-button.png differ diff --git a/docs/swagger-screenshot.png b/docs/swagger-screenshot.png new file mode 100644 index 0000000..3c10ebd Binary files /dev/null and b/docs/swagger-screenshot.png differ diff --git a/max-recommender.yaml b/max-recommender.yaml new file mode 100644 index 0000000..4dbd602 --- /dev/null +++ b/max-recommender.yaml @@ -0,0 +1,32 @@ +apiVersion: v1 +kind: Service +metadata: + name: max-recommender +spec: + selector: + app: max-recommender + ports: + - port: 5000 + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: max-recommender + labels: + app: max-recommender +spec: + selector: + matchLabels: + app: max-recommender + replicas: 1 + template: + metadata: + labels: + app: max-recommender + spec: + containers: + - name: max-recommender + image: codait/max-recommender:latest + ports: + - containerPort: 5000 diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..547de5c --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,2 @@ +pytest +requests diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..72c1f8c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +tensorflow==1.15 +pandas==0.25.0 +numpy==1.17.4 diff --git a/sha512sums.txt b/sha512sums.txt new file mode 100644 index 0000000..e681e30 --- /dev/null +++ b/sha512sums.txt @@ -0,0 +1,8 @@ +08602c93b366bd62d2ebe366fef5f4bbc4647577bf81e47f880bbe653fdf3273191138c35d9368f3ec274ec212cd971998972c40b0ce7a8af96e96c691917870 ./assets/checkpoint +d97bdebb5f41d81e4dd102065a80137baf60d53a28364ef02d97850a26313797b0c7a6ef41ce12d6a386f66cf24fbe32bb46f348c8464ea39dd586fbfddb5d8d ./assets/item_mapping.p +52c8d06e5c536bec98f1bf8fad0912edbb115936f4a9234c87336e4274d9bed89700ac4b0b4fb07abb508a0093f35d9d6ada3e94045fffab9be7898267b26a0c ./assets/model.ckpt.data-00000-of-00001 +a05fed09bcc97002a3da6f50486bed4a18208d55f498be5d69d25c34ee3e4dfdc1f26a7c9cb931f2f8a00aa9e553284afbcc5b7abd7d4f697bd12393c64170a0 ./assets/model.ckpt.index +f8b30239e3cde2204e559231bc1499c4ef252075ca8263c7e4d279379ff4c03eb8ad9881416e8d9fc5e90e63a2533830840d60b31741f87b4f6ac8bd42d4ea29 ./assets/model.ckpt.meta +1a051e17af9dfe74ce908d720294a5938af98e7aced89e54c9a4c0a1460e58db7144f7266b471eaec2284c6cd10ecace87ac519a3ec6ad36cc50f5be60d51849 ./assets/parameters.json +172a99b4504f5682532fa15c9586473012f6498dc66812ce95147c18a4744a415db0eed2c776dff5cc19a493c4947c7f2504af7d57a0e4ed7d179b271bee10df ./assets/parameters.p +364589df0ce40d9add6a215ba186d61aa3ecfeb11d101782cdef14319b605cf9f0f6de34151f00b6e3d180dc9fda160ed45cfbf053aee0fd42b044a8033a4353 ./assets/user_mapping.p \ No newline at end of file diff --git a/tests/test.py b/tests/test.py new file mode 100644 index 0000000..9b10295 --- /dev/null +++ b/tests/test.py @@ -0,0 +1,65 @@ +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import requests + + +def test_swagger(): + + model_endpoint = 'http://localhost:5000/swagger.json' + + r = requests.get(url=model_endpoint) + assert r.status_code == 200 + assert r.headers['Content-Type'] == 'application/json' + + json = r.json() + assert 'swagger' in json + assert json.get('info') and json.get('info').get('title') == 'MAX Recommender' + + +def test_metadata(): + + model_endpoint = 'http://localhost:5000/model/metadata' + + r = requests.get(url=model_endpoint) + assert r.status_code == 200 + + metadata = r.json() + assert metadata['id'] == 'ncf' + assert metadata['name'] == 'MAX Recommender' + assert metadata['description'] == 'Generate personalized recommendations' + assert metadata['license'] == 'Apache V2' + + +def test_response(): + model_endpoint = 'http://localhost:5000/model/predict' + + data = {'user_id': "1", + 'num_results': 5} + r = requests.post(url=model_endpoint, data=data) + + assert r.status_code == 200 + response = r.json() + assert len(response['predictions']) == 5 + + assert response['status'] == 'ok' + + # add sanity checks here + + +if __name__ == '__main__': + pytest.main([__file__]) diff --git a/training/README.md b/training/README.md new file mode 100644 index 0000000..c6d7c17 --- /dev/null +++ b/training/README.md @@ -0,0 +1,271 @@ +## Train the Model with Your Own Data + +This document provides instructions to train the model on Watson Machine Learning, an offering of IBM Cloud. The instructions in this document assume that you already have an IBM Cloud account. If not, please create an [IBM Cloud](https://ibm.biz/Bdz2XM) account. + +- [Prepare Data for Training](#prepare-data-for-training) +- [Train the Model](#train-the-model) +- [Rebuild the Model Serving Microservice](#rebuild-the-model-serving-microservice) + +## Prepare Data for Training + +To prepare your data for training complete the steps listed in [data_preparation/README.md](data_preparation/README.md). + +## Train the Model + +- [Install Local Prerequisites](#install-local-prerequisites) +- [Customize Training](#customize-training) +- [Run the Setup Script](#run-the-setup-script) +- [Train the Model Using Watson Machine Learning](#train-the-model-using-watson-machine-learning) + +In this document `$MODEL_REPO_HOME_DIR` refers to the cloned MAX model repository directory, e.g. +`/users/hi_there/MAX-Recommender`. + +### Install Local Prerequisites + +Open a terminal window, change dir into `$MODEL_REPO_HOME_DIR/training` and install the Python prerequisites. (Model training requires Python 3.6 or above.) + + ``` + $ cd training/ + + $ pip install -r requirements.txt + ... + ``` + + The directory contains two Python scripts, `setup_max_model_training` and `train_max_model`, which you'll use to prepare your environment for model training and to perform model training on Watson Machine Learning. + +### Run the Setup Script + +To perform model training, you need access to a Watson Machine Learning service instance and a Cloud Object Storage service instance on IBM Cloud. The `setup_max_model_training.py` script prepares your IBM Cloud resources for model training and configures your local environment. + +#### Steps + +1. Open a terminal window. + +1. Locate the training configuration file. It is named `max-recommender-training-config.yaml`. + + ``` + + $ ls *.yaml + max-recommender-training-config.yaml + ``` + +2. Run `setup_max_model_training.py` and follow the prompts to configure model training. + + ``` + $ python setup_max_model_training.py max-recommender-training-config.yaml + ... + ------------------------------------------------------------------------------ + Model training setup is complete and your configuration file was updated. + ------------------------------------------------------------------------------ + Training data bucket name : max-recommender-sample-input + Local data directory : sample_training_data/ + Training results bucket name: max-recommender-sample-output + Compute configuration : k80 + ``` + + The setup script updates the training configuration file using the information you've provided. For security reasons, confidential information, such as API keys or passwords, are _not_ stored in this file. Instead the script displays a set of environment variables that you must define to make this information available to the training script. + +3. Once setup is completed, define the displayed environment variables. The model training script `train_max_model` uses those variables to access your training resources. + + MacOS/Linux example: + + ``` + $ export ML_APIKEY=... + $ export ML_INSTANCE=... + $ export ML_ENV=... + $ export AWS_ACCESS_KEY_ID=... + $ export AWS_SECRET_ACCESS_KEY=... + ``` + + Microsoft Windows: + + ``` + $ set ML_APIKEY=... + $ set ML_INSTANCE=... + $ set ML_ENV=... + $ set AWS_ACCESS_KEY_ID=... + $ set AWS_SECRET_ACCESS_KEY=... + ``` + + > If you re-run the setup script and select a different Watson Machine Learning service instance or Cloud Object Storage service instance the displayed values will change. The values do not change if you modify any other configuration setting, such as the input data bucket or the compute configuration. + + +#### Set up training command + +The command that will be run in Watson Machine Learning can be found in the `training_code/train-max-model.sh` script as the variable `TRAINING_CMD` found at the top of the file. The parameters this script accepts are listed below: + +| Parameter Name | Description | Default Value | Required | +|---|---|---|---| +| data | File name | N/A | Yes | +| epoch | Number of epochs to run | 100 | No | +| batch_size | Batch size | 128 | No | +| factors | Number of latent factors | 8 | No | +| learning_rate | Learning rate | 5e-3 | No | +| delimiter | Delimiter to use when reading data | "," | No | +| hpo | Run hyperparameter optimization on a set of parameters | False | No | + +For example: + +If you wish to train the model on data contained in the file `ratings.csv` for 50 epochs, you need to make sure `TRAINING_CMD` is set to `python train_ncf.py --data ratings.csv --epoch 50` + +### Run the Setup Script + + To perform model training, you need access to a Watson Machine Learning service instance and a Cloud Object Storage service instance on IBM Cloud. The `setup_max_model_training` Python script prepares your IBM Cloud resources for model training and configures your local environment. + + #### Steps + +1. Open a terminal window. + +2. Run `setup_max_model_training` and follow the prompts to configure model training. + + ``` + $ ./setup_max_model_training max-ncf-training-config.yaml + ... + ------------------------------------------------------------------------------ + Model training setup is complete and your configuration file was updated. + ------------------------------------------------------------------------------ + Training data bucket name : sample-input + Local data directory : sample_training_data/ + Training results bucket name: sample-output + Compute configuration : k80 + ``` + + > On Microsoft Windows run `python setup_max_model_training max-ncf-training-config.yaml`. + + The setup script updates the training configuration file using the information you've provided. For security reasons, confidential information, such as API keys or passwords, are _not_ stored in this file. Instead the script displays a set of environment variables that you must define to make this information available to the training script. + +3. Once setup is completed, define the displayed environment variables. The model training script `train_max_model` uses those variables to access your training resources. + + MacOS/Linux example: + + ``` + $ export ML_APIKEY=... + $ export ML_INSTANCE=... + $ export ML_ENV=... + $ export AWS_ACCESS_KEY_ID=... + $ export AWS_SECRET_ACCESS_KEY=... + ``` + + Microsoft Windows: + + ``` + $ set ML_APIKEY=... + $ set ML_INSTANCE=... + $ set ML_ENV=... + $ set AWS_ACCESS_KEY_ID=... + $ set AWS_SECRET_ACCESS_KEY=... + ``` + + > If you re-run the setup script and select a different Watson Machine Learning service instance or Cloud Object Storage service instance the displayed values will change. The values do not change if you modify any other configuration setting, such as the input data bucket or the compute configuration. + + +### Train the Model Using Watson Machine Learning + +The `train_max_model` script verifies your configuration settings, packages the model training code, uploads it to Watson Machine Learning, launches the training run, monitors the training run, and downloads the trained model artifacts. + +Complete the following steps in the terminal window where the earlier mentioned environment variables are defined. + +#### Steps + +1. Verify that the training preparation steps complete successfully. + + ``` + $ python train_max_model.py max-ncf-training-config.yaml prepare + ... + # -------------------------------------------------------- + # Checking environment variables ... + # -------------------------------------------------------- + ... + ``` + + If preparation completed successfully: + + - Training data is present in the Cloud Object Storage bucket that WML will access during model training. + - Model training code is packaged `max-ncf-model-building-code.zip` + + +2. Start model training. + + ``` + $ python train_max_model.py max-ncf-training-config.yaml package + ... + # -------------------------------------------------------- + # Starting model training ... + # -------------------------------------------------------- + Training configuration summary: + Training run name : train-max-... + Training data bucket : ... + Results bucket : ... + Model-building archive: max-ncf-model-building-code.zip + Model training was started. Training id: model-... + ... + ``` + +3. Note the displayed `Training id`. It uniquely identifies your training run in Watson Machine Learning. + +4. Monitor the model training progress. + + ``` + ... + Checking model training status every 15 seconds. Press Ctrl+C once to stop monitoring or press Ctrl+C twice to cancel training. + Status - (p)ending (r)unning (e)rror (c)ompleted or canceled: + ppppprrrrrrr... + ``` + + To **stop** monitoring (but continue model training), press `Ctrl+C` once. + + To **restart** monitoring, run the following command, replacing `` with the id that was displayed when you started model training. + + ``` + python train_max_model.py max-ncf-training-config.yaml package + ``` + + To **cancel** the training run, press `Ctrl+C` twice. + + After training has completed the training log file `training-log.txt` is downloaded along with the trained model artifacts. + + ``` + ... + # -------------------------------------------------------- + # Downloading training log file "training-log.txt" ... + # -------------------------------------------------------- + Downloading "training-.../training-log.txt" from bucket "..." to "training_output/training-log.txt" + .. + # -------------------------------------------------------- + # Downloading trained model archive "model_training_output.tar.gz" ... + # -------------------------------------------------------- + Downloading "training-.../model_training_output.tar.gz" from bucket "..." to "training_output/model_training_output.tar.gz" + .................................................................................... + ``` + + If training was terminated early due to an error only the log file is downloaded. Inspect it to identify the problem. + + ``` + $ ls training_output/ + model_training_output.tar.gz + trained_model/ + training-log.txt + +5. Return to the parent directory `$MODEL_REPO_HOME_DIR`. + + ``` + $ cd .. + ``` + +## Rebuild the Model-Serving Microservice + +1. [Build the Docker image](https://docs.docker.com/engine/reference/commandline/build/): + + ``` + $ docker build -t --build-arg use_pre_trained_model=false . + ... + ``` + + > If the optional parameter `use_pre_trained_model` is set to `true` or if the parameter is not defined the Docker image will be configured to serve the pre-trained model. + +2. Once the Docker image build completes start the microservice by [running the container](https://docs.docker.com/engine/reference/commandline/run/): + + ``` + $ docker run -it -p 5000:5000 + ... + ``` diff --git a/training/data_preparation/README.md b/training/data_preparation/README.md new file mode 100644 index 0000000..cefd68e --- /dev/null +++ b/training/data_preparation/README.md @@ -0,0 +1,27 @@ +## How to prepare your data for training + +Follow the instructions in this document to prepare your data for model training. +- [Prerequisites](#prerequisites) +- [Preparing your data](#preparing-your-data) +- [Organize data directory](#organize-data-directory) + +## Preparing your data + +The model trains on CSV files with the following format: `User ID, Item ID, Rating, Timestamp` + +## Organize data directory + +A trainable model should adhere to the standard directory structure below: + +``` +|-- data_directory + |-- assets + |-- data + |-- initial_model +``` + +1. `assets` holds ancillary files required for training (typically these are generated during the data preparation phase). +2. `data` folder holds the data required for training. +3. `initial_model` folder holds the initial checkpoint files to initiate training. + +If a particular directory is not required, it can be omitted. diff --git a/training/max-recommender-training-config.yaml b/training/max-recommender-training-config.yaml new file mode 100644 index 0000000..dfab25b --- /dev/null +++ b/training/max-recommender-training-config.yaml @@ -0,0 +1,50 @@ +name: MAX Recommender +model_identifier: max-recommender +description: Generate personalized recommendations +author: + name: IBM CODAIT +framework: + name: tensorflow + version: "1.15" + runtimes: + name: python + version: 3.6 +train: + model_source: + initial_model: + data_store: cloud_training_datastore + bucket: + path: + initial_model_local: + path: training_code/ + model_training_results: + trained_model: + data_store: cloud_training_datastore + bucket: + trained_model_local: + path: ../custom_assets/ + data_source: + training_data: + data_store: cloud_training_datastore + bucket: + path: + training_data_local: + path: sample_training_data/ + + mount_type: mount_cos + execution: + command: chmod +x *.sh && ./train-max-model.sh + compute_configuration: + name: k80 + +process: + - name: training_process + params: + staging_dir: training_output/ + trained_model_path: trained_model/tensorflow/checkpoint/ + +data_stores: + - name: cloud_training_datastore + type: s3 + connection: + endpoint: https://s3.us.cloud-object-storage.appdomain.cloud diff --git a/training/requirements.txt b/training/requirements.txt new file mode 100644 index 0000000..f7426d8 --- /dev/null +++ b/training/requirements.txt @@ -0,0 +1,5 @@ +# +# Install requirements for MAX model training +# pip install -r requirements.txt --upgrade +# +max-training-framework>=0.1 diff --git a/training/sample_training_data/README.md b/training/sample_training_data/README.md new file mode 100644 index 0000000..70263a7 --- /dev/null +++ b/training/sample_training_data/README.md @@ -0,0 +1,3 @@ +## Sample training data + +If this directory contains any files (other than this README) you can test-drive the model training process. \ No newline at end of file diff --git a/training/setup_max_model_training.py b/training/setup_max_model_training.py new file mode 100755 index 0000000..189a1d3 --- /dev/null +++ b/training/setup_max_model_training.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import sys + +# If this import fails, make sure the max-training-framework package +# was pip installed. +from max_training_framework.wml_setup import do_setup + +if __name__ == '__main__': + sys.exit(do_setup()) diff --git a/training/train_max_model.py b/training/train_max_model.py new file mode 100755 index 0000000..7abb84c --- /dev/null +++ b/training/train_max_model.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import sys + +# If this import fails, make sure the max-training-framework package +# was pip installed. +from max_training_framework.wml_train import do_train + +if __name__ == '__main__': + sys.exit(do_train()) diff --git a/training/training_code/NCF.py b/training/training_code/NCF.py new file mode 100644 index 0000000..c4cc808 --- /dev/null +++ b/training/training_code/NCF.py @@ -0,0 +1,421 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import os +import numpy as np +import tensorflow as tf +from time import time +import logging + + +logger = logging.getLogger(__name__) + + +MODEL_CHECKPOINT = "model.ckpt" + + +class NCF: + """Neural Collaborative Filtering (NCF) implementation + + Reference: + He, Xiangnan, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu, and Tat-Seng Chua. "Neural collaborative filtering." + In Proceedings of the 26th International Conference on World Wide Web, pp. 173-182. International World Wide Web + Conferences Steering Committee, 2017. + + Link: https://www.comp.nus.edu.sg/~xiangnan/papers/ncf.pdf + """ + + def __init__( + self, + n_users, + n_items, + model_type="NeuMF", + n_factors=8, + layer_sizes=[16, 8, 4], + n_epochs=50, + batch_size=64, + learning_rate=5e-3, + verbose=1, + seed=None, + ): + """Constructor + + Args: + n_users (int): Number of users in the dataset. + n_items (int): Number of items in the dataset. + model_type (str): Model type. + n_factors (int): Dimension of latent space. + layer_sizes (list): Number of layers for MLP. + n_epochs (int): Number of epochs for training. + batch_size (int): Batch size. + learning_rate (float): Learning rate. + verbose (int): Whether to show the training output or not. + seed (int): Seed. + + """ + + # seed + tf.set_random_seed(seed) + np.random.seed(seed) + self.seed = seed + + self.n_users = n_users + self.n_items = n_items + self.model_type = model_type.lower() + self.n_factors = n_factors + self.layer_sizes = layer_sizes + self.n_epochs = n_epochs + self.verbose = verbose + self.batch_size = batch_size + self.learning_rate = learning_rate + + # check model type + model_options = ["gmf", "mlp", "neumf"] + if self.model_type not in model_options: + raise ValueError( + "Wrong model type, please select one of this list: {}".format( + model_options + ) + ) + + # ncf layer input size + self.ncf_layer_size = n_factors + layer_sizes[-1] + # create ncf model + self._create_model() + # set GPU use with demand growth + gpu_options = tf.GPUOptions(allow_growth=True) + # set TF Session + self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) + # parameters initialization + self.sess.run(tf.global_variables_initializer()) + + def _create_model(self,): + # reset graph + tf.reset_default_graph() + + with tf.variable_scope("input_data", reuse=tf.AUTO_REUSE): + + # input: index of users, items and ground truth + self.user_input = tf.placeholder(tf.int32, shape=[None, 1]) + self.item_input = tf.placeholder(tf.int32, shape=[None, 1]) + self.labels = tf.placeholder(tf.float32, shape=[None, 1]) + + with tf.variable_scope("embedding", reuse=tf.AUTO_REUSE): + + # set embedding table + self.embedding_gmf_P = tf.Variable( + tf.truncated_normal( + shape=[self.n_users, self.n_factors], mean=0.0, stddev=0.01, seed=self.seed, + ), + name="embedding_gmf_P", + dtype=tf.float32, + ) + + self.embedding_gmf_Q = tf.Variable( + tf.truncated_normal( + shape=[self.n_items, self.n_factors], mean=0.0, stddev=0.01, seed=self.seed, + ), + name="embedding_gmf_Q", + dtype=tf.float32, + ) + + # set embedding table + self.embedding_mlp_P = tf.Variable( + tf.truncated_normal( + shape=[self.n_users, int(self.layer_sizes[0] / 2)], + mean=0.0, + stddev=0.01, + seed=self.seed, + ), + name="embedding_mlp_P", + dtype=tf.float32, + ) + + self.embedding_mlp_Q = tf.Variable( + tf.truncated_normal( + shape=[self.n_items, int(self.layer_sizes[0] / 2)], + mean=0.0, + stddev=0.01, + seed=self.seed, + ), + name="embedding_mlp_Q", + dtype=tf.float32, + ) + + with tf.variable_scope("gmf", reuse=tf.AUTO_REUSE): + + # get user embedding p and item embedding q + self.gmf_p = tf.reduce_sum( + tf.nn.embedding_lookup( + self.embedding_gmf_P, self.user_input), 1 + ) + self.gmf_q = tf.reduce_sum( + tf.nn.embedding_lookup( + self.embedding_gmf_Q, self.item_input), 1 + ) + + # get gmf vector + self.gmf_vector = self.gmf_p * self.gmf_q + + with tf.variable_scope("mlp", reuse=tf.AUTO_REUSE): + + # get user embedding p and item embedding q + self.mlp_p = tf.reduce_sum( + tf.nn.embedding_lookup( + self.embedding_mlp_P, self.user_input), 1 + ) + self.mlp_q = tf.reduce_sum( + tf.nn.embedding_lookup( + self.embedding_mlp_Q, self.item_input), 1 + ) + + # concatenate user and item vector + output = tf.concat([self.mlp_p, self.mlp_q], 1) + + # MLP Layers + for layer_size in self.layer_sizes[1:]: + output = tf.contrib.layers.fully_connected( + output, + num_outputs=layer_size, + activation_fn=tf.nn.relu, + weights_initializer=tf.contrib.layers.xavier_initializer( + seed=self.seed), + ) + self.mlp_vector = output + + # self.output = tf.sigmoid(tf.reduce_sum(self.mlp_vector, axis=1, keepdims=True)) + + with tf.variable_scope("ncf", reuse=tf.AUTO_REUSE): + + if self.model_type == "gmf": + # GMF only + output = tf.contrib.layers.fully_connected( + self.gmf_vector, + num_outputs=1, + activation_fn=None, + biases_initializer=None, + weights_initializer=tf.contrib.layers.xavier_initializer( + seed=self.seed), + ) + self.output = tf.sigmoid(output) + + elif self.model_type == "mlp": + # MLP only + output = tf.contrib.layers.fully_connected( + self.mlp_vector, + num_outputs=1, + activation_fn=None, + biases_initializer=None, + weights_initializer=tf.contrib.layers.xavier_initializer( + seed=self.seed), + ) + self.output = tf.sigmoid(output) + + elif self.model_type == "neumf": + # concatenate GMF and MLP vector + self.ncf_vector = tf.concat( + [self.gmf_vector, self.mlp_vector], 1) + # get predicted rating score + output = tf.contrib.layers.fully_connected( + self.ncf_vector, + num_outputs=1, + activation_fn=None, + biases_initializer=None, + weights_initializer=tf.contrib.layers.xavier_initializer( + seed=self.seed), + ) + self.output = tf.sigmoid(output) + + with tf.variable_scope("loss", reuse=tf.AUTO_REUSE): + + # set loss function + self.loss = tf.losses.log_loss(self.labels, self.output) + + with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE): + + # set optimizer + self.optimizer = tf.train.AdamOptimizer( + learning_rate=self.learning_rate + ).minimize(self.loss) + + def save(self, dir_name): + """Save model parameters in `dir_name` + + Args: + dir_name (str): directory name, which should be a folder name instead of file name + we will create a new directory if not existing. + """ + # save trained model + if not os.path.exists(dir_name): + os.makedirs(dir_name) + saver = tf.train.Saver() + saver.save(self.sess, os.path.join(dir_name, MODEL_CHECKPOINT)) + + def load(self, gmf_dir=None, mlp_dir=None, neumf_dir=None, alpha=0.5): + """Load model parameters for further use. + GMF model --> load parameters in `gmf_dir` + MLP model --> load parameters in `mlp_dir` + NeuMF model --> load parameters in `neumf_dir` or in `gmf_dir` and `mlp_dir` + + Args: + gmf_dir (str): Directory name for GMF model. + mlp_dir (str): Directory name for MLP model. + neumf_dir (str): Directory name for neumf model. + alpha (float): the concatenation hyper-parameter for gmf and mlp output layer. + + Returns: + obj: Load parameters in this model. + """ + + # load pre-trained model + if self.model_type == "gmf" and gmf_dir is not None: + saver = tf.train.Saver() + saver.restore(self.sess, os.path.join(gmf_dir, MODEL_CHECKPOINT)) + + elif self.model_type == "mlp" and mlp_dir is not None: + saver = tf.train.Saver() + saver.restore(self.sess, os.path.join(mlp_dir, MODEL_CHECKPOINT)) + + elif self.model_type == "neumf" and neumf_dir is not None: + saver = tf.train.Saver() + saver.restore(self.sess, os.path.join(neumf_dir, MODEL_CHECKPOINT)) + + elif self.model_type == "neumf" and gmf_dir is not None and mlp_dir is not None: + # load neumf using gmf and mlp + self._load_neumf(gmf_dir, mlp_dir, alpha) + + else: + raise NotImplementedError + + def _load_neumf(self, gmf_dir, mlp_dir, alpha): + """Load gmf and mlp model parameters for further use in NeuMF. + NeuMF model --> load parameters in `gmf_dir` and `mlp_dir` + """ + # load gmf part + variables = tf.global_variables() + # get variables with 'gmf' + var_flow_restore = [ + val for val in variables if "gmf" in val.name and "ncf" not in val.name + ] + # load 'gmf' variable + saver = tf.train.Saver(var_flow_restore) + # restore + saver.restore(self.sess, os.path.join(gmf_dir, MODEL_CHECKPOINT)) + + # load mlp part + variables = tf.global_variables() + # get variables with 'gmf' + var_flow_restore = [ + val for val in variables if "mlp" in val.name and "ncf" not in val.name + ] + # load 'gmf' variable + saver = tf.train.Saver(var_flow_restore) + # restore + saver.restore(self.sess, os.path.join(mlp_dir, MODEL_CHECKPOINT)) + + # concat pretrain h_from_gmf and h_from_mlp + vars_list = tf.get_collection( + tf.GraphKeys.GLOBAL_VARIABLES, scope="ncf") + + assert len(vars_list) == 1 + ncf_fc = vars_list[0] + + # get weight from gmf and mlp + gmf_fc = tf.contrib.framework.load_variable(gmf_dir, ncf_fc.name) + mlp_fc = tf.contrib.framework.load_variable(mlp_dir, ncf_fc.name) + + # load fc layer by tf.concat + assign_op = tf.assign( + ncf_fc, tf.concat([alpha * gmf_fc, (1 - alpha) * mlp_fc], axis=0) + ) + self.sess.run(assign_op) + + def fit(self, data): + """Fit model with training data + + Args: + data (NCFDataset): initilized Dataset in ./dataset.py + """ + + # get user and item mapping dict + self.user2id = data.user2id + self.item2id = data.item2id + self.id2user = data.id2user + self.id2item = data.id2item + + # loop for n_epochs + for epoch_count in range(1, self.n_epochs + 1): + + # negative sampling for training + train_begin = time() + data.negative_sampling() + + # initialize + train_loss = [] + + # calculate loss and update NCF parameters + for user_input, item_input, labels in data.train_loader(self.batch_size): + + user_input = np.array([self.user2id[x] for x in user_input]) + item_input = np.array([self.item2id[x] for x in item_input]) + labels = np.array(labels) + + feed_dict = { + self.user_input: user_input[..., None], + self.item_input: item_input[..., None], + self.labels: labels[..., None], + } + + # get loss and execute optimization + loss, _ = self.sess.run([self.loss, self.optimizer], feed_dict) + train_loss.append(loss) + train_time = time() - train_begin + + # output every self.verbose + if self.verbose and epoch_count % self.verbose == 0: + logger.info( + "Epoch %d [%.2fs]: train_loss = %.6f " + % (epoch_count, train_time, sum(train_loss) / len(train_loss)) + ) + + def predict(self, user_input, item_input, is_list=False, is_mapped=True): + """Predict function of this trained model + + Args: + user_input (list or element of list): userID or userID list + item_input (list or element of list): itemID or itemID list + is_list (bool): if true, the input is list type + noting that list-wise type prediction is faster than element-wise's. + + Returns: + list or float: list of predicted rating or predicted rating score. + """ + + if is_list: + output = self._predict(user_input, item_input, is_mapped) + return list(output.reshape(-1)) + + else: + output = self._predict( + np.array([user_input]), np.array([item_input]), is_mapped) + return float(output.reshape(-1)[0]) + + def _predict(self, user_input, item_input, is_mapped): + + # index converting + if (is_mapped): + user_input = np.array([self.user2id[x] for x in user_input]) + item_input = np.array([self.item2id[x] for x in item_input]) + + else: + user_input = np.array(user_input) + item_input = np.array(item_input) + + # get feed dict + feed_dict = { + self.user_input: user_input[..., None], + self.item_input: item_input[..., None], + } + + # calculate predicted score + return self.sess.run(self.output, feed_dict) diff --git a/training/training_code/README.md b/training/training_code/README.md new file mode 100644 index 0000000..ec6e91f --- /dev/null +++ b/training/training_code/README.md @@ -0,0 +1,8 @@ +## Overview + +The content of this directory is uploaded to a Watson Machine Learning service instance in the IBM Cloud. + +Directory content: +- `train-max-model.sh`: Main entry point. WML executes this script to train the model. +- `training_requirements.txt`: Defines that packages that will be installed before training is started. + diff --git a/training/training_code/dataset/__init__.py b/training/training_code/dataset/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/training/training_code/dataset/constants.py b/training/training_code/dataset/constants.py new file mode 100644 index 0000000..f36696c --- /dev/null +++ b/training/training_code/dataset/constants.py @@ -0,0 +1,20 @@ +# Default column names +DEFAULT_USER_COL = "userID" +DEFAULT_ITEM_COL = "itemID" +DEFAULT_RATING_COL = "rating" +DEFAULT_LABEL_COL = "label" +DEFAULT_TIMESTAMP_COL = "timestamp" +DEFAULT_PREDICTION_COL = "prediction" +COL_DICT = { + "col_user": DEFAULT_USER_COL, + "col_item": DEFAULT_ITEM_COL, + "col_rating": DEFAULT_RATING_COL, + "col_prediction": DEFAULT_PREDICTION_COL +} + +# Filtering variables +DEFAULT_K = 10 +DEFAULT_THRESHOLD = 10 + +# Other +SEED = 42 \ No newline at end of file diff --git a/training/training_code/dataset/dataset.py b/training/training_code/dataset/dataset.py new file mode 100644 index 0000000..f0b0435 --- /dev/null +++ b/training/training_code/dataset/dataset.py @@ -0,0 +1,332 @@ +import random +import numpy as np +import pandas as pd +import warnings + +from dataset.constants import ( + DEFAULT_ITEM_COL, + DEFAULT_USER_COL, + DEFAULT_RATING_COL, + DEFAULT_TIMESTAMP_COL, +) + + +class Dataset(object): + """Dataset class for NCF""" + + def __init__( + self, + train, + test=None, + n_neg=4, + n_neg_test=100, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_timestamp=DEFAULT_TIMESTAMP_COL, + binary=True, + seed=None, + ): + """Constructor + + Args: + train (pd.DataFrame): Training data with at least columns (col_user, col_item, col_rating). + test (pd.DataFrame): Test data with at least columns (col_user, col_item, col_rating). test can be None, + if so, we only process the training data. + n_neg (int): Number of negative samples for training set. + n_neg_test (int): Number of negative samples for test set. + col_user (str): User column name. + col_item (str): Item column name. + col_rating (str): Rating column name. + col_timestamp (str): Timestamp column name. + binary (bool): If true, set rating > 0 to rating = 1. + seed (int): Seed. + + """ + # initialize user and item index + self.user_idx = None + self.item_idx = None + # set negative sampling for training and test + self.n_neg = n_neg + self.n_neg_test = n_neg_test + # get col name of user, item and rating + self.col_user = col_user + self.col_item = col_item + self.col_rating = col_rating + self.col_timestamp = col_timestamp + # data preprocessing for training and test data + self.train, self.test = self._data_processing(train, test, binary) + # initialize negative sampling for training and test data + self._init_train_data() + self._init_test_data() + # set random seed + random.seed(seed) + + def _data_processing(self, train, test, binary): + """Process the dataset to reindex userID and itemID, also set rating as binary feedback + + Args: + train (pd.DataFrame): Training data with at least columns (col_user, col_item, col_rating). + test (pd.DataFrame): Test data with at least columns (col_user, col_item, col_rating) + test can be None, if so, we only process the training data. + binary (bool): If true, set rating>0 to rating = 1. + + Returns: + list: train and test pd.DataFrame Dataset, which have been reindexed. + + """ + # If testing dataset is None + df = train if test is None else train.append(test) + + # Reindex user and item index + if self.user_idx is None: + # Map user id + user_idx = df[[self.col_user]].drop_duplicates().reindex() + user_idx[self.col_user + "_idx"] = np.arange(len(user_idx)) + self.n_users = len(user_idx) + self.user_idx = user_idx + + self.user2id = dict( + zip(user_idx[self.col_user], user_idx[self.col_user + "_idx"]) + ) + self.id2user = {self.user2id[k]: k for k in self.user2id} + + if self.item_idx is None: + # Map item id + item_idx = df[[self.col_item]].drop_duplicates() + item_idx[self.col_item + "_idx"] = np.arange(len(item_idx)) + self.n_items = len(item_idx) + self.item_idx = item_idx + + self.item2id = dict( + zip(item_idx[self.col_item], item_idx[self.col_item + "_idx"]) + ) + self.id2item = {self.item2id[k]: k for k in self.item2id} + + return self._reindex(train, binary), self._reindex(test, binary) + + def _reindex(self, df, binary): + """Process dataset to reindex userID and itemID, also set rating as binary feedback + + Args: + df (pandas.DataFrame): dataframe with at least columns (col_user, col_item, col_rating) + binary (bool): if true, set rating>0 to rating = 1 + + Returns: + list: train and test pandas.DataFrame Dataset, which have been reindexed. + + """ + + # If testing dataset is None + if df is None: + return None + + # Map user_idx and item_idx + df = pd.merge(df, self.user_idx, on=self.col_user, how="left") + df = pd.merge(df, self.item_idx, on=self.col_item, how="left") + + # If binary feedback, set rating as 1.0 or 0.0 + if binary: + df[self.col_rating] = df[self.col_rating].apply( + lambda x: float(x > 0)) + + # Select relevant columns + df_reindex = df[ + [self.col_user + "_idx", self.col_item + "_idx", self.col_rating] + ] + df_reindex.columns = [self.col_user, self.col_item, self.col_rating] + + return df_reindex + + def _init_train_data(self): + """Return all negative items (in train dataset) and store them in self.interact_status[self.col_item + '_negative'] + store train dataset in self.users, self.items and self.ratings + + """ + + self.item_pool = set(self.train[self.col_item].unique()) + self.interact_status = ( + self.train.groupby(self.col_user)[self.col_item] + .apply(set) + .reset_index() + .rename(columns={self.col_item: self.col_item + "_interacted"}) + ) + self.interact_status[self.col_item + + "_negative"] = self.interact_status[self.col_item + + "_interacted"].apply(lambda x: self.item_pool - + x) + + self.users, self.items, self.ratings = [], [], [] + + # sample n_neg negative samples for training + for row in self.train.itertuples(): + self.users.append(int(getattr(row, self.col_user))) + self.items.append(int(getattr(row, self.col_item))) + self.ratings.append(float(getattr(row, self.col_rating))) + + self.users = np.array(self.users) + self.items = np.array(self.items) + self.ratings = np.array(self.ratings) + + def _init_test_data(self): + """Initialize self.test using 'leave-one-out' evaluation protocol in + paper https://www.comp.nus.edu.sg/~xiangnan/papers/ncf.pdf + """ + if self.test is not None: + # get test positive set for every user + test_interact_status = ( + self.test.groupby( + self.col_user)[ + self.col_item] .apply(set) .reset_index() .rename( + columns={ + self.col_item: self.col_item + + "_interacted_test"})) + + # get negative pools for every user based on training and test + # interactions + test_interact_status = pd.merge( + test_interact_status, + self.interact_status, + on=self.col_user, + how="left") + test_interact_status[ + self.col_item + "_negative" + ] = test_interact_status.apply( + lambda row: row[self.col_item + "_negative"] + - row[self.col_item + "_interacted_test"], + axis=1, + ) + test_ratings = pd.merge( + self.test, + test_interact_status[[self.col_user, + self.col_item + "_negative"]], + on=self.col_user, + how="left", + ) + + # sample n_neg_test negative samples for testing + try: + test_ratings[self.col_item + "_negative"] = test_ratings[ + self.col_item + "_negative" + ].apply(lambda x: random.sample(x, self.n_neg_test)) + + except BaseException: + min_num = min( + map(len, list(test_ratings[self.col_item + "_negative"]))) + warnings.warn( + "n_neg_test is larger than negative items set size! We will set n_neg as the smallest size: %d" % + min_num) + test_ratings[self.col_item + "_negative"] = test_ratings[ + self.col_item + "_negative" + ].apply(lambda x: random.sample(x, min_num)) + + self.test_data = [] + + # generate test data + for row in test_ratings.itertuples(): + self.test_users, self.test_items, self.test_ratings = [], [], [] + + self.test_users.append(int(getattr(row, self.col_user))) + self.test_items.append(int(getattr(row, self.col_item))) + self.test_ratings.append(float(getattr(row, self.col_rating))) + + for i in getattr(row, self.col_item + "_negative"): + self.test_users.append(int(getattr(row, self.col_user))) + self.test_items.append(int(i)) + self.test_ratings.append(float(0)) + + self.test_data.append( + [ + [self.id2user[x] for x in self.test_users], + [self.id2item[x] for x in self.test_items], + self.test_ratings, + ] + ) + + def negative_sampling(self): + """Sample n_neg negative items per positive item, this function should be called every epoch.""" + self.users, self.items, self.ratings = [], [], [] + + # sample n_neg negative samples for training + train_ratings = pd.merge( + self.train, + self.interact_status[[self.col_user, self.col_item + "_negative"]], + on=self.col_user, + ) + + try: + train_ratings[self.col_item + "_negative"] = train_ratings[ + self.col_item + "_negative" + ].apply(lambda x: random.sample(x, self.n_neg)) + except BaseException: + min_num = min( + map(len, list(train_ratings[self.col_item + "_negative"]))) + warnings.warn( + "n_neg is larger than negative items set size! We will set n_neg as the smallest size: %d" % + min_num) + train_ratings[self.col_item + "_negative"] = train_ratings[ + self.col_item + "_negative" + ].apply(lambda x: random.sample(x, min_num)) + + # generate training data + for row in train_ratings.itertuples(): + self.users.append(int(getattr(row, self.col_user))) + self.items.append(int(getattr(row, self.col_item))) + self.ratings.append(float(getattr(row, self.col_rating))) + for i in getattr(row, self.col_item + "_negative"): + self.users.append(int(getattr(row, self.col_user))) + self.items.append(int(i)) + self.ratings.append(float(0)) + + self.users = np.array(self.users) + self.items = np.array(self.items) + self.ratings = np.array(self.ratings) + + def train_loader(self, batch_size, shuffle=True): + """Feed train data every batch + + Args: + batch_size (int): Batch size. + shuffle (bool): Ff true, train data will be shuffled. + + Returns: + list: userID list, itemID list, rating list. + public data loader return the userID, itemID consistent with raw data + + """ + + # yield batch of training data with `shuffle` + indices = np.arange(len(self.users)) + if shuffle: + random.shuffle(indices) + for i in range(len(indices) // batch_size): + begin_idx = i * batch_size + end_idx = (i + 1) * batch_size + batch_indices = indices[begin_idx:end_idx] + + # train_loader() could be called and used by our users in other situations, + # who expect the not re-indexed data. So we convert id --> original user and item + # when returning batch + + yield [ + [self.id2user[x] for x in self.users[batch_indices]], + [self.id2item[x] for x in self.items[batch_indices]], + self.ratings[batch_indices], + ] + + def test_loader(self): + """Feed leave-one-out data every user + + Generate test batch by every positive test instance, + (eg. \[1, 2, 1\] is a positive user & item pair in test set + (\[userID, itemID, rating\] for this tuple). This function + returns like \[\[1, 2, 1\], \[1, 3, 0\], \[1,6, 0\], ...\], + ie. following our *leave-one-out* evaluation protocol. + + Returns: + list: userID list, itemID list, rating list. + public data loader return the userID, itemID consistent with raw data + the first (userID, itemID, rating) is the positive one + """ + for test in self.test_data: + yield test diff --git a/training/training_code/dataset/download_utils.py b/training/training_code/dataset/download_utils.py new file mode 100644 index 0000000..3f0f08f --- /dev/null +++ b/training/training_code/dataset/download_utils.py @@ -0,0 +1,79 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import os +import logging +import requests +import math +from contextlib import contextmanager +from tempfile import TemporaryDirectory +from tqdm import tqdm + +log = logging.getLogger(__name__) + + +def maybe_download(url, filename=None, work_directory=".", expected_bytes=None): + """Download a file if it is not already downloaded. + + Args: + filename (str): File name. + work_directory (str): Working directory. + url (str): URL of the file to download. + expected_bytes (int): Expected file size in bytes. + Returns: + str: File path of the file downloaded. + """ + if filename is None: + filename = url.split("/")[-1] + filepath = os.path.join(work_directory, filename) + if not os.path.exists(filepath): + + r = requests.get(url, stream=True) + total_size = int(r.headers.get("content-length", 0)) + block_size = 1024 + num_iterables = math.ceil(total_size / block_size) + + with open(filepath, "wb") as file: + for data in tqdm( + r.iter_content(block_size), + total=num_iterables, + unit="KB", + unit_scale=True, + ): + file.write(data) + else: + log.debug("File {} already downloaded".format(filepath)) + if expected_bytes is not None: + statinfo = os.stat(filepath) + if statinfo.st_size != expected_bytes: + os.remove(filepath) + raise IOError("Failed to verify {}".format(filepath)) + + return filepath + + +@contextmanager +def download_path(path=None): + """Return a path to download data. If `path=None`, then it yields a temporal path that is eventually deleted, + otherwise the real path of the input. + + Args: + path (str): Path to download data. + + Returns: + str: Real path where the data is stored. + + Examples: + >>> with download_path() as path: + >>> ... maybe_download(url="http://example.com/file.zip", work_directory=path) + + """ + if path is None: + tmp_dir = TemporaryDirectory() + try: + yield tmp_dir.name + finally: + tmp_dir.cleanup() + else: + path = os.path.realpath(path) + yield path diff --git a/training/training_code/dataset/pandas_df_utils.py b/training/training_code/dataset/pandas_df_utils.py new file mode 100644 index 0000000..8b5f285 --- /dev/null +++ b/training/training_code/dataset/pandas_df_utils.py @@ -0,0 +1,503 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +from functools import lru_cache, wraps +import logging + +import pandas as pd +import numpy as np + +from dataset.constants import ( + DEFAULT_USER_COL, + DEFAULT_ITEM_COL, + DEFAULT_RATING_COL, + DEFAULT_LABEL_COL, +) + + +logger = logging.getLogger(__name__) + + +def user_item_pairs( + user_df, + item_df, + user_col=DEFAULT_USER_COL, + item_col=DEFAULT_ITEM_COL, + user_item_filter_df=None, + shuffle=True, + seed=None, +): + """Get all pairs of users and items data. + + Args: + user_df (pd.DataFrame): User data containing unique user ids and maybe their features. + item_df (pd.DataFrame): Item data containing unique item ids and maybe their features. + user_col (str): User id column name. + item_col (str): Item id column name. + user_item_filter_df (pd.DataFrame): User-item pairs to be used as a filter. + shuffle (bool): If True, shuffles the result. + seed (int): Random seed for shuffle + + Returns: + pd.DataFrame: All pairs of user-item from user_df and item_df, excepting the pairs in user_item_filter_df + """ + + # Get all user-item pairs + user_df["key"] = 1 + item_df["key"] = 1 + users_items = user_df.merge(item_df, on="key") + + user_df.drop("key", axis=1, inplace=True) + item_df.drop("key", axis=1, inplace=True) + users_items.drop("key", axis=1, inplace=True) + + # Filter + if user_item_filter_df is not None: + users_items = filter_by( + users_items, user_item_filter_df, [user_col, item_col]) + + if shuffle: + users_items = users_items.sample(frac=1, random_state=seed).reset_index( + drop=True + ) + + return users_items + + +def filter_by(df, filter_by_df, filter_by_cols): + """From the input DataFrame (df), remove the records whose target column (filter_by_cols) values are + exist in the filter-by DataFrame (filter_by_df) + + Args: + df (pd.DataFrame): Source dataframe. + filter_by_df (pd.DataFrame): Filter dataframe. + filter_by_cols (iterable of str): Filter columns. + + Returns: + pd.DataFrame: Dataframe filtered by filter_by_df on filter_by_cols + """ + + return df.loc[ + ~df.set_index(filter_by_cols).index.isin( + filter_by_df.set_index(filter_by_cols).index + ) + ] + + +class LibffmConverter(object): + """Converts an input Dataframe (df) to another Dataframe (df) in libffm format. A text file of the converted + Dataframe is optionally generated. + + Note: + The input dataframe is expected to represent the feature data in the following schema + |field-1|field-2|...|field-n|rating| + |feature-1-1|feature-2-1|...|feature-n-1|1| + |feature-1-2|feature-2-2|...|feature-n-2|0| + ... + |feature-1-i|feature-2-j|...|feature-n-k|0| + Where + 1. each "field-*" is the column name of the dataframe (column of lable/rating is excluded), and + 2. "feature-*-*" can be either a string or a numerical value, representing the categorical variable or + actual numerical variable of the feature value in the field, respectively. + 3. If there are ordinal variables represented in int types, users should make sure these columns + are properly converted to string type. + + The above data will be converted to the libffm format by following the convention as explained in + https://www.csie.ntu.edu.tw/~r01922136/slides/ffm.pdf + + i.e., ::1 or ::, depending on + the data type of the features in the original dataframe. + + Examples: + >>> import pandas as pd + >>> df_feature = pd.DataFrame({ + 'rating': [1, 0, 0, 1, 1], + 'field1': ['xxx1', 'xxx2', 'xxx4', 'xxx4', 'xxx4'], + 'field2': [3, 4, 5, 6, 7], + 'field3': [1.0, 2.0, 3.0, 4.0, 5.0], + 'field4': ['1', '2', '3', '4', '5'] + }) + >>> converter = LibffmConveter().fit(df_feature, col_rating='rating') + >>> df_out = converter.transform(df_feature) + >>> df_out + rating field1 field2 field3 field4 + 0 1 1:1:1 2:4:3 3:5:1.0 4:4:1 + 1 0 1:2:1 2:4:4 3:5:2.0 4:5:1 + 2 0 1:3:1 2:4:5 3:5:3.0 4:6:1 + 3 1 1:3:1 2:4:6 3:5:4.0 4:7:1 + 4 1 1:3:1 2:4:7 3:5:5.0 4:8:1 + + Args: + filepath (str): path to save the converted data. + + Attributes: + field_count (int): count of field in the libffm format data + feature_count (int): count of feature in the libffm format data + filepath (str or None): file path where the output is stored - it can be None or a string + """ + + def __init__(self, filepath=None): + self.filepath = filepath + self.col_rating = None + self.field_names = None + self.field_count = None + self.feature_count = None + + def fit(self, df, col_rating=DEFAULT_RATING_COL): + """Fit the dataframe for libffm format. + This method does nothing but check the validity of the input columns + + Args: + df (pd.DataFrame): input Pandas dataframe. + col_rating (str): rating of the data. + + Return: + obj: the instance of the converter + """ + + # Check column types. + types = df.dtypes + if not all( + [ + x == object or np.issubdtype(x, np.integer) or x == np.float + for x in types + ] + ): + raise TypeError( + "Input columns should be only object and/or numeric types.") + + if col_rating not in df.columns: + raise TypeError( + "Column of {} is not in input dataframe columns".format( + col_rating) + ) + + self.col_rating = col_rating + self.field_names = list(df.drop(col_rating, axis=1).columns) + + return self + + def transform(self, df): + """Tranform an input dataset with the same schema (column names and dtypes) to libffm format + by using the fitted converter. + + Args: + df (pd.DataFrame): input Pandas dataframe. + + Return: + pd.DataFrame: output libffm format dataframe. + """ + if self.col_rating not in df.columns: + raise ValueError( + "Input dataset does not contain the label column {} in the fitting dataset".format( + self.col_rating + ) + ) + + if not all([x in df.columns for x in self.field_names]): + raise ValueError( + "Not all columns in the input dataset appear in the fitting dataset" + ) + + # Encode field-feature. + idx = 1 + self.field_feature_dict = {} + for field in self.field_names: + for feature in df[field].values: + # Check whether (field, feature) tuple exists in the dict or not. + # If not, put them into the key-values of the dict and count the index. + if (field, feature) not in self.field_feature_dict: + self.field_feature_dict[(field, feature)] = idx + if df[field].dtype == object: + idx += 1 + if df[field].dtype != object: + idx += 1 + + self.field_count = len(self.field_names) + self.feature_count = idx - 1 + + def _convert(field, feature, field_index, field_feature_index_dict): + field_feature_index = field_feature_index_dict[(field, feature)] + if isinstance(feature, str): + feature = 1 + return "{}:{}:{}".format(field_index, field_feature_index, feature) + + for col_index, col in enumerate(self.field_names): + df[col] = df[col].apply( + lambda x: _convert(col, x, col_index + 1, + self.field_feature_dict) + ) + + # Move rating column to the first. + column_names = self.field_names[:] + column_names.insert(0, self.col_rating) + df = df[column_names] + + if self.filepath is not None: + np.savetxt(self.filepath, df.values, delimiter=" ", fmt="%s") + + return df + + def fit_transform(self, df, col_rating=DEFAULT_RATING_COL): + """Do fit and transform in a row + + Args: + df (pd.DataFrame): input Pandas dataframe. + col_rating (str): rating of the data. + + Return: + pd.DataFrame: output libffm format dataframe. + """ + return self.fit(df, col_rating=col_rating).transform(df) + + def get_params(self): + """Get parameters (attributes) of the libffm converter + + Return: + dict: parameters field count, feature count, and file path. + """ + return { + "field count": self.field_count, + "feature count": self.feature_count, + "file path": self.filepath, + } + + +def negative_feedback_sampler( + df, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_label=DEFAULT_LABEL_COL, + ratio_neg_per_user=1, + seed=42, +): + """Utility function to sample negative feedback from user-item interaction dataset. + + This negative sampling function will take the user-item interaction data to create + binarized feedback, i.e., 1 and 0 indicate positive and negative feedback, + respectively. + + Negative sampling is used in the literature frequently to generate negative samples + from a user-item interaction data. + See for example the neural collaborative filtering paper + https://www.comp.nus.edu.sg/~xiangnan/papers/ncf.pdf + + Examples: + >>> import pandas as pd + >>> df = pd.DataFrame({ + 'userID': [1, 2, 3], + 'itemID': [1, 2, 3], + 'rating': [5, 5, 5] + }) + >>> df_neg_sampled = negative_feedback_sampler( + df, col_user='userID', col_item='itemID', ratio_neg_per_user=1 + ) + >>> df_neg_sampled + userID itemID feedback + 1 1 1 + 1 2 0 + 2 2 1 + 2 1 0 + 3 3 1 + 3 1 0 + + Args: + df (pandas.DataFrame): input data that contains user-item tuples. + col_user (str): user id column name. + col_item (str): item id column name. + col_label (str): label column name. It is used for the generated columns where labels + of positive and negative feedback, i.e., 1 and 0, respectively, in the output dataframe. + ratio_neg_per_user (int): ratio of negative feedback w.r.t to the number of positive feedback for each user. + If the samples exceed the number of total possible negative feedback samples, it will be reduced to the number + of all the possible samples. + seed (int): seed for the random state of the sampling function. + + Returns: + pandas.DataFrame: data with negative feedback + """ + # Get all of the users and items. + users = df[col_user].unique() + items = df[col_item].unique() + + # Create a dataframe for all user-item pairs + df_neg = user_item_pairs( + pd.DataFrame(users, columns=[col_user]), + pd.DataFrame(items, columns=[col_item]), + user_item_filter_df=df, + ) + df_neg[col_label] = 0 + + df_pos = df.copy() + df_pos[col_label] = 1 + + df_all = pd.concat([df_pos, df_neg], ignore_index=True, sort=True) + df_all = df_all[[col_user, col_item, col_label]] + + # Sample negative feedback from the combined dataframe. + df_sample = ( + df_all.groupby(col_user) + .apply( + lambda x: pd.concat( + [ + x[x[col_label] == 1], + x[x[col_label] == 0].sample( + min( + max( + round(len(x[x[col_label] == 1]) + * ratio_neg_per_user), 1 + ), + len(x[x[col_label] == 0]), + ), + random_state=seed, + replace=False, + ) + if len(x[x[col_label] == 0] > 0) + else pd.DataFrame({}, columns=[col_user, col_item, col_label]), + ], + ignore_index=True, + sort=True, + ) + ) + .reset_index(drop=True) + .sort_values(col_user) + ) + + return df_sample + + +def has_columns(df, columns): + """Check if DataFrame has necessary columns + + Args: + df (pd.DataFrame): DataFrame + columns (list(str): columns to check for + + Returns: + bool: True if DataFrame has specified columns + """ + + result = True + for column in columns: + if column not in df.columns: + logger.error("Missing column: {} in DataFrame".format(column)) + result = False + + return result + + +def has_same_base_dtype(df_1, df_2, columns=None): + """Check if specified columns have the same base dtypes across both DataFrames + + Args: + df_1 (pd.DataFrame): first DataFrame + df_2 (pd.DataFrame): second DataFrame + columns (list(str)): columns to check, None checks all columns + + Returns: + bool: True if DataFrames columns have the same base dtypes + """ + + if columns is None: + if any(set(df_1.columns).symmetric_difference(set(df_2.columns))): + logger.error( + "Cannot test all columns because they are not all shared across DataFrames" + ) + return False + columns = df_1.columns + + if not ( + has_columns(df=df_1, columns=columns) and has_columns( + df=df_2, columns=columns) + ): + return False + + result = True + for column in columns: + if df_1[column].dtype.type.__base__ != df_2[column].dtype.type.__base__: + logger.error( + "Columns {} do not have the same base datatype".format(column)) + result = False + + return result + + +class PandasHash: + """Wrapper class to allow pandas objects (DataFrames or Series) to be hashable""" + + # reserve space just for a single pandas object + __slots__ = "pandas_object" + + def __init__(self, pandas_object): + """Initialize class + Args: + pandas_object (pd.DataFrame|pd.Series): pandas object + """ + + if not isinstance(pandas_object, (pd.DataFrame, pd.Series)): + raise TypeError("Can only wrap pandas DataFrame or Series objects") + self.pandas_object = pandas_object + + def __eq__(self, other): + """Overwrite equality comparison + Args: + other (pd.DataFrame|pd.Series): pandas object to compare + + Returns: + bool: whether other object is the same as this one + """ + + return hash(self) == hash(other) + + def __hash__(self): + """Overwrite hash operator for use with pandas objects + + Returns: + int: hashed value of object + """ + + hashable = tuple(self.pandas_object.values.tobytes()) + if isinstance(self.pandas_object, pd.DataFrame): + hashable += tuple(self.pandas_object.columns) + else: + hashable += tuple(self.pandas_object.name) + return hash(hashable) + + +def lru_cache_df(maxsize, typed=False): + """Least-recently-used cache decorator + + Args: + maxsize (int|None): max size of cache, if set to None cache is boundless + typed (bool): arguments of different types are cached separately + """ + + def to_pandas_hash(val): + """Return PandaHash object if input is a DataFrame otherwise return input unchanged""" + return PandasHash(val) if isinstance(val, pd.DataFrame) else val + + def from_pandas_hash(val): + """Extract DataFrame if input is PandaHash object otherwise return input unchanged""" + return val.pandas_object if isinstance(val, PandasHash) else val + + def decorating_function(user_function): + @wraps(user_function) + def wrapper(*args, **kwargs): + # convert DataFrames in args and kwargs to PandaHash objects + args = tuple([to_pandas_hash(a) for a in args]) + kwargs = {k: to_pandas_hash(v) for k, v in kwargs.items()} + return cached_wrapper(*args, **kwargs) + + @lru_cache(maxsize=maxsize, typed=typed) + def cached_wrapper(*args, **kwargs): + # get DataFrames from PandaHash objects in args and kwargs + args = tuple([from_pandas_hash(a) for a in args]) + kwargs = {k: from_pandas_hash(v) for k, v in kwargs.items()} + return user_function(*args, **kwargs) + + # retain lru_cache attributes + wrapper.cache_info = cached_wrapper.cache_info + wrapper.cache_clear = cached_wrapper.cache_clear + + return wrapper + + return decorating_function diff --git a/training/training_code/dataset/python_evaluation.py b/training/training_code/dataset/python_evaluation.py new file mode 100644 index 0000000..36b155b --- /dev/null +++ b/training/training_code/dataset/python_evaluation.py @@ -0,0 +1,681 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import numpy as np +import pandas as pd +from functools import wraps +from sklearn.metrics import ( + mean_squared_error, + mean_absolute_error, + r2_score, + explained_variance_score, + roc_auc_score, + log_loss, +) + +from dataset.constants import ( + DEFAULT_USER_COL, + DEFAULT_ITEM_COL, + DEFAULT_RATING_COL, + DEFAULT_PREDICTION_COL, + DEFAULT_K, + DEFAULT_THRESHOLD, +) +from dataset.pandas_df_utils import ( + has_columns, + has_same_base_dtype, + lru_cache_df, +) + + +def check_column_dtypes(func): + """Checks columns of DataFrame inputs + + This includes the checks on + 1. whether the input columns exist in the input DataFrames + 2. whether the data types of col_user as well as col_item are matched in the two input DataFrames. + + Args: + func (function): function that will be wrapped + """ + + @wraps(func) + def check_column_dtypes_wrapper( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, + *args, + **kwargs + ): + """Check columns of DataFrame inputs + + Args: + rating_true (pd.DataFrame): True data + rating_pred (pd.DataFrame): Predicted data + col_user (str): column name for user + col_item (str): column name for item + col_rating (str): column name for rating + col_prediction (str): column name for prediction + """ + + if not has_columns(rating_true, [col_user, col_item, col_rating]): + raise ValueError("Missing columns in true rating DataFrame") + if not has_columns(rating_pred, [col_user, col_item, col_prediction]): + raise ValueError("Missing columns in predicted rating DataFrame") + if not has_same_base_dtype( + rating_true, rating_pred, columns=[col_user, col_item] + ): + raise ValueError( + "Columns in provided DataFrames are not the same datatype") + + return func( + rating_true=rating_true, + rating_pred=rating_pred, + col_user=col_user, + col_item=col_item, + col_rating=col_rating, + col_prediction=col_prediction, + *args, + **kwargs + ) + + return check_column_dtypes_wrapper + + +@check_column_dtypes +@lru_cache_df(maxsize=1) +def merge_rating_true_pred( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, +): + """Join truth and prediction data frames on userID and itemID and return the true + and predicted rated with the correct index. + + Args: + rating_true (pd.DataFrame): True data + rating_pred (pd.DataFrame): Predicted data + col_user (str): column name for user + col_item (str): column name for item + col_rating (str): column name for rating + col_prediction (str): column name for prediction + + Returns: + np.array: Array with the true ratings + np.array: Array with the predicted ratings + + """ + + # pd.merge will apply suffixes to columns which have the same name across both dataframes + suffixes = ["_true", "_pred"] + rating_true_pred = pd.merge( + rating_true, rating_pred, on=[col_user, col_item], suffixes=suffixes + ) + if col_rating in rating_pred.columns: + col_rating = col_rating + suffixes[0] + if col_prediction in rating_true.columns: + col_prediction = col_prediction + suffixes[1] + return rating_true_pred[col_rating], rating_true_pred[col_prediction] + + +def rmse( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, +): + """Calculate Root Mean Squared Error + + Args: + rating_true (pd.DataFrame): True data. There should be no duplicate (userID, itemID) pairs + rating_pred (pd.DataFrame): Predicted data. There should be no duplicate (userID, itemID) pairs + col_user (str): column name for user + col_item (str): column name for item + col_rating (str): column name for rating + col_prediction (str): column name for prediction + + Returns: + float: Root mean squared error + """ + + y_true, y_pred = merge_rating_true_pred( + rating_true=rating_true, + rating_pred=rating_pred, + col_user=col_user, + col_item=col_item, + col_rating=col_rating, + col_prediction=col_prediction, + ) + return np.sqrt(mean_squared_error(y_true, y_pred)) + + +def mae( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, +): + """Calculate Mean Absolute Error. + + Args: + rating_true (pd.DataFrame): True data. There should be no duplicate (userID, itemID) pairs + rating_pred (pd.DataFrame): Predicted data. There should be no duplicate (userID, itemID) pairs + col_user (str): column name for user + col_item (str): column name for item + col_rating (str): column name for rating + col_prediction (str): column name for prediction + + Returns: + float: Mean Absolute Error. + """ + + y_true, y_pred = merge_rating_true_pred( + rating_true=rating_true, + rating_pred=rating_pred, + col_user=col_user, + col_item=col_item, + col_rating=col_rating, + col_prediction=col_prediction, + ) + return mean_absolute_error(y_true, y_pred) + + +def rsquared( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, +): + """Calculate R squared + + Args: + rating_true (pd.DataFrame): True data. There should be no duplicate (userID, itemID) pairs + rating_pred (pd.DataFrame): Predicted data. There should be no duplicate (userID, itemID) pairs + col_user (str): column name for user + col_item (str): column name for item + col_rating (str): column name for rating + col_prediction (str): column name for prediction + + Returns: + float: R squared (min=0, max=1). + """ + + y_true, y_pred = merge_rating_true_pred( + rating_true=rating_true, + rating_pred=rating_pred, + col_user=col_user, + col_item=col_item, + col_rating=col_rating, + col_prediction=col_prediction, + ) + return r2_score(y_true, y_pred) + + +def exp_var( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, +): + """Calculate explained variance. + + Args: + rating_true (pd.DataFrame): True data. There should be no duplicate (userID, itemID) pairs + rating_pred (pd.DataFrame): Predicted data. There should be no duplicate (userID, itemID) pairs + col_user (str): column name for user + col_item (str): column name for item + col_rating (str): column name for rating + col_prediction (str): column name for prediction + + Returns: + float: Explained variance (min=0, max=1). + """ + + y_true, y_pred = merge_rating_true_pred( + rating_true=rating_true, + rating_pred=rating_pred, + col_user=col_user, + col_item=col_item, + col_rating=col_rating, + col_prediction=col_prediction, + ) + return explained_variance_score(y_true, y_pred) + + +def auc( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, +): + """Calculate the Area-Under-Curve metric for implicit feedback typed + recommender, where rating is binary and prediction is float number ranging + from 0 to 1. + + https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve + + Note: + The evaluation does not require a leave-one-out scenario. + This metric does not calculate group-based AUC which considers the AUC scores + averaged across users. It is also not limited to k. Instead, it calculates the + scores on the entire prediction results regardless the users. + + Args: + rating_true (pd.DataFrame): True data + rating_pred (pd.DataFrame): Predicted data + col_user (str): column name for user + col_item (str): column name for item + col_rating (str): column name for rating + col_prediction (str): column name for prediction + + Returns: + float: auc_score (min=0, max=1) + """ + + y_true, y_pred = merge_rating_true_pred( + rating_true=rating_true, + rating_pred=rating_pred, + col_user=col_user, + col_item=col_item, + col_rating=col_rating, + col_prediction=col_prediction, + ) + return roc_auc_score(y_true, y_pred) + + +def logloss( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, +): + """Calculate the logloss metric for implicit feedback typed + recommender, where rating is binary and prediction is float number ranging + from 0 to 1. + + https://en.wikipedia.org/wiki/Loss_functions_for_classification#Cross_entropy_loss_(Log_Loss) + + Args: + rating_true (pd.DataFrame): True data + rating_pred (pd.DataFrame): Predicted data + col_user (str): column name for user + col_item (str): column name for item + col_rating (str): column name for rating + col_prediction (str): column name for prediction + + Returns: + float: log_loss_score (min=-inf, max=inf) + """ + + y_true, y_pred = merge_rating_true_pred( + rating_true=rating_true, + rating_pred=rating_pred, + col_user=col_user, + col_item=col_item, + col_rating=col_rating, + col_prediction=col_prediction, + ) + return log_loss(y_true, y_pred) + + +@check_column_dtypes +@lru_cache_df(maxsize=1) +def merge_ranking_true_pred( + rating_true, + rating_pred, + col_user, + col_item, + col_rating, + col_prediction, + relevancy_method, + k=DEFAULT_K, + threshold=DEFAULT_THRESHOLD, +): + """Filter truth and prediction data frames on common users + + Args: + rating_true (pd.DataFrame): True DataFrame + rating_pred (pd.DataFrame): Predicted DataFrame + col_user (str): column name for user + col_item (str): column name for item + col_rating (str): column name for rating + col_prediction (str): column name for prediction + relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold'] + k (int): number of top k items per user (optional) + threshold (float): threshold of top items per user (optional) + + Returns: + pd.DataFrame, pd.DataFrame, int: + DataFrame of recommendation hits + DataFrmae of hit counts vs actual relevant items per user + number of unique user ids + """ + + # Make sure the prediction and true data frames have the same set of users + common_users = set(rating_true[col_user]).intersection( + set(rating_pred[col_user])) + rating_true_common = rating_true[rating_true[col_user].isin(common_users)] + rating_pred_common = rating_pred[rating_pred[col_user].isin(common_users)] + n_users = len(common_users) + + # Return hit items in prediction data frame with ranking information. This is used for calculating NDCG and MAP. + # Use first to generate unique ranking values for each item. This is to align with the implementation in + # Spark evaluation metrics, where index of each recommended items (the indices are unique to items) is used + # to calculate penalized precision of the ordered items. + if relevancy_method == "top_k": + top_k = k + elif relevancy_method == "by_threshold": + top_k = threshold + else: + raise NotImplementedError("Invalid relevancy_method") + df_hit = get_top_k_items( + dataframe=rating_pred_common, + col_user=col_user, + col_rating=col_prediction, + k=top_k, + ) + df_hit["rank"] = df_hit.groupby(col_user)[col_prediction].rank( + method="first", ascending=False + ) + df_hit = pd.merge(df_hit, rating_true_common, on=[col_user, col_item])[ + [col_user, col_item, "rank"] + ] + + # count the number of hits vs actual relevant items per user + df_hit_count = pd.merge( + df_hit.groupby(col_user, as_index=False)[ + col_user].agg({"hit": "count"}), + rating_true_common.groupby(col_user, as_index=False)[col_user].agg( + {"actual": "count"} + ), + on=col_user, + ) + + return df_hit, df_hit_count, n_users + + +def precision_at_k( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, + relevancy_method="top_k", + k=DEFAULT_K, + threshold=DEFAULT_THRESHOLD, +): + """Precision at K. + + Note: + We use the same formula to calculate precision@k as that in Spark. + More details can be found at + http://spark.apache.org/docs/2.1.1/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.precisionAt + In particular, the maximum achievable precision may be < 1, if the number of items for a + user in rating_pred is less than k. + + Args: + rating_true (pd.DataFrame): True DataFrame + rating_pred (pd.DataFrame): Predicted DataFrame + col_user (str): column name for user + col_item (str): column name for item + col_rating (str): column name for rating + col_prediction (str): column name for prediction + relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold'] + k (int): number of top k items per user + threshold (float): threshold of top items per user (optional) + + Returns: + float: precision at k (min=0, max=1) + """ + + df_hit, df_hit_count, n_users = merge_ranking_true_pred( + rating_true=rating_true, + rating_pred=rating_pred, + col_user=col_user, + col_item=col_item, + col_rating=col_rating, + col_prediction=col_prediction, + relevancy_method=relevancy_method, + k=k, + threshold=threshold, + ) + + if df_hit.shape[0] == 0: + return 0.0 + + return (df_hit_count["hit"] / k).sum() / n_users + + +def recall_at_k( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, + relevancy_method="top_k", + k=DEFAULT_K, + threshold=DEFAULT_THRESHOLD, +): + """Recall at K. + + Args: + rating_true (pd.DataFrame): True DataFrame + rating_pred (pd.DataFrame): Predicted DataFrame + col_user (str): column name for user + col_item (str): column name for item + col_rating (str): column name for rating + col_prediction (str): column name for prediction + relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold'] + k (int): number of top k items per user + threshold (float): threshold of top items per user (optional) + + Returns: + float: recall at k (min=0, max=1). The maximum value is 1 even when fewer than + k items exist for a user in rating_true. + """ + + df_hit, df_hit_count, n_users = merge_ranking_true_pred( + rating_true=rating_true, + rating_pred=rating_pred, + col_user=col_user, + col_item=col_item, + col_rating=col_rating, + col_prediction=col_prediction, + relevancy_method=relevancy_method, + k=k, + threshold=threshold, + ) + + if df_hit.shape[0] == 0: + return 0.0 + + return (df_hit_count["hit"] / df_hit_count["actual"]).sum() / n_users + + +def ndcg_at_k( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, + relevancy_method="top_k", + k=DEFAULT_K, + threshold=DEFAULT_THRESHOLD, +): + """Normalized Discounted Cumulative Gain (nDCG). + + Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain + + Args: + rating_true (pd.DataFrame): True DataFrame + rating_pred (pd.DataFrame): Predicted DataFrame + col_user (str): column name for user + col_item (str): column name for item + col_rating (str): column name for rating + col_prediction (str): column name for prediction + relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold'] + k (int): number of top k items per user + threshold (float): threshold of top items per user (optional) + + Returns: + float: nDCG at k (min=0, max=1). + """ + + df_hit, df_hit_count, n_users = merge_ranking_true_pred( + rating_true=rating_true, + rating_pred=rating_pred, + col_user=col_user, + col_item=col_item, + col_rating=col_rating, + col_prediction=col_prediction, + relevancy_method=relevancy_method, + k=k, + threshold=threshold, + ) + + if df_hit.shape[0] == 0: + return 0.0 + + # calculate discounted gain for hit items + df_dcg = df_hit.copy() + # relevance in this case is always 1 + df_dcg["dcg"] = 1 / np.log1p(df_dcg["rank"]) + # sum up discount gained to get discount cumulative gain + df_dcg = df_dcg.groupby(col_user, as_index=False).agg({"dcg": "sum"}) + # calculate ideal discounted cumulative gain + df_ndcg = pd.merge(df_dcg, df_hit_count, on=[col_user]) + df_ndcg["idcg"] = df_ndcg["actual"].apply( + lambda x: sum(1 / np.log1p(range(1, min(x, k) + 1))) + ) + + # DCG over IDCG is the normalized DCG + return (df_ndcg["dcg"] / df_ndcg["idcg"]).sum() / n_users + + +def map_at_k( + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, + relevancy_method="top_k", + k=DEFAULT_K, + threshold=DEFAULT_THRESHOLD, +): + """Mean Average Precision at k + The implementation of MAP is referenced from Spark MLlib evaluation metrics. + https://spark.apache.org/docs/2.3.0/mllib-evaluation-metrics.html#ranking-systems + + A good reference can be found at: + http://web.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf + + Note: + 1. The evaluation function is named as 'MAP is at k' because the evaluation class takes top k items for + the prediction items. The naming is different from Spark. + 2. The MAP is meant to calculate Avg. Precision for the relevant items, so it is normalized by the number of + relevant items in the ground truth data, instead of k. + + Args: + rating_true (pd.DataFrame): True DataFrame + rating_pred (pd.DataFrame): Predicted DataFrame + col_user (str): column name for user + col_item (str): column name for item + col_rating (str): column name for rating + col_prediction (str): column name for prediction + relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold'] + k (int): number of top k items per user + threshold (float): threshold of top items per user (optional) + + Returns: + float: MAP at k (min=0, max=1). + """ + + df_hit, df_hit_count, n_users = merge_ranking_true_pred( + rating_true=rating_true, + rating_pred=rating_pred, + col_user=col_user, + col_item=col_item, + col_rating=col_rating, + col_prediction=col_prediction, + relevancy_method=relevancy_method, + k=k, + threshold=threshold, + ) + + if df_hit.shape[0] == 0: + return 0.0 + + # calculate reciprocal rank of items for each user and sum them up + df_hit_sorted = df_hit.sort_values([col_user, "rank"]) + df_hit_sorted["rr"] = (df_hit.groupby( + col_user).cumcount() + 1) / df_hit["rank"] + df_hit_sorted = df_hit_sorted.groupby( + col_user).agg({"rr": "sum"}).reset_index() + + df_merge = pd.merge(df_hit_sorted, df_hit_count, on=col_user) + return (df_merge["rr"] / df_merge["actual"]).sum() / n_users + + +def get_top_k_items( + dataframe, col_user=DEFAULT_USER_COL, col_rating=DEFAULT_RATING_COL, k=DEFAULT_K +): + """Get the input customer-item-rating tuple in the format of Pandas + DataFrame, output a Pandas DataFrame in the dense format of top k items + for each user. + Note: + if it is implicit rating, just append a column of constants to be + ratings. + + Args: + dataframe (pandas.DataFrame): DataFrame of rating data (in the format + customerID-itemID-rating) + col_user (str): column name for user + col_rating (str): column name for rating + k (int): number of items for each user + + Returns: + pd.DataFrame: DataFrame of top k items for each user + """ + + return ( + dataframe.groupby(col_user, as_index=False) + .apply(lambda x: x.nlargest(k, col_rating)) + .reset_index(drop=True) + ) + + +"""Function name and function mapper. +Useful when we have to serialize evaluation metric names +and call the functions based on deserialized names""" +metrics = { + rmse.__name__: rmse, + mae.__name__: mae, + rsquared.__name__: rsquared, + exp_var.__name__: exp_var, + precision_at_k.__name__: precision_at_k, + recall_at_k.__name__: recall_at_k, + ndcg_at_k.__name__: ndcg_at_k, + map_at_k.__name__: map_at_k, +} diff --git a/training/training_code/dataset/python_splitters.py b/training/training_code/dataset/python_splitters.py new file mode 100644 index 0000000..9e60be6 --- /dev/null +++ b/training/training_code/dataset/python_splitters.py @@ -0,0 +1,279 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split as sk_split + +from dataset.constants import ( + DEFAULT_ITEM_COL, + DEFAULT_USER_COL, + DEFAULT_TIMESTAMP_COL, +) +from dataset.split_utils import ( + process_split_ratio, + min_rating_filter_pandas, + split_pandas_data_with_ratios, +) + + +def python_random_split(data, ratio=0.75, seed=42): + """Pandas random splitter + The splitter randomly splits the input data. + + Args: + data (pd.DataFrame): Pandas DataFrame to be split. + ratio (float or list): Ratio for splitting data. If it is a single float number + it splits data into two halves and the ratio argument indicates the ratio + of training data set; if it is a list of float numbers, the splitter splits + data into several portions corresponding to the split ratios. If a list is + provided and the ratios are not summed to 1, they will be normalized. + seed (int): Seed. + + Returns: + list: Splits of the input data as pd.DataFrame. + """ + multi_split, ratio = process_split_ratio(ratio) + + if multi_split: + splits = split_pandas_data_with_ratios(data, ratio, shuffle=True, seed=seed) + splits_new = [x.drop('split_index', axis=1) for x in splits] + + return splits_new + else: + return sk_split(data, test_size=None, train_size=ratio, random_state=seed) + + +def _do_stratification( + data, + ratio=0.75, + min_rating=1, + filter_by="user", + is_random=True, + seed=42, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_timestamp=DEFAULT_TIMESTAMP_COL, +): + # A few preliminary checks. + if not (filter_by == "user" or filter_by == "item"): + raise ValueError("filter_by should be either 'user' or 'item'.") + + if min_rating < 1: + raise ValueError("min_rating should be integer and larger than or equal to 1.") + + if col_user not in data.columns: + raise ValueError("Schema of data not valid. Missing User Col") + + if col_item not in data.columns: + raise ValueError("Schema of data not valid. Missing Item Col") + + if not is_random: + if col_timestamp not in data.columns: + raise ValueError("Schema of data not valid. Missing Timestamp Col") + + multi_split, ratio = process_split_ratio(ratio) + + split_by_column = col_user if filter_by == "user" else col_item + + ratio = ratio if multi_split else [ratio, 1 - ratio] + + if min_rating > 1: + data = min_rating_filter_pandas( + data, + min_rating=min_rating, + filter_by=filter_by, + col_user=col_user, + col_item=col_item, + ) + + # Split by each group and aggregate splits together. + splits = [] + + # If it is for chronological splitting, the split will be performed in a random way. + df_grouped = ( + data.sort_values(col_timestamp).groupby(split_by_column) + if is_random is False + else data.groupby(split_by_column) + ) + + for name, group in df_grouped: + group_splits = split_pandas_data_with_ratios( + df_grouped.get_group(name), ratio, shuffle=is_random, seed=seed + ) + + # Concatenate the list of split dataframes. + concat_group_splits = pd.concat(group_splits) + + splits.append(concat_group_splits) + + # Concatenate splits for all the groups together. + splits_all = pd.concat(splits) + + # Take split by split_index + splits_list = [ + splits_all[splits_all["split_index"] == x].drop("split_index", axis=1) + for x in range(len(ratio)) + ] + + return splits_list + + +def python_chrono_split( + data, + ratio=0.75, + min_rating=1, + filter_by="user", + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_timestamp=DEFAULT_TIMESTAMP_COL, +): + """Pandas chronological splitter + This function splits data in a chronological manner. That is, for each user / item, the + split function takes proportions of ratings which is specified by the split ratio(s). + The split is stratified. + + Args: + data (pd.DataFrame): Pandas DataFrame to be split. + ratio (float or list): Ratio for splitting data. If it is a single float number + it splits data into two halves and the ratio argument indicates the ratio of + training data set; if it is a list of float numbers, the splitter splits + data into several portions corresponding to the split ratios. If a list is + provided and the ratios are not summed to 1, they will be normalized. + seed (int): Seed. + min_rating (int): minimum number of ratings for user or item. + filter_by (str): either "user" or "item", depending on which of the two is to + filter with min_rating. + col_user (str): column name of user IDs. + col_item (str): column name of item IDs. + col_timestamp (str): column name of timestamps. + + Returns: + list: Splits of the input data as pd.DataFrame. + """ + return _do_stratification( + data, + ratio=ratio, + min_rating=min_rating, + filter_by=filter_by, + col_user=col_user, + col_item=col_item, + col_timestamp=col_timestamp, + is_random=False + ) + + +def python_stratified_split( + data, + ratio=0.75, + min_rating=1, + filter_by="user", + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + seed=42, +): + """Pandas stratified splitter + For each user / item, the split function takes proportions of ratings which is + specified by the split ratio(s). The split is stratified. + + Args: + data (pd.DataFrame): Pandas DataFrame to be split. + ratio (float or list): Ratio for splitting data. If it is a single float number + it splits data into two halves and the ratio argument indicates the ratio of + training data set; if it is a list of float numbers, the splitter splits + data into several portions corresponding to the split ratios. If a list is + provided and the ratios are not summed to 1, they will be normalized. + seed (int): Seed. + min_rating (int): minimum number of ratings for user or item. + filter_by (str): either "user" or "item", depending on which of the two is to + filter with min_rating. + col_user (str): column name of user IDs. + col_item (str): column name of item IDs. + + Returns: + list: Splits of the input data as pd.DataFrame. + """ + return _do_stratification( + data, + ratio=ratio, + min_rating=min_rating, + filter_by=filter_by, + col_user=col_user, + col_item=col_item, + is_random=True, + seed=seed + ) + +def numpy_stratified_split(X, ratio=0.75, seed=42): + + """ + Split the user/item affinity matrix (sparse matrix) into train and test set matrices while maintaining + local (i.e. per user) ratios. + + Args: + X (np.array, int): a sparse matrix to be split + ratio (float): fraction of the entire dataset to constitute the train set + seed (int): random seed + + Returns: + Xtr (np.array, int): train set user/item affinity matrix + Xtst (np.array, int): test set user/item affinity matrix + + Basic mechanics: + Main points : + + 1. In a typical recommender problem, different users rate a different number of items, + and therefore the user/affinity matrix has a sparse structure with variable number + of zeroes (unrated items) per row (user). Cutting a total amount of ratings will + result in a non-homogeneous distribution between train and test set, i.e. some test + users may have many ratings while other very little if none. + + 2. In an unsupervised learning problem, no explicit answer is given. For this reason + the split needs to be implemented in a different way then in supervised learningself. + In the latter, one typically split the dataset by rows (by examples), ending up with + the same number of features but different number of examples in the train/test setself. + This scheme does not work in the unsupervised case, as part of the rated items needs to + be used as a test set for fixed number of users. + + Solution: + + 1. Instead of cutting a total percentage, for each user we cut a relative ratio of the rated + items. For example, if user1 has rated 4 items and user2 10, cutting 25% will correspond to + 1 and 2.6 ratings in the test set, approximated as 1 and 3 according to the round() function. + In this way, the 0.75 ratio is satisfied both locally and globally, preserving the original + distribution of ratings across the train and test set. + + 2. It is easy (and fast) to satisfy this requirements by creating the test via element subtraction + from the original dataset X. We first create two copies of X; for each user we select a random + sample of local size ratio (point 1) and erase the remaining ratings, obtaining in this way the + train set matrix Xtst. The train set matrix is obtained in the opposite way. + """ + + np.random.seed(seed) # set the random seed + test_cut = int((1 - ratio) * 100) # percentage of ratings to go in the test set + + # initialize train and test set matrices + Xtr = X.copy() + Xtst = X.copy() + + # find the number of rated movies per user + rated = np.sum(Xtr != 0, axis=1) + + # for each user, cut down a test_size% for the test set + tst = np.around((rated * test_cut) / 100).astype(int) + + for u in range(X.shape[0]): + # For each user obtain the index of rated movies + idx = np.asarray(np.where(Xtr[u] != 0))[0].tolist() + + # extract a random subset of size n from the set of rated movies without repetition + idx_tst = np.random.choice(idx, tst[u], replace=False) + idx_train = list(set(idx).difference(set(idx_tst))) + + # change the selected rated movies to unrated in the train set + Xtr[u, idx_tst] = 0 + # set the movies that appear already in the train set as 0 + Xtst[u, idx_train] = 0 + + del idx, idx_train, idx_tst + + return Xtr, Xtst diff --git a/training/training_code/dataset/spark_evaluation.py b/training/training_code/dataset/spark_evaluation.py new file mode 100644 index 0000000..e2e5fa7 --- /dev/null +++ b/training/training_code/dataset/spark_evaluation.py @@ -0,0 +1,483 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + + +try: + from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics + from pyspark.sql import Window, DataFrame + from pyspark.sql.functions import col, row_number, expr + import pyspark.sql.functions as F +except ImportError: + pass # skip this import if we are in pure python environment + +from dataset.constants import ( + DEFAULT_PREDICTION_COL, + DEFAULT_USER_COL, + DEFAULT_ITEM_COL, + DEFAULT_RATING_COL, + DEFAULT_TIMESTAMP_COL, + DEFAULT_K, + DEFAULT_THRESHOLD, +) + + +class SparkRatingEvaluation: + """Spark Rating Evaluator""" + + def __init__( + self, + rating_true, + rating_pred, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, + ): + """Initializer. + + This is the Spark version of rating metrics evaluator. + The methods of this class, calculate rating metrics such as root mean squared error, mean absolute error, + R squared, and explained variance. + + Args: + rating_true (spark.DataFrame): True labels. + rating_pred (spark.DataFrame): Predicted labels. + col_user (str): column name for user. + col_item (str): column name for item. + col_rating (str): column name for rating. + col_prediction (str): column name for prediction. + """ + self.rating_true = rating_true + self.rating_pred = rating_pred + self.col_user = col_user + self.col_item = col_item + self.col_rating = col_rating + self.col_prediction = col_prediction + + # Check if inputs are Spark DataFrames. + if not isinstance(self.rating_true, DataFrame): + raise TypeError( + "rating_true should be but is not a Spark DataFrame" + ) # pragma : No Cover + + if not isinstance(self.rating_pred, DataFrame): + raise TypeError( + "rating_pred should be but is not a Spark DataFrame" + ) # pragma : No Cover + + # Check if columns exist. + true_columns = self.rating_true.columns + pred_columns = self.rating_pred.columns + + if rating_true.count() == 0: + raise ValueError("Empty input dataframe") + if rating_pred.count() == 0: + raise ValueError("Empty input dataframe") + + if self.col_user not in true_columns: + raise ValueError( + "Schema of rating_true not valid. Missing User Col") + if self.col_item not in true_columns: + raise ValueError( + "Schema of rating_true not valid. Missing Item Col") + if self.col_rating not in true_columns: + raise ValueError( + "Schema of rating_true not valid. Missing Rating Col") + + if self.col_user not in pred_columns: + raise ValueError( + "Schema of rating_pred not valid. Missing User Col" + ) # pragma : No Cover + if self.col_item not in pred_columns: + raise ValueError( + "Schema of rating_pred not valid. Missing Item Col" + ) # pragma : No Cover + if self.col_prediction not in pred_columns: + raise ValueError( + "Schema of rating_pred not valid. Missing Prediction Col") + + self.rating_true = self.rating_true.select( + col(self.col_user).cast("double"), + col(self.col_item).cast("double"), + col(self.col_rating).cast("double").alias("label"), + ) + self.rating_pred = self.rating_pred.select( + col(self.col_user).cast("double"), + col(self.col_item).cast("double"), + col(self.col_prediction).cast("double").alias("prediction"), + ) + + self.y_pred_true = ( + self.rating_true.join( + self.rating_pred, [self.col_user, self.col_item], "inner" + ) + .drop(self.col_user) + .drop(self.col_item) + ) + + self.metrics = RegressionMetrics( + self.y_pred_true.rdd.map(lambda x: (x.prediction, x.label)) + ) + + def rmse(self): + """Calculate Root Mean Squared Error. + + Returns: + float: Root mean squared error. + """ + return self.metrics.rootMeanSquaredError + + def mae(self): + """Calculate Mean Absolute Error. + + Returns: + float: Mean Absolute Error. + """ + return self.metrics.meanAbsoluteError + + def rsquared(self): + """Calculate R squared + Returns: + float: R squared + """ + return self.metrics.r2 + + def exp_var(self): + """Calculate explained variance. + + NOTE: + Spark MLLib's implementation is buggy (can lead to values > 1), hence we use var(). + + Returns: + float: Explained variance (min=0, max=1). + """ + var1 = self.y_pred_true.selectExpr("variance(label - prediction)").collect()[0][ + 0 + ] + var2 = self.y_pred_true.selectExpr("variance(label)").collect()[0][0] + return 1 - var1 / var2 + + +class SparkRankingEvaluation: + """SparkRankingEvaluation""" + + def __init__( + self, + rating_true, + rating_pred, + k=DEFAULT_K, + relevancy_method="top_k", + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, + threshold=DEFAULT_THRESHOLD, + ): + """Initialization. + This is the Spark version of ranking metrics evaluator. + The methods of this class, calculate ranking metrics such as precision@k, recall@k, ndcg@k, and mean average + precision. + + The implementations of precision@k, ndcg@k, and mean average precision are referenced from Spark MLlib, which + can be found at `here `_. + + Args: + rating_true (spark.DataFrame): DataFrame of true rating data (in the + format of customerID-itemID-rating tuple). + rating_pred (spark.DataFrame): DataFrame of predicted rating data (in + the format of customerID-itemID-rating tuple). + col_user (str): column name for user. + col_item (str): column name for item. + col_rating (str): column name for rating. + col_prediction (str): column name for prediction. + k (int): number of items to recommend to each user. + relevancy_method (str): method for determining relevant items. Possible + values are "top_k", "by_time_stamp", and "by_threshold". + threshold (float): threshold for determining the relevant recommended items. + This is used for the case that predicted ratings follow a known + distribution. NOTE: this option is only activated if relevancy_method is + set to "by_threshold". + """ + self.rating_true = rating_true + self.rating_pred = rating_pred + self.col_user = col_user + self.col_item = col_item + self.col_rating = col_rating + self.col_prediction = col_prediction + self.threshold = threshold + + # Check if inputs are Spark DataFrames. + if not isinstance(self.rating_true, DataFrame): + raise TypeError( + "rating_true should be but is not a Spark DataFrame" + ) # pragma : No Cover + + if not isinstance(self.rating_pred, DataFrame): + raise TypeError( + "rating_pred should be but is not a Spark DataFrame" + ) # pragma : No Cover + + # Check if columns exist. + true_columns = self.rating_true.columns + pred_columns = self.rating_pred.columns + + if self.col_user not in true_columns: + raise ValueError( + "Schema of rating_true not valid. Missing User Col: " + + str(true_columns) + ) + if self.col_item not in true_columns: + raise ValueError( + "Schema of rating_true not valid. Missing Item Col") + if self.col_rating not in true_columns: + raise ValueError( + "Schema of rating_true not valid. Missing Rating Col") + + if self.col_user not in pred_columns: + raise ValueError( + "Schema of rating_pred not valid. Missing User Col" + ) # pragma : No Cover + if self.col_item not in pred_columns: + raise ValueError( + "Schema of rating_pred not valid. Missing Item Col" + ) # pragma : No Cover + if self.col_prediction not in pred_columns: + raise ValueError( + "Schema of rating_pred not valid. Missing Prediction Col") + + self.k = k + + relevant_func = { + "top_k": _get_top_k_items, + "by_time_stamp": _get_relevant_items_by_timestamp, + "by_threshold": _get_relevant_items_by_threshold, + } + + if relevancy_method not in relevant_func: + raise ValueError( + "relevancy_method should be one of {}".format( + list(relevant_func.keys()) + ) + ) + + self.rating_pred = ( + relevant_func[relevancy_method]( + dataframe=self.rating_pred, + col_user=self.col_user, + col_item=self.col_item, + col_rating=self.col_prediction, + threshold=self.threshold, + ) + if relevancy_method == "by_threshold" + else relevant_func[relevancy_method]( + dataframe=self.rating_pred, + col_user=self.col_user, + col_item=self.col_item, + col_rating=self.col_prediction, + k=self.k, + ) + ) + + self._metrics = self._calculate_metrics() + + def _calculate_metrics(self): + """Calculate ranking metrics.""" + self._items_for_user_pred = self.rating_pred + + self._items_for_user_true = ( + self.rating_true.groupBy(self.col_user) + .agg(expr("collect_list(" + self.col_item + ") as ground_truth")) + .select(self.col_user, "ground_truth") + ) + + self._items_for_user_all = self._items_for_user_pred.join( + self._items_for_user_true, on=self.col_user + ).drop(self.col_user) + + return RankingMetrics(self._items_for_user_all.rdd) + + def precision_at_k(self): + """Get precision@k. + + NOTE: + More details can be found `here `_. + + Return: + float: precision at k (min=0, max=1) + """ + precision = self._metrics.precisionAt(self.k) + + return precision + + def recall_at_k(self): + """Get recall@K. + + NOTE: + More details can be found `here `_. + + Return: + float: recall at k (min=0, max=1). + """ + recall = self._items_for_user_all.rdd.map( + lambda x: float( + len(set(x[0]).intersection(set(x[1])))) / float(len(x[1])) + ).mean() + + return recall + + def ndcg_at_k(self): + """Get Normalized Discounted Cumulative Gain (NDCG) + + NOTE: + More details can be found `here `_. + + Return: + float: nDCG at k (min=0, max=1). + """ + ndcg = self._metrics.ndcgAt(self.k) + + return ndcg + + def map_at_k(self): + """Get mean average precision at k. + + NOTE: + More details can be found `here `_. + + Return: + float: MAP at k (min=0, max=1). + """ + maprecision = self._metrics.meanAveragePrecision + + return maprecision + + +def _get_top_k_items( + dataframe, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, + k=DEFAULT_K, +): + """Get the input customer-item-rating tuple in the format of Spark + DataFrame, output a Spark DataFrame in the dense format of top k items + for each user. + + NOTE: + if it is implicit rating, just append a column of constants to be ratings. + + Args: + dataframe (spark.DataFrame): DataFrame of rating data (in the format of + customerID-itemID-rating tuple). + col_user (str): column name for user. + col_item (str): column name for item. + col_rating (str): column name for rating. + col_prediction (str): column name for prediction. + k (int): number of items for each user. + + Return: + spark.DataFrame: DataFrame of top k items for each user. + """ + window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc()) + + # this does not work for rating of the same value. + items_for_user = ( + dataframe.select( + col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") + ) + .where(col("rank") <= k) + .groupby(col_user) + .agg(F.collect_list(col_item).alias(col_prediction)) + ) + + return items_for_user + + +def _get_relevant_items_by_threshold( + dataframe, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, + threshold=DEFAULT_THRESHOLD, +): + """Get relevant items for each customer in the input rating data. + + Relevant items are defined as those having ratings above certain threshold. + The threshold is defined as a statistical measure of the ratings for a + user, e.g., median. + + Args: + dataframe: Spark DataFrame of customerID-itemID-rating tuples. + col_user (str): column name for user. + col_item (str): column name for item. + col_rating (str): column name for rating. + col_prediction (str): column name for prediction. + threshold (float): threshold for determining the relevant recommended items. + This is used for the case that predicted ratings follow a known + distribution. + + Return: + spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant + items. + """ + items_for_user = ( + dataframe.orderBy(col_rating, ascending=False) + .where(col_rating + " >= " + str(threshold)) + .select(col_user, col_item, col_rating) + .withColumn( + col_prediction, F.collect_list(col_item).over( + Window.partitionBy(col_user)) + ) + .select(col_user, col_prediction) + .dropDuplicates() + ) + + return items_for_user + + +def _get_relevant_items_by_timestamp( + dataframe, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_timestamp=DEFAULT_TIMESTAMP_COL, + col_prediction=DEFAULT_PREDICTION_COL, + k=DEFAULT_K, +): + """Get relevant items for each customer defined by timestamp. + + Relevant items are defined as k items that appear mostly recently + according to timestamps. + + Args: + dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp + tuples. + col_user (str): column name for user. + col_item (str): column name for item. + col_rating (str): column name for rating. + col_timestamp (str): column name for timestamp. + col_prediction (str): column name for prediction. + k: number of relevent items to be filtered by the function. + + Return: + spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. + """ + window_spec = Window.partitionBy( + col_user).orderBy(col(col_timestamp).desc()) + + items_for_user = ( + dataframe.select( + col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") + ) + .where(col("rank") <= k) + .withColumn( + col_prediction, F.collect_list(col_item).over( + Window.partitionBy(col_user)) + ) + .select(col_user, col_prediction) + .dropDuplicates([col_user, col_prediction]) + ) + + return items_for_user diff --git a/training/training_code/dataset/sparse.py b/training/training_code/dataset/sparse.py new file mode 100644 index 0000000..e87a860 --- /dev/null +++ b/training/training_code/dataset/sparse.py @@ -0,0 +1,197 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +""" +Generate the user/item affinity matrix from a pandas dataframe and vice versa +""" + +import pandas as pd +import numpy as np +import itertools + +from scipy.sparse import coo_matrix +import logging + +# import default parameters +from reco_utils.common.constants import ( + DEFAULT_USER_COL, + DEFAULT_ITEM_COL, + DEFAULT_RATING_COL, + DEFAULT_PREDICTION_COL, +) + +# for logging +log = logging.getLogger(__name__) + + +class AffinityMatrix: + # initialize class parameters + def __init__( + self, + DF, + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, + col_rating=DEFAULT_RATING_COL, + col_pred=DEFAULT_PREDICTION_COL, + save_path=None, + ): + """Generate the user/item affinity matrix from a pandas dataframe and vice versa + + Args: + DF (pd.DataFrame): a dataframe containing the data + col_user (str): default name for user column + col_item (str): default name for item column + col_rating (str): default name for rating columns + save_path (str): default path to save item/user maps + + """ + self.df = DF # dataframe + + # pandas DF parameters + self.col_item = col_item + self.col_user = col_user + self.col_rating = col_rating + self.col_pred = col_pred + + # Options to save the model for future use + self.save_path = save_path + + def _gen_index(self): + """ + Generate the user/item index: + map_users, map_items: dictionaries mapping the original user/item index to matrix indices + map_back_users, map_back_items: dictionaries to map back the matrix elements to the original + dataframe indices + + Basic mechanics: + As a first step we retieve the unique elements in the dataset. In this way we can take care + of either completely missing rows (a user with no ratings) or completely missing columns + (an item that has not being reviewed by anyone). The original indices in the dataframe are + then mapped to an ordered, contiguous integer series to generate a compact matrix representation. + + Functions to map back to the original indices are also provided and can be saved in order to use + a pretrained model. + + """ + # sort entries by user index + self.df_ = self.df.sort_values(by=[self.col_user]) + + # find unique user and item index + unique_users = self.df_[self.col_user].unique() + unique_items = self.df_[self.col_item].unique() + + self.Nusers = len(unique_users) + self.Nitems = len(unique_items) + + # create a dictionary to map unique users/items to hashed values to generate the matrix + self.map_users = {x: i for i, x in enumerate(unique_users)} + self.map_items = {x: i for i, x in enumerate(unique_items)} + + # map back functions used to get back the original dataframe + self.map_back_users = {i: x for i, x in enumerate(unique_users)} + self.map_back_items = {i: x for i, x in enumerate(unique_items)} + + self.df_.loc[:, "hashedItems"] = self.df_[ + self.col_item].map(self.map_items) + self.df_.loc[:, "hashedUsers"] = self.df_[ + self.col_user].map(self.map_users) + + # optionally save the inverse dictionary to work with trained models + if self.save_path is not None: + + np.save(self.save_path + "/user_dict", self.map_users) + np.save(self.save_path + "/item_dict", self.map_items) + + np.save(self.save_path + "/user_back_dict", self.map_back_users) + np.save(self.save_path + "/item_back_dict", self.map_back_items) + + def gen_affinity_matrix(self): + """ + Generate the user/item affinity matrix + + Returns: + AM: user-affinity matrix of dimensions (Nusers, Nitems) in numpy format. Unrated movies + are assigned a value of 0. + + Basic mechanics: + As a first step, two new columns are added to the input DF, containing the index maps + generated by the gen_index() method. The new indices, together with the ratings, are + then used to generate the user/item affinity matrix using scipy's sparse matrix method + coo_matrix; for reference see: + + https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html + + The input format is: coo_matrix((data, (rows, columns)), shape=(rows, columns)) + """ + + log.info("Generating the user/item affinity matrix...") + + self._gen_index() + + ratings = self.df_[self.col_rating] # ratings + itm_id = self.df_["hashedItems"] # itm_id serving as columns + usr_id = self.df_["hashedUsers"] # usr_id serving as rows + + # generate a sparse matrix representation using scipy's coo_matrix and convert to array format + self.AM = coo_matrix( + (ratings, (usr_id, itm_id)), shape=(self.Nusers, self.Nitems) + ).toarray() + + # ---------------------print the degree of sparsness of the matrix------------------------------ + + zero = (self.AM == 0).sum() # number of unrated items + # number of elements in the matrix + total = self.AM.shape[0] * self.AM.shape[1] + sparsness = zero / total * 100 # Percentage of zeros in the matrix + + log.info("Matrix generated, sparseness percentage: %d" % sparsness) + + return self.AM + + def map_back_sparse(self, X, kind): + """ + Map back the user/affinity matrix to a pd dataframe + + Args: + X (np.array, int32): user/item affinity matrix + kind (string): specify if the output values are ratings or predictions + + Returns: + out_df (pandas dataframe): the generated pandas dataframe + + """ + + m, n = X.shape + + # 1) Create a DF from a sparse matrix + # obtain the non zero items + items = [np.asanyarray(np.where(X[i, :] != 0)).flatten() + for i in range(m)] + ratings = [X[i, items[i]] + for i in range(m)] # obtain the non-zero ratings + + # Creates user ids following the DF format + userids = [] + for i in range(0, m): + userids.extend([i] * len(items[i])) + + # Flatten the lists to follow the DF input format + items = list(itertools.chain.from_iterable(items)) + ratings = list(itertools.chain.from_iterable(ratings)) + + if kind == "ratings": + col_out = self.col_rating + else: + col_out = self.col_pred + + # create a df + out_df = pd.DataFrame.from_dict( + {self.col_user: userids, self.col_item: items, col_out: ratings} + ) + + # 2) map back user/item ids to their original value + + out_df[self.col_user] = out_df[self.col_user].map(self.map_back_users) + out_df[self.col_item] = out_df[self.col_item].map(self.map_back_items) + + return out_df diff --git a/training/training_code/dataset/split_utils.py b/training/training_code/dataset/split_utils.py new file mode 100644 index 0000000..1864bcb --- /dev/null +++ b/training/training_code/dataset/split_utils.py @@ -0,0 +1,167 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import numpy as np + +from dataset.constants import DEFAULT_ITEM_COL, DEFAULT_USER_COL + +try: + from pyspark.sql.functions import col, broadcast +except ImportError: + pass # so the environment without spark doesn't break + + +def process_split_ratio(ratio): + """Generate split ratio lists + + Args: + ratio (float or list): a float number that indicates split ratio or a list of float + numbers that indicate split ratios (if it is a multi-split). + + Returns: + tuple: a tuple containing + bool: A boolean variable multi that indicates if the splitting is multi or single. + list: A list of normalized split ratios. + """ + if isinstance(ratio, float): + if ratio <= 0 or ratio >= 1: + raise ValueError("Split ratio has to be between 0 and 1") + + multi = False + elif isinstance(ratio, list): + if any([x <= 0 for x in ratio]): + raise ValueError( + "All split ratios in the ratio list should be larger than 0." + ) + + # normalize split ratios if they are not summed to 1 + if sum(ratio) != 1.0: + ratio = [x / sum(ratio) for x in ratio] + + multi = True + else: + raise TypeError("Split ratio should be either float or a list of floats.") + + return multi, ratio + + +def min_rating_filter_pandas( + data, + min_rating=1, + filter_by="user", + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, +): + """Filter rating DataFrame for each user with minimum rating. + Filter rating data frame with minimum number of ratings for user/item is usually useful to + generate a new data frame with warm user/item. The warmth is defined by min_rating argument. For + example, a user is called warm if he has rated at least 4 items. + + Args: + data (pd.DataFrame): DataFrame of user-item tuples. Columns of user and item + should be present in the DataFrame while other columns like rating, + timestamp, etc. can be optional. + min_rating (int): minimum number of ratings for user or item. + filter_by (str): either "user" or "item", depending on which of the two is to + filter with min_rating. + col_user (str): column name of user ID. + col_item (str): column name of item ID. + + Returns: + pd.DataFrame: DataFrame with at least columns of user and item that has been + filtered by the given specifications. + """ + split_by_column, _ = _check_min_rating_filter( + filter_by, min_rating, col_user, col_item + ) + rating_filtered = data.groupby(split_by_column).filter( + lambda x: len(x) >= min_rating + ) + return rating_filtered + + +def min_rating_filter_spark( + data, + min_rating=1, + filter_by="user", + col_user=DEFAULT_USER_COL, + col_item=DEFAULT_ITEM_COL, +): + """Filter rating DataFrame for each user with minimum rating. + Filter rating data frame with minimum number of ratings for user/item is usually useful to + generate a new data frame with warm user/item. The warmth is defined by min_rating argument. For + example, a user is called warm if he has rated at least 4 items. + + Args: + data (spark.DataFrame): DataFrame of user-item tuples. Columns of user and item + should be present in the DataFrame while other columns like rating, + timestamp, etc. can be optional. + min_rating (int): minimum number of ratings for user or item. + filter_by (str): either "user" or "item", depending on which of the two is to + filter with min_rating. + col_user (str): column name of user ID. + col_item (str): column name of item ID. + + Returns: + spark.DataFrame: DataFrame with at least columns of user and item that has been + filtered by the given specifications. + """ + split_by_column, split_with_column = _check_min_rating_filter( + filter_by, min_rating, col_user, col_item + ) + rating_temp = ( + data.groupBy(split_by_column) + .agg({split_with_column: "count"}) + .withColumnRenamed("count(" + split_with_column + ")", "n" + split_with_column) + .where(col("n" + split_with_column) >= min_rating) + ) + + rating_filtered = data.join(broadcast(rating_temp), split_by_column).drop( + "n" + split_with_column + ) + return rating_filtered + + +def _check_min_rating_filter(filter_by, min_rating, col_user, col_item): + if not (filter_by == "user" or filter_by == "item"): + raise ValueError("filter_by should be either 'user' or 'item'.") + + if min_rating < 1: + raise ValueError("min_rating should be integer and larger than or equal to 1.") + + split_by_column = col_user if filter_by == "user" else col_item + split_with_column = col_item if filter_by == "user" else col_user + return split_by_column, split_with_column + + +def split_pandas_data_with_ratios(data, ratios, seed=42, shuffle=False): + """Helper function to split pandas DataFrame with given ratios + + Note: + Implementation referenced from + https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test + + Args: + data (pd.DataFrame): Pandas data frame to be split. + ratios (list of floats): list of ratios for split. The ratios have to sum to 1. + seed (int): random seed. + shuffle (bool): whether data will be shuffled when being split. + + Returns: + list: List of pd.DataFrame split by the given specifications. + """ + if sum(ratios) != 1.0: + raise ValueError("The ratios have to sum to 1") + + split_index = np.cumsum(ratios).tolist()[:-1] + + if shuffle: + data = data.sample(frac=1, random_state=seed) + + splits = np.split(data, [round(x * len(data)) for x in split_index]) + + # Add split index (this makes splitting by group more efficient). + for i in range(len(ratios)): + splits[i]["split_index"] = i + + return splits diff --git a/training/training_code/evaluate.py b/training/training_code/evaluate.py new file mode 100644 index 0000000..d864875 --- /dev/null +++ b/training/training_code/evaluate.py @@ -0,0 +1,145 @@ +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import pickle + +import pandas as pd +from pyspark.sql import SparkSession + +from dataset.python_splitters import python_chrono_split +from dataset.python_evaluation import (map_at_k, ndcg_at_k,) +from dataset.spark_evaluation import SparkRankingEvaluation +from NCF import NCF + + +def create_dataset(data_path, checkpoint_path): + + # header = ("itemID", "userID", "rating", "timestamp") + # df = pd.read_csv( + # data_path, + # engine="python", + # names=header, + # header=1, + # dtype={'itemID': str, 'userID':str, 'rating':int, 'timestamp':str} + # ) + + header = ("userID", "itemID", "rating", "timestamp") + df = pd.read_csv( + data_path, + engine="python", + names=header, + header=None + ) + + with open(checkpoint_path + '/user_mapping.p', 'rb') as fp: + user2id = pickle.load(fp) + + with open(checkpoint_path + '/item_mapping.p', 'rb') as fp: + item2id = pickle.load(fp) + + # Converting items/users to IDs to store ints instead of str objects + df['itemID'] = df['itemID'].apply(lambda item: item2id[item]) + df['userID'] = df['userID'].apply(lambda user: user2id[user]) + + train, test = python_chrono_split(df, 0.80) + return (train, test) + + +def load_model(data, checkpoint_path): + + with open(checkpoint_path + '/parameters.p', 'rb') as fp: + parameters = pickle.load(fp) + + print(parameters) + + model = NCF( + n_users=parameters["n_users"], + n_items=parameters["n_items"], + model_type="NeuMF", + n_factors=parameters["factors"], + layer_sizes=[16, 8, 4] + ) + + model.load(neumf_dir=checkpoint_path, alpha=0.5) + + with open(checkpoint_path + '/user_mapping.p', 'rb') as fp: + model.user2id = pickle.load(fp) + + with open(checkpoint_path + '/item_mapping.p', 'rb') as fp: + model.item2id = pickle.load(fp) + + return model + + +def get_predictions(model, train, test): + # Columns not needed, dropping to save memory + # try: + # train = train.drop(['timestamp'], axis=1) + # test = test.drop(['timestamp'], axis=1) + # except AnalysisException: + # pass + + users, items, preds = [], [], [] + item = list(train.itemID.unique()) + for user in train.userID.unique(): + user = [user] * len(item) + users.extend(user) + items.extend(item) + preds.extend(list(model.predict(user, item, is_list=True, is_mapped=False))) + + all_predictions = pd.DataFrame(data={"userID": users, "itemID": items, "prediction": preds}) + merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer") + all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1) + + return all_predictions + + +def evaluate_model(model, train, test): + + all_predictions = get_predictions(model, train, test) + merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer") + all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1) + + TOP_K = 10 + eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K) + eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K) + + print("MAP:\t%f" % eval_map) + print("NDCG:\t%f" % eval_ndcg) + + +def evaluate_model_spark(model, train, test): + + all_predictions = get_predictions(model, train, test) + + spark = SparkSession.builder \ + .config("spark.driver.memory", '32g') \ + .config("spark.executor.memory", '32g') \ + .getOrCreate() + spark.conf.set("spark.sql.execution.arrow.enabled", "true") + + test_df = spark.createDataFrame(test) + predictions_df = spark.createDataFrame(all_predictions) + + TOP_K = 10 + evaluations = SparkRankingEvaluation(test_df, predictions_df, k=TOP_K) + eval_map = evaluations.map_at_k() + eval_ndcg = evaluations.ndcg_at_k() + + print("MAP:\t%f" % eval_map) + print("NDCG:\t%f" % eval_ndcg) + return(eval_map, eval_ndcg) diff --git a/training/training_code/grid_search.py b/training/training_code/grid_search.py new file mode 100644 index 0000000..94d77be --- /dev/null +++ b/training/training_code/grid_search.py @@ -0,0 +1,60 @@ +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from sklearn.model_selection import ParameterGrid + + +class GridSearch: + + def __init__(self, model_fn, param_grid, scoring_fn): + self.model_fn = model_fn + self.param_grid = param_grid + self.scoring_fn = scoring_fn + + def run(self, data): + # TODO Verify data has a test split + # start_time = 0 + param_defs = ParameterGrid(self.param_grid) + results = [] + max_score = 0.0 + max_params = None + + # Create models with all definitions and save results + for params in param_defs: + params["n_users"] = data.n_users + params["n_items"] = data.n_items + model = self.model_fn(**params) + scores = self._fit_and_score(model, data) + results.append((params, scores)) + + for result in results: + # TODO Pass in key for metric we want to maximize + print(result[0]) + print("\t Result: " + str(result[1])) + if result[1][0] > max_score: + max_score = result[1][0] + max_params = result[0] + + print("Training complete, best performing parameters:") + del max_params["n_users"] + del max_params["n_items"] + print(max_params) + return max_params + + def _fit_and_score(self, model, data): + model.fit(data) + scores = self.scoring_fn(model, data.train, data.test) + return scores diff --git a/training/training_code/train-max-model.sh b/training/training_code/train-max-model.sh new file mode 100755 index 0000000..90f9a1d --- /dev/null +++ b/training/training_code/train-max-model.sh @@ -0,0 +1,257 @@ +#!/bin/bash +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# uncomment to enable debug output +#set -x + +# -------------------------------------------------------------------- +# Standard training wrapper script for Model Asset Exchange models +# Complete the following IBM customization steps and remove the TODO +# comments. +# -------------------------------------------------------------------- + +SUCCESS_RETURN_CODE=0 +TRAINING_FAILED_RETURN_CODE=1 +POST_PROCESSING_FAILED=2 +PACKAGING_FAILED_RETURN_CODE=3 +CUSTOMIZATION_ERROR_RETURN_CODE=4 +ENV_ERROR_RETURN_CODE=5 +PIP_FAILED_RETURN_CODE=6 + +# -------------------------------------------------------------------- +# Verify that the required environment variables are defined +# -------------------------------------------------------------------- + +# DATA_DIR identifies the directory where the training data is located. +# The specified directory must exist and be readable. +if [ -z ${DATA_DIR+x} ]; then + echo "Error. Environment variable DATA_DIR is not defined." + exit $ENV_ERROR_RETURN_CODE +fi + +if [ ! -d ${DATA_DIR} ]; then + echo "Error. Environment variable DATA_DIR (\"$DATA_DIR\") does not identify an existing directory." + exit $ENV_ERROR_RETURN_CODE +fi + +# RESULT_DIR identifies the directory where the training output is stored. +# The specified directory must exist and be writable. +if [ -z ${RESULT_DIR+x} ]; then + echo "Error. Environment variable RESULT_DIR is not defined." + exit $ENV_ERROR_RETURN_CODE +fi + +if [ ! -d ${RESULT_DIR} ]; then + echo "Error. Environment variable RESULT_DIR (\"$RESULT_DIR\") does not identify an existing directory." + exit $ENV_ERROR_RETURN_CODE +fi + +# --------------------------------------------------------------- +# Perform pre-training tasks +# (1) Verify that environment variables are defined +# (2) Install prerequisite packages +# --------------------------------------------------------------- + +echo "# ************************************************************" +echo "# Preparing for model training" +echo "# ************************************************************" + +# Prior to launching this script, WML copies the training data from +# Cloud Object Storage to the $DATA_DIR directory. Use this environment +# variable to access the data. +echo "Training data is stored in $DATA_DIR" + +# The WML stores work files in the $RESULT_DIR. +echo "Training work files and results will be stored in $RESULT_DIR" + +# Install prerequisite packages +# IBM TODO: add required packages to the file +# +echo "Installing the following prerequisite packages ..." +cat training_requirements.txt + +pip install --user --no-deps -r training_requirements.txt + +RETURN_CODE=$? + +# display installed packages to aid with troubleshooting +echo "Installed Python packages:" +pip freeze +echo "----------------------------------------------------" + +if [ $RETURN_CODE -gt 0 ]; then + # pip install returned an error; exit with PIP_FAILED_RETURN_CODE + echo "Error: pip install exited with status code $RETURN_CODE" + exit $PIP_FAILED_RETURN_CODE +fi + + +# --------------------------------------------------------------- +# Perform model training tasks +# --------------------------------------------------------------- + +# Important: Trained model artifacts must be stored in ${RESULT_DIR}/model +# Make sure the directory exists +mkdir -p ${RESULT_DIR}/model + +echo "# ************************************************************" +echo "# Training model ..." +echo "# ************************************************************" + +TRAINING_CMD="python train_ncf.py" + +# display training command +echo "Running training command \"$TRAINING_CMD\"" + +# run training command +$TRAINING_CMD + +# capture return code +RETURN_CODE=$? +if [ $RETURN_CODE -gt 0 ]; then + # the training script returned an error; exit with TRAINING_FAILED_RETURN_CODE + echo "Error: Training run exited with status code $RETURN_CODE" + exit $TRAINING_FAILED_RETURN_CODE +fi + +echo "Training completed. Output is stored in $RESULT_DIR." + +# --------------------------------------------------------------- +# IBM TODO: +# Add post processing code as necessary; for example +# - patch the TensorFlow checkpoint file (if applicable) +# - convert the trained model into other formats +# - ... +# --------------------------------------------------------------- + +echo "# ************************************************************" +echo "# Post processing ..." +echo "# ************************************************************" + +# according to WML coding guidelines the trained model should be +# saved in ${RESULT_DIR}/model +cd ${RESULT_DIR}/model + +# +# Post processing for serialized TensorFlow models: +# If the output of the training run is a TensorFlow checkpoint, patch it. +# + +if [ -d ${RESULT_DIR}/model/checkpoint ]; then + # the training run created a directory named checkpoint + if [ -f ${RESULT_DIR}/model/checkpoint/checkpoint ]; then + # this directory contains a checkpoint file; patch it + mv ${RESULT_DIR}/model/checkpoint/checkpoint ${RESULT_DIR}/model/checkpoint/checkpoint.bak + sed 's:/.*/::g' ${RESULT_DIR}/model/checkpoint/checkpoint.bak > ${RESULT_DIR}/model/checkpoint/checkpoint + if [ $? -gt 0 ]; then + echo "[Post processing] Warning. Patch of TensorFlow checkpoint file failed. " + mv ${RESULT_DIR}/model/checkpoint/checkpoint.bak ${RESULT_DIR}/model/checkpoint/checkpoint + else + echo "[Post processing] TensorFlow checkpoint file was successfully patched." + rm ${RESULT_DIR}/model/checkpoint/checkpoint.bak + fi + fi +fi + +# +# TODO: add custom code if required; e.g. to convert the +# trained model into other formats ... +# + +# --------------------------------------------------------------- +# Prepare for packaging +# (1) create the staging directory structure +# (2) copy the trained model artifacts +# --------------------------------------------------------------- + +cd ${RESULT_DIR} + +BASE_STAGING_DIR=${RESULT_DIR}/output +# subdirectory where trained model artifacts will be stored +TRAINING_STAGING_DIR=${BASE_STAGING_DIR}/trained_model + +# +# 1. make the directories +# +mkdir -p $TRAINING_STAGING_DIR + +MODEL_ARTIFACT_TARGET_PATH=${TRAINING_STAGING_DIR}/tensorflow/checkpoint + + +if [ -z ${MODEL_ARTIFACT_TARGET_PATH+x} ]; + then "Error. This script was not correctly customized." + exit $CUSTOMIZATION_ERROR_RETURN_CODE +fi +mkdir -p $MODEL_ARTIFACT_TARGET_PATH + +if [ -d ${RESULT_DIR}/model/checkpoint ]; then + cp -R ${RESULT_DIR}/model/checkpoint ${TRAINING_STAGING_DIR}/tensorflow/ +fi + + +# The following files should now be present in BASE_STAGING_DIR +# trained_model///file1 +# trained_model//subdirectory/file2 +# trained_model//file3 +# trained_model//file4 +# ... +# Example: +# trained_model/tensorflow/checkpoint/checkpoint +# trained_model/tensorflow/checkpoint/DCGAN.model-21.meta +# trained_model/tensorflow/checkpoint/DCGAN.model-21.index +# trained_model/tensorflow/checkpoint/DCGAN.model-21.data-00000-of-00001 +# trained_model/tensorflow/frozen_graph_def/frozen_inference_graph.pb + +# ---------------------------------------------------------------------- +# Create a compressed TAR archive containing files from $BASE_STAGING_DIR +# NO CODE CUSTOMIZATION SHOULD BE REQUIRED BEYOND THIS POINT +# ---------------------------------------------------------------------- + +echo "# ************************************************************" +echo "# Packaging artifacts" +echo "# ************************************************************" + +# standardized archive name; do NOT change +OUTPUT_ARCHIVE=${RESULT_DIR}/model_training_output.tar.gz + +CWD=`pwd` +cd $BASE_STAGING_DIR +# Create compressed archive from $BASE_STAGING_DIR +echo "Creating downloadable archive \"$OUTPUT_ARCHIVE\"." +tar cvfz ${OUTPUT_ARCHIVE} . +RETURN_CODE=$? +if [ $RETURN_CODE -gt 0 ]; then + # the tar command returned an error; exit with PACKAGING_FAILED_RETURN_CODE + echo "Error: Packaging command exited with status code $RETURN_CODE." + exit $PACKAGING_FAILED_RETURN_CODE +fi +cd $CWD + +# remove the staging directory +rm -rf $BASE_STAGING_DIR + +echo "Model training and packaging completed." +exit $SUCCESS_RETURN_CODE + +# +# Expected result: +# - $OUTPUT_ARCHIVE contains +# trained_model///file1 +# trained_model//subdirectory/file2 +# trained_model//file3 +# trained_model//file4 +# ... diff --git a/training/training_code/train_ncf.py b/training/training_code/train_ncf.py new file mode 100644 index 0000000..7bbd609 --- /dev/null +++ b/training/training_code/train_ncf.py @@ -0,0 +1,127 @@ +# +# Copyright 2018-2019 IBM Corp. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import json +import pickle + +import pandas as pd +import tensorflow as tf + +from NCF import NCF +from dataset.dataset import Dataset +from dataset.python_splitters import python_chrono_split +from evaluate import evaluate_model_spark +from grid_search import GridSearch + +flags = tf.app.flags +flags.DEFINE_string("data", ".", "Path to data file") +flags.DEFINE_integer("epoch", 100, "Epoch to train [100]") +flags.DEFINE_integer("batch_size", 128, "The size of batch [128]") +flags.DEFINE_integer("factors", 8, "The number of latent factors [8]") +flags.DEFINE_float("learning_rate", 5e-3, "The learning rate [5e-3]") +flags.DEFINE_boolean( + "hpo", False, "Enable hyperparameter optimization [False]") +flags.DEFINE_string("delimiter", ",", "") + +header = ("userID", "itemID", "rating", "timestamp") + + +def create_dataset(data_path, split=0.0): + df = pd.read_csv( + data_path, + engine="python", + names=header, + header=1, + sep=flags.FLAGS.delimiter + ) + + if split == 0.0: + return Dataset(df) + else: + train, test = python_chrono_split(df, split) + return Dataset(train, test) + + +def train_model(data, checkpoint_path, model_type="NeuMF", n_factors=flags.FLAGS.factors, layer_sizes=[16, 8, 4], + n_epochs=flags.FLAGS.epoch, batch_size=flags.FLAGS.batch_size, learning_rate=flags.FLAGS.learning_rate,): + + parameters = flags.FLAGS.flag_values_dict() + parameters["n_users"] = data.n_users + parameters["n_items"] = data.n_items + + model = NCF( + n_users=data.n_users, + n_items=data.n_items, + model_type=model_type, + n_factors=n_factors, + layer_sizes=layer_sizes, + n_epochs=n_epochs, + batch_size=batch_size, + learning_rate=learning_rate + ) + + model.fit(data) + model.save(dir_name=checkpoint_path) + + # Save ID mapping + with open(checkpoint_path + '/user_mapping.p', 'wb') as fp: + pickle.dump(model.user2id, fp, protocol=pickle.HIGHEST_PROTOCOL) + + with open(checkpoint_path + '/item_mapping.p', 'wb') as fp: + pickle.dump(model.item2id, fp, protocol=pickle.HIGHEST_PROTOCOL) + + # Save parameters + with open(checkpoint_path + '/parameters.p', 'wb') as fp: + pickle.dump(parameters, fp, protocol=pickle.HIGHEST_PROTOCOL) + + with open(checkpoint_path + '/parameters.json', 'w') as fp: + json.dump(parameters, fp) + + return model + + +def main(): + + model_path = "" + data_path = "" + + if "RESULT_DIR" in os.environ: + model_path = os.environ["RESULT_DIR"] + if "DATA_DIR" in os.environ: + data_path = os.environ["DATA_DIR"] + + checkpoint_path = os.path.join(model_path, "model", "checkpoint") + data_path = os.path.join(data_path, flags.FLAGS.data) + + data = create_dataset(data_path, split=0.8) + + if flags.FLAGS.hpo: + # Check if HPO flags set + print("Running hyperparameter optimization") + params = {"learning_rate": [1e-3, 5e-3, 1e-2], + "n_factors": [8, 16, 32], "n_epochs": [50, 100]} + grid = GridSearch(model_fn=NCF, param_grid=params, + scoring_fn=evaluate_model_spark) + optimized_params = grid.run(data) + full_data = create_dataset(data_path) + train_model(full_data, checkpoint_path, **optimized_params) + + else: + train_model(data, checkpoint_path) + + +if __name__ == "__main__": + main() diff --git a/training/training_code/training_requirements.txt b/training/training_code/training_requirements.txt new file mode 100644 index 0000000..d6b5676 --- /dev/null +++ b/training/training_code/training_requirements.txt @@ -0,0 +1,4 @@ +pandas>=0.23.4 +numpy>=1.13.3 +pyspark>=2.3.1 +pyarrow>=0.8.0 \ No newline at end of file