Motokolearn is a Motoko package meant to facilitate on-chain training and inference of machine learning models where having a large GPU is not a requirement.
-
Web3 services using these on-chain trained models can inherit security and verification capabilities from the underlying Internet Computer protocol
-
Overall dapp architecture can be simplified by eeliminate dependencies with external web2 providers and/or avoid using pre-compiled WASM modules of pre-trained machine learning models
-
Small to medium size data problems of heterogenous "tabular" data are often better solved with ensemble of boosted trees
-
From personal experience, 1) many Kaggle challenges (including those I won) are better solved with ensembles of trees; and 2) last year alone, I consulted with three medium sized startups and all projects involved data bases below 100 megabytes and none of them required the use of large neural network nor GPUs.
- Install mops package manager: https://docs.mops.one/quick-start
- Install the Motoko Base library in your project using mops:
mops add base
- Install fuzz package:
mops add fuzz
- Clone this repository in the base of your project:
cd <yourproject>
git clone https://github.com/ildefons/motokolearn.git
import mtkl "../motokolearn/src/Mtklearn/Mtklearn";
import data "../motokolearn/src/Mtklearn/Datasets";
actor {
let seed = 123456789;
let nsamples: Nat = 100;
let alldata = data.wine_data;
let pos_vec = mtkl.randomSample(0, alldata.size()-1, nsamples, false, seed);
let train = mtkl.rows(pos_vec, alldata);
let test = mtkl.removeRows(pos_vec, alldata);
let xcols = Iter.toArray(Iter.range(0, mtkl.transpose(train).size()-2));
let ycol = mtkl.transpose(train).size()-1;
let xtrain = mtkl.cols(xcols, train);
let yaux = mtkl.transpose(mtkl.cols([ycol], train))[0];
let ytrain = mtkl.dataMemberVectorToTextVector(yaux);
let xtest = mtkl.cols(xcols, test);
let yauxtest = mtkl.transpose(mtkl.cols([ycol], test))[0];
let ytest = mtkl.dataMemberVectorToTextVector(yauxtest);
switch(ytrain) {
case (#ok(yvec)) {
let y_uniques = mtkl.uniquesText(yvec);
let myiter = Iter.range(0, xcols.size()-1);
let col_ids = Iter.toArray(myiter);
let ret_tree = mtkl.fitClassification(xtrain, yvec, 0, y_uniques, 3, 10, col_ids, seed);
};
};
};
import mtkl "../motokolearn/src/Mtklearn/Mtklearn";
import data "../motokolearn/src/Mtklearn/Datasets";
actor {
let seed = 123456789;
let max_depth: Nat = 10;
let min_num_samples: Nat = 5;
let nsamples: Nat = 300;
let alldata = data.diabetes_data;
let pos_vec = mtkl.randomSample(0, alldata.size()-1, nsamples, false, seed);
let train = mtkl.rows(pos_vec, alldata);
let test = mtkl.removeRows(pos_vec, alldata);
let xcols = Iter.toArray(Iter.range(0, mtkl.transpose(train).size()-2));
let ycol = mtkl.transpose(train).size()-1;
let xtrain = mtkl.cols(xcols, train);
let yaux = mtkl.transpose(mtkl.cols([ycol], train))[0];
let ytrain = mtkl.dataMemberVectorToTextVector(yaux);
let xtest = mtkl.cols(xcols, test);
let yauxtest = mtkl.transpose(mtkl.cols([ycol], test))[0];
let ytest = mtkl.dataMemberVectorToTextVector(yauxtest);
switch(ytrain) {
case (#ok(yvec)) {
let y_uniques = mtkl.uniquesText(yvec);
let myiter = Iter.range(0, xcols.size()-1);
let col_ids = Iter.toArray(myiter);
let ret_tree = mtkl.fitRegression(xtrain, yvec, 0, y_uniques, min_num_samples, max_depth, col_ids, seed);
};
};
};
import mtkl "../motokolearn/src/Mtklearn/Mtklearn";
import data "../motokolearn/src/Mtklearn/Datasets";
actor {
var rf_classifier_vec: [mtkl.BinTree] = [mtkl.nilTree()];
public func doRFClassifier() {
let seed = 123456789;
let ntrees = 100;
let max_depth: Nat = 10;
let min_num_samples: Nat = 5;
let pct_train: Float = 0.99;
let nsamples: Nat = 1000;
let alldata = data.digit_data;
let pos_vec = mtkl.randomSample(0, alldata.size()-1, nsamples, false, seed);
let train = mtkl.rows(pos_vec, alldata);
let test = mtkl.removeRows(pos_vec, alldata);
let xcols = Iter.toArray(Iter.range(0, mtkl.transpose(train).size()-2));
let ycol = mtkl.transpose(train).size()-1;
let xtrain = mtkl.cols(xcols, train);
let yaux = mtkl.transpose(mtkl.cols([ycol], train))[0];
let ytrain = mtkl.dataMemberVectorToTextVector(yaux);
let xtest = mtkl.cols(xcols, test);
let yauxtest = mtkl.transpose(mtkl.cols([ycol], test))[0];
let ytest = mtkl.dataMemberVectorToTextVector(yauxtest);
switch(ytrain) {
case (#ok(yvec)) {
let y_uniques = mtkl.uniquesText(yvec);
let myiter = Iter.range(0, xcols.size()-1);
let col_ids = Iter.toArray(myiter);
var ret_tree: mtkl.BinTree = mtkl.nilTree();
let rfreturn = await mtkl.fitRandomForestClassifier(xtrain,
yvec,
y_uniques,
ntrees,
0,
min_num_samples,
max_depth,
col_ids,
pct_train,
seed+1);
switch(rfreturn) {
case (#ok(tree_vec)) {
rf_classifier_vec := tree_vec;
};
case (_) {
//
};
};
};
case (_) {
//
};
};
};
import mtkl "../motokolearn/src/Mtklearn/Mtklearn";
import data "../motokolearn/src/Mtklearn/Datasets";
actor {
var rf_regression_vec: [mtkl.BinTree] = [mtkl.nilTree()];
public func doRFRegression() : async () {
let seed = 123456789;
let ntrees = 100;
let max_depth: Nat = 10;
let min_num_samples: Nat = 5;
let pct_train = 0.9;
let nsamples: Nat = 300;
let alldata = data.diabetes_data;
let pos_vec = mtkl.randomSample(0, alldata.size()-1, nsamples, false, seed);
let train = mtkl.rows(pos_vec, alldata);
let test = mtkl.removeRows(pos_vec, alldata);
let xcols = Iter.toArray(Iter.range(0, mtkl.transpose(train).size()-2));
let ycol = mtkl.transpose(train).size()-1;
let xtrain = mtkl.cols(xcols, train);
let yaux = mtkl.transpose(mtkl.cols([ycol], train))[0];
let ytrain = mtkl.dataMemberVectorToFloatVector(yaux);
let xtest = mtkl.cols(xcols, test);
let yauxtest = mtkl.transpose(mtkl.cols([ycol], test))[0];
let ytest = mtkl.dataMemberVectorToFloatVector(yauxtest);
switch(ytrain) {
case (#ok(yvec)) {
let myiter = Iter.range(0, xcols.size()-1);
let col_ids = Iter.toArray(myiter);
var ret_tree: mtkl.BinTree = mtkl.nilTree();
let rfreturn = await mtkl.fitRandomForestRegression(xtrain,
yvec,
ntrees,
0,
min_num_samples,
max_depth,
col_ids,
pct_train,
seed+1);
switch(rfreturn) {
case (#ok(tree_vec)) {
rf_regression_vec := tree_vec;
};
case (_) {
//
};
};
};
case (_) {
//
};
};
};
let i = 1;
let sample: [mtkl.dataMember] = xtest[i];
let vec = mtkl.predictTreeClassification(sample, mytree);
let myindex = Array.indexOf<Float>(mtkl.max(vec), vec, Float.equal);
let xindex: Nat = switch(myindex) {
case (?Nat) Nat;
case _ 10;
};
if (Text.equal(y_uniques[xindex], yvectest[i])) {
Debug.print("correct");
}
let i = 1;
let sample: [mtkl.dataMember] = xtest[i];
let y_hat = mtkl.predictTreeRegression(sample, mytree)[0];
let sample_rmse = mtkl.rmse(y_hat, yvectest[i]);
let i = 1;
let sample: [mtkl.dataMember] = xtest[i];
let vec = mtkl.predictRFClassification(sample,rf_classifier_vec);
let myindex = Array.indexOf<Float>(mtkl.max(vec), vec, Float.equal);
let xindex: Nat = switch(myindex) {
case (?Nat) Nat;
case _ 10;
};
let text_sample = mtkl.printSample(sample);
if (Text.equal(y_uniques[xindex], yvectest[i])) {
Debug.print("correct");
}
let i = 0;
let sample: [mtkl.dataMember] = xtest[i];
let y_hat = mtkl.predictRFRegression(sample, rf_regression_vec)[0];
let sample_rmse = mtkl.rmse(y_hat, yvectest[i]);
let leftLeaf: mtkl.BinTree = ?(null, null, #symbol([0.05,0.9,0.05]), mtkl.nilTree(), mtkl.nilTree());
let rightLeaf: mtkl.BinTree = ?(null, null, #symbol([0.9,0.1,0.0]), mtkl.nilTree(), mtkl.nilTree());
let treeRoot: mtkl.BinTree = ?(?2, ?0.3, #symbol([0.33,0.33,0.33]), leftLeaf, rightLeaf); // tree node evaluate sample based on 2nd feature and th value 0.3
- you need a canister method able to receive data in the right format as well as a canister state variable:
var actor_data: [[mtkl.dataMember]] = [[#number(1), #number(3), #symbol("1")],
[#number(2), #number(2), #symbol("2")],
[#number(3), #number(3), #symbol("3")],
[#number(4), #number(2), #symbol("4")]];
public func setTrainingData(data: [[mtkl.dataMember]]) : async () {
actor_data := data;
};
- You can now call this method using dfx:
dfx canister call motokolearn_backend setTrainingData '(vec { vec {variant {number=1}; variant {number=11}; variant {symbol="1"};}; vec {variant {number=2}; variant {number=21}; variant {symbol="2"};}; vec {variant {number=3}; variant {number=31}; variant {symbol="3"};};})'
In case you have a large dataset in csv format, you can use the python notebook to read and convert the dataset into a motoko compatible dataset. then you can copy/paste it into your canister code: <yourproject>/motokolearn/notebooks/sklearn_ds_generation.ipynb