diff --git a/.flake8.log b/.flake8.log new file mode 100644 index 0000000000..6ec680a993 --- /dev/null +++ b/.flake8.log @@ -0,0 +1,1983 @@ +src/python/turicreate/toolkits/activity_classifier/_activity_classifier.py:13:1: F401 'numpy as _np' imported but unused +import numpy as _np +^ +src/python/turicreate/toolkits/activity_classifier/_activity_classifier.py:17:1: F401 'turicreate.SArray as _SArray' imported but unused +from turicreate import SArray as _SArray, SFrame as _SFrame +^ +src/python/turicreate/toolkits/activity_classifier/_activity_classifier.py:18:1: F401 'turicreate.aggregate as _agg' imported but unused +from turicreate import aggregate as _agg +^ +src/python/turicreate/toolkits/activity_classifier/_activity_classifier.py:24:1: F401 'turicreate.toolkits.evaluation as _evaluation' imported but unused +from turicreate.toolkits import evaluation as _evaluation +^ +src/python/turicreate/toolkits/activity_classifier/_activity_classifier.py:26:1: F401 'turicreate.toolkits._model.CustomModel as _CustomModel' imported but unused +from turicreate.toolkits._model import CustomModel as _CustomModel +^ +src/python/turicreate/toolkits/activity_classifier/_activity_classifier.py:28:1: F401 'turicreate.toolkits._model.PythonProxy as _PythonProxy' imported but unused +from turicreate.toolkits._model import PythonProxy as _PythonProxy +^ +src/python/turicreate/toolkits/activity_classifier/_activity_classifier.py:30:1: F401 '.util.random_split_by_session as _random_split_by_session' imported but unused +from .util import random_split_by_session as _random_split_by_session +^ +src/python/turicreate/toolkits/activity_classifier/_activity_classifier.py:31:1: F401 '.util._MIN_NUM_SESSIONS_FOR_SPLIT' imported but unused +from .util import _MIN_NUM_SESSIONS_FOR_SPLIT +^ +src/python/turicreate/toolkits/activity_classifier/_activity_classifier.py:34:1: C901 'create' is too complex (11) +def create( +^ +src/python/turicreate/toolkits/activity_classifier/_activity_classifier.py:162:84: F821 undefined name 'x' + raise TypeError("Invalid feature %s: Feature names must be of type str." % x) + ^ +src/python/turicreate/toolkits/activity_classifier/_activity_classifier.py:166:5: F841 local variable 'start_time' is assigned to but never used + start_time = _time.time() + ^ +src/python/turicreate/test/test_external_memory_tree.py:11:1: F401 'array.array' imported but unused +from array import array +^ +scripts/run_cpp_tests.py:38:1: C901 'If 38' is too complex (22) +if __name__ == "__main__": +^ +src/python/turicreate/toolkits/activity_classifier/_tf_model_architecture.py:364:5: C901 'ActivityTensorFlowModel.export_weights' is too complex (12) + def export_weights(self): + ^ +src/python/turicreate/toolkits/recommender/item_content_recommender.py:19:1: C901 'create' is too complex (19) +def create( +^ +src/python/turicreate/util/_config.py:26:5: C901 'TuriConfig.__init__' is too complex (14) + def __init__(self, server_addr=None): + ^ +src/python/turicreate/toolkits/object_detector/_tf_image_augmenter.py:173:1: C901 'padding_augmenter' is too complex (11) +def padding_augmenter( +^ +src/python/turicreate/toolkits/object_detector/_tf_image_augmenter.py:304:1: C901 'crop_augmenter' is too complex (13) +def crop_augmenter( +^ +src/python/turicreate/toolkits/_feature_engineering/_transformer_chain.py:190:29: F821 undefined name 'self' + for name, tr in self._transformers: + ^ +src/python/turicreate/toolkits/_feature_engineering/_transformer_chain.py:191:17: F821 undefined name 'model_fields' + model_fields.append( + ^ +src/python/turicreate/toolkits/_feature_engineering/_transformer_chain.py:192:47: F821 undefined name 'self' + (name, _precomputed_field(self._compact_class_repr(tr))) + ^ +src/python/turicreate/toolkits/_feature_engineering/_transformer_chain.py:195:25: F821 undefined name 'model_fields' + steps, [model_fields], width=8, section_titles=["Steps"] + ^ +src/python/turicreate/test/test_knn_classifier.py:99:9: F841 local variable 'm' is assigned to but never used + m = tc.nearest_neighbor_classifier.create( + ^ +src/python/turicreate/test/test_knn_classifier.py:127:13: F841 local variable 'm' is assigned to but never used + m = tc.nearest_neighbor_classifier.create( + ^ +src/python/turicreate/test/test_knn_classifier.py:131:5: C901 'KnnClassifierCreateTest.test_distances' is too complex (13) + def test_distances(self): + ^ +src/python/turicreate/test/test_knn_classifier.py:409:13: F841 local variable 'ans' is assigned to but never used + ans = str(self.model) + ^ +src/python/turicreate/test/test_knn_classifier.py:506:13: F841 local variable 'ystar' is assigned to but never used + ystar = self.model.predict_topk(tc.SFrame()) + ^ +src/python/turicreate/test/test_knn_classifier.py:594:9: F841 local variable 'ystar' is assigned to but never used + ystar = self.model.classify(self.sf, verbose=False) + ^ +src/python/turicreate/util/lambda_closure_capture.py:148:9: F841 local variable 't' is assigned to but never used + t = self.visit(ast_node) + ^ +src/python/turicreate/util/lambda_closure_capture.py:161:5: C901 'lambda_closure_visitor.visit_Call' is too complex (13) + def visit_Call(self, node): + ^ +src/python/turicreate/util/lambda_closure_capture.py:284:1: C901 'translate' is too complex (11) +def translate(fn): +^ +src/python/turicreate/toolkits/object_detector/object_detector.py:13:1: F401 'itertools as _itertools' imported but unused +import itertools as _itertools +^ +src/python/turicreate/toolkits/object_detector/object_detector.py:14:1: F401 'datetime.datetime as _datetime' imported but unused +from datetime import datetime as _datetime +^ +src/python/turicreate/toolkits/object_detector/object_detector.py:18:1: F401 'numpy as _np' imported but unused +import numpy as _np +^ +src/python/turicreate/toolkits/object_detector/object_detector.py:19:1: F401 'threading.Thread as _Thread' imported but unused +from threading import Thread as _Thread +^ +src/python/turicreate/toolkits/object_detector/object_detector.py:20:1: F401 'six.moves.queue.Queue as _Queue' imported but unused +from six.moves.queue import Queue as _Queue +^ +src/python/turicreate/toolkits/object_detector/object_detector.py:22:1: F401 'turicreate.toolkits._model.CustomModel as _CustomModel' imported but unused +from turicreate.toolkits._model import CustomModel as _CustomModel +^ +src/python/turicreate/toolkits/object_detector/object_detector.py:26:1: F401 'turicreate.toolkits._model.PythonProxy as _PythonProxy' imported but unused +from turicreate.toolkits._model import PythonProxy as _PythonProxy +^ +src/python/turicreate/toolkits/object_detector/object_detector.py:32:1: F401 'turicreate.config as _tc_config' imported but unused +from turicreate import config as _tc_config +^ +src/python/turicreate/toolkits/object_detector/object_detector.py:35:1: F401 '._evaluation.average_precision as _average_precision' imported but unused +from ._evaluation import average_precision as _average_precision +^ +src/python/turicreate/toolkits/object_detector/object_detector.py:36:1: F401 '.._mps_utils.use_mps as _use_mps' imported but unused +from .._mps_utils import ( +^ +src/python/turicreate/toolkits/object_detector/object_detector.py:36:1: F401 '.._mps_utils.mps_device_memory_limit as _mps_device_memory_limit' imported but unused +from .._mps_utils import ( +^ +src/python/turicreate/toolkits/object_detector/object_detector.py:36:1: F401 '.._mps_utils.MpsGraphMode as _MpsGraphMode' imported but unused +from .._mps_utils import ( +^ +src/python/turicreate/toolkits/object_detector/object_detector.py:58:5: F841 local variable 'c_view' is assigned to but never used + c_view = c_in + ^ +src/python/turicreate/toolkits/object_detector/object_detector.py:59:5: F841 local variable 'h_view' is assigned to but never used + h_view = h_in + ^ +src/python/turicreate/toolkits/object_detector/object_detector.py:60:5: F841 local variable 'w_view' is assigned to but never used + w_view = w_in + ^ +src/python/turicreate/toolkits/object_detector/object_detector.py:211:5: F841 local variable 'start_time' is assigned to but never used + start_time = _time.time() + ^ +src/python/turicreate/toolkits/object_detector/object_detector.py:230:5: F841 local variable 'is_annotations_list' is assigned to but never used + is_annotations_list = dataset[annotations].dtype == list + ^ +src/python/turicreate/toolkits/object_detector/object_detector.py:235:5: F841 local variable 'ref_model' is assigned to but never used + ref_model = _pre_trained_models.OBJECT_DETECTION_BASE_MODELS[base_model]() + ^ +src/python/turicreate/toolkits/object_detector/object_detector.py:297:5: F401 'turicreate.toolkits.libtctensorflow' imported but unused + import turicreate.toolkits.libtctensorflow + ^ +src/python/turicreate/test/test_style_transfer.py:15:1: F401 'platform' imported but unused +import platform +^ +src/python/turicreate/test/test_style_transfer.py:16:1: F401 'sys' imported but unused +import sys +^ +src/python/turicreate/test/test_style_transfer.py:20:1: F401 'coremltools' imported but unused +import coremltools +^ +src/python/turicreate/test/test_style_transfer.py:306:9: F811 redefinition of unused 'coremltools' from line 20 + import coremltools + ^ +src/python/turicreate/test/test_style_transfer.py:307:9: F811 redefinition of unused 'platform' from line 15 + import platform + ^ +src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py:11:1: F401 'numpy as _np' imported but unused +import numpy as _np +^ +src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py:12:1: F401 'time as _time' imported but unused +import time as _time +^ +src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py:14:1: F401 'turicreate.toolkits._model.CustomModel as _CustomModel' imported but unused +from turicreate.toolkits._model import CustomModel as _CustomModel +^ +src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py:16:1: F401 'turicreate.toolkits._model.PythonProxy as _PythonProxy' imported but unused +from turicreate.toolkits._model import PythonProxy as _PythonProxy +^ +src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py:17:1: F401 'turicreate.toolkits.evaluation as _evaluation' imported but unused +from turicreate.toolkits import evaluation as _evaluation +^ +src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py:21:1: F401 'turicreate.extensions as _extensions' imported but unused +from turicreate import extensions as _extensions +^ +src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py:23:1: F401 'six.moves.reduce as _reduce' imported but unused +from six.moves import reduce as _reduce +^ +src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py:61:1: C901 'create' is too complex (15) +def create( +^ +src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py:175:24: F821 undefined name 'params' + set_keys = set(params.keys()) + ^ +src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py:180:9: F821 undefined name 'params' + params.update(kwargs["_advanced_parameters"]) + ^ +src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py:202:5: F401 'turicreate.toolkits.libtctensorflow' imported but unused + import turicreate.toolkits.libtctensorflow + ^ +src/python/turicreate/test/test_decision_tree.py:11:1: F401 'math' imported but unused +import math +^ +src/python/turicreate/test/test_decision_tree.py:18:1: F401 'numpy as np' imported but unused +import numpy as np +^ +src/python/turicreate/test/test_decision_tree.py:259:9: F841 local variable 'pred_lst' is assigned to but never used + pred_lst = model.predict(list(test)) + ^ +src/python/turicreate/test/test_decision_tree.py:771:9: F811 redefinition of unused 'np' from line 18 + import numpy as np + ^ +src/python/turicreate/__init__.py:17:1: F401 'turicreate.version_info.__version__' imported but unused +from turicreate.version_info import __version__ +^ +src/python/turicreate/__init__.py:19:1: F401 'turicreate.data_structures.sgraph.Vertex' imported but unused +from turicreate.data_structures.sgraph import Vertex, Edge +^ +src/python/turicreate/__init__.py:19:1: F401 'turicreate.data_structures.sgraph.Edge' imported but unused +from turicreate.data_structures.sgraph import Vertex, Edge +^ +src/python/turicreate/__init__.py:20:1: F401 'turicreate.data_structures.sgraph.SGraph' imported but unused +from turicreate.data_structures.sgraph import SGraph +^ +src/python/turicreate/__init__.py:21:1: F401 'turicreate.data_structures.sarray.SArray' imported but unused +from turicreate.data_structures.sarray import SArray +^ +src/python/turicreate/__init__.py:22:1: F401 'turicreate.data_structures.sframe.SFrame' imported but unused +from turicreate.data_structures.sframe import SFrame +^ +src/python/turicreate/__init__.py:23:1: F401 'turicreate.data_structures.sketch.Sketch' imported but unused +from turicreate.data_structures.sketch import Sketch +^ +src/python/turicreate/__init__.py:24:1: F401 'turicreate.data_structures.image.Image' imported but unused +from turicreate.data_structures.image import Image +^ +src/python/turicreate/__init__.py:25:1: F401 '.data_structures.sarray_builder.SArrayBuilder' imported but unused +from .data_structures.sarray_builder import SArrayBuilder +^ +src/python/turicreate/__init__.py:26:1: F401 '.data_structures.sframe_builder.SFrameBuilder' imported but unused +from .data_structures.sframe_builder import SFrameBuilder +^ +src/python/turicreate/__init__.py:28:1: F401 'turicreate.data_structures.sgraph.load_sgraph' imported but unused +from turicreate.data_structures.sgraph import load_sgraph +^ +src/python/turicreate/__init__.py:32:1: F401 'turicreate.toolkits.clustering' imported but unused +import turicreate.toolkits.clustering as clustering +^ +src/python/turicreate/__init__.py:33:1: F401 'turicreate.toolkits.distances' imported but unused +import turicreate.toolkits.distances as distances +^ +src/python/turicreate/__init__.py:35:1: F401 'turicreate.toolkits.text_analytics' imported but unused +import turicreate.toolkits.text_analytics as text_analytics +^ +src/python/turicreate/__init__.py:36:1: F401 'turicreate.toolkits.graph_analytics' imported but unused +import turicreate.toolkits.graph_analytics as graph_analytics +^ +src/python/turicreate/__init__.py:38:1: F401 'turicreate.toolkits.graph_analytics.connected_components' imported but unused +from turicreate.toolkits.graph_analytics import connected_components +^ +src/python/turicreate/__init__.py:39:1: F401 'turicreate.toolkits.graph_analytics.shortest_path' imported but unused +from turicreate.toolkits.graph_analytics import shortest_path +^ +src/python/turicreate/__init__.py:40:1: F401 'turicreate.toolkits.graph_analytics.kcore' imported but unused +from turicreate.toolkits.graph_analytics import kcore +^ +src/python/turicreate/__init__.py:41:1: F401 'turicreate.toolkits.graph_analytics.pagerank' imported but unused +from turicreate.toolkits.graph_analytics import pagerank +^ +src/python/turicreate/__init__.py:42:1: F401 'turicreate.toolkits.graph_analytics.graph_coloring' imported but unused +from turicreate.toolkits.graph_analytics import graph_coloring +^ +src/python/turicreate/__init__.py:43:1: F401 'turicreate.toolkits.graph_analytics.triangle_counting' imported but unused +from turicreate.toolkits.graph_analytics import triangle_counting +^ +src/python/turicreate/__init__.py:44:1: F401 'turicreate.toolkits.graph_analytics.degree_counting' imported but unused +from turicreate.toolkits.graph_analytics import degree_counting +^ +src/python/turicreate/__init__.py:45:1: F401 'turicreate.toolkits.graph_analytics.label_propagation' imported but unused +from turicreate.toolkits.graph_analytics import label_propagation +^ +src/python/turicreate/__init__.py:47:1: F401 'turicreate.toolkits.recommender' imported but unused +import turicreate.toolkits.recommender as recommender +^ +src/python/turicreate/__init__.py:48:1: F401 'turicreate.toolkits.recommender.popularity_recommender' imported but unused +from turicreate.toolkits.recommender import popularity_recommender +^ +src/python/turicreate/__init__.py:49:1: F401 'turicreate.toolkits.recommender.item_similarity_recommender' imported but unused +from turicreate.toolkits.recommender import item_similarity_recommender +^ +src/python/turicreate/__init__.py:50:1: F401 'turicreate.toolkits.recommender.ranking_factorization_recommender' imported but unused +from turicreate.toolkits.recommender import ranking_factorization_recommender +^ +src/python/turicreate/__init__.py:51:1: F401 'turicreate.toolkits.recommender.item_content_recommender' imported but unused +from turicreate.toolkits.recommender import item_content_recommender +^ +src/python/turicreate/__init__.py:52:1: F401 'turicreate.toolkits.recommender.factorization_recommender' imported but unused +from turicreate.toolkits.recommender import factorization_recommender +^ +src/python/turicreate/__init__.py:54:1: F401 'turicreate.toolkits.regression' imported but unused +import turicreate.toolkits.regression as regression +^ +src/python/turicreate/__init__.py:55:1: F401 'turicreate.toolkits.regression.boosted_trees_regression' imported but unused +from turicreate.toolkits.regression import boosted_trees_regression +^ +src/python/turicreate/__init__.py:56:1: F401 'turicreate.toolkits.regression.random_forest_regression' imported but unused +from turicreate.toolkits.regression import random_forest_regression +^ +src/python/turicreate/__init__.py:57:1: F401 'turicreate.toolkits.regression.decision_tree_regression' imported but unused +from turicreate.toolkits.regression import decision_tree_regression +^ +src/python/turicreate/__init__.py:58:1: F401 'turicreate.toolkits.regression.linear_regression' imported but unused +from turicreate.toolkits.regression import linear_regression +^ +src/python/turicreate/__init__.py:60:1: F401 'turicreate.toolkits.classifier' imported but unused +import turicreate.toolkits.classifier as classifier +^ +src/python/turicreate/__init__.py:61:1: F401 'turicreate.toolkits.classifier.svm_classifier' imported but unused +from turicreate.toolkits.classifier import svm_classifier +^ +src/python/turicreate/__init__.py:62:1: F401 'turicreate.toolkits.classifier.logistic_classifier' imported but unused +from turicreate.toolkits.classifier import logistic_classifier +^ +src/python/turicreate/__init__.py:63:1: F401 'turicreate.toolkits.classifier.boosted_trees_classifier' imported but unused +from turicreate.toolkits.classifier import boosted_trees_classifier +^ +src/python/turicreate/__init__.py:64:1: F401 'turicreate.toolkits.classifier.random_forest_classifier' imported but unused +from turicreate.toolkits.classifier import random_forest_classifier +^ +src/python/turicreate/__init__.py:65:1: F401 'turicreate.toolkits.classifier.decision_tree_classifier' imported but unused +from turicreate.toolkits.classifier import decision_tree_classifier +^ +src/python/turicreate/__init__.py:66:1: F401 'turicreate.toolkits.classifier.nearest_neighbor_classifier' imported but unused +from turicreate.toolkits.classifier import nearest_neighbor_classifier +^ +src/python/turicreate/__init__.py:69:1: F401 'turicreate.toolkits.nearest_neighbors' imported but unused +import turicreate.toolkits.nearest_neighbors as nearest_neighbors +^ +src/python/turicreate/__init__.py:70:1: F401 'turicreate.toolkits.clustering.kmeans' imported but unused +from turicreate.toolkits.clustering import kmeans +^ +src/python/turicreate/__init__.py:71:1: F401 'turicreate.toolkits.clustering.dbscan' imported but unused +from turicreate.toolkits.clustering import dbscan +^ +src/python/turicreate/__init__.py:72:1: F401 'turicreate.toolkits.topic_model.topic_model' imported but unused +from turicreate.toolkits.topic_model import topic_model +^ +src/python/turicreate/__init__.py:74:1: F401 'turicreate.toolkits.image_analysis.image_analysis' imported but unused +from turicreate.toolkits.image_analysis import image_analysis +^ +src/python/turicreate/__init__.py:75:1: F401 'turicreate.toolkits.text_classifier' imported but unused +import turicreate.toolkits.text_classifier as text_classifier +^ +src/python/turicreate/__init__.py:76:1: F401 'turicreate.toolkits.image_classifier' imported but unused +import turicreate.toolkits.image_classifier as image_classifier +^ +src/python/turicreate/__init__.py:77:1: F401 'turicreate.toolkits.image_similarity' imported but unused +import turicreate.toolkits.image_similarity as image_similarity +^ +src/python/turicreate/__init__.py:78:1: F401 'turicreate.toolkits.object_detector' imported but unused +import turicreate.toolkits.object_detector as object_detector +^ +src/python/turicreate/__init__.py:79:1: F401 'turicreate.toolkits.one_shot_object_detector' imported but unused +import turicreate.toolkits.one_shot_object_detector as one_shot_object_detector +^ +src/python/turicreate/__init__.py:80:1: F401 'turicreate.toolkits.style_transfer' imported but unused +import turicreate.toolkits.style_transfer as style_transfer +^ +src/python/turicreate/__init__.py:81:1: F401 'turicreate.toolkits.sound_classifier.sound_classifier' imported but unused +import turicreate.toolkits.sound_classifier.sound_classifier as sound_classifier +^ +src/python/turicreate/__init__.py:82:1: F401 'turicreate.toolkits.activity_classifier' imported but unused +import turicreate.toolkits.activity_classifier as activity_classifier +^ +src/python/turicreate/__init__.py:83:1: F401 'turicreate.toolkits.drawing_classifier' imported but unused +import turicreate.toolkits.drawing_classifier as drawing_classifier +^ +src/python/turicreate/__init__.py:85:1: F401 'turicreate.toolkits.image_analysis.image_analysis.load_images' imported but unused +from turicreate.toolkits.image_analysis.image_analysis import load_images +^ +src/python/turicreate/__init__.py:86:1: F401 'turicreate.toolkits.audio_analysis.audio_analysis.load_audio' imported but unused +from turicreate.toolkits.audio_analysis.audio_analysis import load_audio +^ +src/python/turicreate/__init__.py:88:1: F401 'turicreate.toolkits.evaluation' imported but unused +from turicreate.toolkits import evaluation +^ +src/python/turicreate/__init__.py:94:1: F401 'turicreate.data_structures.sframe.load_sframe' imported but unused +from turicreate.data_structures.sframe import load_sframe +^ +src/python/turicreate/__init__.py:95:1: F401 'turicreate.data_structures.sarray.load_sarray' imported but unused +from turicreate.data_structures.sarray import load_sarray +^ +src/python/turicreate/__init__.py:96:1: F401 'turicreate.toolkits._model.load_model' imported but unused +from turicreate.toolkits._model import load_model +^ +src/python/turicreate/__init__.py:124:1: F401 '.visualization.plot' imported but unused +from .visualization import plot, show +^ +src/python/turicreate/__init__.py:124:1: F401 '.visualization.show' imported but unused +from .visualization import plot, show +^ +src/python/turicreate/test/test_json.py:124:5: C901 'JSONTest._assertEqual' is too complex (11) + def _assertEqual(self, x, y): + ^ +src/python/turicreate/meta/bytecodetools/instruction.py:62:5: C901 'Instruction.__str__' is too complex (11) + def __str__(self): + ^ +src/python/turicreate/toolkits/text_classifier/__init__.py:13:1: F401 '._text_classifier.create' imported but unused +from ._text_classifier import create +^ +src/python/turicreate/toolkits/text_classifier/__init__.py:14:1: F401 '._text_classifier.TextClassifier' imported but unused +from ._text_classifier import TextClassifier +^ +src/python/turicreate/test/test_image_classifier.py:10:1: F401 'os' imported but unused +import os +^ +src/python/turicreate/test/test_image_classifier.py:42:9: F402 import '_' from line 8 shadowed by loop variable + for _ in range(5): + ^ +src/python/turicreate/test/test_image_classifier.py:207:13: F841 local variable 'predictions' is assigned to but never used + predictions = model.classify("more junk") + ^ +src/python/turicreate/test/test_sarray.py:29:1: F401 'warnings' imported but unused +import warnings +^ +src/python/turicreate/test/test_sarray.py:32:1: F401 'sys' imported but unused +import sys +^ +src/python/turicreate/test/test_sarray.py:788:5: C901 'SArrayTest.test_clip' is too complex (11) + def test_clip(self): + ^ +src/python/turicreate/test/test_sarray.py:1012:54: F821 undefined name 'long' + self.assertTrue(type(t) == int or type(t) == long) + ^ +src/python/turicreate/test/test_sarray.py:1383:5: C901 'SArrayTest.test_floodiv_corner' is too complex (14) + def test_floodiv_corner(self): + ^ +src/python/turicreate/test/test_sarray.py:1701:13: F841 local variable 'sa3' is assigned to but never used + sa3 = sa1.append(sa2) + ^ +src/python/turicreate/test/test_sarray.py:2037:9: F841 local variable 'g' is assigned to but never used + g = SArray(["123", u"\u2019"]) + ^ +src/python/turicreate/test/test_sarray.py:3003:13: F841 local variable 'sa' is assigned to but never used + sa = SArray([[1], [1, 1], [1], [1]]).cumulative_sum() + ^ +src/python/turicreate/test/test_sarray.py:3043:13: F841 local variable 'sa' is assigned to but never used + sa = SArray([[1], [1, 1], [1], [1]]).cumulative_mean() + ^ +src/python/turicreate/test/test_sarray.py:3087:13: F841 local variable 'sa' is assigned to but never used + sa = SArray([[1], [1], [1], [1]]).cumulative_min() + ^ +src/python/turicreate/test/test_sarray.py:3117:13: F841 local variable 'sa' is assigned to but never used + sa = SArray([[1], [1], [1], [1]]).cumulative_max() + ^ +src/python/turicreate/test/test_sarray.py:3149:13: F841 local variable 'sa' is assigned to but never used + sa = SArray([[1], [1], [1], [1]]).cumulative_std() + ^ +src/python/turicreate/test/test_sarray.py:3222:13: F841 local variable 'sa' is assigned to but never used + sa = SArray([[1], [1], [1], [1]]).cumulative_var() + ^ +src/python/turicreate/test/test_fast_path_prediction.py:12:1: F401 'turicreate.toolkits.evaluation' imported but unused +from turicreate.toolkits import evaluation +^ +src/python/turicreate/test/test_fast_path_prediction.py:238:9: F841 local variable 'sf' is assigned to but never used + sf = self.sf + ^ +src/python/turicreate/toolkits/image_similarity/image_similarity.py:24:1: C901 'create' is too complex (11) +def create( +^ +src/python/turicreate/toolkits/image_similarity/image_similarity.py:240:19: F821 undefined name 'ToolkitError' + raise ToolkitError( + ^ +src/python/turicreate/data_structures/sframe.py:22:1: F401 '..util._pytype_to_printf' imported but unused +from ..util import _get_module_from_object, _pytype_to_printf +^ +src/python/turicreate/data_structures/sframe.py:92:1: C901 '_get_global_dbapi_info' is too complex (13) +def _get_global_dbapi_info(dbapi_module, conn): +^ +src/python/turicreate/data_structures/sframe.py:128:9: F841 local variable 'e' is assigned to but never used + except AttributeError as e: + ^ +src/python/turicreate/data_structures/sframe.py:154:5: F841 local variable 'e' is assigned to but never used + except TypeError as e: + ^ +src/python/turicreate/data_structures/sframe.py:732:5: C901 'SFrame.__init__' is too complex (31) + def __init__(self, data=None, format="auto", _proxy=None): + ^ +src/python/turicreate/data_structures/sframe.py:743:45: F821 undefined name 'unicode' + if six.PY2 and isinstance(data, unicode): + ^ +src/python/turicreate/data_structures/sframe.py:749:69: F821 undefined name 'unicode' + sys.version_info.major < 3 and isinstance(data, unicode) + ^ +src/python/turicreate/data_structures/sframe.py:841:5: C901 'SFrame._infer_column_types_from_lines' is too complex (13) + @staticmethod + ^ +src/python/turicreate/data_structures/sframe.py:891:5: C901 'SFrame._read_csv_impl' is too complex (33) + @classmethod + ^ +src/python/turicreate/data_structures/sframe.py:1696:5: C901 'SFrame.from_sql' is too complex (29) + @classmethod + ^ +src/python/turicreate/data_structures/sframe.py:1990:13: F811 redefinition of unused '_pytype_to_printf' from line 22 + _pytype_to_printf = lambda x: "s" + ^ +src/python/turicreate/data_structures/sframe.py:2088:5: C901 'SFrame.__get_pretty_tables__' is too complex (27) + def __get_pretty_tables__( + ^ +src/python/turicreate/data_structures/sframe.py:2156:25: F821 undefined name 'unicode' + u = unicode(s, "utf-8", errors="replace") + ^ +src/python/turicreate/data_structures/sframe.py:2170:28: F821 undefined name 'unicode' + return unicode(s, "utf-8", errors="replace") + ^ +src/python/turicreate/data_structures/sframe.py:2190:28: F821 undefined name 'unicode' + return unicode(ret, "utf-8", errors="replace") + ^ +src/python/turicreate/data_structures/sframe.py:3772:5: C901 'SFrame.__getitem__' is too complex (20) + def __getitem__(self, key): + ^ +src/python/turicreate/data_structures/sframe.py:3796:41: F821 undefined name 'unicode' + if six.PY2 and type(key) == unicode: + ^ +src/python/turicreate/data_structures/sframe.py:3863:5: C901 'SFrame.__setitem__' is too complex (11) + def __setitem__(self, key, value): + ^ +src/python/turicreate/data_structures/sframe.py:4016:5: C901 'SFrame.groupby' is too complex (36) + def groupby(self, key_column_names, operations, *args): + ^ +src/python/turicreate/data_structures/sframe.py:4443:5: C901 'SFrame.join' is too complex (17) + def join(self, right, on=None, how="inner", alter_name=None): + ^ +src/python/turicreate/data_structures/sframe.py:4887:5: C901 'SFrame.pack_columns' is too complex (21) + def pack_columns( + ^ +src/python/turicreate/data_structures/sframe.py:5248:5: C901 'SFrame.unpack' is too complex (11) + def unpack( + ^ +src/python/turicreate/data_structures/sframe.py:5437:5: C901 'SFrame.stack' is too complex (21) + def stack( + ^ +src/python/turicreate/data_structures/sframe.py:5798:5: C901 'SFrame.sort' is too complex (12) + def sort(self, key_column_names, ascending=True): + ^ +src/python/turicreate/meta/bytecodetools/__init__.py:15:1: F401 '.instruction.Instruction' imported but unused +from .instruction import Instruction +^ +src/python/turicreate/meta/bytecodetools/__init__.py:17:1: F401 '.disassembler_.disassembler' imported but unused +from .disassembler_ import disassembler +^ +src/python/turicreate/meta/bytecodetools/__init__.py:19:1: F401 '.bytecode_consumer.ByteCodeConsumer' imported but unused +from .bytecode_consumer import ByteCodeConsumer, StackedByteCodeConsumer +^ +src/python/turicreate/meta/bytecodetools/__init__.py:19:1: F401 '.bytecode_consumer.StackedByteCodeConsumer' imported but unused +from .bytecode_consumer import ByteCodeConsumer, StackedByteCodeConsumer +^ +src/python/turicreate/toolkits/object_detector/_evaluation.py:24:21: F402 import '_' from line 8 shadowed by loop variable + for index, (_, row) in enumerate(pred_sorted.iterrows()): + ^ +src/python/turicreate/test/test_io.py:172:12: F632 use ==/!= to compare str, bytes, and int literals + if status is 0: + ^ +src/python/turicreate/test/test_io.py:236:23: F632 use ==/!= to compare str, bytes, and int literals + self.has_s3 = status is 0 + ^ +src/python/turicreate/test/test_io.py:265:12: F632 use ==/!= to compare str, bytes, and int literals + if status is not 0: + ^ +src/python/turicreate/_sys_util.py:127:1: C901 'test_pylambda_worker' is too complex (11) +def test_pylambda_worker(): +^ +src/python/turicreate/meta/asttools/tests/__init__.py:13:1: F401 '...asttools.visitors.graph_visitor.GraphGen' imported but unused +from ...asttools.visitors.graph_visitor import GraphGen +^ +src/python/turicreate/meta/asttools/tests/__init__.py:65:5: F401 'networkx' imported but unused + import networkx + ^ +src/python/turicreate/test/test_supervised_learning_missing_value_actions.py:96:13: F841 local variable 'model' is assigned to but never used + model = create_fun(train_sf_with_na, self.target, validation_set=None) + ^ +src/python/turicreate/test/test_supervised_learning_missing_value_actions.py:112:13: F841 local variable 'model' is assigned to but never used + model = create_fun(train_sf_with_na, self.target, validation_set=None) + ^ +src/python/turicreate/test/test_supervised_learning_missing_value_actions.py:133:13: F841 local variable 'pred_missing' is assigned to but never used + pred_missing = model.predict(test_sf_with_na) + ^ +src/python/turicreate/test/test_supervised_learning_missing_value_actions.py:146:13: F841 local variable 'pred' is assigned to but never used + pred = model.predict(test_sf) + ^ +src/python/turicreate/test/test_supervised_learning_missing_value_actions.py:170:13: F841 local variable 'pred_missing' is assigned to but never used + pred_missing = model.extract_features(test_sf_with_na) + ^ +src/python/turicreate/test/test_supervised_learning_missing_value_actions.py:185:13: F841 local variable 'pred' is assigned to but never used + pred = model.extract_features(test_sf) + ^ +src/python/turicreate/test/test_supervised_learning_missing_value_actions.py:200:9: F841 local variable 'eval' is assigned to but never used + eval = model.evaluate(test_sf, missing_value_action="error") + ^ +src/python/turicreate/test/test_supervised_learning_missing_value_actions.py:206:13: F841 local variable 'eval_missing' is assigned to but never used + eval_missing = model.evaluate(test_sf_with_na) + ^ +src/python/turicreate/toolkits/topic_model/topic_model.py:34:15: F821 undefined name 'xrange' + _xrange = xrange + ^ +src/python/turicreate/test/test_python_decision_tree.py:701:13: F841 local variable 'score' is assigned to but never used + score = tree.to_json(-1) + ^ +src/python/turicreate/test/test_python_decision_tree.py:709:9: F841 local variable 'out_2' is assigned to but never used + out_2 = tree.get_prediction_score(5) + ^ +src/python/turicreate/test/test_python_decision_tree.py:717:13: F841 local variable 'score' is assigned to but never used + score = tree.get_prediction_score(-1) + ^ +src/python/turicreate/test/test_python_decision_tree.py:733:13: F841 local variable 'score' is assigned to but never used + score = tree.get_prediction_path(-1) + ^ +src/python/turicreate/toolkits/regression/decision_tree_regression.py:22:1: F401 'turicreate.toolkits._internal_utils._map_unity_proxy_to_object' imported but unused +from turicreate.toolkits._internal_utils import _map_unity_proxy_to_object +^ +src/python/turicreate/toolkits/_internal_utils.py:722:83: F821 undefined name 'x' + raise TypeError("Invalid feature %s: Feature names must be of type str" % x) + ^ +src/python/turicreate/toolkits/audio_analysis/audio_analysis.py:21:1: C901 'load_audio' is too complex (12) +def load_audio( +^ +src/python/turicreate/toolkits/audio_analysis/audio_analysis.py:70:24: F402 import '_' from line 10 shadowed by loop variable + for (dir_path, _, file_names) in _os.walk(path): + ^ +src/python/turicreate/meta/decompiler/tests/test_decompiler.py:15:1: F401 '...testing.py2' imported but unused +from ...testing import py2, py2only +^ +src/python/turicreate/meta/decompiler/tests/test_decompiler.py:15:1: F401 '...testing.py2only' imported but unused +from ...testing import py2, py2only +^ +src/python/turicreate/meta/decompiler/tests/test_decompiler.py:375:9: F841 local variable 'stmnt' is assigned to but never used + stmnt = """ + ^ +src/python/turicreate/toolkits/nearest_neighbors/_nearest_neighbors.py:80:1: C901 'create' is too complex (28) +def create( +^ +src/python/turicreate/toolkits/nearest_neighbors/_nearest_neighbors.py:808:5: C901 'NearestNeighborsModel.query' is too complex (13) + def query(self, dataset, label=None, k=5, radius=None, verbose=True): + ^ +src/python/turicreate/toolkits/classifier/boosted_trees_classifier.py:12:1: F401 'turicreate as _turicreate' imported but unused +import turicreate as _turicreate +^ +src/python/turicreate/toolkits/classifier/boosted_trees_classifier.py:15:1: F401 'turicreate.toolkits._main as _toolkits_main' imported but unused +import turicreate.toolkits._main as _toolkits_main +^ +src/python/turicreate/toolkits/classifier/boosted_trees_classifier.py:20:1: F401 'turicreate.toolkits._internal_utils._raise_error_if_column_exists' imported but unused +from turicreate.toolkits._internal_utils import _raise_error_if_column_exists +^ +src/python/turicreate/test/test_tree_tracking_metrics.py:54:16: F632 use ==/!= to compare str, bytes, and int literals + if metric is "auto": + ^ +scripts/fix_headers.py:6:1: F401 'os.path.split' imported but unused +from os.path import join, split, normpath, exists +^ +scripts/fix_headers.py:6:1: F401 'os.path.normpath' imported but unused +from os.path import join, split, normpath, exists +^ +scripts/fix_headers.py:96:5: F841 local variable 'proc' is assigned to but never used + proc = subprocess.check_call(cmd, shell=True) + ^ +scripts/fix_headers.py:96:34: F821 undefined name 'cmd' + proc = subprocess.check_call(cmd, shell=True) + ^ +scripts/fix_headers.py:190:5: F841 local variable 'rhl' is assigned to but never used + rhl = [h for h in raw_header_list if match_regex.match(h)] + ^ +src/python/turicreate/test/test_logistic_classifier.py:15:1: F403 'from sklearn.metrics import *' used; unable to detect undefined names +from sklearn.metrics import * +^ +src/python/turicreate/test/test_logistic_classifier.py:78:24: F841 local variable 'table' is assigned to but never used + cls.sm_cf_matrix = table = np.histogram2d(target, cls.yhat_class, bins=2)[0] + ^ +src/python/turicreate/test/test_logistic_classifier.py:123:21: F405 'accuracy_score' may be undefined, or defined from star imports: sklearn.metrics + "accuracy": accuracy_score(target, list(cls.yhat_class)), + ^ +src/python/turicreate/test/test_logistic_classifier.py:124:16: F405 'roc_auc_score' may be undefined, or defined from star imports: sklearn.metrics + "auc": roc_auc_score(target, list(cls.yhat_prob)), + ^ +src/python/turicreate/test/test_logistic_classifier.py:126:21: F405 'f1_score' may be undefined, or defined from star imports: sklearn.metrics + "f1_score": f1_score(target, list(cls.yhat_class)), + ^ +src/python/turicreate/test/test_logistic_classifier.py:127:21: F405 'log_loss' may be undefined, or defined from star imports: sklearn.metrics + "log_loss": log_loss(target, list(cls.yhat_prob)), + ^ +src/python/turicreate/test/test_logistic_classifier.py:128:22: F405 'precision_score' may be undefined, or defined from star imports: sklearn.metrics + "precision": precision_score(target, list(cls.yhat_class)), + ^ +src/python/turicreate/test/test_logistic_classifier.py:129:19: F405 'recall_score' may be undefined, or defined from star imports: sklearn.metrics + "recall": recall_score(target, list(cls.yhat_class)), + ^ +src/python/turicreate/test/test_logistic_classifier.py:218:21: F405 'accuracy_score' may be undefined, or defined from star imports: sklearn.metrics + "accuracy": accuracy_score(target, list(cls.yhat_class)), + ^ +src/python/turicreate/test/test_logistic_classifier.py:223:21: F405 'f1_score' may be undefined, or defined from star imports: sklearn.metrics + "f1_score": f1_score(target, list(cls.yhat_class), average="macro"), + ^ +src/python/turicreate/test/test_logistic_classifier.py:224:21: F405 'log_loss' may be undefined, or defined from star imports: sklearn.metrics + "log_loss": log_loss(target, list(raw_predictions)), + ^ +src/python/turicreate/test/test_logistic_classifier.py:225:22: F405 'precision_score' may be undefined, or defined from star imports: sklearn.metrics + "precision": precision_score(target, list(cls.yhat_class), average="macro"), + ^ +src/python/turicreate/test/test_logistic_classifier.py:226:19: F405 'recall_score' may be undefined, or defined from star imports: sklearn.metrics + "recall": recall_score(target, list(cls.yhat_class), average="macro"), + ^ +src/python/turicreate/test/test_logistic_classifier.py:726:13: F841 local variable 'model' is assigned to but never used + model = tc.logistic_classifier.create( + ^ +src/python/turicreate/test/test_logistic_classifier.py:961:9: F841 local variable 'pred' is assigned to but never used + pred = model.predict(sf) + ^ +src/python/turicreate/test/test_logistic_classifier.py:969:9: F841 local variable 'eval1' is assigned to but never used + eval1 = model.evaluate(sf) + ^ +src/python/turicreate/test/test_logistic_classifier.py:971:9: F841 local variable 'eval2' is assigned to but never used + eval2 = model.evaluate(sf) + ^ +src/python/turicreate/test/test_logistic_classifier.py:986:9: F841 local variable 'model' is assigned to but never used + model = tc.logistic_classifier.create(sf, self.target) + ^ +src/python/turicreate/test/test_logistic_classifier.py:1006:13: F841 local variable 'model' is assigned to but never used + model = tc.logistic_classifier.create(sf, self.target) + ^ +src/python/turicreate/test/test_logistic_classifier.py:1493:9: F841 local variable 'test_case' is assigned to but never used + test_case = "solver = {}, opts = {}".format(solver, opts) + ^ +src/python/turicreate/test/test_logistic_classifier.py:1515:9: F841 local variable 'test_case' is assigned to but never used + test_case = "solver = {}, opts = {}".format(solver, opts) + ^ +src/python/turicreate/test/test_logistic_classifier.py:1586:13: F841 local variable 'm' is assigned to but never used + m = tc.logistic_classifier.create(sf, "y") + ^ +src/python/turicreate/test/test_boosted_trees_checkpoint.py:103:9: F841 local variable 'm' is assigned to but never used + m = self.model.create( + ^ +src/python/turicreate/test/test_boosted_trees_checkpoint.py:116:13: F841 local variable 'm_resume' is assigned to but never used + m_resume = self.model.create( + ^ +src/python/turicreate/toolkits/_feature_engineering/_internal_utils.py:12:1: F401 'os as _os' imported but unused +import os as _os +^ +src/python/turicreate/toolkits/_feature_engineering/_internal_utils.py:13:1: F401 'six.moves.urllib.request.urlretrieve' imported but unused +from six.moves.urllib.request import urlretrieve +^ +src/python/turicreate/toolkits/_feature_engineering/_internal_utils.py:14:1: F401 'zipfile' imported but unused +import zipfile +^ +src/python/turicreate/meta/asttools/tests/test_remove_trivial.py:19:1: F401 '...asttools.visitors.graph_visitor.GraphGen' imported but unused +from ...asttools.visitors.graph_visitor import GraphGen +^ +src/python/turicreate/test/test_recommender.py:35:1: F401 'turicreate.toolkits.recommender.popularity_recommender.PopularityRecommender' imported but unused +from turicreate.toolkits.recommender.popularity_recommender import PopularityRecommender +^ +src/python/turicreate/test/test_recommender.py:41:1: F401 'subprocess.Popen as _Popen' imported but unused +from subprocess import Popen as _Popen +^ +src/python/turicreate/test/test_recommender.py:42:1: F401 'subprocess.PIPE as _PIPE' imported but unused +from subprocess import PIPE as _PIPE +^ +src/python/turicreate/test/test_recommender.py:123:5: C901 'RecommenderTestBase._get_trained_model' is too complex (21) + def _get_trained_model( + ^ +src/python/turicreate/test/test_recommender.py:814:9: F841 local variable 'item' is assigned to but never used + item = data["item"][0] + ^ +src/python/turicreate/test/test_recommender.py:825:13: F841 local variable 'm' is assigned to but never used + m = mod.create(data, "user", "item", "target") + ^ +src/python/turicreate/test/test_recommender.py:1412:9: F841 local variable 'r5' is assigned to but never used + r5 = m1.recommend(users=user_query_2, exclude_known=False) + ^ +src/python/turicreate/test/test_recommender.py:2486:13: F841 local variable 'm' is assigned to but never used + m = tc.recommender.create( + ^ +src/python/turicreate/test/test_recommender.py:2784:5: C901 'FactorizationTests.test_retrieve_factors' is too complex (12) + def test_retrieve_factors(self): + ^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:9:1: F401 'time as _time' imported but unused +import time as _time +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:10:1: F401 'datetime.datetime as _datetime' imported but unused +from datetime import datetime as _datetime +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:14:1: F401 'turicreate.toolkits._internal_utils._mac_ver' imported but unused +from turicreate.toolkits._internal_utils import _raise_error_if_not_sframe, _mac_ver +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:15:1: F401 '._utils._seconds_as_string' imported but unused +from ._utils import _seconds_as_string +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:17:1: F401 'turicreate.toolkits._model.CustomModel as _CustomModel' imported but unused +from turicreate.toolkits._model import CustomModel as _CustomModel +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:20:1: F401 'turicreate.toolkits._model.PythonProxy as _PythonProxy' imported but unused +from turicreate.toolkits._model import PythonProxy as _PythonProxy +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:22:1: F401 'numpy as _np' imported but unused +import numpy as _np +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:23:1: F401 'math as _math' imported but unused +import math as _math +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:24:1: F401 'six as _six' imported but unused +import six as _six +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:25:1: F401 '.._mps_utils.use_mps as _use_mps' imported but unused +from .._mps_utils import ( +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:25:1: F401 '.._mps_utils.mps_device_memory_limit as _mps_device_memory_limit' imported but unused +from .._mps_utils import ( +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:25:1: F401 '.._mps_utils.MpsGraphAPI as _MpsGraphAPI' imported but unused +from .._mps_utils import ( +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:25:1: F401 '.._mps_utils.MpsGraphNetworkType as _MpsGraphNetworkType' imported but unused +from .._mps_utils import ( +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:25:1: F401 '.._mps_utils.MpsGraphMode as _MpsGraphMode' imported but unused +from .._mps_utils import ( +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:45:5: F841 local variable 'c_view' is assigned to but never used + c_view = c_in + ^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:46:5: F841 local variable 'h_view' is assigned to but never used + h_view = h_in + ^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:47:5: F841 local variable 'w_view' is assigned to but never used + w_view = w_in + ^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:63:1: C901 'create' is too complex (15) +def create( +^ +src/python/turicreate/toolkits/style_transfer/style_transfer.py:344:5: C901 'StyleTransfer.stylize' is too complex (14) + def stylize(self, images, style=None, verbose=True, max_size=800, batch_size=4): + ^ +src/python/turicreate/test/test_kmeans.py:130:13: F841 local variable 'ans' is assigned to but never used + ans = str(self.model) + ^ +src/python/turicreate/test/test_kmeans.py:227:9: F841 local variable 'm' is assigned to but never used + m = tc.kmeans.create( + ^ +src/python/turicreate/test/test_kmeans.py:258:13: F841 local variable 'm' is assigned to but never used + m = tc.kmeans.create( + ^ +src/python/turicreate/test/test_kmeans.py:293:17: F841 local variable 'm' is assigned to but never used + m = tc.kmeans.create( + ^ +src/python/turicreate/data_structures/sarray.py:395:5: C901 'SArray.__init__' is too complex (23) + def __init__(self, data=[], dtype=None, ignore_cast_failure=False, _proxy=None): + ^ +src/python/turicreate/data_structures/sarray.py:467:69: F821 undefined name 'unicode' + sys.version_info.major < 3 and isinstance(data, unicode) + ^ +src/python/turicreate/data_structures/sarray.py:486:66: F821 undefined name 'unicode' + sys.version_info.major <= 2 and isinstance(data, unicode) + ^ +src/python/turicreate/data_structures/sarray.py:867:26: F821 undefined name 'unicode' + headln = unicode( + ^ +src/python/turicreate/data_structures/sarray.py:1363:5: C901 'SArray.__getitem__' is too complex (16) + def __getitem__(self, other): + ^ +src/python/turicreate/data_structures/sarray.py:3140:5: C901 'SArray.split_datetime' is too complex (12) + def split_datetime(self, column_name_prefix="X", limit=None, timezone=False): + ^ +src/python/turicreate/data_structures/sarray.py:3241:52: F821 undefined name 'unicode' + if six.PY2 and type(column_name_prefix) == unicode: + ^ +src/python/turicreate/data_structures/sarray.py:3357:5: C901 'SArray.unpack' is too complex (22) + def unpack( + ^ +src/python/turicreate/test/test_one_shot_object_detector.py:15:1: F401 'platform' imported but unused +import platform +^ +src/python/turicreate/test/test_one_shot_object_detector.py:16:1: F811 redefinition of unused 'pytest' from line 10 +import pytest +^ +src/python/turicreate/test/test_one_shot_object_detector.py:17:1: F401 'sys' imported but unused +import sys +^ +src/python/turicreate/test/test_one_shot_object_detector.py:18:1: F401 'os' imported but unused +import os +^ +src/python/turicreate/test/test_one_shot_object_detector.py:20:1: F401 'turicreate.toolkits._internal_utils._read_env_var_cpp' imported but unused +from turicreate.toolkits._internal_utils import ( +^ +src/python/turicreate/test/test_one_shot_object_detector.py:25:1: F401 'coremltools' imported but unused +import coremltools +^ +src/python/turicreate/test/test_one_shot_object_detector.py:56:5: F841 local variable 'max_num_boxes_per_image' is assigned to but never used + max_num_boxes_per_image = 10 + ^ +src/python/turicreate/test/test_one_shot_object_detector.py:57:5: F841 local variable 'classes' is assigned to but never used + classes = _CLASSES + ^ +src/python/turicreate/test/test_one_shot_object_detector.py:60:9: F402 import '_' from line 8 shadowed by loop variable + for _ in range(num_examples): + ^ +src/python/turicreate/test/test_one_shot_object_detector.py:129:9: F841 local variable 'data' is assigned to but never used + data = tc.one_shot_object_detector.util.preview_synthetic_training_data( + ^ +src/python/turicreate/test/test_one_shot_object_detector.py:135:9: F841 local variable 'model' is assigned to but never used + model = tc.one_shot_object_detector.create( + ^ +src/python/turicreate/test/test_one_shot_object_detector.py:218:9: F811 redefinition of unused 'coremltools' from line 25 + import coremltools + ^ +src/python/turicreate/test/test_one_shot_object_detector.py:219:9: F811 redefinition of unused 'platform' from line 15 + import platform + ^ +src/python/turicreate/visualization/_plot.py:203:5: C901 'Plot.save' is too complex (11) + def save(self, filepath): + ^ +src/python/turicreate/test/test_svm_classifier.py:11:1: F401 'sys' imported but unused +import sys +^ +src/python/turicreate/test/test_svm_classifier.py:12:1: F401 'operator as op' imported but unused +import operator as op +^ +src/python/turicreate/test/test_svm_classifier.py:19:1: F403 'from sklearn.metrics import *' used; unable to detect undefined names +from sklearn.metrics import * +^ +src/python/turicreate/test/test_svm_classifier.py:22:1: F401 'os as _os' imported but unused +import os as _os +^ +src/python/turicreate/test/test_svm_classifier.py:80:9: F841 local variable 'target_name' is assigned to but never used + target_name = self.target + ^ +src/python/turicreate/test/test_svm_classifier.py:81:9: F841 local variable 'feature_names' is assigned to but never used + feature_names = self.features + ^ +src/python/turicreate/test/test_svm_classifier.py:89:9: F841 local variable 'classes' is assigned to but never used + classes = predictions + ^ +src/python/turicreate/test/test_svm_classifier.py:95:25: F405 'accuracy_score' may be undefined, or defined from star imports: sklearn.metrics + "accuracy": accuracy_score(target, list(self.yhat_class)), + ^ +src/python/turicreate/test/test_svm_classifier.py:99:25: F405 'f1_score' may be undefined, or defined from star imports: sklearn.metrics + "f1_score": f1_score(target, list(self.yhat_class)), + ^ +src/python/turicreate/test/test_svm_classifier.py:100:26: F405 'precision_score' may be undefined, or defined from star imports: sklearn.metrics + "precision": precision_score(target, list(self.yhat_class)), + ^ +src/python/turicreate/test/test_svm_classifier.py:101:23: F405 'recall_score' may be undefined, or defined from star imports: sklearn.metrics + "recall": recall_score(target, list(self.yhat_class)), + ^ +src/python/turicreate/test/test_svm_classifier.py:169:9: F841 local variable 'coef_list' is assigned to but never used + coef_list = list(coefs["value"]) + ^ +src/python/turicreate/test/test_svm_classifier.py:208:9: F841 local variable 'model' is assigned to but never used + model = self.model + ^ +src/python/turicreate/test/test_svm_classifier.py:293:9: F841 local variable 'target_name' is assigned to but never used + target_name = self.target + ^ +src/python/turicreate/test/test_svm_classifier.py:317:9: F841 local variable 'test_case' is assigned to but never used + test_case = "solver = {}, kwargs = {}".format(solver, kwargs) + ^ +src/python/turicreate/test/test_svm_classifier.py:361:13: F841 local variable 'model' is assigned to but never used + model = tc.svm_classifier.create( + ^ +src/python/turicreate/test/test_svm_classifier.py:430:9: F841 local variable 'target_name' is assigned to but never used + target_name = self.target + ^ +src/python/turicreate/test/test_svm_classifier.py:447:9: F841 local variable 'coef_list' is assigned to but never used + coef_list = list(coefs["value"]) + ^ +src/python/turicreate/test/test_svm_classifier.py:454:9: F841 local variable 'test_label' is assigned to but never used + test_label = "solver: {}\tkwargs: {}".format(solver, kwargs) + ^ +src/python/turicreate/test/test_svm_classifier.py:510:9: F841 local variable 'target_name' is assigned to but never used + target_name = self.target + ^ +src/python/turicreate/test/test_svm_classifier.py:526:9: F841 local variable 'coef_list' is assigned to but never used + coef_list = list(coefs["value"]) + ^ +src/python/turicreate/test/test_svm_classifier.py:533:9: F841 local variable 'test_label' is assigned to but never used + test_label = "solver: {}\tkwargs: {}".format(solver, kwargs) + ^ +src/python/turicreate/test/test_svm_classifier.py:564:9: F841 local variable 'pred' is assigned to but never used + pred = model.evaluate(self.sf) + ^ +src/python/turicreate/test/test_svm_classifier.py:582:9: F841 local variable 'pred' is assigned to but never used + pred = model.evaluate(self.sf) + ^ +src/python/turicreate/test/test_svm_classifier.py:610:13: F841 local variable 'model' is assigned to but never used + model = tc.svm_classifier.create(sf, self.target) + ^ +src/python/turicreate/test/test_svm_classifier.py:633:13: F841 local variable 'model' is assigned to but never used + model = tc.svm_classifier.create(sf, self.target) + ^ +src/python/turicreate/test/test_svm_classifier.py:668:9: F841 local variable 'target_name' is assigned to but never used + target_name = self.target + ^ +src/python/turicreate/test/test_svm_classifier.py:669:9: F841 local variable 'feature_names' is assigned to but never used + feature_names = self.features + ^ +src/python/turicreate/test/test_svm_classifier.py:681:9: F841 local variable 'coef_list' is assigned to but never used + coef_list = list(coefs["value"]) + ^ +src/python/turicreate/test/test_svm_classifier.py:688:9: F841 local variable 'test_case' is assigned to but never used + test_case = "solver = {solver}, kwargs = {kwargs}".format( + ^ +src/python/turicreate/test/test_svm_classifier.py:741:9: F841 local variable 'target_name' is assigned to but never used + target_name = self.target + ^ +src/python/turicreate/test/test_svm_classifier.py:742:9: F841 local variable 'feature_names' is assigned to but never used + feature_names = self.features + ^ +src/python/turicreate/test/test_svm_classifier.py:753:9: F841 local variable 'coef_list' is assigned to but never used + coef_list = list(coefs["value"]) + ^ +src/python/turicreate/test/test_svm_classifier.py:761:9: F841 local variable 'test_case' is assigned to but never used + test_case = "solver = {solver}, kwargs = {kwargs}".format( + ^ +src/python/turicreate/test/test_svm_classifier.py:834:9: F841 local variable 'predictions' is assigned to but never used + predictions = model.predict(self.sf) + ^ +src/python/turicreate/test/test_svm_classifier.py:836:9: F841 local variable 'results' is assigned to but never used + results = model.evaluate(self.sf) + ^ +src/python/turicreate/test/test_dbscan.py:23:1: F401 'os as _os' imported but unused +import os as _os +^ +src/python/turicreate/test/test_dbscan.py:92:9: F841 local variable 'local_model' is assigned to but never used + local_model = tc.dbscan.create( + ^ +src/python/turicreate/test/test_dbscan.py:210:5: C901 'CreateTest.test_distances' is too complex (13) + def test_distances(self): + ^ +src/python/turicreate/test/test_dbscan.py:423:13: F841 local variable 'ans' is assigned to but never used + ans = str(self.model) + ^ +src/python/turicreate/extensions.py:406:1: C901 '_publish' is too complex (13) +def _publish(): +^ +src/python/turicreate/extensions.py:772:1: C901 '_build_native_function_call' is too complex (13) +def _build_native_function_call(fn): +^ +src/python/turicreate/test/test_evaluation.py:190:9: F841 local variable 'log_loss' is assigned to but never used + log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat, index_map=index_map) + ^ +src/python/turicreate/test/test_evaluation.py:191:9: F841 local variable 'auc' is assigned to but never used + auc = turicreate.toolkits.evaluation.auc(y, yhat, index_map=index_map) + ^ +src/python/turicreate/test/test_evaluation.py:192:9: F841 local variable 'roc_curve' is assigned to but never used + roc_curve = turicreate.toolkits.evaluation.roc_curve( + ^ +src/python/turicreate/test/test_evaluation.py:201:9: F841 local variable 'log_loss' is assigned to but never used + log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat) + ^ +src/python/turicreate/test/test_evaluation.py:202:9: F841 local variable 'auc' is assigned to but never used + auc = turicreate.toolkits.evaluation.auc(y, yhat) + ^ +src/python/turicreate/test/test_evaluation.py:203:9: F841 local variable 'roc_curve' is assigned to but never used + roc_curve = turicreate.toolkits.evaluation.roc_curve(y, yhat) + ^ +src/python/turicreate/test/test_evaluation.py:221:13: F841 local variable 'log_loss' is assigned to but never used + log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat) + ^ +src/python/turicreate/test/test_evaluation.py:223:13: F841 local variable 'auc' is assigned to but never used + auc = turicreate.toolkits.evaluation.auc(y, yhat) + ^ +src/python/turicreate/test/test_evaluation.py:225:13: F841 local variable 'roc_curve' is assigned to but never used + roc_curve = turicreate.toolkits.evaluation.roc_curve(y, yhat) + ^ +src/python/turicreate/test/test_evaluation.py:556:9: F841 local variable 'points' is assigned to but never used + points = res[["fpr", "tpr"]].unique().sort(["fpr", "tpr"]) + ^ +src/python/turicreate/test/test_evaluation.py:1332:13: F841 local variable 'score' is assigned to but never used + score = turicreate.toolkits.evaluation.log_loss( + ^ +src/python/turicreate/toolkits/sound_classifier/_audio_feature_extractor.py:101:5: C901 'VGGishFeatureExtractor._extract_features' is too complex (11) + def _extract_features(self, preprocessed_data, verbose=True): + ^ +src/python/turicreate/test/test_supervised_learning_string_targets.py:12:1: F401 'turicreate.toolkits._main.ToolkitError' imported but unused +from turicreate.toolkits._main import ToolkitError +^ +src/python/turicreate/test/test_supervised_learning_string_targets.py:68:13: F841 local variable 'model' is assigned to but never used + model = create_fun(train_sf, self.target, validation_set=None) + ^ +src/python/turicreate/test/test_supervised_learning_string_targets.py:78:9: F841 local variable 'ev_train' is assigned to but never used + ev_train = model.evaluate(self.sf) + ^ +src/python/turicreate/test/test_supervised_learning_string_targets.py:79:9: F841 local variable 'ev_test' is assigned to but never used + ev_test = model.evaluate(test_sf) + ^ +src/python/turicreate/test/test_supervised_learning_string_targets.py:80:9: F841 local variable 'ev_test_one' is assigned to but never used + ev_test_one = model.evaluate(test_sf[0:1]) + ^ +src/python/turicreate/test/test_supervised_learning_string_targets.py:136:13: F841 local variable 'model' is assigned to but never used + model = create_fun(train_sf, self.target, validation_set=None) + ^ +src/python/turicreate/test/test_supervised_learning_string_targets.py:146:9: F841 local variable 'ev_train' is assigned to but never used + ev_train = model.evaluate(self.sf) + ^ +src/python/turicreate/test/test_supervised_learning_string_targets.py:147:9: F841 local variable 'ev_test' is assigned to but never used + ev_test = model.evaluate(test_sf) + ^ +src/python/turicreate/test/test_supervised_learning_string_targets.py:148:9: F841 local variable 'ev_test_one' is assigned to but never used + ev_test_one = model.evaluate(test_sf[0:1]) + ^ +src/python/turicreate/test/test_coreml_export.py:12:1: F401 'math' imported but unused +import math +^ +src/python/turicreate/test/test_coreml_export.py:13:1: F401 'uuid' imported but unused +import uuid +^ +src/python/turicreate/test/test_coreml_export.py:14:1: F401 'random' imported but unused +import random +^ +src/python/turicreate/test/test_coreml_export.py:15:1: F401 'copy' imported but unused +import copy +^ +src/python/turicreate/test/test_coreml_export.py:16:1: F401 'turicreate.toolkits.evaluation' imported but unused +from turicreate.toolkits import evaluation +^ +src/python/turicreate/test/test_coreml_export.py:17:1: F401 'turicreate.toolkits._main.ToolkitError' imported but unused +from turicreate.toolkits._main import ToolkitError +^ +src/python/turicreate/test/test_coreml_export.py:19:1: F401 'shutil' imported but unused +import shutil +^ +src/python/turicreate/test/test_coreml_export.py:22:1: F401 'sys' imported but unused +import sys +^ +src/python/turicreate/test/test_coreml_export.py:24:1: F401 'array' imported but unused +import array +^ +src/python/turicreate/test/test_coreml_export.py:68:5: C901 'CoreMLExportTest._test_coreml_export' is too complex (11) + def _test_coreml_export( + ^ +src/python/turicreate/test/test_coreml_export.py:96:17: F811 redefinition of unused 'array' from line 24 + import array + ^ +src/python/turicreate/test/test_coreml_export.py:98:17: F811 redefinition of unused 'copy' from line 15 + import copy + ^ +src/python/turicreate/test/test_util.py:13:1: F401 'uuid' imported but unused +import uuid +^ +src/python/turicreate/test/test_util.py:14:1: F401 'sys as _sys' imported but unused +import sys as _sys +^ +src/python/turicreate/test/test_util.py:66:9: F841 local variable 'd' is assigned to but never used + d = SFrame( + ^ +src/python/turicreate/meta/bytecodetools/disassembler_.py:61:1: C901 'disassembler' is too complex (14) +def disassembler(co, lasti=-1): +^ +src/python/turicreate/toolkits/_pre_trained_models.py:42:1: C901 '_download_and_checksum_files' is too complex (18) +def _download_and_checksum_files(urls, dirname, delete=False): +^ +src/python/turicreate/test/test_boosted_trees.py:11:1: F401 'math' imported but unused +import math +^ +src/python/turicreate/test/test_boosted_trees.py:14:1: F401 'sys' imported but unused +import sys +^ +src/python/turicreate/test/test_boosted_trees.py:19:1: F401 'numpy as np' imported but unused +import numpy as np +^ +src/python/turicreate/test/test_boosted_trees.py:317:9: F841 local variable 'y1' is assigned to but never used + y1 = self.model.predict(new_test) + ^ +src/python/turicreate/test/test_boosted_trees.py:330:9: F841 local variable 'model' is assigned to but never used + model = tc.boosted_trees_regression.create(train, target="label", **self.param) + ^ +src/python/turicreate/test/test_boosted_trees.py:331:9: F841 local variable 'y' is assigned to but never used + y = self.model.predict(test) + ^ +src/python/turicreate/test/test_boosted_trees.py:819:9: F841 local variable 'pred' is assigned to but never used + pred = model.predict(test, output_type="class") + ^ +src/python/turicreate/test/test_boosted_trees.py:830:9: F841 local variable 'y1' is assigned to but never used + y1 = self.model.predict(new_test) + ^ +src/python/turicreate/test/test_boosted_trees.py:845:9: F841 local variable 'model' is assigned to but never used + model = tc.boosted_trees_classifier.create(train, target="label", **self.param) + ^ +src/python/turicreate/test/test_boosted_trees.py:846:9: F841 local variable 'y' is assigned to but never used + y = self.model.predict(test) + ^ +src/python/turicreate/test/test_boosted_trees.py:872:9: F811 redefinition of unused 'np' from line 19 + import numpy as np + ^ +src/python/turicreate/test/test_tree_extract_features.py:11:1: F401 'math' imported but unused +import math +^ +src/python/turicreate/test/test_tree_extract_features.py:12:1: F401 'uuid' imported but unused +import uuid +^ +src/python/turicreate/test/test_tree_extract_features.py:14:1: F401 'copy' imported but unused +import copy +^ +src/python/turicreate/test/test_tree_extract_features.py:15:1: F401 'turicreate.toolkits.evaluation' imported but unused +from turicreate.toolkits import evaluation +^ +src/python/turicreate/test/test_tree_extract_features.py:16:1: F401 'turicreate.toolkits._main.ToolkitError' imported but unused +from turicreate.toolkits._main import ToolkitError +^ +src/python/turicreate/test/test_tree_extract_features.py:17:1: F401 'shutil' imported but unused +import shutil +^ +src/python/turicreate/test/test_tree_extract_features.py:18:1: F401 'numpy as np' imported but unused +import numpy as np +^ +src/python/turicreate/test/test_tree_extract_features.py:19:1: F401 'array.array' imported but unused +from array import array +^ +src/python/turicreate/toolkits/image_classifier/_evaluation.py:211:5: F841 local variable 'conf_metric' is assigned to but never used + conf_metric = evaluation._data["confidence_metric_for_threshold"] + ^ +src/python/turicreate/toolkits/image_classifier/_evaluation.py:235:5: F841 local variable 'conf_metric' is assigned to but never used + conf_metric = evaluation._data["confidence_metric_for_threshold"] + ^ +src/python/turicreate/meta/__init__.py:11:1: F401 '.asttools.visitors.pysourcegen.dump_python_source' imported but unused +from .asttools.visitors.pysourcegen import dump_python_source +^ +src/python/turicreate/test/test_random_forest.py:11:1: F401 'math' imported but unused +import math +^ +src/python/turicreate/test/test_random_forest.py:14:1: F401 'sys' imported but unused +import sys +^ +src/python/turicreate/test/test_random_forest.py:19:1: F401 'numpy as np' imported but unused +import numpy as np +^ +src/python/turicreate/test/test_random_forest.py:281:9: F841 local variable 'pred_lst' is assigned to but never used + pred_lst = model.predict(list(test)) + ^ +src/python/turicreate/test/test_random_forest.py:802:9: F811 redefinition of unused 'np' from line 19 + import numpy as np + ^ +src/python/turicreate/meta/decompiler/tests/__init__.py:10:1: F401 'sys' imported but unused +import sys +^ +src/python/turicreate/meta/decompiler/tests/__init__.py:14:1: F401 '...testing.py2only' imported but unused +from ...testing import py2, py2only +^ +src/python/turicreate/test/test_graph.py:62:9: F841 local variable 'g3' is assigned to but never used + g3 = SGraph( + ^ +src/python/turicreate/_gl_pickle.py:421:5: C901 'GLUnpickler.__init__' is too complex (11) + def __init__(self, filename): + ^ +src/python/turicreate/test/test_linear_regression.py:223:9: F841 local variable 'rmse' is assigned to but never used + rmse = model.evaluate(self.sf, metric="rmse") + ^ +src/python/turicreate/test/test_linear_regression.py:226:9: F841 local variable 'max_error' is assigned to but never used + max_error = model.evaluate(self.sf, metric="max_error") + ^ +src/python/turicreate/test/test_linear_regression.py:421:13: F841 local variable 'model' is assigned to but never used + model = tc.linear_regression.create(sf, self.target) + ^ +src/python/turicreate/test/test_linear_regression.py:444:13: F841 local variable 'model' is assigned to but never used + model = tc.linear_regression.create(sf, self.target) + ^ +src/python/turicreate/test/test_linear_regression.py:1036:9: F841 local variable 'pred' is assigned to but never used + pred = model.predict(X_sf) + ^ +src/python/turicreate/test/test_linear_regression.py:1050:9: F841 local variable 'pred' is assigned to but never used + pred = model.evaluate(X_sf) + ^ +src/python/turicreate/meta/asttools/mutators/prune_mutator.py:100:9: F841 local variable 'can_remove_test' is assigned to but never used + can_remove_test = self.visit(node.test) + ^ +src/python/turicreate/meta/asttools/mutators/prune_mutator.py:126:9: F841 local variable 'discard_test' is assigned to but never used + discard_test = self.visit(node.test) + ^ +src/python/turicreate/test/test_nearest_neighbors.py:30:1: F401 'os as _os' imported but unused +import os as _os +^ +src/python/turicreate/test/test_nearest_neighbors.py:320:13: F841 local variable 'm' is assigned to but never used + m = tc.nearest_neighbors.create( + ^ +src/python/turicreate/test/test_nearest_neighbors.py:646:9: F841 local variable 'm' is assigned to but never used + m = tc.nearest_neighbors.create( + ^ +src/python/turicreate/test/test_nearest_neighbors.py:678:17: F841 local variable 'm' is assigned to but never used + m = tc.nearest_neighbors.create(sf[["x0", "x1", ftr]], verbose=False) + ^ +src/python/turicreate/test/test_nearest_neighbors.py:720:13: F841 local variable 'm' is assigned to but never used + m = tc.nearest_neighbors.create(sf_empty, self.label) + ^ +src/python/turicreate/test/test_nearest_neighbors.py:723:13: F841 local variable 'knn' is assigned to but never used + knn = tc.nearest_neighbors.create(sf_empty, self.label) + ^ +src/python/turicreate/test/test_nearest_neighbors.py:744:13: F841 local variable 'm' is assigned to but never used + m = tc.nearest_neighbors.create(self.refs, distance=[]) + ^ +src/python/turicreate/test/test_nearest_neighbors.py:924:16: F632 use ==/!= to compare str, bytes, and int literals + assert self.model.summary() is not "" + ^ +src/python/turicreate/test/test_nearest_neighbors.py:930:13: F841 local variable 'ans' is assigned to but never used + ans = str(self.model) + ^ +src/python/turicreate/test/test_nearest_neighbors.py:954:17: F841 local variable 'g' is assigned to but never used + g = self.model.similarity_graph() + ^ +src/python/turicreate/test/test_nearest_neighbors.py:1102:16: F632 use ==/!= to compare str, bytes, and int literals + assert self.model.summary() is not "" + ^ +src/python/turicreate/test/test_nearest_neighbors.py:1108:13: F841 local variable 'ans' is assigned to but never used + ans = str(self.model) + ^ +src/python/turicreate/test/test_nearest_neighbors.py:1254:16: F632 use ==/!= to compare str, bytes, and int literals + assert self.model.summary() is not "" + ^ +src/python/turicreate/test/test_nearest_neighbors.py:1260:13: F841 local variable 'ans' is assigned to but never used + ans = str(self.model) + ^ +src/python/turicreate/test/test_nearest_neighbors.py:1505:9: F841 local variable 'knn' is assigned to but never used + knn = m.similarity_graph(k=5, radius=None, verbose=False) + ^ +src/python/turicreate/test/test_nearest_neighbors.py:1778:9: F841 local variable 'k' is assigned to but never used + k = 5 + ^ +src/python/turicreate/test/test_nearest_neighbors.py:1879:9: F841 local variable 'n_query' is assigned to but never used + n_query = 2 + ^ +src/python/turicreate/test/test_nearest_neighbors.py:1916:23: F821 undefined name 'dot_product' + ans = dot_product(query, ref) + ^ +src/python/turicreate/test/test_nearest_neighbors.py:2246:9: F841 local variable 'test_ftrs' is assigned to but never used + test_ftrs = ["query_label", "distance", "rank"] + ^ +src/python/turicreate/test/test_nearest_neighbors.py:2354:9: F841 local variable 'str_list' is assigned to but never used + str_list = random_list_of_str(10, length=5) + ^ +src/python/turicreate/test/test_nearest_neighbors.py:2369:13: F841 local variable 'observed' is assigned to but never used + observed = self.check_for_numeric_fixed_length_lists(lst) + ^ +src/python/turicreate/meta/decompiler/instructions.py:19:1: F401 '..utils.py3' imported but unused +from ..utils import py3, py3op, py2op +^ +src/python/turicreate/meta/decompiler/instructions.py:159:1: C901 'make_function' is too complex (17) +@make_function.py3op +^ +src/python/turicreate/meta/decompiler/instructions.py:491:13: F841 local variable '_doc' is assigned to but never used + _doc = code.pop(1) + ^ +src/python/turicreate/meta/decompiler/instructions.py:493:13: F841 local variable '_name' is assigned to but never used + _name = code.pop(0) + ^ +src/python/turicreate/meta/decompiler/instructions.py:528:9: F841 local variable 'doc' is assigned to but never used + doc = pop_doc(code) + ^ +src/python/turicreate/toolkits/sound_classifier/sound_classifier.py:257:1: C901 'create' is too complex (34) +def create( +^ +src/python/turicreate/toolkits/sound_classifier/sound_classifier.py:693:5: C901 'SoundClassifier.evaluate' is too complex (15) + def evaluate(self, dataset, metric="auto", verbose=True, batch_size=64): + ^ +src/python/turicreate/toolkits/sound_classifier/sound_classifier.py:999:5: C901 'SoundClassifier.predict' is too complex (16) + def predict(self, dataset, output_type="class", verbose=True, batch_size=64): + ^ +src/python/turicreate/toolkits/recommender/__init__.py:100:1: F401 '.item_content_recommender' imported but unused +from . import item_content_recommender +^ +src/python/turicreate/meta/decompiler/disassemble.py:31:1: C901 'print_code' is too complex (18) +def print_code(co, lasti=-1, level=0): +^ +src/python/turicreate/toolkits/regression/_regression.py:12:1: F401 'turicreate._cython.cy_server.QuietProgress' imported but unused +from turicreate._cython.cy_server import QuietProgress +^ +src/python/turicreate/test/test_image_type.py:19:1: F401 '.._deps.numpy as _np' imported but unused +from .._deps import numpy as _np, HAS_NUMPY +^ +src/python/turicreate/test/test_image_type.py:19:1: F401 '.._deps.HAS_NUMPY' imported but unused +from .._deps import numpy as _np, HAS_NUMPY +^ +src/python/turicreate/test/test_image_type.py:147:21: F841 local variable 'cmyk_image' is assigned to but never used + cmyk_image = image.Image(path=t.name, format="JPG") + ^ +src/python/turicreate/toolkits/regression/random_forest_regression.py:12:1: F401 'turicreate as _turicreate' imported but unused +import turicreate as _turicreate +^ +src/python/turicreate/toolkits/regression/random_forest_regression.py:19:1: F401 'turicreate.toolkits._internal_utils._raise_error_if_column_exists' imported but unused +from turicreate.toolkits._internal_utils import _raise_error_if_column_exists +^ +src/python/turicreate/toolkits/_feature_engineering/__init__.py:32:1: F401 '._feature_engineering.Transformer as _Transformer' imported but unused +from ._feature_engineering import Transformer as _Transformer +^ +src/python/turicreate/toolkits/_feature_engineering/__init__.py:33:1: F401 '._feature_engineering._SampleTransformer' imported but unused +from ._feature_engineering import _SampleTransformer +^ +src/python/turicreate/toolkits/_feature_engineering/__init__.py:37:1: F401 '._tokenizer.Tokenizer' imported but unused +from ._tokenizer import Tokenizer +^ +src/python/turicreate/toolkits/_feature_engineering/__init__.py:38:1: F401 '._tfidf.TFIDF' imported but unused +from ._tfidf import TFIDF +^ +src/python/turicreate/toolkits/_feature_engineering/__init__.py:39:1: F401 '._bm25.BM25' imported but unused +from ._bm25 import BM25 +^ +src/python/turicreate/toolkits/_feature_engineering/__init__.py:40:1: F401 '._word_counter.WordCounter' imported but unused +from ._word_counter import WordCounter +^ +src/python/turicreate/toolkits/_feature_engineering/__init__.py:41:1: F401 '._ngram_counter.NGramCounter' imported but unused +from ._ngram_counter import NGramCounter +^ +src/python/turicreate/toolkits/_feature_engineering/__init__.py:42:1: F401 '._word_trimmer.RareWordTrimmer' imported but unused +from ._word_trimmer import RareWordTrimmer +^ +src/python/turicreate/toolkits/_feature_engineering/__init__.py:43:1: F401 '._transform_to_flat_dictionary.TransformToFlatDictionary' imported but unused +from ._transform_to_flat_dictionary import TransformToFlatDictionary +^ +src/python/turicreate/toolkits/_feature_engineering/__init__.py:44:1: F401 '._autovectorizer.AutoVectorizer' imported but unused +from ._autovectorizer import AutoVectorizer +^ +src/python/turicreate/test/test_file_util.py:11:1: F401 'tempfile' imported but unused +import tempfile +^ +src/python/turicreate/meta/asttools/__init__.py:15:1: F401 '_ast' imported but unused +import _ast +^ +src/python/turicreate/meta/asttools/__init__.py:18:1: F401 '..asttools.visitors.dont_visit' imported but unused +from ..asttools.visitors import dont_visit, visit_children, Visitor +^ +src/python/turicreate/meta/asttools/__init__.py:18:1: F401 '..asttools.visitors.visit_children' imported but unused +from ..asttools.visitors import dont_visit, visit_children, Visitor +^ +src/python/turicreate/meta/asttools/__init__.py:18:1: F401 '..asttools.visitors.Visitor' imported but unused +from ..asttools.visitors import dont_visit, visit_children, Visitor +^ +src/python/turicreate/meta/asttools/__init__.py:58:1: F401 '..asttools.visitors.print_visitor.print_ast' imported but unused +from ..asttools.visitors.print_visitor import print_ast, dump_ast as str_ast +^ +src/python/turicreate/meta/asttools/__init__.py:58:1: F401 '..asttools.visitors.print_visitor.dump_ast as str_ast' imported but unused +from ..asttools.visitors.print_visitor import print_ast, dump_ast as str_ast +^ +src/python/turicreate/meta/asttools/__init__.py:59:1: F401 '..asttools.visitors.pysourcegen.python_source' imported but unused +from ..asttools.visitors.pysourcegen import python_source, dump_python_source +^ +src/python/turicreate/meta/asttools/__init__.py:59:1: F401 '..asttools.visitors.pysourcegen.dump_python_source' imported but unused +from ..asttools.visitors.pysourcegen import python_source, dump_python_source +^ +src/python/turicreate/meta/asttools/__init__.py:60:1: F401 '..asttools.visitors.cond_symbol_visitor.lhs' imported but unused +from ..asttools.visitors.cond_symbol_visitor import lhs, rhs +^ +src/python/turicreate/meta/asttools/__init__.py:60:1: F401 '..asttools.visitors.cond_symbol_visitor.rhs' imported but unused +from ..asttools.visitors.cond_symbol_visitor import lhs, rhs +^ +src/python/turicreate/meta/asttools/__init__.py:61:1: F401 '..asttools.visitors.cond_symbol_visitor.conditional_lhs' imported but unused +from ..asttools.visitors.cond_symbol_visitor import conditional_lhs, conditional_symbols +^ +src/python/turicreate/meta/asttools/__init__.py:61:1: F401 '..asttools.visitors.cond_symbol_visitor.conditional_symbols' imported but unused +from ..asttools.visitors.cond_symbol_visitor import conditional_lhs, conditional_symbols +^ +src/python/turicreate/meta/asttools/__init__.py:62:1: F401 '..asttools.visitors.symbol_visitor.get_symbols' imported but unused +from ..asttools.visitors.symbol_visitor import get_symbols +^ +src/python/turicreate/meta/asttools/__init__.py:63:1: F401 '..asttools.visitors.graph_visitor.make_graph' imported but unused +from ..asttools.visitors.graph_visitor import make_graph +^ +src/python/turicreate/test/test_sarray_sketch.py:13:1: F401 'numpy as np' imported but unused +import numpy as np +^ +src/python/turicreate/test/test_sarray_sketch.py:15:1: F401 'random' imported but unused +import random +^ +src/python/turicreate/test/test_sarray_sketch.py:16:1: F401 'copy' imported but unused +import copy +^ +src/python/turicreate/test/test_sarray_sketch.py:17:1: F401 'os' imported but unused +import os +^ +src/python/turicreate/test/test_sarray_sketch.py:19:1: F401 'array' imported but unused +import array +^ +src/python/turicreate/test/test_sarray_sketch.py:20:1: F401 'time' imported but unused +import time +^ +src/python/turicreate/test/test_sarray_sketch.py:313:9: F841 local variable 'sk' is assigned to but never used + sk = sa.summary() + ^ +src/python/turicreate/util/_cloudpickle.py:192:1: C901 'If 192' is too complex (11) +if sys.version_info < (3, 4): +^ +src/python/turicreate/util/_cloudpickle.py:280:18: F821 undefined name 'buffer' + dispatch[buffer] = save_buffer + ^ +src/python/turicreate/util/_cloudpickle.py:358:5: C901 'CloudPickler.save_function' is too complex (16) + def save_function(self, obj, name=None): + ^ +src/python/turicreate/util/_cloudpickle.py:720:5: C901 'CloudPickler.save_inst' is too complex (11) + def save_inst(self, obj): + ^ +src/python/turicreate/util/_cloudpickle.py:834:5: C901 'CloudPickler.save_reduce' is too complex (13) + def save_reduce( + ^ +src/python/turicreate/util/_cloudpickle.py:906:5: C901 'CloudPickler.save_file' is too complex (12) + def save_file(self, obj): + ^ +src/python/turicreate/util/_cloudpickle.py:962:18: F821 undefined name 'file' + dispatch[file] = save_file + ^ +src/python/turicreate/toolkits/one_shot_object_detector/one_shot_object_detector.py:9:1: F401 'turicreate.extensions as _extensions' imported but unused +from turicreate import extensions as _extensions +^ +src/python/turicreate/toolkits/style_transfer/_sframe_loader.py:9:1: F401 'numpy as _np' imported but unused +import numpy as _np +^ +src/python/turicreate/toolkits/style_transfer/_sframe_loader.py:11:1: F401 'turicreate.toolkits._main.ToolkitError as _ToolkitError' imported but unused +from turicreate.toolkits._main import ToolkitError as _ToolkitError +^ +src/python/turicreate/test/util.py:17:1: F401 'turicreate as tc' imported but unused +import turicreate as tc +^ +src/python/turicreate/test/util.py:88:16: F821 undefined name 'unicode' + return unicode(self).encode("utf-8") + ^ +src/python/turicreate/data_structures/sgraph.py:508:5: C901 'SGraph.get_edges' is too complex (11) + def get_edges(self, src_ids=[], dst_ids=[], fields={}, format="sframe"): + ^ +src/python/turicreate/data_structures/sgraph.py:900:5: C901 'SGraph.triple_apply' is too complex (13) + def triple_apply(self, triple_apply_fn, mutated_fields, input_fields=None): + ^ +src/python/turicreate/data_structures/sgraph.py:1447:1: C901 '_edge_data_to_sframe' is too complex (12) +def _edge_data_to_sframe(data, src_field, dst_field): +^ +src/python/turicreate/test/test_image_similarity.py:10:1: F401 'pytest' imported but unused +import pytest +^ +src/python/turicreate/test/test_image_similarity.py:31:9: F402 import '_' from line 8 shadowed by loop variable + for _ in range(5): + ^ +src/python/turicreate/test/test_image_similarity.py:247:9: F841 local variable 'expected_result' is assigned to but never used + expected_result = ( + ^ +src/python/turicreate/toolkits/image_classifier/__init__.py:12:1: F403 'from .image_classifier import *' used; unable to detect undefined names +from .image_classifier import * +^ +src/python/turicreate/toolkits/image_classifier/__init__.py:12:1: F401 '.image_classifier.*' imported but unused +from .image_classifier import * +^ +src/python/turicreate/toolkits/image_classifier/__init__.py:13:1: F401 '._annotate.annotate' imported but unused +from ._annotate import annotate, recover_annotation +^ +src/python/turicreate/toolkits/image_classifier/__init__.py:13:1: F401 '._annotate.recover_annotation' imported but unused +from ._annotate import annotate, recover_annotation +^ +src/python/turicreate/test/test_topic_model.py:11:1: F401 'shutil' imported but unused +import shutil +^ +src/python/turicreate/test/test_topic_model.py:20:1: F401 'time' imported but unused +import time +^ +src/python/turicreate/test/test_topic_model.py:29:1: C901 'generate_bar_example' is too complex (15) +def generate_bar_example( +^ +src/python/turicreate/test/test_topic_model.py:55:5: F841 local variable 'vocab_size' is assigned to but never used + vocab_size = width * width + ^ +src/python/turicreate/test/test_topic_model.py:386:13: F841 local variable 'pr' is assigned to but never used + pr = m.predict(bad3) + ^ +src/python/turicreate/test/test_topic_model.py:399:17: F841 local variable 'perp2' is assigned to but never used + perp2 = m.validation_perplexity + ^ +src/python/turicreate/test/test_topic_model.py:498:9: F841 local variable 'prob_2_d' is assigned to but never used + prob_2_d = 0 + ^ +src/python/turicreate/toolkits/_feature_engineering/_autovectorizer.py:21:1: F401 'turicreate.toolkits._internal_utils._check_categorical_option_type' imported but unused +from turicreate.toolkits._internal_utils import _check_categorical_option_type +^ +src/python/turicreate/toolkits/_feature_engineering/_autovectorizer.py:94:9: F841 local variable 'features' is assigned to but never used + features = state["features"] + ^ +src/python/turicreate/toolkits/_feature_engineering/_autovectorizer.py:95:9: F841 local variable 'excluded_features' is assigned to but never used + excluded_features = state["excluded_features"] + ^ +src/python/turicreate/toolkits/_feature_engineering/_autovectorizer.py:797:9: F841 local variable '_features' is assigned to but never used + _features = _precomputed_field(_internal_utils.pretty_print_list(self.features)) + ^ +src/python/turicreate/toolkits/_feature_engineering/_autovectorizer.py:798:9: F841 local variable '_exclude' is assigned to but never used + _exclude = _precomputed_field( + ^ +src/python/turicreate/toolkits/_feature_engineering/_autovectorizer.py:875:9: F841 local variable 'features' is assigned to but never used + features = state["features"] + ^ +src/python/turicreate/toolkits/_feature_engineering/_autovectorizer.py:876:9: F841 local variable 'excluded_features' is assigned to but never used + excluded_features = state["excluded_features"] + ^ +src/python/turicreate/test/test_activity_classifier.py:18:1: F401 'turicreate.toolkits._internal_utils._read_env_var_cpp' imported but unused +from turicreate.toolkits._internal_utils import _mac_ver, _read_env_var_cpp +^ +src/python/turicreate/test/test_activity_classifier.py:185:9: F841 local variable 'predictions' is assigned to but never used + predictions = model.predict(self.data) + ^ +src/python/turicreate/test/test_activity_classifier.py:195:9: F841 local variable 'predictions' is assigned to but never used + predictions = model.predict(self.data) + ^ +src/python/turicreate/test/test_activity_classifier.py:214:9: F841 local variable 'predictions' is assigned to but never used + predictions = model.predict(self.data) + ^ +src/python/turicreate/test/test_activity_classifier.py:220:9: F841 local variable 'predictions' is assigned to but never used + predictions = model.predict(self.data) + ^ +src/python/turicreate/test/test_activity_classifier.py:227:13: F841 local variable 'model' is assigned to but never used + model = tc.activity_classifier.create( + ^ +src/python/turicreate/test/test_activity_classifier.py:261:9: F841 local variable 'predictions' is assigned to but never used + predictions = model.predict(self.data) + ^ +src/python/turicreate/test/test_activity_classifier.py:498:32: F841 local variable 'random_labels' is assigned to but never used + dataset[self.target] = random_labels = [ + ^ +src/python/turicreate/test/test_activity_classifier.py:545:13: F841 local variable 'pred' is assigned to but never used + pred = self.model.classify(data) + ^ +src/python/turicreate/test/test_activity_classifier.py:576:13: F841 local variable 'preds' is assigned to but never used + preds = model.predict_topk(self.data, k=[]) + ^ +src/python/turicreate/test/test_flexible_type.py:22:1: F401 '.._cython.cy_flexible_type._all_convertable' imported but unused +from .._cython.cy_flexible_type import _get_inferred_column_type, _all_convertable +^ +src/python/turicreate/test/test_flexible_type.py:198:1: C901 'verify_inference' is too complex (13) +def verify_inference(values, expected_type): +^ +src/python/turicreate/test/test_flexible_type.py:343:13: F841 local variable 'reconverted_result' is assigned to but never used + reconverted_result = _tr_flex_list(result, inferred_type) + ^ +src/python/turicreate/toolkits/distances/__init__.py:126:1: F401 '._distances.euclidean' imported but unused +from ._distances import euclidean, squared_euclidean, manhattan +^ +src/python/turicreate/toolkits/distances/__init__.py:126:1: F401 '._distances.squared_euclidean' imported but unused +from ._distances import euclidean, squared_euclidean, manhattan +^ +src/python/turicreate/toolkits/distances/__init__.py:126:1: F401 '._distances.manhattan' imported but unused +from ._distances import euclidean, squared_euclidean, manhattan +^ +src/python/turicreate/toolkits/distances/__init__.py:127:1: F401 '._distances.cosine' imported but unused +from ._distances import ( +^ +src/python/turicreate/toolkits/distances/__init__.py:127:1: F401 '._distances.dot_product' imported but unused +from ._distances import ( +^ +src/python/turicreate/toolkits/distances/__init__.py:127:1: F401 '._distances.transformed_dot_product' imported but unused +from ._distances import ( +^ +src/python/turicreate/toolkits/distances/__init__.py:127:1: F401 '._distances.jaccard' imported but unused +from ._distances import ( +^ +src/python/turicreate/toolkits/distances/__init__.py:127:1: F401 '._distances.weighted_jaccard' imported but unused +from ._distances import ( +^ +src/python/turicreate/toolkits/distances/__init__.py:127:1: F401 '._distances.gaussian_kernel' imported but unused +from ._distances import ( +^ +src/python/turicreate/toolkits/distances/__init__.py:135:1: F401 '._distances.levenshtein' imported but unused +from ._distances import levenshtein +^ +src/python/turicreate/toolkits/distances/__init__.py:138:1: F401 '._util.compute_composite_distance' imported but unused +from ._util import compute_composite_distance +^ +src/python/turicreate/toolkits/distances/__init__.py:139:1: F401 '._util.build_address_distance' imported but unused +from ._util import build_address_distance +^ +src/python/turicreate/toolkits/activity_classifier/_mps_model_architecture.py:62:1: C901 '_fit_model_mps' is too complex (16) +def _fit_model_mps(model, data_iter, valid_iter, max_iterations, verbose): +^ +src/python/turicreate/toolkits/distances/_util.py:26:1: C901 'compute_composite_distance' is too complex (15) +def compute_composite_distance(distance, x, y): +^ +src/python/turicreate/toolkits/distances/_util.py:136:1: C901 '_validate_composite_distance' is too complex (15) +def _validate_composite_distance(distance): +^ +src/python/turicreate/toolkits/drawing_classifier/_tf_drawing_classifier.py:242:9: F841 local variable 'one_hot_labels' is assigned to but never used + one_hot_labels = _np.zeros((int(self.batch_size), self.num_classes)) + ^ +src/python/turicreate/test/test_sframe.py:35:1: F401 'sqlite3' imported but unused +import sqlite3 +^ +src/python/turicreate/test/test_sframe.py:1707:9: F841 local variable 't' is assigned to but never used + t = sf_um.to_dataframe() + ^ +src/python/turicreate/test/test_sframe.py:2326:5: C901 'SFrameTest.test_big_composite_join' is too complex (12) + def test_big_composite_join(self): + ^ +src/python/turicreate/test/test_sframe.py:4246:9: F841 local variable 'none_sf' is assigned to but never used + none_sf = SFrame( + ^ +src/python/turicreate/test/test_sframe.py:4503:69: F821 undefined name 'unicode' + sf2 = sf2.add_columns(SFrame({"long": [12], "unicode": [unicode("foo")]})) + ^ +src/python/turicreate/test/test_sframe.py:4560:5: F811 redefinition of unused 'sys' from line 33 + import sys + ^ +src/python/turicreate/toolkits/_model.py:47:1: C901 'load_model' is too complex (21) +def load_model(location): +^ +src/python/turicreate/toolkits/_model.py:136:21: F811 redefinition of unused 'turicreate' from line 129 + import turicreate.toolkits.libtctensorflow + ^ +src/python/turicreate/toolkits/_model.py:143:21: F811 redefinition of unused 'turicreate' from line 136 + import turicreate.toolkits.libtctensorflow + ^ +src/python/turicreate/toolkits/_model.py:150:21: F811 redefinition of unused 'turicreate' from line 143 + import turicreate.toolkits.libtctensorflow + ^ +src/python/turicreate/toolkits/_model.py:157:21: F811 redefinition of unused 'turicreate' from line 150 + import turicreate.toolkits.libtctensorflow + ^ +src/python/turicreate/toolkits/_model.py:157:21: F401 'turicreate.toolkits.libtctensorflow' imported but unused + import turicreate.toolkits.libtctensorflow + ^ +src/python/turicreate/util/__init__.py:11:1: F401 'urllib as _urllib' imported but unused +import urllib as _urllib +^ +src/python/turicreate/util/__init__.py:18:1: F401 '._sframe_generation.generate_random_sframe' imported but unused +from ._sframe_generation import generate_random_sframe +^ +src/python/turicreate/util/__init__.py:19:1: F401 '._sframe_generation.generate_random_regression_sframe' imported but unused +from ._sframe_generation import generate_random_regression_sframe +^ +src/python/turicreate/util/__init__.py:20:1: F401 '._sframe_generation.generate_random_classification_sframe' imported but unused +from ._sframe_generation import generate_random_classification_sframe +^ +src/python/turicreate/util/__init__.py:21:1: F401 '._type_checks._raise_error_if_not_of_type' imported but unused +from ._type_checks import _raise_error_if_not_of_type +^ +src/python/turicreate/util/__init__.py:22:1: F401 '._type_checks._is_non_string_iterable' imported but unused +from ._type_checks import _is_non_string_iterable +^ +src/python/turicreate/util/__init__.py:23:1: F401 '._progress_table_printer.ProgressTablePrinter as _ProgressTablePrinter' imported but unused +from ._progress_table_printer import ProgressTablePrinter as _ProgressTablePrinter +^ +src/python/turicreate/util/__init__.py:276:1: C901 '_assert_sframe_equal' is too complex (15) +def _assert_sframe_equal( +^ +src/python/turicreate/test/test_drawing_classifier.py:10:1: F401 'os as _os' imported but unused +import os as _os +^ +src/python/turicreate/test/test_drawing_classifier.py:17:1: F401 'copy.copy as _copy' imported but unused +from copy import copy as _copy +^ +src/python/turicreate/test/test_drawing_classifier.py:22:1: F401 'pytest' imported but unused +import pytest +^ +src/python/turicreate/toolkits/classifier/random_forest_classifier.py:12:1: F401 'turicreate as _turicreate' imported but unused +import turicreate as _turicreate +^ +src/python/turicreate/test/test_graph_compute.py:11:1: F401 'time' imported but unused +import time +^ +src/python/turicreate/test/test_graph_compute.py:26:12: F632 use ==/!= to compare str, bytes, and int literals + if edge_dir is "in" or edge_dir is "all": + ^ +src/python/turicreate/test/test_graph_compute.py:26:32: F632 use ==/!= to compare str, bytes, and int literals + if edge_dir is "in" or edge_dir is "all": + ^ +src/python/turicreate/test/test_graph_compute.py:28:12: F632 use ==/!= to compare str, bytes, and int literals + if edge_dir is "out" or edge_dir is "all": + ^ +src/python/turicreate/test/test_graph_compute.py:28:33: F632 use ==/!= to compare str, bytes, and int literals + if edge_dir is "out" or edge_dir is "all": + ^ +src/python/turicreate/test/test_audio_functionality.py:24:1: F401 'sys as _sys' imported but unused +import sys as _sys +^ +src/python/turicreate/test/test_audio_functionality.py:182:13: F841 local variable 'model' is assigned to but never used + model = tc.sound_classifier.create( + ^ +src/python/turicreate/test/test_audio_functionality.py:208:13: F841 local variable 'model' is assigned to but never used + model = tc.sound_classifier.create( + ^ +src/python/turicreate/test/test_audio_functionality.py:219:13: F841 local variable 'model' is assigned to but never used + model = tc.sound_classifier.create( + ^ +src/python/turicreate/test/test_audio_functionality.py:411:13: F841 local variable 'pred' is assigned to but never used + pred = self.model.predict_topk(self.data, k={}) + ^ +src/python/turicreate/toolkits/object_detector/_sframe_loader.py:9:1: F401 'numpy as _np' imported but unused +import numpy as _np +^ +src/python/turicreate/toolkits/object_detector/_sframe_loader.py:11:1: F401 'six.moves.queue.Queue as _Queue' imported but unused +from six.moves.queue import Queue as _Queue +^ +src/python/turicreate/toolkits/object_detector/_sframe_loader.py:12:1: F401 'threading.Thread as _Thread' imported but unused +from threading import Thread as _Thread +^ +src/python/turicreate/toolkits/object_detector/_sframe_loader.py:13:1: F401 'turicreate.toolkits._main.ToolkitError as _ToolkitError' imported but unused +from turicreate.toolkits._main import ToolkitError as _ToolkitError +^ +src/python/turicreate/toolkits/object_detector/_sframe_loader.py:14:1: F401 '._detection.yolo_boxes_to_yolo_map as _yolo_boxes_to_yolo_map' imported but unused +from ._detection import yolo_boxes_to_yolo_map as _yolo_boxes_to_yolo_map +^ +src/python/turicreate/meta/decompiler/simple_instructions.py:15:1: F403 'from opcode import *' used; unable to detect undefined names +from opcode import * +^ +src/python/turicreate/meta/decompiler/simple_instructions.py:15:1: F401 'opcode.*' imported but unused +from opcode import * +^ +src/python/turicreate/meta/decompiler/simple_instructions.py:17:1: F401 'sys' imported but unused +import sys +^ +src/python/turicreate/meta/decompiler/simple_instructions.py:20:1: F401 '..asttools.visitors.print_visitor.print_ast' imported but unused +from ..asttools.visitors.print_visitor import print_ast, dump_ast +^ +src/python/turicreate/meta/decompiler/simple_instructions.py:20:1: F401 '..asttools.visitors.print_visitor.dump_ast' imported but unused +from ..asttools.visitors.print_visitor import print_ast, dump_ast +^ +src/python/turicreate/meta/decompiler/simple_instructions.py:178:13: F402 import '_' from line 13 shadowed by loop variable + for _ in range(nkwargs): + ^ +src/python/turicreate/meta/decompiler/simple_instructions.py:349:9: F841 local variable 'hmm' is assigned to but never used + hmm = self.ast_stack.pop() + ^ +src/python/turicreate/toolkits/_image_feature_extractor.py:92:5: C901 'TensorFlowFeatureExtractor.extract_features' is too complex (15) + def extract_features(self, dataset, feature, batch_size=64, verbose=False): + ^ +src/python/turicreate/toolkits/image_classifier/_annotate.py:20:1: C901 'annotate' is too complex (14) +def annotate(data, image_column=None, annotation_column="annotations"): +^ +src/python/turicreate/test/test_tree_json_dump.py:15:1: F401 'os as _os' imported but unused +import os as _os +^ +src/python/turicreate/test/test_extensions.py:85:5: C901 'VariantCheckTest.test_stress' is too complex (12) + def test_stress(self): + ^ +src/python/turicreate/test/test_object_detector.py:15:1: F401 'platform' imported but unused +import platform +^ +src/python/turicreate/test/test_object_detector.py:16:1: F811 redefinition of unused 'pytest' from line 10 +import pytest +^ +src/python/turicreate/test/test_object_detector.py:20:1: F401 'turicreate.toolkits._internal_utils._read_env_var_cpp' imported but unused +from turicreate.toolkits._internal_utils import ( +^ +src/python/turicreate/test/test_object_detector.py:276:9: F841 local variable 'model' is assigned to but never used + model = tc.object_detector.create( + ^ +src/python/turicreate/test/test_object_detector.py:308:9: F841 local variable 'pred' is assigned to but never used + pred = dict_model.predict(sf_copy) + ^ +src/python/turicreate/test/test_object_detector.py:309:9: F841 local variable 'metrics' is assigned to but never used + metrics = dict_model.evaluate(sf_copy) + ^ +src/python/turicreate/test/test_object_detector.py:310:9: F841 local variable 'annotated_img' is assigned to but never used + annotated_img = tc.object_detector.util.draw_bounding_boxes( + ^ +src/python/turicreate/test/test_object_detector.py:332:13: F841 local variable 'pred' is assigned to but never used + pred = model.predict(self.sf) + ^ +src/python/turicreate/test/test_object_detector.py:356:9: F841 local variable 'pred' is assigned to but never used + pred = self.model.predict(sf) + ^ +src/python/turicreate/test/test_object_detector.py:450:9: F811 redefinition of unused 'platform' from line 15 + import platform + ^ +src/python/turicreate/test/test_lambda.py:101:9: F401 'time' imported but unused + import time, sys + ^ +src/python/turicreate/test/test_lambda.py:101:9: F401 'sys' imported but unused + import time, sys + ^ +src/python/turicreate/toolkits/_supervised_learning.py:341:1: C901 'create_classification_with_model_selector' is too complex (19) +def create_classification_with_model_selector( +^ +src/python/turicreate/meta/asttools/tests/test_depgraph.py:17:1: F401 '...asttools.visitors.graph_visitor.DiGraph' imported but unused +from ...asttools.visitors.graph_visitor import DiGraph +^ +src/python/turicreate/toolkits/image_classifier/image_classifier.py:25:1: F811 redefinition of unused '_coreml_utils' from line 18 +from turicreate.toolkits import _coreml_utils +^ +src/python/turicreate/toolkits/image_classifier/image_classifier.py:35:1: C901 'create' is too complex (11) +def create( +^ +src/python/turicreate/toolkits/image_classifier/image_classifier.py:376:19: F821 undefined name 'ToolkitError' + raise ToolkitError( + ^ +src/python/turicreate/toolkits/image_classifier/image_classifier.py:846:5: C901 'ImageClassifier.export_coreml' is too complex (12) + def export_coreml(self, filename): + ^ +src/python/turicreate/test/test_json_export.py:21:1: F401 'struct' imported but unused +import struct +^ +src/python/turicreate/test/test_json_export.py:48:17: F841 local variable 'loaded' is assigned to but never used + loaded = json.load(json_data) + ^ +src/python/turicreate/test/test_json_export.py:58:17: F841 local variable 'loaded' is assigned to but never used + loaded = json.load(json_data) + ^ +src/python/turicreate/meta/asttools/visitors/pysourcegen.py:134:5: C901 'ExprSourceGen.visitarguments' is too complex (13) + @visitarguments.py3op + ^ +src/python/turicreate/meta/asttools/visitors/pysourcegen.py:391:5: F811 redefinition of unused 'visitMod' from line 241 + visitMod = simple_string("%") + ^ +src/python/turicreate/meta/asttools/visitors/pysourcegen.py:817:5: C901 'SourceGen.visitClassDef' is too complex (13) + @visitClassDef.py3op + ^ +src/python/turicreate/toolkits/regression/boosted_trees_regression.py:12:1: F401 'turicreate as _turicreate' imported but unused +import turicreate as _turicreate +^ +src/python/turicreate/toolkits/regression/boosted_trees_regression.py:17:1: F401 'turicreate.toolkits._main as _toolkits_main' imported but unused +import turicreate.toolkits._main as _toolkits_main +^ +src/python/turicreate/toolkits/regression/boosted_trees_regression.py:23:1: F401 'turicreate.toolkits._internal_utils._map_unity_proxy_to_object' imported but unused +from turicreate.toolkits._internal_utils import _map_unity_proxy_to_object +^ +src/python/turicreate/_scripts/_pylambda_worker.py:22:1: C901 'setup_environment' is too complex (16) +def setup_environment(info_log_function=None, error_log_function=None): +^ +src/python/turicreate/_scripts/_pylambda_worker.py:30:17: F841 local variable 'e' is assigned to but never used + except Exception as e: + ^ +src/python/turicreate/_scripts/_pylambda_worker.py:104:1: C901 'If 104' is too complex (20) +if __name__ == "__main__": +^ +src/python/turicreate/test/test_dataframe.py:32:36: F821 undefined name 'unicode' + expected["unicode"] = [unicode(i) for i in range(10)] + ^ +src/python/turicreate/meta/decompiler/__init__.py:18:1: F401 'struct' imported but unused +import struct +^ +src/python/turicreate/meta/decompiler/__init__.py:19:1: F401 'time' imported but unused +import time +^ +src/python/turicreate/test/test_text_classifier.py:87:9: F841 local variable 'ans' is assigned to but never used + ans = str(self.model) + ^ +src/python/turicreate/test/test_text_classifier.py:135:13: F841 local variable 'loaded_model' is assigned to but never used + loaded_model = tc.load_model(f) + ^ +src/python/turicreate/test/test_graph_analytics.py:17:1: F401 'turicreate.toolkits._main.ToolkitError' imported but unused +from turicreate.toolkits._main import ToolkitError +^ +src/python/turicreate/test/test_gl_pickler.py:13:1: F401 'sys' imported but unused +import sys +^ +src/python/turicreate/test/test_gl_pickler.py:22:1: F401 'os as _os' imported but unused +import os as _os +^ +src/python/turicreate/data_structures/sketch.py:174:5: C901 'Sketch.__repr__' is too complex (14) + def __repr__(self): + ^ +src/python/turicreate/meta/decompiler/control_flow_instructions.py:17:1: F401 '..asttools.visitors.print_visitor.print_ast' imported but unused +from ..asttools.visitors.print_visitor import print_ast +^ +src/python/turicreate/meta/decompiler/control_flow_instructions.py:663:9: F841 local variable 'loop_block_map' is assigned to but never used + loop_block_map = {instr.i: instr.op for instr in loop_block} + ^ +src/python/turicreate/meta/decompiler/control_flow_instructions.py:680:17: F841 local variable 'const_else' is assigned to but never used + const_else = False + ^ +src/python/turicreate/toolkits/_decision_tree.py:409:5: C901 'DecisionTree.get_prediction_path' is too complex (14) + def get_prediction_path(self, node_id, missing_id=[]): + ^ +src/python/turicreate/toolkits/recommender/util.py:25:1: C901 '_create' is too complex (16) +def _create( +^ +src/python/turicreate/toolkits/recommender/util.py:634:5: C901 '_Recommender._get_summary_struct' is too complex (12) + def _get_summary_struct(self): + ^ +src/python/turicreate/toolkits/recommender/util.py:1078:5: C901 '_Recommender.recommend' is too complex (30) + def recommend( + ^ +src/python/turicreate/toolkits/recommender/util.py:1365:5: C901 '_Recommender.recommend_from_interactions' is too complex (12) + def recommend_from_interactions( + ^ +src/python/turicreate/meta/decompiler/recompile.py:14:1: F401 'os' imported but unused +import os +^ +src/python/turicreate/meta/decompiler/recompile.py:28:1: F401 'py_compile.PyCompileError' imported but unused +from py_compile import PyCompileError, wr_long +^ +src/python/turicreate/meta/decompiler/recompile.py:28:1: F401 'py_compile.wr_long' imported but unused +from py_compile import PyCompileError, wr_long +^ +88 C901 'If 38' is too complex (22) +285 F401 'os.path.split' imported but unused +6 F402 import '_' from line 13 shadowed by loop variable +4 F403 'from opcode import *' used; unable to detect undefined names +15 F405 'accuracy_score' may be undefined, or defined from star imports: sklearn.metrics +11 F632 use ==/!= to compare str, bytes, and int literals +20 F811 redefinition of unused '_pytype_to_printf' from line 22 +29 F821 undefined name 'cmd' +200 F841 local variable 'proc' is assigned to but never used diff --git a/scripts/fix_headers.py b/scripts/fix_headers.py index a60d37c41c..5b49bb109a 100755 --- a/scripts/fix_headers.py +++ b/scripts/fix_headers.py @@ -8,80 +8,111 @@ import argparse from collections import defaultdict -parser = argparse.ArgumentParser(description= -""" +parser = argparse.ArgumentParser( + description=""" Script to fix invalid headers in the repository source files after file moves. Must be run from the root of the repository. -Examples: +Examples: To fix one header file in all source files. ./scripts/fix_headers.py -To fix all references to a collection of header files (note regex match): - -./scripts/fix_headers.py --all-match='src/]*/%s\>|#include <%s>|g' % (header_name, true_header), - r's|\#include[ ]*\<%s\>|#include <%s>|g' % (header_name, true_header), - r's|\#include[ ]*\"[^\"]*/%s\"|#include <%s>|g' % (header_name, true_header), - r's|\#include[ ]*\"[^\"]*/%s\"|#include <%s>|g' % (header_name, true_header), - r's|cdef extern from \"\<[^\>]*%s\>\"|cdef extern from "<%s>"|g' % (header_name, true_header), - r's|\#include_boost_|\#include \]*/%s\>|#include <%s>|g" % (header_name, true_header), + r"s|\#include[ ]*\<%s\>|#include <%s>|g" % (header_name, true_header), + r"s|\#include[ ]*\"[^\"]*/%s\"|#include <%s>|g" % (header_name, true_header), + r"s|\#include[ ]*\"[^\"]*/%s\"|#include <%s>|g" % (header_name, true_header), + r's|cdef extern from \"\<[^\>]*%s\>\"|cdef extern from "<%s>"|g' + % (header_name, true_header), + r"s|\#include_boost_|\#include \ 1: - error_out("Multiple matches for file ", filename, " found. Please disambiguate by providing part of the path.\n" - "Found: \n" - + "\n".join(header_files)) + error_out( + "Multiple matches for file ", + filename, + " found. Please disambiguate by providing part of the path.\n" + "Found: \n" + "\n".join(header_files), + ) new_file = header_files[0] - assert new_file.startswith( ("%s/" % header_root).replace("//", "/") ), new_file + assert new_file.startswith(("%s/" % header_root).replace("//", "/")), new_file - new_file = new_file[len(header_root) + 1:] + new_file = new_file[len(header_root) + 1 :] - repls.append( (filename, new_file) ) + repls.append((filename, new_file)) if len(repls) > 100: print("Fixing header locations for %d headers." % len(repls)) else: - print("Fixing header locations for headers: \n" + "\n".join(" %s -> %s" % (h, fl) for h, fl in repls)) + print( + "Fixing header locations for headers: \n" + + "\n".join(" %s -> %s" % (h, fl) for h, fl in repls) + ) shell_cmd = "\n".join( - "{2} || echo 'ERROR fixing {0}; ignoring.' && echo 'Fixed {0} (True = {1}). ' \n".format(header, new_file, fix_headers_cmd(header, new_file)) - for header, new_file in repls) + "{2} || echo 'ERROR fixing {0}; ignoring.' && echo 'Fixed {0} (True = {1}). ' \n".format( + header, new_file, fix_headers_cmd(header, new_file) + ) + for header, new_file in repls + ) open("run_all.out", "w").write(shell_cmd) import tempfile - with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp: + + with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp: temp.write(shell_cmd) temp.flush() subprocess.check_call(["bash", temp.name]) @@ -134,31 +179,33 @@ def print_usage_and_exit(): print("Usage: %s [args...]" % sys.argv[0]) print("Commands: ") - print(" --fix-headers Fixes all import paths of a unique header.") + print( + " --fix-headers Fixes all import paths of a unique header." + ) sys.exit(1) + def all_headers_matching(match_regex): rhl = [h for h in raw_header_list if match_regex.match(h)] - - # Sometimes filenames may have multiple versions, which we need to handle. - # We can do this splitting the list into two components -- those that are - # unique, and those that need additional path information to distinguish + # Sometimes filenames may have multiple versions, which we need to handle. + # We can do this splitting the list into two components -- those that are + # unique, and those that need additional path information to distinguish # which one is being included. The latter we then expand out the directories # until all files are correctly included def separate_unique(raw_headers, n): lookup = defaultdict(lambda: []) - + for h in raw_headers: - fn = '/'.join(h.split('/')[-n:]) + fn = "/".join(h.split("/")[-n:]) lookup[fn].append(h) header_list = [] for fn, hl in lookup.items(): if len(hl) == 1: - header_list.append( (fn, hl[0]) ) + header_list.append((fn, hl[0])) else: # use recursion on this sublist to find distinguishing suffixes header_list += separate_unique(hl, n + 1) @@ -175,8 +222,8 @@ def separate_unique(raw_headers, n): return ret -if __name__ == "__main__": +if __name__ == "__main__": if args.fix_all: filter_regex = re.compile(args.fix_all) diff --git a/scripts/run_cpp_tests.py b/scripts/run_cpp_tests.py index 4644618a9e..2ae16d378c 100755 --- a/scripts/run_cpp_tests.py +++ b/scripts/run_cpp_tests.py @@ -10,175 +10,229 @@ import subprocess # The build image version that will be used for testing -SCRIPT_DIR=os.path.dirname(__file__) -WORKSPACE=os.path.abspath(os.path.join(SCRIPT_DIR, '..')) -TC_BUILD_IMAGE_COS6=subprocess.check_output(['bash', os.path.join(WORKSPACE, 'scripts', 'get_docker_image.sh'), '--centos=6']).strip() - -def run_in_docker(cmd, workdir='/build'): - if not(isinstance(cmd, list)): - cmd = [cmd] - subprocess.check_call(['docker', 'run', '--rm', '-m=4g', - '--mount', 'type=bind,source=' + WORKSPACE + ',target=/build,consistency=delegated', - '-w=%s' % workdir, - TC_BUILD_IMAGE_COS6] + cmd) - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Run cxxtests (optionally caching those that have not been modified).') - - parser.add_argument('--cache', dest='cache', action='store_true') - parser.add_argument('--no-cache', dest='cache', action='store_false') - parser.set_defaults(cache=True) - - parser.add_argument('--schedule-random', dest='schedule_random', action='store_true') - parser.add_argument('--no-schedule-random', dest='schedule_random', action='store_false') - parser.set_defaults(schedule_random=True) - - parser.add_argument('--skip-expensive', default=False, action='store_true') - parser.add_argument('--only-expensive', default=False, action='store_true') - - parser.add_argument('--cache-file', type=str, default='cache_for_cxxtests.pkl', - help='Filename where the cache should be saved') - parser.add_argument('-j', type=int, default=4, - help='Number of processes to use for ctest command.') - parser.add_argument('--dry-run', action='store_true', - help='If present, the ctest command is printed rather than run.') - parser.add_argument('--docker', action='store_true', - help='Run the C++ tests inside of Docker on Centos 6.') - parser.add_argument('--configure-args', dest='configure_args', type=str, default='', help="Space-separated arguments to pass to the configure command.") - - args = parser.parse_args() - - if args.docker: - print('Docker run requested! Proceeding to run inside Docker.') - - # create docker images if needed - subprocess.check_call(['bash', os.path.join(WORKSPACE, 'scripts/create_docker_images.sh')]) - - # make tests if needed - run_in_docker(['bash', 'configure', '--no-python'] + [s.strip() for s in args.configure_args.split(' ') if s.strip()]) - run_in_docker(['bash', '-c', 'make -j%d 2>&1 | grep -v \'ld: warning: direct access\'' % args.j], '/build/debug/test') - - # run tests - # TODO pass through other arguments - run_in_docker(['python', '/build/scripts/run_cpp_tests.py', '-j%d' % args.j], '/build/debug/test') - - # exit if successful (if failed, it will have thrown above) - sys.exit(0) - - expensive_tests = [ - 'boosted_trees_classifier_tests.cxxtest', - 'worker_pool_test.cxxtest', - 'sframe_test.cxxtest', - 'shuffle_test.cxxtest', - 'sarray_file_format_v2_test.cxxtest', - 'parallel_sframe_iterator.cxxtest', - 'optimizations.cxxtest', - 'sorting_and_blocks.cxxtest', - 'test_brute_force_all_pairs.cxxtest', - 'gl_string.cxxtest', - 'process_launch_test.cxxtest', - ] - - # Get all cxxtests in directory - matches = [] - for root, dirnames, filenames in os.walk('.'): - for filename in fnmatch.filter(filenames, '*.cxxtest*'): - if args.skip_expensive and (filename in expensive_tests): - continue - if args.only_expensive and (filename not in expensive_tests): - continue - matches.append(os.path.join(root, filename)) - print('Found {} tests.'.format(len(matches))) - - if args.cache: - # Load the previous cache if it exists - if os.path.exists(args.cache_file): - cache = pickle.load(open(args.cache_file, 'rb')) +SCRIPT_DIR = os.path.dirname(__file__) +WORKSPACE = os.path.abspath(os.path.join(SCRIPT_DIR, "..")) +TC_BUILD_IMAGE_COS6 = subprocess.check_output( + ["bash", os.path.join(WORKSPACE, "scripts", "get_docker_image.sh"), "--centos=6"] +).strip() + + +def run_in_docker(cmd, workdir="/build"): + if not (isinstance(cmd, list)): + cmd = [cmd] + subprocess.check_call( + [ + "docker", + "run", + "--rm", + "-m=4g", + "--mount", + "type=bind,source=" + WORKSPACE + ",target=/build,consistency=delegated", + "-w=%s" % workdir, + TC_BUILD_IMAGE_COS6, + ] + + cmd + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Run cxxtests (optionally caching those that have not been modified)." + ) + + parser.add_argument("--cache", dest="cache", action="store_true") + parser.add_argument("--no-cache", dest="cache", action="store_false") + parser.set_defaults(cache=True) + + parser.add_argument( + "--schedule-random", dest="schedule_random", action="store_true" + ) + parser.add_argument( + "--no-schedule-random", dest="schedule_random", action="store_false" + ) + parser.set_defaults(schedule_random=True) + + parser.add_argument("--skip-expensive", default=False, action="store_true") + parser.add_argument("--only-expensive", default=False, action="store_true") + + parser.add_argument( + "--cache-file", + type=str, + default="cache_for_cxxtests.pkl", + help="Filename where the cache should be saved", + ) + parser.add_argument( + "-j", type=int, default=4, help="Number of processes to use for ctest command." + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="If present, the ctest command is printed rather than run.", + ) + parser.add_argument( + "--docker", + action="store_true", + help="Run the C++ tests inside of Docker on Centos 6.", + ) + parser.add_argument( + "--configure-args", + dest="configure_args", + type=str, + default="", + help="Space-separated arguments to pass to the configure command.", + ) + + args = parser.parse_args() + + if args.docker: + print("Docker run requested! Proceeding to run inside Docker.") + + # create docker images if needed + subprocess.check_call( + ["bash", os.path.join(WORKSPACE, "scripts/create_docker_images.sh")] + ) + + # make tests if needed + run_in_docker( + ["bash", "configure", "--no-python"] + + [s.strip() for s in args.configure_args.split(" ") if s.strip()] + ) + run_in_docker( + [ + "bash", + "-c", + "make -j%d 2>&1 | grep -v 'ld: warning: direct access'" % args.j, + ], + "/build/debug/test", + ) + + # run tests + # TODO pass through other arguments + run_in_docker( + ["python", "/build/scripts/run_cpp_tests.py", "-j%d" % args.j], + "/build/debug/test", + ) + + # exit if successful (if failed, it will have thrown above) + sys.exit(0) + + expensive_tests = [ + "boosted_trees_classifier_tests.cxxtest", + "worker_pool_test.cxxtest", + "sframe_test.cxxtest", + "shuffle_test.cxxtest", + "sarray_file_format_v2_test.cxxtest", + "parallel_sframe_iterator.cxxtest", + "optimizations.cxxtest", + "sorting_and_blocks.cxxtest", + "test_brute_force_all_pairs.cxxtest", + "gl_string.cxxtest", + "process_launch_test.cxxtest", + ] + + # Get all cxxtests in directory + matches = [] + for root, dirnames, filenames in os.walk("."): + for filename in fnmatch.filter(filenames, "*.cxxtest*"): + if args.skip_expensive and (filename in expensive_tests): + continue + if args.only_expensive and (filename not in expensive_tests): + continue + matches.append(os.path.join(root, filename)) + print("Found {} tests.".format(len(matches))) + + if args.cache: + # Load the previous cache if it exists + if os.path.exists(args.cache_file): + cache = pickle.load(open(args.cache_file, "rb")) + else: + cache = set() + + if type(cache) != set: + print("Invalid cache contents. Resetting cache") + cache = set() + + print("Found {} files in cache.".format(len(cache))) + + # Hash each test binary. + new_tests = {} + start_time = time.time() + for test_file in matches: + hasher = hashlib.md5() + with open(test_file, "rb") as afile: + buf = afile.read() + hasher.update(buf) + new_tests[test_file] = hasher.hexdigest() + elapsed = time.time() - start_time + print("Hashed {0} files in {1} seconds.".format(len(new_tests), elapsed)) + + # Make a list of tests whose hash does not appear in the cache + tests = [] + for test_file in new_tests.keys(): + if new_tests[test_file] not in cache: + tests.append(test_file) else: - cache = set() - - if type(cache) != set: - print("Invalid cache contents. Resetting cache") - cache = set() - - print('Found {} files in cache.'.format(len(cache))) - - # Hash each test binary. - new_tests = {} - start_time = time.time() - for test_file in matches: - hasher = hashlib.md5() - with open(test_file, 'rb') as afile: - buf = afile.read() - hasher.update(buf) - new_tests[test_file] = hasher.hexdigest() - elapsed = time.time() - start_time - print('Hashed {0} files in {1} seconds.'.format(len(new_tests), elapsed)) - - # Make a list of tests whose hash does not appear in the cache - tests = [] - for test_file in new_tests.keys(): - if new_tests[test_file] not in cache: - tests.append(test_file) - else: - tests = list(matches) - - print('Ready to test {} files.'.format(len(tests))) - - # If there are no tests, pick a random one. - if len(tests) == 0: - print("For annoying reasons, we cannot handle running 0 tests because jenkins will complain. So we are running the first test") - tests = [matches[0]] - - # Get basename and use .cxx rather than .cxxtest - runtests = [test.split('/')[-1].replace('cxxtest','cxx') for test in tests] - - # Make the command to run - cmd = [ - 'ctest', - '--output-on-failure', - '-j', - str(args.j), - ] - - if args.schedule_random: - cmd.append('--schedule-random') - - cmd += [ - '-R', - '({})'.format('|'.join(runtests)), - ] - - if args.dry_run: - print('Dry run requested! Proposed ctest command:', ' '.join(cmd)) - exit() - - exit_code = 0 - - print("Running: %s" % cmd) - ctest_process = subprocess.Popen(cmd, stdout=subprocess.PIPE) - - lines = [] - while True: - line = ctest_process.stdout.readline() - if len(line) == 0: - break - sys.stdout.write(line.decode()) - sys.stdout.flush() - lines.append(line) - - out, err = ctest_process.communicate() - - if args.cache: - # go through all the tests and see if we have a "Passed" line matching it - for i in range(len(tests)): - for line in lines: - if ('Passed' in line.decode()) and ((" " + runtests[i] + " ") in line.decode()): - # pass! - cache.add(new_tests[tests[i]]) - - # Save to cache - pickle.dump(cache, open(args.cache_file, "wb")) - - exit(ctest_process.returncode) + tests = list(matches) + + print("Ready to test {} files.".format(len(tests))) + + # If there are no tests, pick a random one. + if len(tests) == 0: + print( + "For annoying reasons, we cannot handle running 0 tests because jenkins will complain. So we are running the first test" + ) + tests = [matches[0]] + + # Get basename and use .cxx rather than .cxxtest + runtests = [test.split("/")[-1].replace("cxxtest", "cxx") for test in tests] + + # Make the command to run + cmd = [ + "ctest", + "--output-on-failure", + "-j", + str(args.j), + ] + + if args.schedule_random: + cmd.append("--schedule-random") + + cmd += [ + "-R", + "({})".format("|".join(runtests)), + ] + + if args.dry_run: + print("Dry run requested! Proposed ctest command:", " ".join(cmd)) + exit() + + exit_code = 0 + + print("Running: %s" % cmd) + ctest_process = subprocess.Popen(cmd, stdout=subprocess.PIPE) + + lines = [] + while True: + line = ctest_process.stdout.readline() + if len(line) == 0: + break + sys.stdout.write(line.decode()) + sys.stdout.flush() + lines.append(line) + + out, err = ctest_process.communicate() + + if args.cache: + # go through all the tests and see if we have a "Passed" line matching it + for i in range(len(tests)): + for line in lines: + if ("Passed" in line.decode()) and ( + (" " + runtests[i] + " ") in line.decode() + ): + # pass! + cache.add(new_tests[tests[i]]) + + # Save to cache + pickle.dump(cache, open(args.cache_file, "wb")) + + exit(ctest_process.returncode) diff --git a/src/python/doc/source/conf.py b/src/python/doc/source/conf.py index 77e980364c..3bff9aba62 100644 --- a/src/python/doc/source/conf.py +++ b/src/python/doc/source/conf.py @@ -20,71 +20,79 @@ # Import turicreate and several submodules so that sphinx can find them. # For example, it can now find turicreate.recommender.PopularityModel. import turicreate + for m in [ - 'activity_classifier', - 'boosted_trees_classifier', - 'boosted_trees_regression', - 'classifier', - 'connected_components', - 'dbscan', - 'degree_counting', - 'distances', - 'drawing_classifier', - 'evaluation', - 'extensions', - 'graph_coloring', - 'image_analysis', - 'kcore', - 'kmeans', - 'label_propagation', - 'linear_regression', - 'logistic_classifier', - 'load_sarray', - 'load_sframe', - 'load_sgraph', - 'load_model', - 'nearest_neighbors', - 'nearest_neighbor_classifier', - 'pagerank', - 'recommender', - 'random_forest_classifier', - 'random_forest_regression', - 'decision_tree_classifier', - 'decision_tree_regression', - 'regression', - 'shortest_path', - 'text_classifier', - 'sound_classifier', - 'svm_classifier', - 'style_transfer', - 'text_analytics', - 'object_detector', - 'image_classifier', - 'image_similarity', - 'topic_model', - 'triangle_counting', - 'visualization', - 'one_shot_object_detector' - ]: - module_name = 'turicreate.' + m + "activity_classifier", + "boosted_trees_classifier", + "boosted_trees_regression", + "classifier", + "connected_components", + "dbscan", + "degree_counting", + "distances", + "drawing_classifier", + "evaluation", + "extensions", + "graph_coloring", + "image_analysis", + "kcore", + "kmeans", + "label_propagation", + "linear_regression", + "logistic_classifier", + "load_sarray", + "load_sframe", + "load_sgraph", + "load_model", + "nearest_neighbors", + "nearest_neighbor_classifier", + "pagerank", + "recommender", + "random_forest_classifier", + "random_forest_regression", + "decision_tree_classifier", + "decision_tree_regression", + "regression", + "shortest_path", + "text_classifier", + "sound_classifier", + "svm_classifier", + "style_transfer", + "text_analytics", + "object_detector", + "image_classifier", + "image_similarity", + "topic_model", + "triangle_counting", + "visualization", + "one_shot_object_detector", +]: + module_name = "turicreate." + m sys.modules[module_name] = eval(module_name) # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('.')) -print 'CURRENT PATH', sys.path -#sys.path.insert(0, os.path.abspath('../venv/lib/python2.7/site-packages/')) +sys.path.insert(0, os.path.abspath(".")) +print "CURRENT PATH", sys.path +# sys.path.insert(0, os.path.abspath('../venv/lib/python2.7/site-packages/')) # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'numpydoc', 'sphinx.ext.coverage', 'sphinx.ext.mathjax', - 'sphinx.ext.inheritance_diagram', 'sphinx.ext.autosummary', 'sphinx_turicreate_ext.autorun'] +extensions = [ + "sphinx.ext.autodoc", + "numpydoc", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.inheritance_diagram", + "sphinx.ext.autosummary", + "sphinx_turicreate_ext.autorun", +] autosummary_generate = True @@ -92,20 +100,20 @@ mathjax_path = "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML" # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' #'turicreate' +master_doc = "index" #'turicreate' # General information about the project. -project = u'Turi Create API' -copyright = u'2017 Apple Inc.' +project = u"Turi Create API" +copyright = u"2017 Apple Inc." # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -117,71 +125,79 @@ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['*demo*', '*test*', 'test_*', '*cython*'] +exclude_patterns = ["*demo*", "*test*", "test_*", "*cython*"] numpydoc_show_class_members = False # The reST default role (used for this markup: `text`) to use for all documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -#pygments_style = 'sphinx' +# pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # -- Customizations ------------------- -autodoc_default_flags = [] #'members'] - #'private-members', - #'special-members', - #'show-inheritance'] +autodoc_default_flags = [] #'members'] +#'private-members', +#'special-members', +#'show-inheritance'] + def autodoc_skip_member(app, what, name, obj, skip, options): - exclusions = ('__weakref__', # special-members - '__doc__', '__module__', '__dict__', # undoc-members - ) + exclusions = ( + "__weakref__", # special-members + "__doc__", + "__module__", + "__dict__", # undoc-members + ) exclude = name in exclusions return skip or exclude + def setup(app): - app.connect('autodoc-skip-member', autodoc_skip_member) + app.connect("autodoc-skip-member", autodoc_skip_member) + # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. html_theme = "sphinx_rtd_theme" -html_theme_path = ["_themes", ] +html_theme_path = [ + "_themes", +] html_theme_options = { #'theme_globaltoc_depth': 3 } html_context = { - # The same as the default but without jquery (it is already loaded) - # https://github.com/sphinx-doc/sphinx/blob/795190950648277a39cbb13e51a2ae1bdd244cca/sphinx/builders/html.py#L91 - 'script_files': ['_static/underscore.js', '_static/doctools.js'] + # The same as the default but without jquery (it is already loaded) + # https://github.com/sphinx-doc/sphinx/blob/795190950648277a39cbb13e51a2ae1bdd244cca/sphinx/builders/html.py#L91 + "script_files": ["_static/underscore.js", "_static/doctools.js"] } @@ -193,54 +209,54 @@ def setup(app): # documentation. # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. html_short_title = "Turi Create" -org_url="https://github.com/apple/turicreate" +org_url = "https://github.com/apple/turicreate" # The name of an image file (relative to this directory) to place at the top # of the sidebar. -html_logo = '../images/tc_logo_white.png' +html_logo = "../images/tc_logo_white.png" # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -html_favicon = '../images/favicon.ico' +html_favicon = "../images/favicon.ico" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['../images'] +html_static_path = ["../images"] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -html_last_updated_fmt = '%b %d, %Y' +html_last_updated_fmt = "%b %d, %Y" # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {'**': ['sidebartoc.html']} +# html_sidebars = {'**': ['sidebartoc.html']} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. html_show_sourcelink = False @@ -254,66 +270,60 @@ def setup(app): # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). # html_file_suffix = '.html' # Output file base name for HTML help builder. -htmlhelp_basename = 'TuriCreatedoc' +htmlhelp_basename = "TuriCreatedoc" # -- Options for LaTeX output -------------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'TuriCreate.tex', u'Turi Create Documentation', - u'Turi', 'manual'), + ("index", "TuriCreate.tex", u"Turi Create Documentation", u"Turi", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'turicreate', u'Turi Create Documentation', - [u'Turi'], 1) -] +man_pages = [("index", "turicreate", u"Turi Create Documentation", [u"Turi"], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------------ @@ -322,16 +332,22 @@ def setup(app): # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'TuriCreate', u'Turi Create Documentation', - u'Turi', 'TuriCreate', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "TuriCreate", + u"Turi Create Documentation", + u"Turi", + "TuriCreate", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' diff --git a/src/python/doc/source/sphinx_turicreate_ext/autorun.py b/src/python/doc/source/sphinx_turicreate_ext/autorun.py index 500f2f0c0f..5b54016180 100644 --- a/src/python/doc/source/sphinx_turicreate_ext/autorun.py +++ b/src/python/doc/source/sphinx_turicreate_ext/autorun.py @@ -13,7 +13,7 @@ """ import os -from subprocess import Popen,PIPE +from subprocess import Popen, PIPE from docutils import nodes from sphinx.util.compat import Directive @@ -22,81 +22,79 @@ class RunBlockError(SphinxError): - category = 'runblock error' + category = "runblock error" + class AutoRun(object): here = os.path.abspath(__file__) - pycon = os.path.join(os.path.dirname(here),'pycon.py') + pycon = os.path.join(os.path.dirname(here), "pycon.py") config = dict( - pycon = 'python ' + pycon, - pycon_prefix_chars = 4, - pycon_show_source = False, - console = 'bash', - console_prefix_chars = 1 , + pycon="python " + pycon, + pycon_prefix_chars=4, + pycon_show_source=False, + console="bash", + console_prefix_chars=1, ) + @classmethod - def builder_init(cls,app): + def builder_init(cls, app): cls.config.update(app.builder.config.autorun_languages) - class RunBlock(Directive): has_content = True required_arguments = 1 optional_arguments = 0 final_argument_whitespace = False option_spec = { - 'linenos': directives.flag, + "linenos": directives.flag, } - def run(self): config = AutoRun.config language = self.arguments[0] if language not in config: - raise RunBlockError('Unknown language %s' % language) - + raise RunBlockError("Unknown language %s" % language) # Get configuration values for the language args = config[language].split() - input_encoding = config.get(language+'_input_encoding','ascii') - output_encoding = config.get(language+'_output_encoding','ascii') - prefix_chars = config.get(language+'_prefix_chars',0) - show_source = config.get(language+'_show_source',True) - + input_encoding = config.get(language + "_input_encoding", "ascii") + output_encoding = config.get(language + "_output_encoding", "ascii") + prefix_chars = config.get(language + "_prefix_chars", 0) + show_source = config.get(language + "_show_source", True) # Build the code text - proc = Popen(args,bufsize=1,stdin=PIPE,stdout=PIPE,stderr=PIPE) + proc = Popen(args, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=PIPE) codelines = (line[prefix_chars:] for line in self.content) - code = u'\n'.join(codelines).encode(input_encoding) + code = u"\n".join(codelines).encode(input_encoding) # Run the code - stdout,stderr = proc.communicate(code) + stdout, stderr = proc.communicate(code) # Process output if stdout: - out = ''.join(stdout).decode(output_encoding) + out = "".join(stdout).decode(output_encoding) else: - out = ''.join(stderr).decode(output_encoding) + out = "".join(stderr).decode(output_encoding) # Get the original code with prefixes if show_source: - code = u'\n'.join(self.content) + code = u"\n".join(self.content) else: - code = '' - code_out = u'\n'.join((code, out)) + code = "" + code_out = u"\n".join((code, out)) literal = nodes.literal_block(code_out, code_out) - literal['language'] = language - literal['linenos'] = 'linenos' in self.options + literal["language"] = language + literal["linenos"] = "linenos" in self.options return [literal] - def setup(app): - app.add_directive('runblock', RunBlock) - app.connect('builder-inited',AutoRun.builder_init) - app.add_config_value('autorun_languages', AutoRun.config, 'env') + app.add_directive("runblock", RunBlock) + app.connect("builder-inited", AutoRun.builder_init) + app.add_config_value("autorun_languages", AutoRun.config, "env") + # vim: set expandtab shiftwidth=4 softtabstop=4 : diff --git a/src/python/doc/source/sphinx_turicreate_ext/pycon.py b/src/python/doc/source/sphinx_turicreate_ext/pycon.py index f71bc0fe58..713f6a3210 100644 --- a/src/python/doc/source/sphinx_turicreate_ext/pycon.py +++ b/src/python/doc/source/sphinx_turicreate_ext/pycon.py @@ -13,25 +13,24 @@ def main(): """ source_lines = (line.rstrip() for line in sys.stdin) console = InteractiveInterpreter() - console.runsource('import turicreate') - source = '' + console.runsource("import turicreate") + source = "" try: while True: source = source_lines.next() more = console.runsource(source) while more: next_line = source_lines.next() - print '...', next_line - source += '\n' + next_line + print "...", next_line + source += "\n" + next_line more = console.runsource(source) except StopIteration: if more: - print '... ' - more = console.runsource(source + '\n') + print "... " + more = console.runsource(source + "\n") - -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/python/setup.py b/src/python/setup.py index d85f82bba9..657b3a8975 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -12,14 +12,15 @@ from setuptools.dist import Distribution from setuptools.command.install import install -PACKAGE_NAME="turicreate" -VERSION='6.0'#{{VERSION_STRING}} +PACKAGE_NAME = "turicreate" +VERSION = "6.0" # {{VERSION_STRING}} # Prevent distutils from thinking we are a pure python package class BinaryDistribution(Distribution): def is_pure(self): return False + class InstallEngine(install): """Helper class to hook the python setup.py install path to download client libraries and engine""" @@ -31,9 +32,12 @@ def run(self): # Check correct version of architecture (64-bit only) arch = platform.architecture()[0] - if arch != '64bit': - msg = ("Turi Create currently supports only 64-bit operating systems, and only recent Linux/OSX " + - "architectures. Please install using a supported version. Your architecture is currently: %s" % arch) + if arch != "64bit": + msg = ( + "Turi Create currently supports only 64-bit operating systems, and only recent Linux/OSX " + + "architectures. Please install using a supported version. Your architecture is currently: %s" + % arch + ) sys.stderr.write(msg) sys.exit(1) @@ -41,40 +45,46 @@ def run(self): # if OSX, verify >= 10.8 from distutils.util import get_platform from pkg_resources import parse_version + cur_platform = get_platform() if cur_platform.startswith("macosx"): mac_ver = platform.mac_ver()[0] - if parse_version(mac_ver) < parse_version('10.8.0'): + if parse_version(mac_ver) < parse_version("10.8.0"): msg = ( - "Turi Create currently does not support versions of OSX prior to 10.8. Please upgrade your Mac OSX " - "installation to a supported version. Your current OSX version is: %s" % mac_ver) + "Turi Create currently does not support versions of OSX prior to 10.8. Please upgrade your Mac OSX " + "installation to a supported version. Your current OSX version is: %s" + % mac_ver + ) sys.stderr.write(msg) sys.exit(1) - elif cur_platform.startswith('linux'): + elif cur_platform.startswith("linux"): pass - elif cur_platform.startswith('win'): + elif cur_platform.startswith("win"): win_ver = platform.version() # Verify this is Vista or above - if parse_version(win_ver) < parse_version('6.0'): + if parse_version(win_ver) < parse_version("6.0"): msg = ( - "Turi Create currently does not support versions of Windows" - " prior to Vista, or versions of Windows Server prior to 2008." - "Your current version of Windows is: %s" % platform.release()) + "Turi Create currently does not support versions of Windows" + " prior to Vista, or versions of Windows Server prior to 2008." + "Your current version of Windows is: %s" % platform.release() + ) sys.stderr.write(msg) sys.exit(1) else: msg = ( - "Unsupported Platform: '%s'. Turi Create is only supported on Windows, Mac OSX, and Linux." % cur_platform + "Unsupported Platform: '%s'. Turi Create is only supported on Windows, Mac OSX, and Linux." + % cur_platform ) sys.stderr.write(msg) sys.exit(1) -if __name__ == '__main__': +if __name__ == "__main__": from distutils.util import get_platform - classifiers=[ + + classifiers = [ "Development Status :: 5 - Production/Stable", "Environment :: Console", "Intended Audience :: Developers", @@ -95,21 +105,24 @@ def run(self): cur_platform = get_platform() if cur_platform.startswith("macosx"): classifiers.append("Operating System :: MacOS :: MacOS X") - elif cur_platform.startswith('linux'): - classifiers += ["Operating System :: POSIX :: Linux", - "Operating System :: POSIX :: BSD", - "Operating System :: Unix"] - elif cur_platform.startswith('win'): + elif cur_platform.startswith("linux"): + classifiers += [ + "Operating System :: POSIX :: Linux", + "Operating System :: POSIX :: BSD", + "Operating System :: Unix", + ] + elif cur_platform.startswith("win"): classifiers += ["Operating System :: Microsoft :: Windows"] else: msg = ( - "Unsupported Platform: '%s'. Turi Create is only supported on Windows, Mac OSX, and Linux." % cur_platform - ) + "Unsupported Platform: '%s'. Turi Create is only supported on Windows, Mac OSX, and Linux." + % cur_platform + ) sys.stderr.write(msg) sys.exit(1) - with open(os.path.join(os.path.dirname(__file__), 'README.rst'), 'rb') as f: - long_description = f.read().decode('utf-8') + with open(os.path.join(os.path.dirname(__file__), "README.rst"), "rb") as f: + long_description = f.read().decode("utf-8") install_requires = [ "coremltools==3.1", @@ -122,63 +135,59 @@ def run(self): "requests >= 2.9.1", "scipy >= 1.1.0", "six >= 1.10.0", - "tensorflow >= 2.0.0" + "tensorflow >= 2.0.0", ] setup( name="turicreate", version=VERSION, - # This distribution contains platform-specific C++ libraries, but they are not # built with distutils. So we must create a dummy Extension object so when we # create a binary file it knows to make it platform-specific. - ext_modules=[Extension('turicreate.__dummy', sources = ['dummy.c'])], - - author='Apple Inc.', - author_email='turi-create@group.apple.com', + ext_modules=[Extension("turicreate.__dummy", sources=["dummy.c"])], + author="Apple Inc.", + author_email="turi-create@group.apple.com", cmdclass=dict(install=InstallEngine), distclass=BinaryDistribution, package_data={ - 'turicreate': [ - '_cython/*.so', '_cython/*.pyd', - '*.so', '*.dylib', 'toolkits/*.so', - + "turicreate": [ + "_cython/*.so", + "_cython/*.pyd", + "*.so", + "*.dylib", + "toolkits/*.so", # macOS visualization - 'Turi Create Visualization.app/Contents/*', - 'Turi Create Visualization.app/Contents/_CodeSignature/*', - 'Turi Create Visualization.app/Contents/MacOS/*', - 'Turi Create Visualization.app/Contents/Resources/*', - 'Turi Create Visualization.app/Contents/Resources/Base.lproj/*', - 'Turi Create Visualization.app/Contents/Resources/Base.lproj/Main.storyboardc/*', - 'Turi Create Visualization.app/Contents/Resources/build/*', - 'Turi Create Visualization.app/Contents/Resources/build/static/*', - 'Turi Create Visualization.app/Contents/Resources/build/static/css/*', - 'Turi Create Visualization.app/Contents/Resources/build/static/js/*', - 'Turi Create Visualization.app/Contents/Resources/build/static/media/*', - 'Turi Create Visualization.app/Contents/Frameworks/*', - + "Turi Create Visualization.app/Contents/*", + "Turi Create Visualization.app/Contents/_CodeSignature/*", + "Turi Create Visualization.app/Contents/MacOS/*", + "Turi Create Visualization.app/Contents/Resources/*", + "Turi Create Visualization.app/Contents/Resources/Base.lproj/*", + "Turi Create Visualization.app/Contents/Resources/Base.lproj/Main.storyboardc/*", + "Turi Create Visualization.app/Contents/Resources/build/*", + "Turi Create Visualization.app/Contents/Resources/build/static/*", + "Turi Create Visualization.app/Contents/Resources/build/static/css/*", + "Turi Create Visualization.app/Contents/Resources/build/static/js/*", + "Turi Create Visualization.app/Contents/Resources/build/static/media/*", + "Turi Create Visualization.app/Contents/Frameworks/*", # Linux visualization - 'Turi Create Visualization/*.*', - 'Turi Create Visualization/visualization_client', - 'Turi Create Visualization/swiftshader/*', - 'Turi Create Visualization/locales/*', - 'Turi Create Visualization/html/*.*', - 'Turi Create Visualization/html/static/js/*', - 'Turi Create Visualization/html/static/css/*', - + "Turi Create Visualization/*.*", + "Turi Create Visualization/visualization_client", + "Turi Create Visualization/swiftshader/*", + "Turi Create Visualization/locales/*", + "Turi Create Visualization/html/*.*", + "Turi Create Visualization/html/static/js/*", + "Turi Create Visualization/html/static/css/*", # Plot.save dependencies - 'visualization/vega_3.2.1.js', - 'visualization/vg2png', - 'visualization/vg2svg' + "visualization/vega_3.2.1.js", + "visualization/vg2png", + "visualization/vg2svg", ] }, - packages=find_packages( - exclude=["test"] - ), - url='https://github.com/apple/turicreate', - license='LICENSE.txt', - description='Turi Create simplifies the development of custom machine learning models.', + packages=find_packages(exclude=["test"]), + url="https://github.com/apple/turicreate", + license="LICENSE.txt", + description="Turi Create simplifies the development of custom machine learning models.", long_description=long_description, classifiers=classifiers, - install_requires=install_requires + install_requires=install_requires, ) diff --git a/src/python/turicreate/__init__.py b/src/python/turicreate/__init__.py index 26be0f0677..d7182ab58e 100644 --- a/src/python/turicreate/__init__.py +++ b/src/python/turicreate/__init__.py @@ -3,17 +3,17 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" @package turicreate ... Turi Create is a machine learning platform that enables data scientists and app developers to easily create intelligent applications at scale. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ -__version__ = '{{VERSION_STRING}}' +__version__ = "{{VERSION_STRING}}" from turicreate.version_info import __version__ from turicreate.data_structures.sgraph import Vertex, Edge @@ -100,20 +100,24 @@ # rewrite the extensions module class _extensions_wrapper(object): - def __init__(self, wrapped): - self._wrapped = wrapped - self.__doc__ = wrapped.__doc__ - - def __getattr__(self, name): - try: + def __init__(self, wrapped): + self._wrapped = wrapped + self.__doc__ = wrapped.__doc__ + + def __getattr__(self, name): + try: + return getattr(self._wrapped, name) + except: + pass + turicreate._connect.main.get_unity() return getattr(self._wrapped, name) - except: - pass - turicreate._connect.main.get_unity() - return getattr(self._wrapped, name) + import sys as _sys -_sys.modules["turicreate.extensions"] = _extensions_wrapper(_sys.modules["turicreate.extensions"]) + +_sys.modules["turicreate.extensions"] = _extensions_wrapper( + _sys.modules["turicreate.extensions"] +) # rewrite the import extensions = _sys.modules["turicreate.extensions"] diff --git a/src/python/turicreate/_connect/__init__.py b/src/python/turicreate/_connect/__init__.py index a224b5d589..7167512633 100644 --- a/src/python/turicreate/_connect/__init__.py +++ b/src/python/turicreate/_connect/__init__.py @@ -4,10 +4,10 @@ # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause # of the BSD license. See the LICENSE file for details. -''' +""" This module defines classes and global functions for creating and managing connection to the turicreate backend server. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ diff --git a/src/python/turicreate/_connect/main.py b/src/python/turicreate/_connect/main.py index 00f21ecc6e..c915633c88 100644 --- a/src/python/turicreate/_connect/main.py +++ b/src/python/turicreate/_connect/main.py @@ -3,9 +3,9 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" This module contains the main logic for start, query, stop turicreate server client connection. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -51,6 +51,7 @@ def launch(server_log=None): def get_server(): return __SERVER__ + def get_unity(): """ Returns the unity global object of the current connection. diff --git a/src/python/turicreate/_cython/context.py b/src/python/turicreate/_cython/context.py index d1e77e2757..aa1de5002a 100755 --- a/src/python/turicreate/_cython/context.py +++ b/src/python/turicreate/_cython/context.py @@ -3,10 +3,10 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Provides utility context managers related to executing cython functions -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -21,8 +21,8 @@ class debug_trace(object): """ def __init__(self): - self.show_cython_trace = 'TURI_BUILD_ROOT' in os.environ - self.show_server_log = 'TURI_VERBOSE' in os.environ + self.show_cython_trace = "TURI_BUILD_ROOT" in os.environ + self.show_server_log = "TURI_VERBOSE" in os.environ def __enter__(self): pass @@ -30,12 +30,13 @@ def __enter__(self): def _fetch_unity_log(self): try: from .._connect.main import get_server + logfile = get_server().unity_log - logcontent = "\n=======================================" + logcontent = "\n=======================================" logcontent += "\n unity server log location: " + logfile logcontent += "\n=======================================\n" logcontent += "\nLast 100 lines:\n" - with open(logfile, 'r') as f: + with open(logfile, "r") as f: logcontent += "".join(f.readlines()[-100:]) logcontent += "\n========= End of server log ===========\n" return logcontent diff --git a/src/python/turicreate/_cython/python_printer_callback.py b/src/python/turicreate/_cython/python_printer_callback.py index 4b89c98756..4d2655e5c7 100755 --- a/src/python/turicreate/_cython/python_printer_callback.py +++ b/src/python/turicreate/_cython/python_printer_callback.py @@ -7,13 +7,16 @@ from __future__ import division as _ from __future__ import absolute_import as _ import sys + try: import IPython from IPython.core.interactiveshell import InteractiveShell + have_ipython = True except ImportError: have_ipython = False + def print_callback(val): """ Internal function. @@ -28,7 +31,9 @@ def print_callback(val): # I have to intrude rather deep into IPython to make it behave if have_ipython: if InteractiveShell.initialized(): - IPython.display.publish_display_data({'text/plain':val,'text/html':'
' + val + '
'}) + IPython.display.publish_display_data( + {"text/plain": val, "text/html": "
" + val + "
"} + ) success = True except: pass diff --git a/src/python/turicreate/_deps/__init__.py b/src/python/turicreate/_deps/__init__.py index 7e7557616d..bef1159ae8 100644 --- a/src/python/turicreate/_deps/__init__.py +++ b/src/python/turicreate/_deps/__init__.py @@ -13,55 +13,72 @@ def __get_version(version): # matching 1.6.1, and 1.6.1rc, 1.6.1.dev - version_regex = '^\d+\.\d+\.\d+' + version_regex = "^\d+\.\d+\.\d+" version = _re.search(version_regex, str(version)).group(0) return _StrictVersion(version) HAS_PANDAS = True -PANDAS_MIN_VERSION = '0.13.0' +PANDAS_MIN_VERSION = "0.13.0" try: import pandas + if __get_version(pandas.__version__) < _StrictVersion(PANDAS_MIN_VERSION): HAS_PANDAS = False - _logging.warn(('Pandas version %s is not supported. Minimum required version: %s. ' - 'Pandas support will be disabled.') - % (pandas.__version__, PANDAS_MIN_VERSION) ) + _logging.warn( + ( + "Pandas version %s is not supported. Minimum required version: %s. " + "Pandas support will be disabled." + ) + % (pandas.__version__, PANDAS_MIN_VERSION) + ) except: HAS_PANDAS = False from . import pandas_mock as pandas HAS_NUMPY = True -NUMPY_MIN_VERSION = '1.8.0' +NUMPY_MIN_VERSION = "1.8.0" try: import numpy if __get_version(numpy.__version__) < _StrictVersion(NUMPY_MIN_VERSION): HAS_NUMPY = False - _logging.warn(('Numpy version %s is not supported. Minimum required version: %s. ' - 'Numpy support will be disabled.') - % (numpy.__version__, NUMPY_MIN_VERSION) ) + _logging.warn( + ( + "Numpy version %s is not supported. Minimum required version: %s. " + "Numpy support will be disabled." + ) + % (numpy.__version__, NUMPY_MIN_VERSION) + ) except: HAS_NUMPY = False from . import numpy_mock as numpy HAS_SKLEARN = True -SKLEARN_MIN_VERSION = '0.15' +SKLEARN_MIN_VERSION = "0.15" + + def __get_sklearn_version(version): # matching 0.15b, 0.16bf, etc - version_regex = '^\d+\.\d+' + version_regex = "^\d+\.\d+" version = _re.search(version_regex, str(version)).group(0) return _StrictVersion(version) + try: import sklearn + if __get_sklearn_version(sklearn.__version__) < _StrictVersion(SKLEARN_MIN_VERSION): HAS_SKLEARN = False - _logging.warn(('sklearn version %s is not supported. Minimum required version: %s. ' - 'sklearn support will be disabled.') - % (sklearn.__version__, SKLEARN_MIN_VERSION) ) + _logging.warn( + ( + "sklearn version %s is not supported. Minimum required version: %s. " + "sklearn support will be disabled." + ) + % (sklearn.__version__, SKLEARN_MIN_VERSION) + ) except: HAS_SKLEARN = False from . import sklearn_mock as sklearn diff --git a/src/python/turicreate/_deps/numpy_mock.py b/src/python/turicreate/_deps/numpy_mock.py index da407e0394..19e1cf6ea8 100644 --- a/src/python/turicreate/_deps/numpy_mock.py +++ b/src/python/turicreate/_deps/numpy_mock.py @@ -3,12 +3,12 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Dummy mocking module for numpy. When numpy is not available we will import this module as turicreate._deps.numpy, and set HAS_NUMPY to false. All methods that access numpy should check the HAS_NUMPY flag, therefore, attributes/class/methods in this module should never be actually used. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ diff --git a/src/python/turicreate/_gl_pickle.py b/src/python/turicreate/_gl_pickle.py index 27d46f1dfe..f24b616b5c 100644 --- a/src/python/turicreate/_gl_pickle.py +++ b/src/python/turicreate/_gl_pickle.py @@ -6,11 +6,16 @@ from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ -from . import util as _util, toolkits as _toolkits, SFrame as _SFrame, SArray as _SArray, \ - SGraph as _SGraph, load_sgraph as _load_graph +from . import ( + util as _util, + toolkits as _toolkits, + SFrame as _SFrame, + SArray as _SArray, + SGraph as _SGraph, + load_sgraph as _load_graph, +) -from .util import _get_aws_credentials as _util_get_aws_credentials, \ - _cloudpickle +from .util import _get_aws_credentials as _util_get_aws_credentials, _cloudpickle import pickle as _pickle import uuid as _uuid @@ -20,15 +25,19 @@ import atexit as _atexit import glob as _glob + def _get_aws_credentials(): (key, secret) = _util_get_aws_credentials() - return {'aws_access_key_id': key, 'aws_secret_access_key': secret} + return {"aws_access_key_id": key, "aws_secret_access_key": secret} + def _get_temp_filename(): - return _util._make_temp_filename(prefix='gl_pickle_') + return _util._make_temp_filename(prefix="gl_pickle_") + def _get_tmp_file_location(): - return _util._make_temp_directory(prefix='gl_pickle_') + return _util._make_temp_directory(prefix="gl_pickle_") + def _is_not_pickle_safe_gl_model_class(obj_class): """ @@ -49,6 +58,7 @@ def _is_not_pickle_safe_gl_model_class(obj_class): return not obj_class._is_gl_pickle_safe() return False + def _is_not_pickle_safe_gl_class(obj_class): """ Check if class is a Turi create model. @@ -70,6 +80,7 @@ class and verifies that _Model is the base class. # Object is GLC-DS or GLC-Model return (obj_class in gl_ds) or _is_not_pickle_safe_gl_model_class(obj_class) + def _get_gl_class_type(obj_class): """ Internal util to get the type of the GLC class. The pickle file stores @@ -96,6 +107,7 @@ def _get_gl_class_type(obj_class): else: return None + def _get_gl_object_from_persistent_id(type_tag, gl_archive_abs_path): """ Internal util to get a GLC object from a persistent ID in the pickle file. @@ -120,14 +132,17 @@ def _get_gl_object_from_persistent_id(type_tag, gl_archive_abs_path): obj = _SArray(gl_archive_abs_path) elif type_tag == "Model": from . import load_model as _load_model + obj = _load_model(gl_archive_abs_path) else: - raise _pickle.UnpicklingError("Turi pickling Error: Unsupported object." - " Only SFrames, SGraphs, SArrays, and Models are supported.") + raise _pickle.UnpicklingError( + "Turi pickling Error: Unsupported object." + " Only SFrames, SGraphs, SArrays, and Models are supported." + ) return obj -class GLPickler(_cloudpickle.CloudPickler): +class GLPickler(_cloudpickle.CloudPickler): def _to_abs_path_set(self, l): return set([_os.path.abspath(x) for x in l]) @@ -211,7 +226,8 @@ def _to_abs_path_set(self, l): """ - def __init__(self, filename, protocol = -1, min_bytes_to_save = 0): + + def __init__(self, filename, protocol=-1, min_bytes_to_save=0): """ Construct a GLC pickler. @@ -245,40 +261,43 @@ def __init__(self, filename, protocol = -1, min_bytes_to_save = 0): self.mark_for_delete = set() # Make sure the directory exists. - filename = _os.path.abspath( - _os.path.expanduser( - _os.path.expandvars(filename))) + filename = _os.path.abspath(_os.path.expanduser(_os.path.expandvars(filename))) if not _os.path.exists(filename): _os.makedirs(filename) elif _os.path.isdir(filename): self.mark_for_delete = self._to_abs_path_set( - _glob.glob(_os.path.join(filename, "*"))) + _glob.glob(_os.path.join(filename, "*")) + ) self.mark_for_delete -= self._to_abs_path_set( - [_os.path.join(filename, 'pickle_archive'), - _os.path.join(filename, 'version')]) + [ + _os.path.join(filename, "pickle_archive"), + _os.path.join(filename, "version"), + ] + ) elif _os.path.isfile(filename): - _os.remove(filename) - _os.makedirs(filename) + _os.remove(filename) + _os.makedirs(filename) # Create a new directory. self.gl_temp_storage_path = filename # The pickle file where all the Python objects are saved. relative_pickle_filename = "pickle_archive" - pickle_filename = _os.path.join(self.gl_temp_storage_path, - relative_pickle_filename) + pickle_filename = _os.path.join( + self.gl_temp_storage_path, relative_pickle_filename + ) try: # Initialize the pickle file with cloud _pickle. Note, cloud pickle # takes a file handle for initialization. - self.file = open(pickle_filename, 'wb') + self.file = open(pickle_filename, "wb") _cloudpickle.CloudPickler.__init__(self, self.file, protocol) except IOError as err: print("Turi create pickling error: %s" % err) # Write the version number. - with open(_os.path.join(self.gl_temp_storage_path, 'version'), 'w') as f: + with open(_os.path.join(self.gl_temp_storage_path, "version"), "w") as f: f.write("1.0") def dump(self, obj): @@ -322,13 +341,13 @@ def persistent_id(self, obj): """ # Get the class of the object (if it can be done) - obj_class = None if not hasattr(obj, '__class__') else obj.__class__ + obj_class = None if not hasattr(obj, "__class__") else obj.__class__ if obj_class is None: return None # If the object is a GLC class. if _is_not_pickle_safe_gl_class(obj_class): - if (id(obj) in self.gl_object_memo): + if id(obj) in self.gl_object_memo: # has already been pickled return (None, None, id(obj)) else: @@ -368,7 +387,7 @@ def close(self): def register_error(*args): error[0] = True - _shutil.rmtree(f, onerror = register_error) + _shutil.rmtree(f, onerror=register_error) if error[0]: _atexit.register(_shutil.rmtree, f, ignore_errors=True) @@ -376,6 +395,7 @@ def register_error(*args): def __del__(self): self.close() + class GLUnpickler(_pickle.Unpickler): """ # GLC unpickler works with a GLC pickler archive or a regular pickle @@ -418,11 +438,9 @@ def __init__(self, filename): # GLC 1.3 used Zipfiles for storing the objects. self.directory_mode = True - filename = _os.path.abspath( - _os.path.expanduser( - _os.path.expandvars(filename))) + filename = _os.path.abspath(_os.path.expanduser(_os.path.expandvars(filename))) if not _os.path.exists(filename): - raise IOError('%s is not a valid file name.' % filename) + raise IOError("%s is not a valid file name." % filename) # GLC 1.3 Pickle file if _zipfile.is_zipfile(filename): @@ -432,12 +450,16 @@ def __init__(self, filename): # Get the pickle file name. zf = _zipfile.ZipFile(filename, allowZip64=True) for info in zf.infolist(): - if info.filename == 'pickle_file': + if info.filename == "pickle_file": pickle_filename = zf.read(info.filename).decode() if pickle_filename is None: - raise IOError(("Cannot pickle file of the given format. File" + raise IOError( + ( + "Cannot pickle file of the given format. File" " must be one of (a) GLPickler archive, " - "(b) Cloudpickle archive, or (c) python pickle archive.")) + "(b) Cloudpickle archive, or (c) python pickle archive." + ) + ) # Extract the zip file. try: @@ -446,15 +468,18 @@ def __init__(self, filename): except IOError as err: print("Turi pickle extraction error: %s " % err) - self.pickle_filename = _os.path.join(self.gl_temp_storage_path, - pickle_filename) + self.pickle_filename = _os.path.join( + self.gl_temp_storage_path, pickle_filename + ) # GLC Pickle directory mode. elif _os.path.isdir(filename): self.directory_mode = True pickle_filename = _os.path.join(filename, "pickle_archive") if not _os.path.exists(pickle_filename): - raise IOError("Corrupted archive: Missing pickle file %s." % pickle_filename) + raise IOError( + "Corrupted archive: Missing pickle file %s." % pickle_filename + ) if not _os.path.exists(_os.path.join(filename, "version")): raise IOError("Corrupted archive: Missing version file.") self.pickle_filename = pickle_filename @@ -465,10 +490,9 @@ def __init__(self, filename): self.directory_mode = False self.pickle_filename = filename - self.file = open(self.pickle_filename, 'rb') + self.file = open(self.pickle_filename, "rb") _pickle.Unpickler.__init__(self, self.file) - def persistent_load(self, pid): """ Reconstruct a GLC object using the persistent ID. @@ -487,7 +511,7 @@ def persistent_load(self, pid): # Pre GLC-1.3 release behavior, without memorization type_tag, filename = pid abs_path = _os.path.join(self.gl_temp_storage_path, filename) - return _get_gl_object_from_persistent_id(type_tag, abs_path) + return _get_gl_object_from_persistent_id(type_tag, abs_path) else: # Post GLC-1.3 release behavior, with memorization type_tag, filename, object_id = pid diff --git a/src/python/turicreate/_json.py b/src/python/turicreate/_json.py index f1c5355228..9a7b6ca8ba 100644 --- a/src/python/turicreate/_json.py +++ b/src/python/turicreate/_json.py @@ -9,14 +9,19 @@ import json as _json + def to_serializable(obj): from . import extensions + return extensions.json.to_serializable(obj) + def from_serializable(data, schema): from . import extensions + return extensions.json.from_serializable(data, schema) + def dumps(obj): """ Dumps a serializable object to JSON. This API maps to the Python built-in @@ -35,7 +40,8 @@ def dumps(obj): (like Image) from dict. """ (data, schema) = to_serializable(obj) - return _json.dumps({'data': data, 'schema': schema}) + return _json.dumps({"data": data, "schema": schema}) + def loads(json_string): """ diff --git a/src/python/turicreate/_scripts/_pylambda_worker.py b/src/python/turicreate/_scripts/_pylambda_worker.py index b109c76b57..b8c449f15a 100644 --- a/src/python/turicreate/_scripts/_pylambda_worker.py +++ b/src/python/turicreate/_scripts/_pylambda_worker.py @@ -11,6 +11,7 @@ import os from os.path import split, abspath + def get_main_dir(): script_path = abspath(sys.modules[__name__].__file__) main_dir = split(split(script_path)[0])[0] @@ -18,9 +19,8 @@ def get_main_dir(): return main_dir -def setup_environment(info_log_function = None, error_log_function = None): - - def _write_log(s, error = False): +def setup_environment(info_log_function=None, error_log_function=None): + def _write_log(s, error=False): if error: if error_log_function is None: print(s) @@ -59,7 +59,6 @@ def _write_log(s, error = False): os.environ["MKL_DOMAIN_NUM_THREADS"] = "1" os.environ["NUMBA_NUM_THREADS"] = "1" - ######################################## # Now, import thnigs @@ -69,7 +68,7 @@ def _write_log(s, error = False): ######################################## # Finally, set the dll load path if we are on windows - if sys.platform == 'win32': + if sys.platform == "win32": import ctypes import ctypes.wintypes as wintypes @@ -91,13 +90,16 @@ def errcheck_bool(result, func, args): # folder so windows attempts to load all DLLs from this # directory. try: - kernel32 = ctypes.WinDLL('kernel32', use_last_error=True) + kernel32 = ctypes.WinDLL("kernel32", use_last_error=True) kernel32.SetDllDirectoryW.errcheck = errcheck_bool kernel32.SetDllDirectoryW.argtypes = (wintypes.LPCWSTR,) kernel32.SetDllDirectoryW(lib_path) except Exception as e: - _write_log("Error setting DLL load orders: %s (things may still work).\n" - % str(e), error = True) + _write_log( + "Error setting DLL load orders: %s (things may still work).\n" % str(e), + error=True, + ) + if __name__ == "__main__": @@ -117,7 +119,7 @@ def errcheck_bool(result, func, args): _write_out_file_name = os.environ.get("TURI_LAMBDA_WORKER_LOG_FILE", "") _write_out_file = None - def _write_log(s, error = False): + def _write_log(s, error=False): s = s + "\n" if error: @@ -155,17 +157,23 @@ def _write_log(s, error = False): try: _write_out_file = open(_write_out_file_name, "w") except Exception as e: - _write_log("Error opening '%s' for write: %s" % (_write_out_file_name, repr(e))) + _write_log( + "Error opening '%s' for write: %s" % (_write_out_file_name, repr(e)) + ) _write_out_file = None for s in sys.argv: _write_log("Lambda worker args: \n %s" % ("\n ".join(sys.argv))) if dry_run: - print("PyLambda script called with no IPC information; entering diagnostic mode.") + print( + "PyLambda script called with no IPC information; entering diagnostic mode." + ) - setup_environment(info_log_function = _write_log, - error_log_function = lambda s: _write_log(s, error=True)) + setup_environment( + info_log_function=_write_log, + error_log_function=lambda s: _write_log(s, error=True), + ) ############################################################ # Load in the cython lambda workers. On import, this will resolve @@ -175,8 +183,12 @@ def _write_log(s, error = False): main_dir = get_main_dir() - default_loglevel = 5 # 5: LOG_WARNING, 4: LOG_PROGRESS 3: LOG_EMPH 2: LOG_INFO 1: LOG_DEBUG - dryrun_loglevel = 1 # 5: LOG_WARNING, 4: LOG_PROGRESS 3: LOG_EMPH 2: LOG_INFO 1: LOG_DEBUG + default_loglevel = ( + 5 # 5: LOG_WARNING, 4: LOG_PROGRESS 3: LOG_EMPH 2: LOG_INFO 1: LOG_DEBUG + ) + dryrun_loglevel = ( + 1 # 5: LOG_WARNING, 4: LOG_PROGRESS 3: LOG_EMPH 2: LOG_INFO 1: LOG_DEBUG + ) if not dry_run: # This call only returns after the parent process is done. diff --git a/src/python/turicreate/_sys_util.py b/src/python/turicreate/_sys_util.py index 58362c44ec..0621669563 100644 --- a/src/python/turicreate/_sys_util.py +++ b/src/python/turicreate/_sys_util.py @@ -19,6 +19,7 @@ else: import configparser as _ConfigParser + def make_unity_server_env(): """ Returns the environment for unity_server. @@ -37,19 +38,21 @@ def make_unity_server_env(): # Add hadoop class path classpath = get_hadoop_class_path() - if ("CLASSPATH" in env): - env["CLASSPATH"] = env['CLASSPATH'] + (os.path.pathsep + classpath if classpath != '' else '') + if "CLASSPATH" in env: + env["CLASSPATH"] = env["CLASSPATH"] + ( + os.path.pathsep + classpath if classpath != "" else "" + ) else: env["CLASSPATH"] = classpath # Add python syspath - env['__GL_SYS_PATH__'] = (os.path.pathsep).join(sys.path + [os.getcwd()]) + env["__GL_SYS_PATH__"] = (os.path.pathsep).join(sys.path + [os.getcwd()]) # Add the python executable to the runtime config - env['__GL_PYTHON_EXECUTABLE__'] = os.path.abspath(sys.executable) + env["__GL_PYTHON_EXECUTABLE__"] = os.path.abspath(sys.executable) # Add the pylambda execution script to the runtime config - env['__GL_PYLAMBDA_SCRIPT__'] = os.path.abspath(_pylambda_worker.__file__) + env["__GL_PYLAMBDA_SCRIPT__"] = os.path.abspath(_pylambda_worker.__file__) #### Remove PYTHONEXECUTABLE #### # Anaconda overwrites this environment variable @@ -58,20 +61,24 @@ def make_unity_server_env(): # all subprocess launched under unity_server will use the # conda binary outside of virtualenv, which lacks the access # to all packages installed inside virtualenv. - if 'PYTHONEXECUTABLE' in env: - del env['PYTHONEXECUTABLE'] + if "PYTHONEXECUTABLE" in env: + del env["PYTHONEXECUTABLE"] # add certificate file - if 'TURI_FILEIO_ALTERNATIVE_SSL_CERT_FILE' not in env and \ - 'TURI_FILEIO_ALTERNATIVE_SSL_CERT_DIR' not in env: + if ( + "TURI_FILEIO_ALTERNATIVE_SSL_CERT_FILE" not in env + and "TURI_FILEIO_ALTERNATIVE_SSL_CERT_DIR" not in env + ): try: import certifi - env['TURI_FILEIO_ALTERNATIVE_SSL_CERT_FILE'] = certifi.where() - env['TURI_FILEIO_ALTERNATIVE_SSL_CERT_DIR'] = "" + + env["TURI_FILEIO_ALTERNATIVE_SSL_CERT_FILE"] = certifi.where() + env["TURI_FILEIO_ALTERNATIVE_SSL_CERT_DIR"] = "" except: pass return env + def set_windows_dll_path(): """ Sets the dll load path so that things are resolved correctly. @@ -95,25 +102,26 @@ def errcheck_bool(result, func, args): import ctypes.wintypes as wintypes try: - kernel32 = ctypes.WinDLL('kernel32', use_last_error=True) + kernel32 = ctypes.WinDLL("kernel32", use_last_error=True) kernel32.SetDllDirectoryW.errcheck = errcheck_bool kernel32.SetDllDirectoryW.argtypes = (wintypes.LPCWSTR,) kernel32.SetDllDirectoryW(lib_path) except Exception as e: logging.getLogger(__name__).warning( - "Error setting DLL load orders: %s (things should still work)." % str(e)) + "Error setting DLL load orders: %s (things should still work)." % str(e) + ) def get_current_platform_dll_extension(): """ Return the dynamic loading library extension for the current platform """ - if sys.platform == 'win32': - return 'dll' - elif sys.platform == 'darwin': - return 'dylib' + if sys.platform == "win32": + return "dll" + elif sys.platform == "darwin": + return "dylib" else: - return 'so' + return "so" def test_pylambda_worker(): @@ -131,11 +139,12 @@ def test_pylambda_worker(): import time import zipfile import sys + # change the temp directory to /tmp. # Otherwise we get interesting zeromq "too long file name" issues. - if sys.platform == 'darwin': - if exists('/tmp'): - tempfile.tempdir = '/tmp' + if sys.platform == "darwin": + if exists("/tmp"): + tempfile.tempdir = "/tmp" temp_dir = tempfile.mkdtemp() @@ -152,13 +161,13 @@ def test_pylambda_worker(): print("\nRunning simulation.") - env=make_unity_server_env() + env = make_unity_server_env() env["TURI_LAMBDA_WORKER_DEBUG_MODE"] = "1" env["TURI_LAMBDA_WORKER_LOG_FILE"] = lambda_log_file_sym proc = subprocess.Popen( - [sys.executable, os.path.abspath(_pylambda_worker.__file__)], - env = env) + [sys.executable, os.path.abspath(_pylambda_worker.__file__)], env=env + ) proc.wait() @@ -166,7 +175,8 @@ def test_pylambda_worker(): # Write out the current system path. open(join(temp_dir, "sys_path_1.log"), "w").write( - "\n".join(" sys.path[%d] = %s. " % (i, p) for i, p in enumerate(sys.path))) + "\n".join(" sys.path[%d] = %s. " % (i, p) for i, p in enumerate(sys.path)) + ) # Now run the program print("\nRunning full lambda worker process") @@ -181,14 +191,16 @@ def test_pylambda_worker(): run_temp_dir_copy = join(temp_dir, "run_temp_dir_copy") run_info_dict = { - "lambda_log" : lambda_log_file_run, - "temp_dir" : trial_temp_dir, - "run_temp_dir" : run_temp_dir, - "preserved_temp_dir" : run_temp_dir_copy, - "runtime_log" : join(trial_temp_dir, "runtime.log"), - "sys_path_log" : join(trial_temp_dir, "sys_path_2.log")} - - run_script = r""" + "lambda_log": lambda_log_file_run, + "temp_dir": trial_temp_dir, + "run_temp_dir": run_temp_dir, + "preserved_temp_dir": run_temp_dir_copy, + "runtime_log": join(trial_temp_dir, "runtime.log"), + "sys_path_log": join(trial_temp_dir, "sys_path_2.log"), + } + + run_script = ( + r""" import os import traceback import shutil @@ -292,7 +304,9 @@ def translate_name(d): sys.stderr.write("Error with: " + f) write_exception(e) - """ % run_info_dict + """ + % run_info_dict + ) run_script_file = join(temp_dir, "run_script.py") open(run_script_file, "w").write(run_script) @@ -301,18 +315,21 @@ def translate_name(d): log_file_stderr = join(trial_temp_dir, "stderr.log") env = os.environ.copy() - env['__GL_SYS_PATH__'] = (os.path.pathsep).join(sys.path) + env["__GL_SYS_PATH__"] = (os.path.pathsep).join(sys.path) proc = subprocess.Popen( - [sys.executable, os.path.abspath(run_script_file)], - stdout = open(log_file_stdout, "w"), - stderr = open(log_file_stderr, "w"), - env = env) + [sys.executable, os.path.abspath(run_script_file)], + stdout=open(log_file_stdout, "w"), + stderr=open(log_file_stderr, "w"), + env=env, + ) proc.wait() # Now zip up the output data into a package we can access. - timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H-%M-%S') + timestamp = datetime.datetime.fromtimestamp(time.time()).strftime( + "%Y-%m-%d-%H-%M-%S" + ) zipfile_name = join(temp_dir, "testing_logs-%d-%s.zip" % (os.getpid(), timestamp)) print("Creating archive of log files in %s." % zipfile_name) @@ -322,7 +339,7 @@ def translate_name(d): for root, dirs, files in os.walk(temp_dir): save_files += [join(root, name) for name in files] - with zipfile.ZipFile(zipfile_name, 'w') as logzip: + with zipfile.ZipFile(zipfile_name, "w") as logzip: error_logs = [] for f in save_files: try: @@ -335,11 +352,15 @@ def translate_name(d): open(error_log_file, "w").write("\n\n".join(error_logs)) logzip.write(error_log_file) - print("################################################################################") + print( + "################################################################################" + ) print("# ") print("# Results of lambda test logged as %s." % zipfile_name) print("# ") - print("################################################################################") + print( + "################################################################################" + ) print("Cleaning up.") @@ -350,7 +371,7 @@ def translate_name(d): print("Could not delete: %s" % f) -def dump_directory_structure(out = sys.stdout): +def dump_directory_structure(out=sys.stdout): """ Dumps a detailed report of the turicreate/sframe directory structure and files, along with the output of os.lstat for each. This is useful @@ -362,57 +383,72 @@ def dump_directory_structure(out = sys.stdout): import sys, os from os.path import split, abspath, join from itertools import chain + main_dir = split(abspath(sys.modules[__name__].__file__))[0] visited_files = [] def on_error(err): - visited_files.append( (" ERROR", str(err)) ) + visited_files.append((" ERROR", str(err))) - for path, dirs, files in os.walk(main_dir, onerror = on_error): + for path, dirs, files in os.walk(main_dir, onerror=on_error): for fn in chain(files, dirs): name = join(path, fn) try: - visited_files.append( (name, repr(os.lstat(name))) ) + visited_files.append((name, repr(os.lstat(name)))) except: - visited_files.append( (name, "ERROR calling os.lstat.") ) + visited_files.append((name, "ERROR calling os.lstat.")) def strip_name(n): - if n[:len(main_dir)] == main_dir: - return "/" + n[len(main_dir):] + if n[: len(main_dir)] == main_dir: + return "/" + n[len(main_dir) :] else: return n - out.write("\n".join( (" %s: %s" % (strip_name(name), stats)) - for name, stats in sorted(visited_files))) + out.write( + "\n".join( + (" %s: %s" % (strip_name(name), stats)) + for name, stats in sorted(visited_files) + ) + ) out.flush() + __hadoop_class_warned = False + def get_hadoop_class_path(): # Try get the classpath directly from executing hadoop env = os.environ.copy() - hadoop_exe_name = 'hadoop' - if sys.platform == 'win32': - hadoop_exe_name += '.cmd' + hadoop_exe_name = "hadoop" + if sys.platform == "win32": + hadoop_exe_name += ".cmd" output = None try: try: - output = _subprocess.check_output([hadoop_exe_name, 'classpath']).decode() + output = _subprocess.check_output([hadoop_exe_name, "classpath"]).decode() except: - output = _subprocess.check_output(['/'.join([env['HADOOP_HOME'],'bin',hadoop_exe_name]), 'classpath']).decode() + output = _subprocess.check_output( + ["/".join([env["HADOOP_HOME"], "bin", hadoop_exe_name]), "classpath"] + ).decode() - output = (os.path.pathsep).join(os.path.realpath(path) for path in output.split(os.path.pathsep)) + output = (os.path.pathsep).join( + os.path.realpath(path) for path in output.split(os.path.pathsep) + ) return _get_expanded_classpath(output) except Exception as e: global __hadoop_class_warned if not __hadoop_class_warned: __hadoop_class_warned = True - logging.getLogger(__name__).debug("Exception trying to retrieve Hadoop classpath: %s" % e) + logging.getLogger(__name__).debug( + "Exception trying to retrieve Hadoop classpath: %s" % e + ) - logging.getLogger(__name__).debug("Hadoop not found. HDFS url is not supported. Please make hadoop available from PATH or set the environment variable HADOOP_HOME.") + logging.getLogger(__name__).debug( + "Hadoop not found. HDFS url is not supported. Please make hadoop available from PATH or set the environment variable HADOOP_HOME." + ) return "" @@ -426,16 +462,21 @@ def _get_expanded_classpath(classpath): mentioned in the path """ - if classpath is None or classpath == '': - return '' + if classpath is None or classpath == "": + return "" # so this set comprehension takes paths that end with * to be globbed to find the jars, and then # recombined back into a colon separated list of jar paths, removing dupes and using full file paths - jars = (os.path.pathsep).join((os.path.pathsep).join([os.path.abspath(jarpath) for jarpath in _glob.glob(path)]) - for path in classpath.split(os.path.pathsep)) - logging.getLogger(__name__).debug('classpath being used: %s' % jars) + jars = (os.path.pathsep).join( + (os.path.pathsep).join( + [os.path.abspath(jarpath) for jarpath in _glob.glob(path)] + ) + for path in classpath.split(os.path.pathsep) + ) + logging.getLogger(__name__).debug("classpath being used: %s" % jars) return jars + def get_library_name(): """ Returns either sframe or turicreate depending on which library @@ -468,9 +509,14 @@ def get_config_file(): __default_config_path = abspath(expanduser(os.environ["TURI_CONFIG_FILE"])) if not exists(__default_config_path): - print(("WARNING: Config file specified in environment variable " - "'TURI_CONFIG_FILE' as " - "'%s', but this path does not exist.") % __default_config_path) + print( + ( + "WARNING: Config file specified in environment variable " + "'TURI_CONFIG_FILE' as " + "'%s', but this path does not exist." + ) + % __default_config_path + ) return __default_config_path @@ -502,9 +548,13 @@ def setup_environment_from_config_file(): try: os.environ[k.upper()] = v except Exception as e: - print(("WARNING: Error setting environment variable " - "'%s = %s' from config file '%s': %s.") - % (k, str(v), config_file, str(e)) ) + print( + ( + "WARNING: Error setting environment variable " + "'%s = %s' from config file '%s': %s." + ) + % (k, str(v), config_file, str(e)) + ) except Exception as e: print("WARNING: Error reading config file '%s': %s." % (config_file, str(e))) @@ -525,10 +575,10 @@ def write_config_file_value(key, value): __section = "Environment" - if not(config.has_section(__section)): + if not (config.has_section(__section)): config.add_section(__section) config.set(__section, key, value) - with open(filename, 'w') as config_file: + with open(filename, "w") as config_file: config.write(config_file) diff --git a/src/python/turicreate/aggregate.py b/src/python/turicreate/aggregate.py index 044dba8e29..bb6f09c701 100644 --- a/src/python/turicreate/aggregate.py +++ b/src/python/turicreate/aggregate.py @@ -3,17 +3,18 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Builtin aggregators for SFrame groupby operator. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ from .util import _is_non_string_iterable + def SUM(src_column): - """ + """ Builtin sum aggregator for groupby. Example: Get the sum of the rating column for each user. If @@ -24,10 +25,11 @@ def SUM(src_column): ... {'rating_sum':tc.aggregate.SUM('rating')}) """ - return ("__builtin__sum__", [src_column]) + return ("__builtin__sum__", [src_column]) -def ARGMAX(agg_column,out_column): - """ + +def ARGMAX(agg_column, out_column): + """ Builtin arg maximum aggregator for groupby Example: Get the movie with maximum rating per user. @@ -35,10 +37,11 @@ def ARGMAX(agg_column,out_column): >>> sf.groupby("user", ... {'best_movie':tc.aggregate.ARGMAX('rating','movie')}) """ - return ("__builtin__argmax__",[agg_column,out_column]) + return ("__builtin__argmax__", [agg_column, out_column]) -def ARGMIN(agg_column,out_column): - """ + +def ARGMIN(agg_column, out_column): + """ Builtin arg minimum aggregator for groupby Example: Get the movie with minimum rating per user. @@ -47,10 +50,11 @@ def ARGMIN(agg_column,out_column): ... {'best_movie':tc.aggregate.ARGMIN('rating','movie')}) """ - return ("__builtin__argmin__",[agg_column,out_column]) + return ("__builtin__argmin__", [agg_column, out_column]) + def MAX(src_column): - """ + """ Builtin maximum aggregator for groupby Example: Get the maximum rating of each user. @@ -59,10 +63,11 @@ def MAX(src_column): ... {'rating_max':tc.aggregate.MAX('rating')}) """ - return ("__builtin__max__", [src_column]) + return ("__builtin__max__", [src_column]) + def MIN(src_column): - """ + """ Builtin minimum aggregator for groupby Example: Get the minimum rating of each user. @@ -71,11 +76,11 @@ def MIN(src_column): ... {'rating_min':tc.aggregate.MIN('rating')}) """ - return ("__builtin__min__", [src_column]) + return ("__builtin__min__", [src_column]) def COUNT(): - """ + """ Builtin count aggregator for groupby Example: Get the number of occurrences of each user. @@ -84,11 +89,11 @@ def COUNT(): ... {'count':tc.aggregate.COUNT()}) """ - return ("__builtin__count__", [""]) + return ("__builtin__count__", [""]) def AVG(src_column): - """ + """ Builtin average aggregator for groupby. Synonym for tc.aggregate.MEAN. If src_column is of array type, and if array's do not match in length a NoneType is returned in the destination column. @@ -99,11 +104,11 @@ def AVG(src_column): ... {'rating_avg':tc.aggregate.AVG('rating')}) """ - return ("__builtin__avg__", [src_column]) + return ("__builtin__avg__", [src_column]) def MEAN(src_column): - """ + """ Builtin average aggregator for groupby. Synonym for tc.aggregate.AVG. If src_column is of array type, and if array's do not match in length a NoneType is returned in the destination column. @@ -115,10 +120,11 @@ def MEAN(src_column): ... {'rating_mean':tc.aggregate.MEAN('rating')}) """ - return ("__builtin__avg__", [src_column]) + return ("__builtin__avg__", [src_column]) + def VAR(src_column): - """ + """ Builtin variance aggregator for groupby. Synonym for tc.aggregate.VARIANCE Example: Get the rating variance of each user. @@ -127,11 +133,11 @@ def VAR(src_column): ... {'rating_var':tc.aggregate.VAR('rating')}) """ - return ("__builtin__var__", [src_column]) + return ("__builtin__var__", [src_column]) def VARIANCE(src_column): - """ + """ Builtin variance aggregator for groupby. Synonym for tc.aggregate.VAR Example: Get the rating variance of each user. @@ -140,11 +146,11 @@ def VARIANCE(src_column): ... {'rating_var':tc.aggregate.VARIANCE('rating')}) """ - return ("__builtin__var__", [src_column]) + return ("__builtin__var__", [src_column]) def STD(src_column): - """ + """ Builtin standard deviation aggregator for groupby. Synonym for tc.aggregate.STDV Example: Get the rating standard deviation of each user. @@ -153,11 +159,11 @@ def STD(src_column): ... {'rating_std':tc.aggregate.STD('rating')}) """ - return ("__builtin__stdv__", [src_column]) + return ("__builtin__stdv__", [src_column]) def STDV(src_column): - """ + """ Builtin standard deviation aggregator for groupby. Synonym for tc.aggregate.STD Example: Get the rating standard deviation of each user. @@ -166,11 +172,11 @@ def STDV(src_column): ... {'rating_stdv':tc.aggregate.STDV('rating')}) """ - return ("__builtin__stdv__", [src_column]) + return ("__builtin__stdv__", [src_column]) def SELECT_ONE(src_column): - """ + """ Builtin aggregator for groupby which selects one row in the group. Example: Get one rating row from a user. @@ -188,10 +194,11 @@ def SELECT_ONE(src_column): same row in the SFrame. """ - return ("__builtin__select_one__", [src_column]) + return ("__builtin__select_one__", [src_column]) -def CONCAT(src_column, dict_value_column = None): - """ + +def CONCAT(src_column, dict_value_column=None): + """ Builtin aggregator that combines values from one or two columns in one group into either a dictionary value or list value. @@ -215,10 +222,10 @@ def CONCAT(src_column, dict_value_column = None): "count" into a dictionary with keys being words and values being counts. """ - if dict_value_column is None: - return ("__builtin__concat__list__", [src_column]) - else: - return ("__builtin__concat__dict__", [src_column, dict_value_column]) + if dict_value_column is None: + return ("__builtin__concat__list__", [src_column]) + else: + return ("__builtin__concat__dict__", [src_column, dict_value_column]) def QUANTILE(src_column, *args): @@ -258,7 +265,7 @@ def QUANTILE(src_column, *args): def COUNT_DISTINCT(src_column): - """ + """ Builtin unique counter for groupby. Counts the number of unique values Example: Get the number of unique ratings produced by each user. @@ -267,21 +274,22 @@ def COUNT_DISTINCT(src_column): ... {'rating_distinct_count':tc.aggregate.COUNT_DISTINCT('rating')}) """ - return ("__builtin__count__distinct__", [src_column]) + return ("__builtin__count__distinct__", [src_column]) def DISTINCT(src_column): - """ + """ Builtin distinct values for groupby. Returns a list of distinct values. >>> sf.groupby("user", ... {'rating_distinct':tc.aggregate.DISTINCT('rating')}) """ - return ("__builtin__distinct__", [src_column]) + return ("__builtin__distinct__", [src_column]) + def FREQ_COUNT(src_column): - """ + """ Builtin frequency counts for groupby. Returns a dictionary where the key is the `src_column` and the value is the number of times each value occurs. @@ -289,4 +297,4 @@ def FREQ_COUNT(src_column): ... {'rating_distinct':tc.aggregate.FREQ_COUNT('rating')}) """ - return ("__builtin__freq_count__", [src_column]) + return ("__builtin__freq_count__", [src_column]) diff --git a/src/python/turicreate/config/__init__.py b/src/python/turicreate/config/__init__.py index 7545059080..c0d4a7f6e9 100644 --- a/src/python/turicreate/config/__init__.py +++ b/src/python/turicreate/config/__init__.py @@ -13,24 +13,29 @@ import re as _re # Return the root package name -_root_package_name = 'turicreate' -_client_log_file = _os.path.join(_tempfile.gettempdir(), - _root_package_name + - '_client_%d_%d.log' % (_time.time(), _os.getpid())) +_root_package_name = "turicreate" +_client_log_file = _os.path.join( + _tempfile.gettempdir(), + _root_package_name + "_client_%d_%d.log" % (_time.time(), _os.getpid()), +) + def _get_log_location(): from .._connect import main as _glconnect + server = _glconnect.get_server() - if hasattr(server, 'unity_log'): + if hasattr(server, "unity_log"): return server.unity_log else: return None + def _i_am_a_lambda_worker(): if _re.match(".*lambda_worker.*", _sys.argv[0]) is not None: return True return False + def init_logger(): """ Initialize the logging configuration for the turicreate package. @@ -41,40 +46,37 @@ def init_logger(): import logging.config # Package level logger - _logging.config.dictConfig({ - 'version': 1, - 'disable_existing_loggers': False, - 'formatters': { - 'standard': { - 'format': '%(asctime)s [%(levelname)s] %(name)s, %(lineno)s: %(message)s' + _logging.config.dictConfig( + { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "standard": { + "format": "%(asctime)s [%(levelname)s] %(name)s, %(lineno)s: %(message)s" + }, + "brief": {"format": "[%(levelname)s] %(name)s: %(message)s"}, }, - 'brief': { - 'format': '[%(levelname)s] %(name)s: %(message)s' - } - }, - 'handlers': { - 'default': { - 'class': 'logging.StreamHandler', - 'formatter': 'brief' + "handlers": { + "default": {"class": "logging.StreamHandler", "formatter": "brief"}, + "file": { + "class": "logging.FileHandler", + "formatter": "standard", + "filename": _client_log_file, + "encoding": "UTF-8", + "delay": "False", + }, + }, + "loggers": { + _root_package_name: { + "handlers": ["default", "file"], + "propagate": "True", + } }, - 'file': { - 'class': 'logging.FileHandler', - 'formatter': 'standard', - 'filename': _client_log_file, - 'encoding': 'UTF-8', - 'delay': 'False', - } - }, - 'loggers': { - _root_package_name: { - 'handlers': ['default', 'file'], - 'propagate': 'True' - } } - }) + ) # Set module specific log levels - _logging.getLogger('requests').setLevel(_logging.CRITICAL) + _logging.getLogger("requests").setLevel(_logging.CRITICAL) if _i_am_a_lambda_worker(): _logging.getLogger(_root_package_name).setLevel(_logging.WARNING) else: @@ -84,18 +86,21 @@ def init_logger(): # Let's call init_logger on import init_logger() + def get_client_log_location(): """ Get the location of client logs """ return _client_log_file + def get_server_log_location(): """ Get the locations of server logs """ return _get_log_location() + def set_num_gpus(num_gpus): """ Set the number of GPUs to use whenever applicable. @@ -127,9 +132,9 @@ def set_num_gpus(num_gpus): >> turicreate.config.set_num_gpus(1) >> turicreate.image_classifier.create(data, target='label') """ - if(num_gpus < -1): + if num_gpus < -1: raise ValueError("'num_gpus' must be greater than or equal to -1") - set_runtime_config('TURI_NUM_GPUS', num_gpus) + set_runtime_config("TURI_NUM_GPUS", num_gpus) def get_num_gpus(): @@ -140,7 +145,8 @@ def get_num_gpus(): -------- set_num_gpus """ - return get_runtime_config()['TURI_NUM_GPUS'] + return get_runtime_config()["TURI_NUM_GPUS"] + def get_environment_config(): """ @@ -156,9 +162,11 @@ def get_environment_config(): Returns a dictionary of {key:value,..} """ from .._connect import main as _glconnect + unity = _glconnect.get_unity() return unity.list_globals(False) + def set_log_level(level): """ Sets the log level. @@ -166,6 +174,7 @@ def set_log_level(level): if level is 8, nothing is logged. If level is 0, everything is logged. """ from .._connect import main as _glconnect + unity = _glconnect.get_unity() return unity.set_log_level(level) @@ -185,9 +194,11 @@ def get_runtime_config(): set_runtime_config """ from .._connect import main as _glconnect + unity = _glconnect.get_unity() return unity.list_globals(True) + def set_runtime_config(name, value): """ Configures system behavior at runtime. These configuration values are also @@ -306,6 +317,7 @@ def set_runtime_config(name, value): file handle limit with "ulimit -n"). Defaults to 128. """ from .._connect import main as _glconnect + unity = _glconnect.get_unity() ret = unity.set_global(name, value) if ret != "": diff --git a/src/python/turicreate/data_structures/__init__.py b/src/python/turicreate/data_structures/__init__.py index cc8247c85c..8b5e598ca5 100644 --- a/src/python/turicreate/data_structures/__init__.py +++ b/src/python/turicreate/data_structures/__init__.py @@ -3,16 +3,16 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Turi Create offers several data structures for data analysis. Concise descriptions of the data structures and their methods are contained in the API documentation, along with a small number of simple examples. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ -__all__ = ['sframe', 'sarray', 'sgraph', 'sketch', 'image'] +__all__ = ["sframe", "sarray", "sgraph", "sketch", "image"] from . import image from . import sframe diff --git a/src/python/turicreate/data_structures/gframe.py b/src/python/turicreate/data_structures/gframe.py index 8fc7d3b70f..4441021d83 100644 --- a/src/python/turicreate/data_structures/gframe.py +++ b/src/python/turicreate/data_structures/gframe.py @@ -45,6 +45,7 @@ class GFrame(SFrame): >>> # extract an SFrame >>> sf = vertices_gf.__to_sframe__() """ + def __init__(self, graph, gframe_type): self.__type__ = gframe_type self.__graph__ = graph @@ -54,11 +55,11 @@ def __init__(self, graph, gframe_type): def __to_sframe__(self): return copy.copy(self._get_cache()) -#/**************************************************************************/ -#/* */ -#/* Modifiers */ -#/* */ -#/**************************************************************************/ + # /**************************************************************************/ + # /* */ + # /* Modifiers */ + # /* */ + # /**************************************************************************/ def add_column(self, data, column_name="", inplace=False): """ Adds the specified column to this SFrame. The number of elements in @@ -91,16 +92,19 @@ def add_column(self, data, column_name="", inplace=False): self.__is_dirty__ = True with cython_context(): if self._is_vertex_frame(): - graph_proxy = self.__graph__.__proxy__.add_vertex_field(data.__proxy__, column_name) + graph_proxy = self.__graph__.__proxy__.add_vertex_field( + data.__proxy__, column_name + ) self.__graph__.__proxy__ = graph_proxy elif self._is_edge_frame(): - graph_proxy = self.__graph__.__proxy__.add_edge_field(data.__proxy__, column_name) + graph_proxy = self.__graph__.__proxy__.add_edge_field( + data.__proxy__, column_name + ) self.__graph__.__proxy__ = graph_proxy return self else: return super(GFrame, self).add_column(data, column_name, inplace=inplace) - def add_columns(self, data, column_names=None, inplace=False): """ Adds columns to the SFrame. The number of elements in all columns must @@ -134,7 +138,9 @@ def add_columns(self, data, column_names=None, inplace=False): my_columns = set(self.column_names()) for name in column_names: if name in my_columns: - raise ValueError("Column '" + name + "' already exists in current SFrame") + raise ValueError( + "Column '" + name + "' already exists in current SFrame" + ) else: if not _is_non_string_iterable(datalist): raise TypeError("datalist must be an iterable") @@ -151,8 +157,9 @@ def add_columns(self, data, column_names=None, inplace=False): self.add_column(data, name) return self else: - return super(GFrame, self).add_column(datalist, column_names, inplace=inplace) - + return super(GFrame, self).add_column( + datalist, column_names, inplace=inplace + ) def remove_column(self, column_name, inplace=False): """ @@ -173,19 +180,27 @@ def remove_column(self, column_name, inplace=False): Whether the SFrame is modified in place. """ if column_name not in self.column_names(): - raise KeyError('Cannot find column %s' % column_name) + raise KeyError("Cannot find column %s" % column_name) if inplace: self.__is_dirty__ = True try: with cython_context(): if self._is_vertex_frame(): - assert column_name != '__id', 'Cannot remove \"__id\" column' - graph_proxy = self.__graph__.__proxy__.delete_vertex_field(column_name) + assert column_name != "__id", 'Cannot remove "__id" column' + graph_proxy = self.__graph__.__proxy__.delete_vertex_field( + column_name + ) self.__graph__.__proxy__ = graph_proxy elif self._is_edge_frame(): - assert column_name != '__src_id', 'Cannot remove \"__src_id\" column' - assert column_name != '__dst_id', 'Cannot remove \"__dst_id\" column' - graph_proxy = self.__graph__.__proxy__.delete_edge_field(column_name) + assert ( + column_name != "__src_id" + ), 'Cannot remove "__src_id" column' + assert ( + column_name != "__dst_id" + ), 'Cannot remove "__dst_id" column' + graph_proxy = self.__graph__.__proxy__.delete_edge_field( + column_name + ) self.__graph__.__proxy__ = graph_proxy return self except: @@ -200,7 +215,7 @@ def remove_columns(self, column_names, inplace=False): for name in column_names: if name not in existing_columns: - raise KeyError('Cannot find column %s' % name) + raise KeyError("Cannot find column %s" % name) if inplace: for c in column_names: @@ -233,14 +248,20 @@ def swap_columns(self, column_name_1, column_name_2, inplace=False): self.__is_dirty__ = True with cython_context(): if self._is_vertex_frame(): - graph_proxy = self.__graph__.__proxy__.swap_vertex_fields(column_name_1, column_name_2) + graph_proxy = self.__graph__.__proxy__.swap_vertex_fields( + column_name_1, column_name_2 + ) self.__graph__.__proxy__ = graph_proxy elif self._is_edge_frame(): - graph_proxy = self.__graph__.__proxy__.swap_edge_fields(column_name_1, column_name_2) + graph_proxy = self.__graph__.__proxy__.swap_edge_fields( + column_name_1, column_name_2 + ) self.__graph__.__proxy__ = graph_proxy return self else: - return super(GFrame, self).swap_columns(column_name_1, column_name_2, inplace=inplace) + return super(GFrame, self).swap_columns( + column_name_1, column_name_2, inplace=inplace + ) def rename(self, names, inplace=False): """ @@ -262,23 +283,27 @@ def rename(self, names, inplace=False): inplace : bool, optional. Defaults to False. Whether the SFrame is modified in place. """ - if (type(names) is not dict): - raise TypeError('names must be a dictionary: oldname -> newname') + if type(names) is not dict: + raise TypeError("names must be a dictionary: oldname -> newname") if inplace: self.__is_dirty__ = True with cython_context(): if self._is_vertex_frame(): - graph_proxy = self.__graph__.__proxy__.rename_vertex_fields(names.keys(), names.values()) + graph_proxy = self.__graph__.__proxy__.rename_vertex_fields( + names.keys(), names.values() + ) self.__graph__.__proxy__ = graph_proxy elif self._is_edge_frame(): - graph_proxy = self.__graph__.__proxy__.rename_edge_fields(names.keys(), names.values()) + graph_proxy = self.__graph__.__proxy__.rename_edge_fields( + names.keys(), names.values() + ) self.__graph__.__proxy__ = graph_proxy return self else: return super(GFrame, self).rename(names, inplace=inplace) - def add_row_number(self, column_name='id', start=0, inplace=False): + def add_row_number(self, column_name="id", start=0, inplace=False): if type(column_name) is not str: raise TypeError("Must give column_name as str") @@ -294,9 +319,9 @@ def add_row_number(self, column_name='id', start=0, inplace=False): return self else: - return super(GFrame, self).add_row_number(column_name=column_name, start=start,inplace=inplace) - - + return super(GFrame, self).add_row_number( + column_name=column_name, start=start, inplace=inplace + ) def __setitem__(self, key, value): """ @@ -306,18 +331,21 @@ def __setitem__(self, key, value): every entry is equal to the constant value. Existing columns can also be replaced using this wrapper. """ - if (key in ['__id', '__src_id', '__dst_id']): - raise KeyError('Cannot modify column %s. Changing __id column will\ - change the graph structure' % key) + if key in ["__id", "__src_id", "__dst_id"]: + raise KeyError( + "Cannot modify column %s. Changing __id column will\ + change the graph structure" + % key + ) else: self.__is_dirty__ = True super(GFrame, self).__setitem__(key, value) -#/**************************************************************************/ -#/* */ -#/* Read-only Accessor */ -#/* */ -#/**************************************************************************/ + # /**************************************************************************/ + # /* */ + # /* Read-only Accessor */ + # /* */ + # /**************************************************************************/ def num_rows(self): """ Returns the number of rows. @@ -328,9 +356,9 @@ def num_rows(self): Number of rows in the SFrame. """ if self._is_vertex_frame(): - return self.__graph__.summary()['num_vertices'] + return self.__graph__.summary()["num_vertices"] elif self._is_edge_frame(): - return self.__graph__.summary()['num_edges'] + return self.__graph__.summary()["num_edges"] def num_columns(self): """ @@ -371,11 +399,11 @@ def column_types(self): elif self.__type__ == EDGE_GFRAME: return self.__graph__.__proxy__.get_edge_field_types() -#/**************************************************************************/ -#/* */ -#/* Internal Private Methods */ -#/* */ -#/**************************************************************************/ + # /**************************************************************************/ + # /* */ + # /* Internal Private Methods */ + # /* */ + # /**************************************************************************/ def _get_cache(self): if self.__sframe_cache__ is None or self.__is_dirty__: if self._is_vertex_frame(): diff --git a/src/python/turicreate/data_structures/grouped_sframe.py b/src/python/turicreate/data_structures/grouped_sframe.py index 44d7315385..20cfa34bd2 100644 --- a/src/python/turicreate/data_structures/grouped_sframe.py +++ b/src/python/turicreate/data_structures/grouped_sframe.py @@ -10,12 +10,16 @@ from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ + + class GroupedSFrame(object): """ Left undocumented intentionally. """ + def __init__(self, sframe, key_columns): from .. import extensions + self._sf_group = extensions.grouped_sframe() if isinstance(key_columns, str): key_columns = [key_columns] @@ -42,7 +46,7 @@ def generator(): elems_at_a_time = 16 self._sf_group.begin_iterator() ret = self._sf_group.iterator_get_next(elems_at_a_time) - while(True): + while True: for j in ret: yield tuple(j) diff --git a/src/python/turicreate/data_structures/image.py b/src/python/turicreate/data_structures/image.py index 9faae6f511..12b29a1402 100644 --- a/src/python/turicreate/data_structures/image.py +++ b/src/python/turicreate/data_structures/image.py @@ -52,7 +52,7 @@ class Image(object): >>> turicreate.SArray([img]).show() """ - def __init__(self, path=None, format='auto', **__internal_kw_args): + def __init__(self, path=None, format="auto", **__internal_kw_args): self._image_data = bytearray() self._height = 0 self._width = 0 @@ -61,9 +61,10 @@ def __init__(self, path=None, format='auto', **__internal_kw_args): self._version = _CURRENT_VERSION self._format_enum = _format[_UNDEFINED] - if (path is not None): + if path is not None: from ..util import _make_internal_url from .. import extensions as _extensions + img = _extensions.load_image(_make_internal_url(path), format) for key, value in list(img.__dict__.items()): setattr(self, key, value) @@ -160,6 +161,7 @@ def pixel_data(self): """ from .. import extensions as _extensions + data = _np.zeros((self.height, self.width, self.channels), dtype=_np.uint8) _extensions.image_load_to_numpy(self, data.ctypes.data, data.strides) if self.channels == 1: @@ -187,16 +189,20 @@ def __repr__(self): def _repr_png_(self): img = self._to_pil_image() from io import BytesIO + b = BytesIO() - img.save(b, format='png') + img.save(b, format="png") data = b.getvalue() - res = {"Height" :str(self._height), "Width":str(self._width), "Channels: " :str(self._channels)} - return (data,res) - - + res = { + "Height": str(self._height), + "Width": str(self._width), + "Channels: ": str(self._channels), + } + return (data, res) def _to_pil_image(self): from PIL import Image as _PIL_Image + return _PIL_Image.fromarray(self.pixel_data) def save(self, filename): @@ -238,21 +244,24 @@ def show(self): from ..visualization._plot import _target # Suppress visualization output if 'none' target is set - if _target == 'none': + if _target == "none": return try: img = self._to_pil_image() try: # output into jupyter notebook if possible - if _target == 'auto' and \ - (get_ipython().__class__.__name__ == "ZMQInteractiveShell" or get_ipython().__class__.__name__ == "Shell"): + if _target == "auto" and ( + get_ipython().__class__.__name__ == "ZMQInteractiveShell" + or get_ipython().__class__.__name__ == "Shell" + ): from io import BytesIO from IPython import display + b = BytesIO() - img.save(b, format='png') + img.save(b, format="png") data = b.getvalue() - ip_img = display.Image(data=data, format='png', embed=True) + ip_img = display.Image(data=data, format="png", embed=True) display.display(ip_img) else: # fall back to pillow .show (jupyter notebook integration disabled or not in jupyter notebook) diff --git a/src/python/turicreate/data_structures/sarray.py b/src/python/turicreate/data_structures/sarray.py index d80f3dd385..fecebd9e1a 100644 --- a/src/python/turicreate/data_structures/sarray.py +++ b/src/python/turicreate/data_structures/sarray.py @@ -3,13 +3,13 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" This module defines the SArray class which provides the ability to create, access and manipulate a remote scalable array object. SArray acts similarly to pandas.Series but without indexing. The data is immutable and homogeneous. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -35,11 +35,12 @@ import six import types -__all__ = ['SArray'] +__all__ = ["SArray"] if sys.version_info.major > 2: long = int + def _create_sequential_sarray(size, start=0, reverse=False): if type(size) is not int: raise TypeError("size must be int") @@ -51,7 +52,10 @@ def _create_sequential_sarray(size, start=0, reverse=False): raise TypeError("reverse must me bool") with cython_context(): - return SArray(_proxy=glconnect.get_unity().create_sequential_sarray(size, start, reverse)) + return SArray( + _proxy=glconnect.get_unity().create_sequential_sarray(size, start, reverse) + ) + def load_sarray(filename): """ @@ -372,10 +376,21 @@ def gen(): @classmethod def _is_iterable_required_to_listify(cls, obj): # In Python 3, str implements '__iter__'. - return (isinstance(obj, types.GeneratorType) or - (sys.version_info.major < 3 and isinstance(obj, six.moves.xrange)) or - sys.version_info.major >= 3 and isinstance(obj, (range, filter, map, collections.abc.KeysView, collections.abc.ValuesView))) - + return ( + isinstance(obj, types.GeneratorType) + or (sys.version_info.major < 3 and isinstance(obj, six.moves.xrange)) + or sys.version_info.major >= 3 + and isinstance( + obj, + ( + range, + filter, + map, + collections.abc.KeysView, + collections.abc.ValuesView, + ), + ) + ) def __init__(self, data=[], dtype=None, ignore_cast_failure=False, _proxy=None): """ @@ -386,9 +401,9 @@ def __init__(self, data=[], dtype=None, ignore_cast_failure=False, _proxy=None): """ if dtype is not None and type(dtype) != type: - raise TypeError('dtype must be a type, e.g. use int rather than \'int\'') + raise TypeError("dtype must be a type, e.g. use int rather than 'int'") - if (_proxy): + if _proxy: self.__proxy__ = _proxy elif isinstance(data, SArray): if dtype is None: @@ -415,12 +430,17 @@ def __init__(self, data=[], dtype=None, ignore_cast_failure=False, _proxy=None): # first try the fast inproc method try: from .. import numpy_loader + if numpy_loader.numpy_activation_successful(): from ..numpy import _fast_numpy_to_sarray + ret = _fast_numpy_to_sarray(data) # conversion is good! # swap the proxy. - self.__proxy__, ret.__proxy__ = ret.__proxy__, self.__proxy__ + self.__proxy__, ret.__proxy__ = ( + ret.__proxy__, + self.__proxy__, + ) return else: dtype = infer_type_of_sequence(data) @@ -439,10 +459,13 @@ def __init__(self, data=[], dtype=None, ignore_cast_failure=False, _proxy=None): else: dtype = list elif len(data.shape) > 2: - raise TypeError("Cannot convert Numpy arrays of greater than 2 dimensions") + raise TypeError( + "Cannot convert Numpy arrays of greater than 2 dimensions" + ) - elif (isinstance(data, str) or - (sys.version_info.major < 3 and isinstance(data, unicode))): + elif isinstance(data, str) or ( + sys.version_info.major < 3 and isinstance(data, unicode) + ): # if it is a file, we default to string dtype = str elif isinstance(data, array.array): @@ -456,25 +479,33 @@ def __init__(self, data=[], dtype=None, ignore_cast_failure=False, _proxy=None): if HAS_PANDAS and isinstance(data, pandas.Series): with cython_context(): - self.__proxy__.load_from_iterable(data.values, dtype, ignore_cast_failure) - elif (isinstance(data, str) or (sys.version_info.major <= 2 and isinstance(data, unicode))): + self.__proxy__.load_from_iterable( + data.values, dtype, ignore_cast_failure + ) + elif isinstance(data, str) or ( + sys.version_info.major <= 2 and isinstance(data, unicode) + ): internal_url = _make_internal_url(data) with cython_context(): self.__proxy__.load_autodetect(internal_url, dtype) - elif ((HAS_NUMPY and isinstance(data, numpy.ndarray)) - or isinstance(data, array.array) - or isinstance(data, collections.Sequence)): + elif ( + (HAS_NUMPY and isinstance(data, numpy.ndarray)) + or isinstance(data, array.array) + or isinstance(data, collections.Sequence) + ): with cython_context(): self.__proxy__.load_from_iterable(data, dtype, ignore_cast_failure) else: - raise TypeError("Unexpected data source. " \ - "Possible data source types are: list, " \ - "numpy.ndarray, pandas.Series, and string(url)") + raise TypeError( + "Unexpected data source. " + "Possible data source types are: list, " + "numpy.ndarray, pandas.Series, and string(url)" + ) @classmethod - def date_range(cls,start_time,end_time,freq): - ''' + def date_range(cls, start_time, end_time, freq): + """ Returns a new SArray that represents a fixed frequency datetime index. Parameters @@ -505,19 +536,26 @@ def date_range(cls,start_time,end_time,freq): datetime.datetime(2013, 5, 8, 10, 4, 10), datetime.datetime(2013, 5, 9, 10, 4, 10), datetime.datetime(2013, 5, 10, 10, 4, 10)] - ''' + """ - if not isinstance(start_time,datetime.datetime): - raise TypeError("The ``start_time`` argument must be from type datetime.datetime.") + if not isinstance(start_time, datetime.datetime): + raise TypeError( + "The ``start_time`` argument must be from type datetime.datetime." + ) - if not isinstance(end_time,datetime.datetime): - raise TypeError("The ``end_time`` argument must be from type datetime.datetime.") + if not isinstance(end_time, datetime.datetime): + raise TypeError( + "The ``end_time`` argument must be from type datetime.datetime." + ) - if not isinstance(freq,datetime.timedelta): - raise TypeError("The ``freq`` argument must be from type datetime.timedelta.") + if not isinstance(freq, datetime.timedelta): + raise TypeError( + "The ``freq`` argument must be from type datetime.timedelta." + ) from .. import extensions - return extensions.date_range(start_time,end_time,freq.total_seconds()) + + return extensions.date_range(start_time, end_time, freq.total_seconds()) @classmethod def from_const(cls, value, size, dtype=type(None)): @@ -545,9 +583,14 @@ def from_const(cls, value, size, dtype=type(None)): >>> turicreate.SArray.from_const(None, 10, str) """ - assert isinstance(size, (int, long)) and size >= 0, "size must be a positive int" - if not isinstance(value, (type(None), int, float, str, array.array, list, dict, datetime.datetime)): - raise TypeError('Cannot create sarray of value type %s' % str(type(value))) + assert ( + isinstance(size, (int, long)) and size >= 0 + ), "size must be a positive int" + if not isinstance( + value, + (type(None), int, float, str, array.array, list, dict, datetime.datetime), + ): + raise TypeError("Cannot create sarray of value type %s" % str(type(value))) proxy = UnitySArrayProxy() proxy.load_from_const(value, size, dtype) return cls(_proxy=proxy) @@ -603,7 +646,7 @@ def from_sequence(cls, *args): size = stop - start # this matches the behavior of range # i.e. range(100,10) just returns an empty array - if (size < 0): + if size < 0: size = 0 return _create_sequential_sarray(size, start) @@ -611,8 +654,8 @@ def from_sequence(cls, *args): def read_json(cls, filename): """ Construct an SArray from a json file or glob of json files. - The json file must contain a list. Every element in the list - must also have the same type. The returned SArray type will be + The json file must contain a list. Every element in the list + must also have the same type. The returned SArray type will be inferred from the elements type. Parameters @@ -633,7 +676,7 @@ def read_json(cls, filename): """ proxy = UnitySArrayProxy() proxy.load_from_json_record_files(_make_internal_url(filename)) - return cls(_proxy = proxy) + return cls(_proxy=proxy) @classmethod def where(cls, condition, istrue, isfalse, dtype=None): @@ -715,10 +758,16 @@ def where(cls, condition, istrue, isfalse, dtype=None): elif type(istrue) == type(isfalse): dtype = type(istrue) if dtype is None: - raise TypeError("Both true and false are None. Resultant type cannot be inferred.") + raise TypeError( + "Both true and false are None. Resultant type cannot be inferred." + ) istrue = cls(_proxy=condition.__proxy__.to_const(istrue, dtype)) isfalse = cls(_proxy=condition.__proxy__.to_const(isfalse, dtype)) - return cls(_proxy=condition.__proxy__.ternary_operator(istrue.__proxy__, isfalse.__proxy__)) + return cls( + _proxy=condition.__proxy__.ternary_operator( + istrue.__proxy__, isfalse.__proxy__ + ) + ) def to_numpy(self): """ @@ -733,8 +782,9 @@ def to_numpy(self): A Numpy Array containing all the values of the SArray """ - assert HAS_NUMPY, 'numpy is not installed.' + assert HAS_NUMPY, "numpy is not installed." import numpy + return numpy.asarray(self) def __get_content_identifier__(self): @@ -774,17 +824,19 @@ def save(self, filename, format=None): from .sframe import SFrame as _SFrame if format is None: - if filename.endswith(('.csv', '.csv.gz', 'txt')): - format = 'text' + if filename.endswith((".csv", ".csv.gz", "txt")): + format = "text" else: - format = 'binary' - if format == 'binary': + format = "binary" + if format == "binary": with cython_context(): self.__proxy__.save(_make_internal_url(filename)) - elif format == 'text' or format == 'csv': - sf = _SFrame({'X1':self}) + elif format == "text" or format == "csv": + sf = _SFrame({"X1": self}) with cython_context(): - sf.__proxy__.save_as_csv(_make_internal_url(filename), {'header':False}) + sf.__proxy__.save_as_csv( + _make_internal_url(filename), {"header": False} + ) else: raise ValueError("Unsupported format: {}".format(format)) @@ -794,7 +846,7 @@ def __repr__(self): """ data_str = self.__str__() ret = "dtype: " + str(self.dtype.__name__) + "\n" - if (self.__has_size__()): + if self.__has_size__(): ret = ret + "Rows: " + str(len(self)) + "\n" else: ret = ret + "Rows: ?\n" @@ -812,10 +864,12 @@ def __str__(self): else: if sys.version_info.major < 3: headln = str(list(self.head(100))) - headln = unicode(headln.decode('string_escape'),'utf-8',errors='replace').encode('utf-8') + headln = unicode( + headln.decode("string_escape"), "utf-8", errors="replace" + ).encode("utf-8") else: headln = str(list(self.head(100))) - if (self.__proxy__.has_size() is False or len(self) > 100): + if self.__proxy__.has_size() is False or len(self) > 100: # cut the last close bracket # and replace it with ... headln = headln[0:-1] + ", ... ]" @@ -827,7 +881,9 @@ def __nonzero__(self): The truth value of an array with more than one element is ambiguous. Use a.any() or a.all(). """ # message copied from Numpy - raise ValueError("The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()") + raise ValueError( + "The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()" + ) def __bool__(self): """ @@ -835,7 +891,9 @@ def __bool__(self): The truth value of an array with more than one element is ambiguous. Use a.any() or a.all(). """ # message copied from Numpy - raise ValueError("The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()") + raise ValueError( + "The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()" + ) def __len__(self): """ @@ -847,11 +905,12 @@ def __iter__(self): """ Provides an iterator to the contents of the array. """ + def generator(): elems_at_a_time = 262144 self.__proxy__.begin_iterator() ret = self.__proxy__.iterator_get_next(elems_at_a_time) - while(True): + while True: for j in ret: yield j @@ -937,8 +996,7 @@ def contains(self, item): -------- is_in """ - return SArray(_proxy = self.__proxy__.left_scalar_operator(item, 'in')) - + return SArray(_proxy=self.__proxy__.left_scalar_operator(item, "in")) def is_in(self, other): """ @@ -980,7 +1038,7 @@ def is_in(self, other): -------- contains """ - return SArray(_proxy = self.__proxy__.right_scalar_operator(other, 'in')) + return SArray(_proxy=self.__proxy__.right_scalar_operator(other, "in")) # XXX: all of these functions are highly repetitive def __add__(self, other): @@ -991,9 +1049,11 @@ def __add__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '+')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "+") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '+')) + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, "+")) def __sub__(self, other): """ @@ -1003,9 +1063,11 @@ def __sub__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '-')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "-") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '-')) + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, "-")) def __mul__(self, other): """ @@ -1015,9 +1077,11 @@ def __mul__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '*')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "*") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '*')) + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, "*")) def __div__(self, other): """ @@ -1027,9 +1091,11 @@ def __div__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '/')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "/") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '/')) + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, "/")) def __truediv__(self, other): """ @@ -1039,10 +1105,11 @@ def __truediv__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '/')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "/") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '/')) - + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, "/")) def __floordiv__(self, other): """ @@ -1052,9 +1119,11 @@ def __floordiv__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '//')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "//") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '//')) + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, "//")) def __pow__(self, other): """ @@ -1065,31 +1134,35 @@ def __pow__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '**')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "**") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '**')) + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, "**")) def __neg__(self): """ Returns the negative of each element. """ with cython_context(): - return SArray(_proxy = self.__proxy__.right_scalar_operator(0, '-')) + return SArray(_proxy=self.__proxy__.right_scalar_operator(0, "-")) def __pos__(self): if self.dtype not in [int, long, float, array.array]: - raise RuntimeError("Runtime Exception. Unsupported type operation. " - "cannot perform operation + on type %s" % str(self.dtype)) + raise RuntimeError( + "Runtime Exception. Unsupported type operation. " + "cannot perform operation + on type %s" % str(self.dtype) + ) with cython_context(): - return SArray(_proxy = self.__proxy__) + return SArray(_proxy=self.__proxy__) def __abs__(self): """ Returns the absolute value of each element. """ with cython_context(): - return SArray(_proxy = self.__proxy__.left_scalar_operator(0, 'left_abs')) + return SArray(_proxy=self.__proxy__.left_scalar_operator(0, "left_abs")) def __mod__(self, other): """ @@ -1097,10 +1170,11 @@ def __mod__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '%')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "%") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '%')) - + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, "%")) def __lt__(self, other): """ @@ -1110,9 +1184,11 @@ def __lt__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '<')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "<") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '<')) + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, "<")) def __gt__(self, other): """ @@ -1122,10 +1198,11 @@ def __gt__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '>')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, ">") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '>')) - + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, ">")) def __le__(self, other): """ @@ -1135,10 +1212,11 @@ def __le__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '<=')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "<=") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '<=')) - + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, "<=")) def __ge__(self, other): """ @@ -1148,10 +1226,11 @@ def __ge__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '>=')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, ">=") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '>=')) - + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, ">=")) def __radd__(self, other): """ @@ -1159,8 +1238,7 @@ def __radd__(self, other): Returned array has the same type as the array on the right hand side """ with cython_context(): - return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '+')) - + return SArray(_proxy=self.__proxy__.right_scalar_operator(other, "+")) def __rsub__(self, other): """ @@ -1168,8 +1246,7 @@ def __rsub__(self, other): Returned array has the same type as the array on the right hand side """ with cython_context(): - return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '-')) - + return SArray(_proxy=self.__proxy__.right_scalar_operator(other, "-")) def __rmul__(self, other): """ @@ -1177,8 +1254,7 @@ def __rmul__(self, other): Returned array has the same type as the array on the right hand side """ with cython_context(): - return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '*')) - + return SArray(_proxy=self.__proxy__.right_scalar_operator(other, "*")) def __rdiv__(self, other): """ @@ -1186,7 +1262,7 @@ def __rdiv__(self, other): Returned array has the same type as the array on the right hand side """ with cython_context(): - return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '/')) + return SArray(_proxy=self.__proxy__.right_scalar_operator(other, "/")) def __rtruediv__(self, other): """ @@ -1194,7 +1270,7 @@ def __rtruediv__(self, other): Returned array has the same type as the array on the right hand side """ with cython_context(): - return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '/')) + return SArray(_proxy=self.__proxy__.right_scalar_operator(other, "/")) def __rfloordiv__(self, other): """ @@ -1203,8 +1279,9 @@ def __rfloordiv__(self, other): right hand side """ with cython_context(): - return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '/')).astype(int) - + return SArray( + _proxy=self.__proxy__.right_scalar_operator(other, "/") + ).astype(int) def __rmod__(self, other): """ @@ -1213,7 +1290,7 @@ def __rmod__(self, other): right hand side """ with cython_context(): - return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '%')) + return SArray(_proxy=self.__proxy__.right_scalar_operator(other, "%")) def __rpow__(self, other): """ @@ -1221,7 +1298,7 @@ def __rpow__(self, other): value, returning floor of the result. """ with cython_context(): - return SArray(_proxy = self.__proxy__.right_scalar_operator(other, '**')) + return SArray(_proxy=self.__proxy__.right_scalar_operator(other, "**")) def __eq__(self, other): """ @@ -1231,10 +1308,11 @@ def __eq__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '==')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "==") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '==')) - + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, "==")) def __ne__(self, other): """ @@ -1244,10 +1322,11 @@ def __ne__(self, other): """ with cython_context(): if type(other) is SArray: - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '!=')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "!=") + ) else: - return SArray(_proxy = self.__proxy__.left_scalar_operator(other, '!=')) - + return SArray(_proxy=self.__proxy__.left_scalar_operator(other, "!=")) def __and__(self, other): """ @@ -1255,10 +1334,13 @@ def __and__(self, other): """ if type(other) is SArray: with cython_context(): - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '&')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "&") + ) else: - raise TypeError("SArray can only perform logical and against another SArray") - + raise TypeError( + "SArray can only perform logical and against another SArray" + ) def __or__(self, other): """ @@ -1266,11 +1348,12 @@ def __or__(self, other): """ if type(other) is SArray: with cython_context(): - return SArray(_proxy = self.__proxy__.vector_operator(other.__proxy__, '|')) + return SArray( + _proxy=self.__proxy__.vector_operator(other.__proxy__, "|") + ) else: raise TypeError("SArray can only perform logical or against another SArray") - def __has_size__(self): """ Returns whether or not the size of the SArray is known. @@ -1311,15 +1394,17 @@ def __getitem__(self, other): lb = block_num * block_size ub = min(sa_len, lb + block_size) - val_list = list(SArray(_proxy = self.__proxy__.copy_range(lb, 1, ub))) + val_list = list(SArray(_proxy=self.__proxy__.copy_range(lb, 1, ub))) self._getitem_cache = (lb, ub, val_list) return val_list[other - lb] elif type(other) is SArray: if self.__has_size__() and other.__has_size__() and len(other) != len(self): - raise IndexError("Cannot perform logical indexing on arrays of different length.") + raise IndexError( + "Cannot perform logical indexing on arrays of different length." + ) with cython_context(): - return SArray(_proxy = self.__proxy__.logical_filter(other.__proxy__)) + return SArray(_proxy=self.__proxy__.logical_filter(other.__proxy__)) elif type(other) is slice: sa_len = len(self) @@ -1338,7 +1423,7 @@ def __getitem__(self, other): if stop < 0: stop = sa_len + stop - return SArray(_proxy = self.__proxy__.copy_range(start, step, stop)) + return SArray(_proxy=self.__proxy__.copy_range(start, step, stop)) else: raise IndexError("Invalid type to use for indexing") @@ -1579,7 +1664,6 @@ def dict_trim_by_keys(self, keys, exclude=True): if not _is_non_string_iterable(keys): keys = [keys] - with cython_context(): return SArray(_proxy=self.__proxy__.dict_trim_by_keys(keys, exclude)) @@ -1630,7 +1714,6 @@ def dict_trim_by_values(self, lower=None, upper=None): if not (upper is None or isinstance(upper, numbers.Number)): raise TypeError("upper bound has to be a numeric value") - with cython_context(): return SArray(_proxy=self.__proxy__.dict_trim_by_values(lower, upper)) @@ -1725,7 +1808,6 @@ def dict_has_any_keys(self, keys): if not _is_non_string_iterable(keys): keys = [keys] - with cython_context(): return SArray(_proxy=self.__proxy__.dict_has_any_keys(keys)) @@ -1763,7 +1845,6 @@ def dict_has_all_keys(self, keys): if not _is_non_string_iterable(keys): keys = [keys] - with cython_context(): return SArray(_proxy=self.__proxy__.dict_has_all_keys(keys)) @@ -1846,6 +1927,7 @@ def apply(self, fn, dtype=None, skip_na=True): nativefn = None try: from .. import extensions + nativefn = extensions._build_native_function_call(fn) except: # failure are fine. we just fall out into the next few phases @@ -1855,12 +1937,15 @@ def apply(self, fn, dtype=None, skip_na=True): # this is a toolkit lambda. We can do something about it nativefn.native_fn_name = nativefn.native_fn_name.encode() with cython_context(): - return SArray(_proxy=self.__proxy__.transform_native(nativefn, dtype, skip_na, seed)) + return SArray( + _proxy=self.__proxy__.transform_native( + nativefn, dtype, skip_na, seed + ) + ) with cython_context(): return SArray(_proxy=self.__proxy__.transform(fn, dtype, skip_na, seed)) - def filter(self, fn, skip_na=True, seed=None): """ Filter this SArray by a function. @@ -1899,11 +1984,9 @@ def filter(self, fn, skip_na=True, seed=None): if seed is None: seed = abs(hash("%0.20f" % time.time())) % (2 ** 31) - with cython_context(): return SArray(_proxy=self.__proxy__.filter(fn, skip_na, seed)) - def sample(self, fraction, seed=None, exact=False): """ Create an SArray which contains a subsample of the current SArray. @@ -1935,14 +2018,13 @@ def sample(self, fraction, seed=None, exact=False): Rows: 3 [2, 6, 9] """ - if (fraction > 1 or fraction < 0): - raise ValueError('Invalid sampling rate: ' + str(fraction)) - if (len(self) == 0): + if fraction > 1 or fraction < 0: + raise ValueError("Invalid sampling rate: " + str(fraction)) + if len(self) == 0: return SArray() if seed is None: seed = abs(hash("%0.20f" % time.time())) % (2 ** 31) - with cython_context(): return SArray(_proxy=self.__proxy__.sample(fraction, seed, exact)) @@ -1982,7 +2064,6 @@ def _save_as_text(self, url): """ raise NotImplementedError - def all(self): """ Return True if every element of the SArray evaluates to True. For @@ -2020,7 +2101,6 @@ def all(self): with cython_context(): return self.__proxy__.all() - def any(self): """ Return True if any element of the SArray evaluates to True. For numeric @@ -2057,7 +2137,6 @@ def any(self): with cython_context(): return self.__proxy__.any() - def max(self): """ Get maximum numeric value in SArray. @@ -2082,7 +2161,6 @@ def max(self): with cython_context(): return self.__proxy__.max() - def min(self): """ Get minimum numeric value in SArray. @@ -2132,12 +2210,15 @@ def argmax(self): if len(self) == 0: return None - if not any([isinstance(self[0], i) for i in [int,float,long]]): + if not any([isinstance(self[0], i) for i in [int, float, long]]): raise TypeError("SArray must be of type 'int', 'long', or 'float'.") sf = _SFrame(self).add_row_number() - sf_out = sf.groupby(key_column_names=[],operations={'maximum_x1': _aggregate.ARGMAX('X1','id')}) - return sf_out['maximum_x1'][0] + sf_out = sf.groupby( + key_column_names=[], + operations={"maximum_x1": _aggregate.ARGMAX("X1", "id")}, + ) + return sf_out["maximum_x1"][0] def argmin(self): """ @@ -2164,13 +2245,15 @@ def argmin(self): if len(self) == 0: return None - if not any([isinstance(self[0], i) for i in [int,float,long]]): + if not any([isinstance(self[0], i) for i in [int, float, long]]): raise TypeError("SArray must be of type 'int', 'long', or 'float'.") sf = _SFrame(self).add_row_number() - sf_out = sf.groupby(key_column_names=[],operations={'minimum_x1': _aggregate.ARGMIN('X1','id')}) - return sf_out['minimum_x1'][0] - + sf_out = sf.groupby( + key_column_names=[], + operations={"minimum_x1": _aggregate.ARGMIN("X1", "id")}, + ) + return sf_out["minimum_x1"][0] def sum(self): """ @@ -2205,12 +2288,12 @@ def mean(self): """ with cython_context(): if self.dtype == _Image: - from .. import extensions + from .. import extensions + return extensions.generate_mean(self) else: return self.__proxy__.mean() - def std(self, ddof=0): """ Standard deviation of all the values in the SArray. @@ -2231,7 +2314,6 @@ def std(self, ddof=0): with cython_context(): return self.__proxy__.std(ddof) - def var(self, ddof=0): """ Variance of all the values in the SArray. @@ -2276,7 +2358,7 @@ def nnz(self): with cython_context(): return self.__proxy__.nnz() - def datetime_to_str(self,format="%Y-%m-%dT%H:%M:%S%ZP"): + def datetime_to_str(self, format="%Y-%m-%dT%H:%M:%S%ZP"): """ Create a new SArray with all the values cast to str. The string format is specified by the 'format' parameter. @@ -2309,13 +2391,15 @@ def datetime_to_str(self,format="%Y-%m-%dT%H:%M:%S%ZP"): [1] Boost date time from string conversion guide (http://www.boost.org/doc/libs/1_48_0/doc/html/date_time/date_time_io.html) """ - if(self.dtype != datetime.datetime): - raise TypeError("datetime_to_str expects SArray of datetime as input SArray") + if self.dtype != datetime.datetime: + raise TypeError( + "datetime_to_str expects SArray of datetime as input SArray" + ) with cython_context(): return SArray(_proxy=self.__proxy__.datetime_to_str(format)) - def str_to_datetime(self,format="%Y-%m-%dT%H:%M:%S%ZP"): + def str_to_datetime(self, format="%Y-%m-%dT%H:%M:%S%ZP"): """ Create a new SArray with all the values cast to datetime. The string format is specified by the 'format' parameter. @@ -2347,13 +2431,15 @@ def str_to_datetime(self,format="%Y-%m-%dT%H:%M:%S%ZP"): [1] boost date time to string conversion guide (http://www.boost.org/doc/libs/1_48_0/doc/html/date_time/date_time_io.html) """ - if(self.dtype != str): + if self.dtype != str: raise TypeError("str_to_datetime expects SArray of str as input SArray") with cython_context(): return SArray(_proxy=self.__proxy__.str_to_datetime(format)) - def pixel_array_to_image(self, width, height, channels, undefined_on_failure=True, allow_rounding=False): + def pixel_array_to_image( + self, width, height, channels, undefined_on_failure=True, allow_rounding=False + ): """ Create a new SArray with all the values cast to :py:class:`turicreate.image.Image` of uniform size. @@ -2397,7 +2483,7 @@ def pixel_array_to_image(self, width, height, channels, undefined_on_failure=Tru >>> mnist_img_sarray = tc.SArray.pixel_array_to_image(scaled_mnist_array, 28, 28, 1, allow_rounding = True) """ - if(self.dtype != array.array): + if self.dtype != array.array: raise TypeError("array_to_img expects SArray of arrays as input SArray") num_to_test = 10 @@ -2406,17 +2492,25 @@ def pixel_array_to_image(self, width, height, channels, undefined_on_failure=Tru mod_values = [val % 1 for x in range(num_test) for val in self[x]] - out_of_range_values = [(val > 255 or val < 0) for x in range(num_test) for val in self[x]] + out_of_range_values = [ + (val > 255 or val < 0) for x in range(num_test) for val in self[x] + ] if sum(mod_values) != 0.0 and not allow_rounding: - raise ValueError("There are non-integer values in the array data. Images only support integer data values between 0 and 255. To permit rounding, set the 'allow_rounding' parameter to 1.") + raise ValueError( + "There are non-integer values in the array data. Images only support integer data values between 0 and 255. To permit rounding, set the 'allow_rounding' parameter to 1." + ) if sum(out_of_range_values) != 0: - raise ValueError("There are values outside the range of 0 to 255. Images only support integer data values between 0 and 255.") - + raise ValueError( + "There are values outside the range of 0 to 255. Images only support integer data values between 0 and 255." + ) from .. import extensions - return extensions.vector_sarray_to_image_sarray(self, width, height, channels, undefined_on_failure) + + return extensions.vector_sarray_to_image_sarray( + self, width, height, channels, undefined_on_failure + ) def astype(self, dtype, undefined_on_failure=False): """ @@ -2466,16 +2560,19 @@ def astype(self, dtype, undefined_on_failure=False): """ if (dtype == _Image) and (self.dtype == array.array): - raise TypeError("Cannot cast from image type to array with sarray.astype(). Please use sarray.pixel_array_to_image() instead.") - - if float('nan') in self: + raise TypeError( + "Cannot cast from image type to array with sarray.astype(). Please use sarray.pixel_array_to_image() instead." + ) + + if float("nan") in self: import turicreate as _tc - self=_tc.SArray.where(self == float("nan"),None,self) + + self = _tc.SArray.where(self == float("nan"), None, self) with cython_context(): return SArray(_proxy=self.__proxy__.astype(dtype, undefined_on_failure)) - def clip(self, lower=float('nan'), upper=float('nan')): + def clip(self, lower=float("nan"), upper=float("nan")): """ Create a new SArray with each value clipped to be within the given bounds. @@ -2546,8 +2643,7 @@ def clip_lower(self, threshold): [2, 2, 3] """ with cython_context(): - return SArray(_proxy=self.__proxy__.clip(threshold, float('nan'))) - + return SArray(_proxy=self.__proxy__.clip(threshold, float("nan"))) def clip_upper(self, threshold): """ @@ -2577,7 +2673,7 @@ def clip_upper(self, threshold): [1, 2, 2] """ with cython_context(): - return SArray(_proxy=self.__proxy__.clip(float('nan'), threshold)) + return SArray(_proxy=self.__proxy__.clip(float("nan"), threshold)) def tail(self, n=10): """ @@ -2596,7 +2692,6 @@ def tail(self, n=10): with cython_context(): return SArray(_proxy=self.__proxy__.tail(n)) - def dropna(self): """ Create new SArray containing only the non-missing values of the @@ -2611,9 +2706,8 @@ def dropna(self): The new SArray with missing values removed. """ - with cython_context(): - return SArray(_proxy = self.__proxy__.drop_missing_values()) + return SArray(_proxy=self.__proxy__.drop_missing_values()) def fillna(self, value): """ @@ -2637,7 +2731,7 @@ def fillna(self, value): """ with cython_context(): - return SArray(_proxy = self.__proxy__.fill_missing_values(value)) + return SArray(_proxy=self.__proxy__.fill_missing_values(value)) def is_topk(self, topk=10, reverse=False): """ @@ -2664,7 +2758,7 @@ def is_topk(self, topk=10, reverse=False): This is used internally by SFrame's topk function. """ with cython_context(): - return SArray(_proxy = self.__proxy__.topk_index(topk, reverse)) + return SArray(_proxy=self.__proxy__.topk_index(topk, reverse)) def summary(self, background=False, sub_sketch_keys=None): """ @@ -2696,28 +2790,37 @@ def summary(self, background=False, sub_sketch_keys=None): Many of the statistics are approximate. """ from ..data_structures.sketch import Sketch - if (self.dtype == _Image): + + if self.dtype == _Image: raise TypeError("summary() is not supported for arrays of image type") - if (type(background) != bool): + if type(background) != bool: raise TypeError("'background' parameter has to be a boolean value") - if (sub_sketch_keys is not None): - if (self.dtype != dict and self.dtype != array.array): - raise TypeError("sub_sketch_keys is only supported for SArray of dictionary or array type") + if sub_sketch_keys is not None: + if self.dtype != dict and self.dtype != array.array: + raise TypeError( + "sub_sketch_keys is only supported for SArray of dictionary or array type" + ) if not _is_non_string_iterable(sub_sketch_keys): sub_sketch_keys = [sub_sketch_keys] value_types = set([type(i) for i in sub_sketch_keys]) - if (len(value_types) != 1): - raise ValueError("sub_sketch_keys member values need to have the same type.") + if len(value_types) != 1: + raise ValueError( + "sub_sketch_keys member values need to have the same type." + ) value_type = value_types.pop() - if (self.dtype == dict and value_type != str): - raise TypeError("Only string value(s) can be passed to sub_sketch_keys for SArray of dictionary type. "+ - "For dictionary types, sketch summary is computed by casting keys to string values.") - if (self.dtype == array.array and value_type != int): - raise TypeError("Only int value(s) can be passed to sub_sketch_keys for SArray of array type") + if self.dtype == dict and value_type != str: + raise TypeError( + "Only string value(s) can be passed to sub_sketch_keys for SArray of dictionary type. " + + "For dictionary types, sketch summary is computed by casting keys to string values." + ) + if self.dtype == array.array and value_type != int: + raise TypeError( + "Only int value(s) can be passed to sub_sketch_keys for SArray of array type" + ) else: sub_sketch_keys = list() - return Sketch(self, background, sub_sketch_keys = sub_sketch_keys) + return Sketch(self, background, sub_sketch_keys=sub_sketch_keys) def value_counts(self): """ @@ -2753,8 +2856,12 @@ def value_counts(self): [3 rows x 2 columns] """ from .sframe import SFrame as _SFrame - return _SFrame({'value':self}).groupby('value', {'count':_aggregate.COUNT}).sort('count', ascending=False) + return ( + _SFrame({"value": self}) + .groupby("value", {"count": _aggregate.COUNT}) + .sort("count", ascending=False) + ) def append(self, other): """ @@ -2791,16 +2898,16 @@ def append(self, other): if self.dtype != other.dtype: if len(other) == 0: - other=other.astype(self.dtype) + other = other.astype(self.dtype) elif len(self) == 0: - self=self.astype(other.dtype) + self = self.astype(other.dtype) else: raise RuntimeError("Data types in both SArrays have to be the same") with cython_context(): - return SArray(_proxy = self.__proxy__.append(other.__proxy__)) + return SArray(_proxy=self.__proxy__.append(other.__proxy__)) def unique(self): """ @@ -2822,11 +2929,11 @@ def unique(self): from .sframe import SFrame as _SFrame tmp_sf = _SFrame() - tmp_sf.add_column(self, 'X1', inplace=True) + tmp_sf.add_column(self, "X1", inplace=True) - res = tmp_sf.groupby('X1',{}) + res = tmp_sf.groupby("X1", {}) - return SArray(_proxy=res['X1'].__proxy__) + return SArray(_proxy=res["X1"].__proxy__) def explore(self, title=None): """ @@ -2853,7 +2960,8 @@ def explore(self, title=None): >>> sa.explore(title="My Plot Title") """ from .sframe import SFrame as _SFrame - _SFrame({'SArray': self}).explore() + + _SFrame({"SArray": self}).explore() def show(self, title=LABEL_DEFAULT, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT): """ @@ -2944,7 +3052,7 @@ def plot(self, title=LABEL_DEFAULT, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT): ylabel = " " if title is None: - title = "" # C++ otherwise gets "None" as std::string + title = "" # C++ otherwise gets "None" as std::string if xlabel is None: xlabel = "" if ylabel is None: @@ -2982,12 +3090,13 @@ def item_length(self): Rows: 6 [2, 3, 3, 1, 2, None] """ - if (self.dtype not in [list, dict, array.array]): - raise TypeError("item_length() is only applicable for SArray of type list, dict and array.") - + if self.dtype not in [list, dict, array.array]: + raise TypeError( + "item_length() is only applicable for SArray of type list, dict and array." + ) with cython_context(): - return SArray(_proxy = self.__proxy__.item_length()) + return SArray(_proxy=self.__proxy__.item_length()) def random_split(self, fraction, seed=None): """ @@ -3022,12 +3131,13 @@ def random_split(self, fraction, seed=None): 922 102 """ from .sframe import SFrame + temporary_sf = SFrame() - temporary_sf['X1'] = self + temporary_sf["X1"] = self (train, test) = temporary_sf.random_split(fraction, seed) - return (train['X1'], test['X1']) + return (train["X1"], test["X1"]) - def split_datetime(self, column_name_prefix = "X", limit=None, timezone=False): + def split_datetime(self, column_name_prefix="X", limit=None, timezone=False): """ Splits an SArray of datetime type to multiple columns, return a new SFrame that contains expanded columns. A SArray of datetime will be @@ -3129,7 +3239,7 @@ def split_datetime(self, column_name_prefix = "X", limit=None, timezone=False): if column_name_prefix is None: column_name_prefix = "" if six.PY2 and type(column_name_prefix) == unicode: - column_name_prefix = column_name_prefix.encode('utf-8') + column_name_prefix = column_name_prefix.encode("utf-8") if type(column_name_prefix) != str: raise TypeError("'column_name_prefix' must be a string") @@ -3139,10 +3249,10 @@ def split_datetime(self, column_name_prefix = "X", limit=None, timezone=False): raise TypeError("'limit' must be a list") name_types = set([type(i) for i in limit]) - if (len(name_types) != 1): + if len(name_types) != 1: raise TypeError("'limit' contains values that are different types") - if (name_types.pop() != str): + if name_types.pop() != str: raise TypeError("'limit' must contain string values.") if len(set(limit)) != len(limit): @@ -3150,18 +3260,19 @@ def split_datetime(self, column_name_prefix = "X", limit=None, timezone=False): column_types = [] - if(limit is None): - limit = ['year','month','day','hour','minute','second'] + if limit is None: + limit = ["year", "month", "day", "hour", "minute", "second"] column_types = [int] * len(limit) - if(timezone == True): - limit += ['timezone'] + if timezone == True: + limit += ["timezone"] column_types += [float] - with cython_context(): - return _SFrame(_proxy=self.__proxy__.expand(column_name_prefix, limit, column_types)) + return _SFrame( + _proxy=self.__proxy__.expand(column_name_prefix, limit, column_types) + ) def stack(self, new_column_name=None, drop_na=False, new_column_type=None): """ @@ -3235,12 +3346,17 @@ def stack(self, new_column_name=None, drop_na=False, new_column_type=None): To drop that row, set drop_na=True in the parameters to stack. """ from .sframe import SFrame as _SFrame - return _SFrame({'SArray': self}).stack('SArray', - new_column_name=new_column_name, - drop_na=drop_na, - new_column_type=new_column_type) - def unpack(self, column_name_prefix = "X", column_types=None, na_value=None, limit=None): + return _SFrame({"SArray": self}).stack( + "SArray", + new_column_name=new_column_name, + drop_na=drop_na, + new_column_type=new_column_type, + ) + + def unpack( + self, column_name_prefix="X", column_types=None, na_value=None, limit=None + ): """ Convert an SArray of list, array, or dict type to an SFrame with multiple columns. @@ -3373,16 +3489,16 @@ def unpack(self, column_name_prefix = "X", column_types=None, na_value=None, lim if column_name_prefix is None: column_name_prefix = "" - if not(isinstance(column_name_prefix, six.string_types)): + if not (isinstance(column_name_prefix, six.string_types)): raise TypeError("'column_name_prefix' must be a string") # validate 'limit' if limit is not None: - if (not _is_non_string_iterable(limit)): + if not _is_non_string_iterable(limit): raise TypeError("'limit' must be a list") name_types = set([type(i) for i in limit]) - if (len(name_types) != 1): + if len(name_types) != 1: raise TypeError("'limit' contains values that are different types") # limit value should be numeric if unpacking sarray.array value @@ -3392,19 +3508,25 @@ def unpack(self, column_name_prefix = "X", column_types=None, na_value=None, lim if len(set(limit)) != len(limit): raise ValueError("'limit' contains duplicate values") - if (column_types is not None): + if column_types is not None: if not _is_non_string_iterable(column_types): raise TypeError("column_types must be a list") for column_type in column_types: - if (column_type not in (int, float, str, list, dict, array.array)): - raise TypeError("column_types contains unsupported types. Supported types are ['float', 'int', 'list', 'dict', 'str', 'array.array']") + if column_type not in (int, float, str, list, dict, array.array): + raise TypeError( + "column_types contains unsupported types. Supported types are ['float', 'int', 'list', 'dict', 'str', 'array.array']" + ) if limit is not None: if len(limit) != len(column_types): - raise ValueError("limit and column_types do not have the same length") + raise ValueError( + "limit and column_types do not have the same length" + ) elif self.dtype == dict: - raise ValueError("if 'column_types' is given, 'limit' has to be provided to unpack dict type.") + raise ValueError( + "if 'column_types' is given, 'limit' has to be provided to unpack dict type." + ) else: limit = range(len(column_types)) @@ -3412,7 +3534,9 @@ def unpack(self, column_name_prefix = "X", column_types=None, na_value=None, lim head_rows = self.head(100).dropna() lengths = [len(i) for i in head_rows] if len(lengths) == 0 or max(lengths) == 0: - raise RuntimeError("Cannot infer number of items from the SArray, SArray may be empty. please explicitly provide column types") + raise RuntimeError( + "Cannot infer number of items from the SArray, SArray may be empty. please explicitly provide column types" + ) # infer column types for dict type at server side, for list and array, infer from client side if self.dtype != dict: @@ -3428,16 +3552,29 @@ def unpack(self, column_name_prefix = "X", column_types=None, na_value=None, lim else: column_types = list() for i in limit: - t = [(x[i] if ((x is not None) and len(x) > i) else None) for x in head_rows] + t = [ + (x[i] if ((x is not None) and len(x) > i) else None) + for x in head_rows + ] column_types.append(infer_type_of_list(t)) - with cython_context(): - if (self.dtype == dict and column_types is None): + if self.dtype == dict and column_types is None: limit = limit if limit is not None else [] - return _SFrame(_proxy=self.__proxy__.unpack_dict(column_name_prefix.encode('utf-8'), limit, na_value)) + return _SFrame( + _proxy=self.__proxy__.unpack_dict( + column_name_prefix.encode("utf-8"), limit, na_value + ) + ) else: - return _SFrame(_proxy=self.__proxy__.unpack(column_name_prefix.encode('utf-8'), limit, column_types, na_value)) + return _SFrame( + _proxy=self.__proxy__.unpack( + column_name_prefix.encode("utf-8"), + limit, + column_types, + na_value, + ) + ) def sort(self, ascending=True): """ @@ -3467,10 +3604,12 @@ def sort(self, ascending=True): from .sframe import SFrame as _SFrame if self.dtype not in (int, float, str, datetime.datetime): - raise TypeError("Only sarray with type (int, float, str, datetime.datetime) can be sorted") + raise TypeError( + "Only sarray with type (int, float, str, datetime.datetime) can be sorted" + ) sf = _SFrame() - sf['a'] = self - return sf.sort('a', ascending)['a'] + sf["a"] = self + return sf.sort("a", ascending)["a"] def __check_min_observations(self, min_observations): if min_observations is None: @@ -3579,10 +3718,14 @@ def rolling_mean(self, window_start, window_end, min_observations=None): min_observations = self.__check_min_observations(min_observations) agg_op = None if self.dtype is array.array: - agg_op = '__builtin__vector__avg__' + agg_op = "__builtin__vector__avg__" else: - agg_op = '__builtin__avg__' - return SArray(_proxy=self.__proxy__.builtin_rolling_apply(agg_op, window_start, window_end, min_observations)) + agg_op = "__builtin__avg__" + return SArray( + _proxy=self.__proxy__.builtin_rolling_apply( + agg_op, window_start, window_end, min_observations + ) + ) def rolling_sum(self, window_start, window_end, min_observations=None): """ @@ -3684,10 +3827,14 @@ def rolling_sum(self, window_start, window_end, min_observations=None): min_observations = self.__check_min_observations(min_observations) agg_op = None if self.dtype is array.array: - agg_op = '__builtin__vector__sum__' + agg_op = "__builtin__vector__sum__" else: - agg_op = '__builtin__sum__' - return SArray(_proxy=self.__proxy__.builtin_rolling_apply(agg_op, window_start, window_end, min_observations)) + agg_op = "__builtin__sum__" + return SArray( + _proxy=self.__proxy__.builtin_rolling_apply( + agg_op, window_start, window_end, min_observations + ) + ) def rolling_max(self, window_start, window_end, min_observations=None): """ @@ -3786,8 +3933,12 @@ def rolling_max(self, window_start, window_end, min_observations=None): [None, None, 2, 3, 4] """ min_observations = self.__check_min_observations(min_observations) - agg_op = '__builtin__max__' - return SArray(_proxy=self.__proxy__.builtin_rolling_apply(agg_op, window_start, window_end, min_observations)) + agg_op = "__builtin__max__" + return SArray( + _proxy=self.__proxy__.builtin_rolling_apply( + agg_op, window_start, window_end, min_observations + ) + ) def rolling_min(self, window_start, window_end, min_observations=None): """ @@ -3886,8 +4037,12 @@ def rolling_min(self, window_start, window_end, min_observations=None): [None, None, 1, 2, 3] """ min_observations = self.__check_min_observations(min_observations) - agg_op = '__builtin__min__' - return SArray(_proxy=self.__proxy__.builtin_rolling_apply(agg_op, window_start, window_end, min_observations)) + agg_op = "__builtin__min__" + return SArray( + _proxy=self.__proxy__.builtin_rolling_apply( + agg_op, window_start, window_end, min_observations + ) + ) def rolling_var(self, window_start, window_end, min_observations=None): """ @@ -3986,8 +4141,12 @@ def rolling_var(self, window_start, window_end, min_observations=None): [None, None, 0.25, 0.25, 0.25] """ min_observations = self.__check_min_observations(min_observations) - agg_op = '__builtin__var__' - return SArray(_proxy=self.__proxy__.builtin_rolling_apply(agg_op, window_start, window_end, min_observations)) + agg_op = "__builtin__var__" + return SArray( + _proxy=self.__proxy__.builtin_rolling_apply( + agg_op, window_start, window_end, min_observations + ) + ) def rolling_stdv(self, window_start, window_end, min_observations=None): """ @@ -4087,8 +4246,12 @@ def rolling_stdv(self, window_start, window_end, min_observations=None): [None, None, 0.5, 0.5, 0.5] """ min_observations = self.__check_min_observations(min_observations) - agg_op = '__builtin__stdv__' - return SArray(_proxy=self.__proxy__.builtin_rolling_apply(agg_op, window_start, window_end, min_observations)) + agg_op = "__builtin__stdv__" + return SArray( + _proxy=self.__proxy__.builtin_rolling_apply( + agg_op, window_start, window_end, min_observations + ) + ) def rolling_count(self, window_start, window_end): """ @@ -4164,8 +4327,12 @@ def rolling_count(self, window_start, window_end): Rows: 5 [0, 1, 2, 2, 1] """ - agg_op = '__builtin__nonnull__count__' - return SArray(_proxy=self.__proxy__.builtin_rolling_apply(agg_op, window_start, window_end, 0)) + agg_op = "__builtin__nonnull__count__" + return SArray( + _proxy=self.__proxy__.builtin_rolling_apply( + agg_op, window_start, window_end, 0 + ) + ) def cumulative_sum(self): """ @@ -4195,7 +4362,7 @@ def cumulative_sum(self): [1, 3, 6, 10, 15] """ agg_op = "__builtin__cum_sum__" - return SArray(_proxy = self.__proxy__.builtin_cumulative_aggregate(agg_op)) + return SArray(_proxy=self.__proxy__.builtin_cumulative_aggregate(agg_op)) def cumulative_mean(self): """ @@ -4226,7 +4393,7 @@ def cumulative_mean(self): [1, 1.5, 2, 2.5, 3] """ agg_op = "__builtin__cum_avg__" - return SArray(_proxy = self.__proxy__.builtin_cumulative_aggregate(agg_op)) + return SArray(_proxy=self.__proxy__.builtin_cumulative_aggregate(agg_op)) def cumulative_min(self): """ @@ -4254,7 +4421,7 @@ def cumulative_min(self): [1, 1, 1, 1, 0] """ agg_op = "__builtin__cum_min__" - return SArray(_proxy = self.__proxy__.builtin_cumulative_aggregate(agg_op)) + return SArray(_proxy=self.__proxy__.builtin_cumulative_aggregate(agg_op)) def cumulative_max(self): """ @@ -4282,7 +4449,7 @@ def cumulative_max(self): [1, 1, 3, 4, 4] """ agg_op = "__builtin__cum_max__" - return SArray(_proxy = self.__proxy__.builtin_cumulative_aggregate(agg_op)) + return SArray(_proxy=self.__proxy__.builtin_cumulative_aggregate(agg_op)) def cumulative_std(self): """ @@ -4310,7 +4477,7 @@ def cumulative_std(self): [0.0, 0.5, 0.816496580927726, 1.118033988749895, 1.4142135623730951] """ agg_op = "__builtin__cum_std__" - return SArray(_proxy = self.__proxy__.builtin_cumulative_aggregate(agg_op)) + return SArray(_proxy=self.__proxy__.builtin_cumulative_aggregate(agg_op)) def cumulative_var(self): """ @@ -4338,7 +4505,7 @@ def cumulative_var(self): [0.0, 0.25, 0.6666666666666666, 1.25, 2.0] """ agg_op = "__builtin__cum_var__" - return SArray(_proxy = self.__proxy__.builtin_cumulative_aggregate(agg_op)) + return SArray(_proxy=self.__proxy__.builtin_cumulative_aggregate(agg_op)) def filter_by(self, values, exclude=False): @@ -4379,10 +4546,10 @@ def filter_by(self, values, exclude=False): from .sframe import SFrame as _SFrame - column_name = 'sarray' + column_name = "sarray" # Convert values to SArray - if not isinstance(values, SArray): #type(values) is not SArray: + if not isinstance(values, SArray): # type(values) is not SArray: # If we were given a single element, try to put in list and convert # to SArray if not _is_non_string_iterable(values): @@ -4392,7 +4559,7 @@ def filter_by(self, values, exclude=False): # Convert values to SFrame value_sf = _SFrame() value_sf.add_column(values, column_name, inplace=True) - given_type = value_sf.column_types()[0] #value column type + given_type = value_sf.column_types()[0] # value column type existing_type = self.dtype sarray_sf = _SFrame() @@ -4409,29 +4576,33 @@ def filter_by(self, values, exclude=False): if exclude: id_name = "id" value_sf = value_sf.add_row_number(id_name) - tmp = _SFrame(_proxy=sarray_sf.__proxy__.join(value_sf.__proxy__, - 'left', - {column_name:column_name})) + tmp = _SFrame( + _proxy=sarray_sf.__proxy__.join( + value_sf.__proxy__, "left", {column_name: column_name} + ) + ) ret_sf = tmp[tmp[id_name] == None] return ret_sf[column_name] else: - ret_sf = _SFrame(_proxy=sarray_sf.__proxy__.join(value_sf.__proxy__, - 'inner', - {column_name:column_name})) + ret_sf = _SFrame( + _proxy=sarray_sf.__proxy__.join( + value_sf.__proxy__, "inner", {column_name: column_name} + ) + ) return ret_sf[column_name] def __copy__(self): """ Returns a shallow copy of the sarray. """ - return SArray(_proxy = self.__proxy__) + return SArray(_proxy=self.__proxy__) def __deepcopy__(self, memo): """ Returns a deep copy of the sarray. As the data in an SArray is immutable, this is identical to __copy__. """ - return SArray(_proxy = self.__proxy__) + return SArray(_proxy=self.__proxy__) def abs(self): """ diff --git a/src/python/turicreate/data_structures/sarray_builder.py b/src/python/turicreate/data_structures/sarray_builder.py index 44f63fe525..bcb09b7903 100644 --- a/src/python/turicreate/data_structures/sarray_builder.py +++ b/src/python/turicreate/data_structures/sarray_builder.py @@ -3,9 +3,9 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" An interface for creating an SArray over time. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -13,6 +13,7 @@ from .._cython.cy_sarray_builder import UnitySArrayBuilderProxy from .sarray import SArray + class SArrayBuilder(object): """ An interface to incrementally build an SArray element by element. @@ -50,6 +51,7 @@ class SArrayBuilder(object): Rows: 3 [1, 2, 3] """ + def __init__(self, dtype, num_segments=1, history_size=10): self._builder = UnitySArrayBuilderProxy() self._builder.init(num_segments, history_size, dtype) @@ -93,7 +95,7 @@ def append_multiple(self, data, segment=0): any value in segment 0, and the order of elements in each segment is preserved as they are added. """ - if not hasattr(data, '__iter__'): + if not hasattr(data, "__iter__"): raise TypeError("append_multiple must be passed an iterable object") tmp_list = [] diff --git a/src/python/turicreate/data_structures/sframe.py b/src/python/turicreate/data_structures/sframe.py index 2a99714194..f86374e64c 100644 --- a/src/python/turicreate/data_structures/sframe.py +++ b/src/python/turicreate/data_structures/sframe.py @@ -3,13 +3,13 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" This module defines the SFrame class which provides the ability to create, access and manipulate a remote scalable dataframe object. SFrame acts similarly to pandas.DataFrame, but the data is completely immutable and is stored column wise. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -41,18 +41,23 @@ import csv from collections import Iterable as _Iterable -__all__ = ['SFrame'] +__all__ = ["SFrame"] __LOGGER__ = _logging.getLogger(__name__) -FOOTER_STRS = ['Note: Only the head of the SFrame is printed.', - 'You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.'] +FOOTER_STRS = [ + "Note: Only the head of the SFrame is printed.", + "You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.", +] -LAZY_FOOTER_STRS = ['Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.', - 'You can use sf.materialize() to force materialization.'] +LAZY_FOOTER_STRS = [ + "Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.", + "You can use sf.materialize() to force materialization.", +] if sys.version_info.major > 2: long = int + def load_sframe(filename): """ Load an SFrame. The filename extension is used to determine the format @@ -83,6 +88,7 @@ def load_sframe(filename): sf = SFrame(data=filename) return sf + def _get_global_dbapi_info(dbapi_module, conn): """ Fetches all needed information from the top-level DBAPI module, @@ -90,14 +96,18 @@ def _get_global_dbapi_info(dbapi_module, conn): dictionary of all the needed variables. This is put in one place to make sure the error message is clear if the module "guess" is wrong. """ - module_given_msg = "The DBAPI2 module given ({0}) is missing the global\n"+\ - "variable '{1}'. Please make sure you are supplying a module that\n"+\ - "conforms to the DBAPI 2.0 standard (PEP 0249)." - module_not_given_msg = "Hello! I gave my best effort to find the\n"+\ - "top-level module that the connection object you gave me came from.\n"+\ - "I found '{0}' which doesn't have the global variable '{1}'.\n"+\ - "To avoid this confusion, you can pass the module as a parameter using\n"+\ - "the 'dbapi_module' argument to either from_sql or to_sql." + module_given_msg = ( + "The DBAPI2 module given ({0}) is missing the global\n" + + "variable '{1}'. Please make sure you are supplying a module that\n" + + "conforms to the DBAPI 2.0 standard (PEP 0249)." + ) + module_not_given_msg = ( + "Hello! I gave my best effort to find the\n" + + "top-level module that the connection object you gave me came from.\n" + + "I found '{0}' which doesn't have the global variable '{1}'.\n" + + "To avoid this confusion, you can pass the module as a parameter using\n" + + "the 'dbapi_module' argument to either from_sql or to_sql." + ) if dbapi_module is None: dbapi_module = _get_module_from_object(conn) @@ -105,20 +115,20 @@ def _get_global_dbapi_info(dbapi_module, conn): else: module_given = True - module_name = dbapi_module.__name__ if hasattr(dbapi_module, '__name__') else None + module_name = dbapi_module.__name__ if hasattr(dbapi_module, "__name__") else None - needed_vars = ['apilevel','paramstyle','Error','DATETIME','NUMBER','ROWID'] + needed_vars = ["apilevel", "paramstyle", "Error", "DATETIME", "NUMBER", "ROWID"] ret_dict = {} - ret_dict['module_name'] = module_name + ret_dict["module_name"] = module_name for i in needed_vars: tmp = None try: - tmp = eval("dbapi_module."+i) + tmp = eval("dbapi_module." + i) except AttributeError as e: # Some DBs don't actually care about types, so they won't define # the types. These are the ACTUALLY needed variables though - if i not in ['apilevel','paramstyle','Error']: + if i not in ["apilevel", "paramstyle", "Error"]: pass elif module_given: raise AttributeError(module_given_msg.format(module_name, i)) @@ -127,22 +137,26 @@ def _get_global_dbapi_info(dbapi_module, conn): ret_dict[i] = tmp try: - if ret_dict['apilevel'][0:3] != "2.0": - raise NotImplementedError("Unsupported API version " +\ - str(ret_dict['apilevel']) + ". Only DBAPI 2.0 is supported.") + if ret_dict["apilevel"][0:3] != "2.0": + raise NotImplementedError( + "Unsupported API version " + + str(ret_dict["apilevel"]) + + ". Only DBAPI 2.0 is supported." + ) except TypeError as e: e.message = "Module's 'apilevel' value is invalid." raise e - acceptable_paramstyles = ['qmark','numeric','named','format','pyformat'] + acceptable_paramstyles = ["qmark", "numeric", "named", "format", "pyformat"] try: - if ret_dict['paramstyle'] not in acceptable_paramstyles: + if ret_dict["paramstyle"] not in acceptable_paramstyles: raise TypeError("Module's 'paramstyle' value is invalid.") except TypeError as e: raise TypeError("Module's 'paramstyle' value is invalid.") return ret_dict + def _convert_rows_to_builtin_seq(data): # Flexible type expects a builtin type (like list or tuple) for conversion. # Some DBAPI modules abstract rows as classes that act as single sequences @@ -152,6 +166,7 @@ def _convert_rows_to_builtin_seq(data): data = [list(row) for row in data] return data + # Expects list of tuples def _force_cast_sql_types(data, result_types, force_cast_cols): if len(force_cast_cols) == 0: @@ -712,78 +727,80 @@ class SFrame(object): 2 3 C """ - __slots__ = ['_proxy', '_cache'] + __slots__ = ["_proxy", "_cache"] - def __init__(self, data=None, - format='auto', - _proxy=None): + def __init__(self, data=None, format="auto", _proxy=None): """__init__(data=list(), format='auto') Construct a new SFrame from a url or a pandas.DataFrame. """ # emit metrics for num_rows, num_columns, and type (local://, s3, hdfs, http) - if (_proxy): + if _proxy: self.__proxy__ = _proxy else: self.__proxy__ = UnitySFrameProxy() _format = None if six.PY2 and isinstance(data, unicode): - data = data.encode('utf-8') - if (format == 'auto'): - if (HAS_PANDAS and isinstance(data, pandas.DataFrame)): - _format = 'dataframe' - elif (isinstance(data, str) or - (sys.version_info.major < 3 and isinstance(data, unicode))): - - if data.endswith(('.csv', '.csv.gz')): - _format = 'csv' - elif data.endswith(('.tsv', '.tsv.gz')): - _format = 'tsv' - elif data.endswith(('.txt', '.txt.gz')): - print("Assuming file is csv. For other delimiters, " + \ - "please use `SFrame.read_csv`.") - _format = 'csv' + data = data.encode("utf-8") + if format == "auto": + if HAS_PANDAS and isinstance(data, pandas.DataFrame): + _format = "dataframe" + elif isinstance(data, str) or ( + sys.version_info.major < 3 and isinstance(data, unicode) + ): + + if data.endswith((".csv", ".csv.gz")): + _format = "csv" + elif data.endswith((".tsv", ".tsv.gz")): + _format = "tsv" + elif data.endswith((".txt", ".txt.gz")): + print( + "Assuming file is csv. For other delimiters, " + + "please use `SFrame.read_csv`." + ) + _format = "csv" else: - _format = 'sframe' + _format = "sframe" elif type(data) == SArray: - _format = 'sarray' + _format = "sarray" elif isinstance(data, SFrame): - _format = 'sframe_obj' + _format = "sframe_obj" elif isinstance(data, dict): - _format = 'dict' + _format = "dict" elif _is_non_string_iterable(data): - _format = 'array' + _format = "array" elif data is None: - _format = 'empty' + _format = "empty" else: - raise ValueError('Cannot infer input type for data ' + str(data)) + raise ValueError("Cannot infer input type for data " + str(data)) else: _format = format - with cython_context(): - if (_format == 'dataframe'): + if _format == "dataframe": for c in data.columns.values: self.add_column(SArray(data[c].values), str(c), inplace=True) - elif (_format == 'sframe_obj'): + elif _format == "sframe_obj": for col in data.column_names(): self.__proxy__.add_column(data[col].__proxy__, col) - elif (_format == 'sarray'): - self.__proxy__.add_column(data.__proxy__, '') - elif (_format == 'array'): + elif _format == "sarray": + self.__proxy__.add_column(data.__proxy__, "") + elif _format == "array": if len(data) > 0: unique_types = set([type(x) for x in data if x is not None]) if len(unique_types) == 1 and SArray in unique_types: for arr in data: self.add_column(arr, inplace=True) elif SArray in unique_types: - raise ValueError("Cannot create SFrame from mix of regular values and SArrays") + raise ValueError( + "Cannot create SFrame from mix of regular values and SArrays" + ) else: - self.__proxy__.add_column(SArray(data).__proxy__, '') - elif (_format == 'dict'): + self.__proxy__.add_column(SArray(data).__proxy__, "") + elif _format == "dict": # Validate that every column is the same length. if len(set(len(value) for value in data.values())) > 1: # probably should be a value error. But we used to raise @@ -791,37 +808,48 @@ def __init__(self, data=None, raise RuntimeError("All column should be of the same length") # split into SArray values and other iterable values. # We convert the iterable values in bulk, and then add the sarray values as columns - sarray_keys = sorted(key for key,value in six.iteritems(data) if isinstance(value, SArray)) - self.__proxy__.load_from_dataframe({key:value for key,value in six.iteritems(data) if not isinstance(value, SArray)}) + sarray_keys = sorted( + key + for key, value in six.iteritems(data) + if isinstance(value, SArray) + ) + self.__proxy__.load_from_dataframe( + { + key: value + for key, value in six.iteritems(data) + if not isinstance(value, SArray) + } + ) for key in sarray_keys: self.__proxy__.add_column(data[key].__proxy__, key) - elif (_format == 'csv'): + elif _format == "csv": url = data - tmpsf = SFrame.read_csv(url, delimiter=',', header=True) + tmpsf = SFrame.read_csv(url, delimiter=",", header=True) self.__proxy__ = tmpsf.__proxy__ - elif (_format == 'tsv'): + elif _format == "tsv": url = data - tmpsf = SFrame.read_csv(url, delimiter='\t', header=True) + tmpsf = SFrame.read_csv(url, delimiter="\t", header=True) self.__proxy__ = tmpsf.__proxy__ - elif (_format == 'sframe'): + elif _format == "sframe": url = _make_internal_url(data) self.__proxy__.load_from_sframe_index(url) - elif (_format == 'empty'): + elif _format == "empty": pass else: - raise ValueError('Unknown input type: ' + format) + raise ValueError("Unknown input type: " + format) @staticmethod def _infer_column_types_from_lines(first_rows): - if (len(first_rows.column_names()) < 1): - print("Insufficient number of columns to perform type inference") - raise RuntimeError("Insufficient columns ") + if len(first_rows.column_names()) < 1: + print("Insufficient number of columns to perform type inference") + raise RuntimeError("Insufficient columns ") if len(first_rows) < 1: - print("Insufficient number of rows to perform type inference") - raise RuntimeError("Insufficient rows") + print("Insufficient number of rows to perform type inference") + raise RuntimeError("Insufficient rows") # gets all the values column-wise - all_column_values_transposed = [list(first_rows[col]) - for col in first_rows.column_names()] + all_column_values_transposed = [ + list(first_rows[col]) for col in first_rows.column_names() + ] # transpose all_column_values = [list(x) for x in list(zip(*all_column_values_transposed))] all_column_type_hints = [[type(t) for t in vals] for vals in all_column_values] @@ -834,56 +862,58 @@ def _infer_column_types_from_lines(first_rows): column_type_hints = all_column_type_hints[0] # now perform type combining across rows for i in range(1, len(all_column_type_hints)): - currow = all_column_type_hints[i] - for j in range(len(column_type_hints)): - # combine types - d = set([currow[j], column_type_hints[j]]) - if (len(d) == 1): - # easy case. both agree on the type - continue - if (((long in d) or (int in d)) and (float in d)): - # one is an int, one is a float. its a float - column_type_hints[j] = float - elif ((array.array in d) and (list in d)): - # one is an array , one is a list. its a list - column_type_hints[j] = list - elif type(None) in d: - # one is a NoneType. assign to other type - if currow[j] != type(None): - column_type_hints[j] = currow[j] - else: - column_type_hints[j] = str + currow = all_column_type_hints[i] + for j in range(len(column_type_hints)): + # combine types + d = set([currow[j], column_type_hints[j]]) + if len(d) == 1: + # easy case. both agree on the type + continue + if ((long in d) or (int in d)) and (float in d): + # one is an int, one is a float. its a float + column_type_hints[j] = float + elif (array.array in d) and (list in d): + # one is an array , one is a list. its a list + column_type_hints[j] = list + elif type(None) in d: + # one is a NoneType. assign to other type + if currow[j] != type(None): + column_type_hints[j] = currow[j] + else: + column_type_hints[j] = str # final pass. everything which is still NoneType is now a str for i in range(len(column_type_hints)): - if column_type_hints[i] == type(None): - column_type_hints[i] = str + if column_type_hints[i] == type(None): + column_type_hints[i] = str return column_type_hints @classmethod - def _read_csv_impl(cls, - url, - delimiter=',', - header=True, - error_bad_lines=False, - comment_char='', - escape_char='\\', - double_quote=True, - quote_char='\"', - skip_initial_space=True, - column_type_hints=None, - na_values=["NA"], - line_terminator="\n", - usecols=[], - nrows=None, - skiprows=0, - verbose=True, - store_errors=True, - nrows_to_infer=100, - true_values=[], - false_values=[], - _only_raw_string_substitutions=False, - **kwargs): + def _read_csv_impl( + cls, + url, + delimiter=",", + header=True, + error_bad_lines=False, + comment_char="", + escape_char="\\", + double_quote=True, + quote_char='"', + skip_initial_space=True, + column_type_hints=None, + na_values=["NA"], + line_terminator="\n", + usecols=[], + nrows=None, + skiprows=0, + verbose=True, + store_errors=True, + nrows_to_infer=100, + true_values=[], + false_values=[], + _only_raw_string_substitutions=False, + **kwargs + ): """ Constructs an SFrame from a CSV file or a path to multiple CSVs, and returns a pair containing the SFrame and optionally @@ -900,22 +930,22 @@ def _read_csv_impl(cls, """ # Pandas argument compatibility if "sep" in kwargs: - delimiter = kwargs['sep'] - del kwargs['sep'] + delimiter = kwargs["sep"] + del kwargs["sep"] if "quotechar" in kwargs: - quote_char = kwargs['quotechar'] - del kwargs['quotechar'] + quote_char = kwargs["quotechar"] + del kwargs["quotechar"] if "doublequote" in kwargs: - double_quote = kwargs['doublequote'] - del kwargs['doublequote'] + double_quote = kwargs["doublequote"] + del kwargs["doublequote"] if "comment" in kwargs: - comment_char = kwargs['comment'] - del kwargs['comment'] + comment_char = kwargs["comment"] + del kwargs["comment"] if comment_char is None: - comment_char = '' + comment_char = "" if "lineterminator" in kwargs: - line_terminator = kwargs['lineterminator'] - del kwargs['lineterminator'] + line_terminator = kwargs["lineterminator"] + del kwargs["lineterminator"] if len(kwargs) > 0: raise TypeError("Unexpected keyword arguments " + str(kwargs.keys())) @@ -924,7 +954,7 @@ def _read_csv_impl(cls, parsing_config["use_header"] = header parsing_config["continue_on_failure"] = not error_bad_lines parsing_config["comment_char"] = comment_char - parsing_config["escape_char"] = '\0' if escape_char is None else escape_char + parsing_config["escape_char"] = "\0" if escape_char is None else escape_char parsing_config["use_escape_char"] = escape_char is None parsing_config["double_quote"] = double_quote parsing_config["quote_char"] = quote_char @@ -932,18 +962,18 @@ def _read_csv_impl(cls, parsing_config["store_errors"] = store_errors parsing_config["line_terminator"] = line_terminator parsing_config["output_columns"] = usecols - parsing_config["skip_rows"] =skiprows + parsing_config["skip_rows"] = skiprows parsing_config["true_values"] = true_values parsing_config["false_values"] = false_values parsing_config["only_raw_string_substitutions"] = _only_raw_string_substitutions if type(na_values) is str: - na_values = [na_values] + na_values = [na_values] if na_values is not None and len(na_values) > 0: parsing_config["na_values"] = na_values if nrows is not None: - parsing_config["row_limit"] = nrows + parsing_config["row_limit"] = nrows proxy = UnitySFrameProxy() internal_url = _make_internal_url(url) @@ -954,86 +984,112 @@ def _read_csv_impl(cls, if column_type_hints is None: try: # Get the first nrows_to_infer rows (using all the desired arguments). - first_rows = SFrame.read_csv(url, nrows=nrows_to_infer, - column_type_hints=type(None), - header=header, - delimiter=delimiter, - comment_char=comment_char, - escape_char=escape_char, - double_quote=double_quote, - quote_char=quote_char, - skip_initial_space=skip_initial_space, - na_values=na_values, - line_terminator=line_terminator, - usecols=usecols, - skiprows=skiprows, - verbose=verbose, - true_values=true_values, - false_values=false_values, - _only_raw_string_substitutions=_only_raw_string_substitutions) + first_rows = SFrame.read_csv( + url, + nrows=nrows_to_infer, + column_type_hints=type(None), + header=header, + delimiter=delimiter, + comment_char=comment_char, + escape_char=escape_char, + double_quote=double_quote, + quote_char=quote_char, + skip_initial_space=skip_initial_space, + na_values=na_values, + line_terminator=line_terminator, + usecols=usecols, + skiprows=skiprows, + verbose=verbose, + true_values=true_values, + false_values=false_values, + _only_raw_string_substitutions=_only_raw_string_substitutions, + ) column_type_hints = SFrame._infer_column_types_from_lines(first_rows) - typelist = '[' + ','.join(t.__name__ for t in column_type_hints) + ']' + typelist = "[" + ",".join(t.__name__ for t in column_type_hints) + "]" if verbose: print("------------------------------------------------------") - print("Inferred types from first %d line(s) of file as " % nrows_to_infer) - print("column_type_hints="+ typelist) + print( + "Inferred types from first %d line(s) of file as " + % nrows_to_infer + ) + print("column_type_hints=" + typelist) print("If parsing fails due to incorrect types, you can correct") print("the inferred type list above and pass it to read_csv in") - print( "the column_type_hints argument") + print("the column_type_hints argument") print("------------------------------------------------------") column_type_inference_was_used = True except RuntimeError as e: - if type(e) == RuntimeError and ("cancel" in str(e.args[0]) or "Cancel" in str(e.args[0])): + if type(e) == RuntimeError and ( + "cancel" in str(e.args[0]) or "Cancel" in str(e.args[0]) + ): raise e # If the above fails, default back to str for all columns. column_type_hints = str if verbose: - print('Could not detect types. Using str for each column.') + print("Could not detect types. Using str for each column.") if type(column_type_hints) is type: - type_hints = {'__all_columns__': column_type_hints} + type_hints = {"__all_columns__": column_type_hints} elif type(column_type_hints) is list: - type_hints = dict(list(zip(['__X%d__' % i for i in range(len(column_type_hints))], column_type_hints))) + type_hints = dict( + list( + zip( + ["__X%d__" % i for i in range(len(column_type_hints))], + column_type_hints, + ) + ) + ) elif type(column_type_hints) is dict: # we need to fill in a potentially incomplete dictionary try: # Get the first nrows_to_infer rows (using all the desired arguments). - first_rows = SFrame.read_csv(url, nrows=nrows_to_infer, - column_type_hints=type(None), - header=header, - delimiter=delimiter, - comment_char=comment_char, - escape_char=escape_char, - double_quote=double_quote, - quote_char=quote_char, - skip_initial_space=skip_initial_space, - na_values=na_values, - line_terminator=line_terminator, - usecols=usecols, - skiprows=skiprows, - verbose=verbose, - true_values=true_values, - false_values=false_values, - _only_raw_string_substitutions=_only_raw_string_substitutions) + first_rows = SFrame.read_csv( + url, + nrows=nrows_to_infer, + column_type_hints=type(None), + header=header, + delimiter=delimiter, + comment_char=comment_char, + escape_char=escape_char, + double_quote=double_quote, + quote_char=quote_char, + skip_initial_space=skip_initial_space, + na_values=na_values, + line_terminator=line_terminator, + usecols=usecols, + skiprows=skiprows, + verbose=verbose, + true_values=true_values, + false_values=false_values, + _only_raw_string_substitutions=_only_raw_string_substitutions, + ) inferred_types = SFrame._infer_column_types_from_lines(first_rows) # make a dict of column_name to type - inferred_types = dict(list(zip(first_rows.column_names(), inferred_types))) + inferred_types = dict( + list(zip(first_rows.column_names(), inferred_types)) + ) # overwrite with the user's specified types for key in column_type_hints: inferred_types[key] = column_type_hints[key] column_type_hints = inferred_types except RuntimeError as e: - if type(e) == RuntimeError and ("cancel" in str(e) or "Cancel" in str(e)): + if type(e) == RuntimeError and ( + "cancel" in str(e) or "Cancel" in str(e) + ): raise e # If the above fails, default back to str for unmatched columns if verbose: - print('Could not detect types. Using str for all unspecified columns.') + print( + "Could not detect types. Using str for all unspecified columns." + ) type_hints = column_type_hints else: - raise TypeError("Invalid type for column_type_hints. Must be a dictionary, list or a single type.") + raise TypeError( + "Invalid type for column_type_hints. Must be a dictionary, list or a single type." + ) try: - if (not verbose): + if not verbose: glconnect.get_server().set_log_progress(False) with cython_context(): errors = proxy.load_from_csvs(internal_url, parsing_config, type_hints) @@ -1045,10 +1101,12 @@ def _read_csv_impl(cls, if verbose: print("Unable to parse the file with automatic type inference.") print("Defaulting to column_type_hints=str") - type_hints = {'__all_columns__': str} + type_hints = {"__all_columns__": str} try: with cython_context(): - errors = proxy.load_from_csvs(internal_url, parsing_config, type_hints) + errors = proxy.load_from_csvs( + internal_url, parsing_config, type_hints + ) except: glconnect.get_server().set_log_progress(True) raise @@ -1058,30 +1116,32 @@ def _read_csv_impl(cls, glconnect.get_server().set_log_progress(True) - return (cls(_proxy=proxy), { f: SArray(_proxy = es) for (f, es) in errors.items() }) + return (cls(_proxy=proxy), {f: SArray(_proxy=es) for (f, es) in errors.items()}) @classmethod - def read_csv_with_errors(cls, - url, - delimiter=',', - header=True, - comment_char='', - escape_char='\\', - double_quote=True, - quote_char='\"', - skip_initial_space=True, - column_type_hints=None, - na_values=["NA"], - line_terminator='\n', - usecols = [], - nrows=None, - skiprows=0, - verbose=True, - nrows_to_infer=100, - true_values=[], - false_values=[], - _only_raw_string_substitutions=False, - **kwargs): + def read_csv_with_errors( + cls, + url, + delimiter=",", + header=True, + comment_char="", + escape_char="\\", + double_quote=True, + quote_char='"', + skip_initial_space=True, + column_type_hints=None, + na_values=["NA"], + line_terminator="\n", + usecols=[], + nrows=None, + skiprows=0, + verbose=True, + nrows_to_infer=100, + true_values=[], + false_values=[], + _only_raw_string_substitutions=False, + **kwargs + ): """ Constructs an SFrame from a CSV file or a path to multiple CSVs, and returns a pair containing the SFrame and a dict of filenames to SArrays @@ -1200,53 +1260,58 @@ def read_csv_with_errors(cls, Rows: 1 ['x,y,z,a,b,c']} """ - return cls._read_csv_impl(url, - delimiter=delimiter, - header=header, - error_bad_lines=False, # we are storing errors, - # thus we must not fail - # on bad lines - comment_char=comment_char, - escape_char=escape_char, - double_quote=double_quote, - quote_char=quote_char, - skip_initial_space=skip_initial_space, - column_type_hints=column_type_hints, - na_values=na_values, - line_terminator=line_terminator, - usecols=usecols, - nrows=nrows, - verbose=verbose, - skiprows=skiprows, - store_errors=True, - nrows_to_infer=nrows_to_infer, - true_values=true_values, - false_values=false_values, - _only_raw_string_substitutions=_only_raw_string_substitutions, - **kwargs) + return cls._read_csv_impl( + url, + delimiter=delimiter, + header=header, + error_bad_lines=False, # we are storing errors, + # thus we must not fail + # on bad lines + comment_char=comment_char, + escape_char=escape_char, + double_quote=double_quote, + quote_char=quote_char, + skip_initial_space=skip_initial_space, + column_type_hints=column_type_hints, + na_values=na_values, + line_terminator=line_terminator, + usecols=usecols, + nrows=nrows, + verbose=verbose, + skiprows=skiprows, + store_errors=True, + nrows_to_infer=nrows_to_infer, + true_values=true_values, + false_values=false_values, + _only_raw_string_substitutions=_only_raw_string_substitutions, + **kwargs + ) + @classmethod - def read_csv(cls, - url, - delimiter=',', - header=True, - error_bad_lines=False, - comment_char='', - escape_char='\\', - double_quote=True, - quote_char='\"', - skip_initial_space=True, - column_type_hints=None, - na_values=["NA"], - line_terminator='\n', - usecols=[], - nrows=None, - skiprows=0, - verbose=True, - nrows_to_infer=100, - true_values=[], - false_values=[], - _only_raw_string_substitutions=False, - **kwargs): + def read_csv( + cls, + url, + delimiter=",", + header=True, + error_bad_lines=False, + comment_char="", + escape_char="\\", + double_quote=True, + quote_char='"', + skip_initial_space=True, + column_type_hints=None, + na_values=["NA"], + line_terminator="\n", + usecols=[], + nrows=None, + skiprows=0, + verbose=True, + nrows_to_infer=100, + true_values=[], + false_values=[], + _only_raw_string_substitutions=False, + **kwargs + ): """ Constructs an SFrame from a CSV file or a path to multiple CSVs. @@ -1489,34 +1554,33 @@ def read_csv(cls, Set error_bad_lines=False to skip bad lines """ - return cls._read_csv_impl(url, - delimiter=delimiter, - header=header, - error_bad_lines=error_bad_lines, - comment_char=comment_char, - escape_char=escape_char, - double_quote=double_quote, - quote_char=quote_char, - skip_initial_space=skip_initial_space, - column_type_hints=column_type_hints, - na_values=na_values, - line_terminator=line_terminator, - usecols=usecols, - nrows=nrows, - skiprows=skiprows, - verbose=verbose, - store_errors=False, - nrows_to_infer=nrows_to_infer, - true_values=true_values, - false_values=false_values, - _only_raw_string_substitutions=_only_raw_string_substitutions, - **kwargs)[0] - + return cls._read_csv_impl( + url, + delimiter=delimiter, + header=header, + error_bad_lines=error_bad_lines, + comment_char=comment_char, + escape_char=escape_char, + double_quote=double_quote, + quote_char=quote_char, + skip_initial_space=skip_initial_space, + column_type_hints=column_type_hints, + na_values=na_values, + line_terminator=line_terminator, + usecols=usecols, + nrows=nrows, + skiprows=skiprows, + verbose=verbose, + store_errors=False, + nrows_to_infer=nrows_to_infer, + true_values=true_values, + false_values=false_values, + _only_raw_string_substitutions=_only_raw_string_substitutions, + **kwargs + )[0] @classmethod - def read_json(cls, - url, - orient='records'): + def read_json(cls, url, orient="records"): """ Reads a JSON file representing a table into an SFrame. @@ -1604,28 +1668,42 @@ def read_json(cls, if len(g) == 0: return SFrame() if g.dtype != dict: - raise RuntimeError("Invalid input JSON format. Expected list of dictionaries") - g = SFrame({'X1':g}) - return g.unpack('X1','') + raise RuntimeError( + "Invalid input JSON format. Expected list of dictionaries" + ) + g = SFrame({"X1": g}) + return g.unpack("X1", "") elif orient == "lines": - g = cls.read_csv(url, header=False,na_values=['null'],true_values=['true'],false_values=['false'], - _only_raw_string_substitutions=True) + g = cls.read_csv( + url, + header=False, + na_values=["null"], + true_values=["true"], + false_values=["false"], + _only_raw_string_substitutions=True, + ) if g.num_rows() == 0: return SFrame() if g.num_columns() != 1: raise RuntimeError("Input JSON not of expected format") - if g['X1'].dtype == dict: - return g.unpack('X1','') + if g["X1"].dtype == dict: + return g.unpack("X1", "") else: return g else: raise ValueError("Invalid value for orient parameter (" + str(orient) + ")") - - @classmethod - def from_sql(cls, conn, sql_statement, params=None, type_inference_rows=100, - dbapi_module=None, column_type_hints=None, cursor_arraysize=128): + def from_sql( + cls, + conn, + sql_statement, + params=None, + type_inference_rows=100, + dbapi_module=None, + column_type_hints=None, + cursor_arraysize=128, + ): """ Convert the result of a SQL database query to an SFrame. @@ -1739,11 +1817,11 @@ def from_sql(cls, conn, sql_statement, params=None, type_inference_rows=100, c.execute(sql_statement) else: c.execute(sql_statement, params) - except mod_info['Error'] as e: + except mod_info["Error"] as e: # The rollback method is considered optional by DBAPI2, but some # modules that do implement it won't work again unless it is called # if an error happens on a cursor. - if hasattr(conn, 'rollback'): + if hasattr(conn, "rollback"): conn.rollback() raise e @@ -1757,18 +1835,20 @@ def from_sql(cls, conn, sql_statement, params=None, type_inference_rows=100, temp_vals = [] # Set any types that are given to us - col_name_to_num = {result_names[i]:i for i in range(len(result_names))} + col_name_to_num = {result_names[i]: i for i in range(len(result_names))} if column_type_hints is not None: if type(column_type_hints) is dict: - for k,v in column_type_hints.items(): + for k, v in column_type_hints.items(): col_num = col_name_to_num[k] cols_to_force_cast.add(col_num) result_types[col_num] = v elif type(column_type_hints) is list: if len(column_type_hints) != len(result_names): - __LOGGER__.warn("If column_type_hints is specified as a "+\ - "list, it must be of the same size as the result "+\ - "set's number of columns. Ignoring (use dict instead).") + __LOGGER__.warn( + "If column_type_hints is specified as a " + + "list, it must be of the same size as the result " + + "set's number of columns. Ignoring (use dict instead)." + ) else: result_types = column_type_hints cols_to_force_cast.update(range(len(result_desc))) @@ -1781,9 +1861,11 @@ def from_sql(cls, conn, sql_statement, params=None, type_inference_rows=100, # these are types that a "cast" makes sense, and we're not calling a # constructor that expects certain input (e.g. datetime.datetime), # since we could get lots of different input - hintable_types = [int,float,str] + hintable_types = [int, float, str] if not all([i in hintable_types or i is None for i in result_types]): - raise TypeError("Only " + str(hintable_types) + " can be provided as type hints!") + raise TypeError( + "Only " + str(hintable_types) + " can be provided as type hints!" + ) # Perform type inference by checking to see what python types are # returned from the cursor @@ -1792,8 +1874,8 @@ def from_sql(cls, conn, sql_statement, params=None, type_inference_rows=100, # will raise an exception is if execute didn't produce a result set try: row = c.fetchone() - except mod_info['Error'] as e: - if hasattr(conn, 'rollback'): + except mod_info["Error"] as e: + if hasattr(conn, "rollback"): conn.rollback() raise e while row is not None: @@ -1812,7 +1894,7 @@ def from_sql(cls, conn, sql_statement, params=None, type_inference_rows=100, # point. Try using DBAPI2 type_codes to pick a suitable type. If this # doesn't work, fall back to string. if not all(result_types): - missing_val_cols = [i for i,v in enumerate(result_types) if v is None] + missing_val_cols = [i for i, v in enumerate(result_types) if v is None] cols_to_force_cast.update(missing_val_cols) inferred_types = _infer_dbapi2_types(c, mod_info) cnt = 0 @@ -1822,7 +1904,9 @@ def from_sql(cls, conn, sql_statement, params=None, type_inference_rows=100, cnt += 1 sb = SFrameBuilder(result_types, column_names=result_names) - unsupported_cols = [i for i,v in enumerate(sb.column_types()) if v is type(None)] + unsupported_cols = [ + i for i, v in enumerate(sb.column_types()) if v is type(None) + ] if len(unsupported_cols) > 0: cols_to_force_cast.update(unsupported_cols) for i in unsupported_cols: @@ -1830,24 +1914,34 @@ def from_sql(cls, conn, sql_statement, params=None, type_inference_rows=100, sb = SFrameBuilder(result_types, column_names=result_names) temp_vals = _convert_rows_to_builtin_seq(temp_vals) - sb.append_multiple(_force_cast_sql_types(temp_vals, result_types, cols_to_force_cast)) + sb.append_multiple( + _force_cast_sql_types(temp_vals, result_types, cols_to_force_cast) + ) rows = c.fetchmany() while len(rows) > 0: rows = _convert_rows_to_builtin_seq(rows) - sb.append_multiple(_force_cast_sql_types(rows, result_types, cols_to_force_cast)) + sb.append_multiple( + _force_cast_sql_types(rows, result_types, cols_to_force_cast) + ) rows = c.fetchmany() cls = sb.close() try: c.close() - except mod_info['Error'] as e: - if hasattr(conn, 'rollback'): + except mod_info["Error"] as e: + if hasattr(conn, "rollback"): conn.rollback() raise e return cls - def to_sql(self, conn, table_name, dbapi_module=None, - use_python_type_specifiers=False, use_exact_column_names=True): + def to_sql( + self, + conn, + table_name, + dbapi_module=None, + use_python_type_specifiers=False, + use_exact_column_names=True, + ): """ Convert an SFrame to a single table in a SQL database. @@ -1893,18 +1987,21 @@ def to_sql(self, conn, table_name, dbapi_module=None, col_info = list(zip(self.column_names(), self.column_types())) if not use_python_type_specifiers: - _pytype_to_printf = lambda x: 's' + _pytype_to_printf = lambda x: "s" # DBAPI2 standard allows for five different ways to specify parameters sql_param = { - 'qmark' : lambda name,col_num,col_type: '?', - 'numeric' : lambda name,col_num,col_type:':'+str(col_num+1), - 'named' : lambda name,col_num,col_type:':'+str(name), - 'format' : lambda name,col_num,col_type:'%'+_pytype_to_printf(col_type), - 'pyformat': lambda name,col_num,col_type:'%('+str(name)+')'+_pytype_to_printf(col_type), - } - - get_sql_param = sql_param[mod_info['paramstyle']] + "qmark": lambda name, col_num, col_type: "?", + "numeric": lambda name, col_num, col_type: ":" + str(col_num + 1), + "named": lambda name, col_num, col_type: ":" + str(name), + "format": lambda name, col_num, col_type: "%" + _pytype_to_printf(col_type), + "pyformat": lambda name, col_num, col_type: "%(" + + str(name) + + ")" + + _pytype_to_printf(col_type), + } + + get_sql_param = sql_param[mod_info["paramstyle"]] # form insert string ins_str = "INSERT INTO " + str(table_name) @@ -1913,8 +2010,8 @@ def to_sql(self, conn, table_name, dbapi_module=None, count = 0 for i in col_info: col_str += i[0] - value_str += get_sql_param(i[0],count,i[1]) - if count < len(col_info)-1: + value_str += get_sql_param(i[0], count, i[1]) + if count < len(col_info) - 1: col_str += "," value_str += "," count += 1 @@ -1927,37 +2024,35 @@ def to_sql(self, conn, table_name, dbapi_module=None, ins_str += value_str # Some formats require values in an iterable, some a dictionary - if (mod_info['paramstyle'] == 'named' or\ - mod_info['paramstyle'] == 'pyformat'): - prepare_sf_row = lambda x:x + if mod_info["paramstyle"] == "named" or mod_info["paramstyle"] == "pyformat": + prepare_sf_row = lambda x: x else: - col_names = self.column_names() - prepare_sf_row = lambda x: [x[i] for i in col_names] + col_names = self.column_names() + prepare_sf_row = lambda x: [x[i] for i in col_names] for i in self: try: c.execute(ins_str, prepare_sf_row(i)) - except mod_info['Error'] as e: - if hasattr(conn, 'rollback'): + except mod_info["Error"] as e: + if hasattr(conn, "rollback"): conn.rollback() raise e conn.commit() c.close() - def __hash__(self): - ''' + """ Because we override `__eq__` we need to implement this function in Python 3. Just make it match default behavior in Python 2. - ''' + """ return id(self) // 16 - - def __add__(self,other): + + def __add__(self, other): """ Return append one frames to other """ - self=self.append(other) + self = self.append(other) return self def __repr__(self): @@ -1990,9 +2085,14 @@ def __get_column_description__(self): ret = ret + "\tNone\n\n" return ret - def __get_pretty_tables__(self, wrap_text=False, max_row_width=80, - max_column_width=30, max_columns=20, - max_rows_to_display=60): + def __get_pretty_tables__( + self, + wrap_text=False, + max_row_width=80, + max_column_width=30, + max_columns=20, + max_rows_to_display=60, + ): """ Returns a list of pretty print tables representing the current SFrame. If the number of columns is larger than max_columns, the last pretty @@ -2012,7 +2112,7 @@ def __get_pretty_tables__(self, wrap_text=False, max_row_width=80, ------- out : list[PrettyTable] """ - if (len(self) <= max_rows_to_display): + if len(self) <= max_rows_to_display: headsf = self.__copy__() else: headsf = self.head(max_rows_to_display) @@ -2027,31 +2127,37 @@ def __get_pretty_tables__(self, wrap_text=False, max_row_width=80, headsf[col] = headsf[col].astype(list) def _value_to_str(value): - if (type(value) is array.array): + if type(value) is array.array: return str(list(value)) - elif (type(value) is numpy.ndarray): - return str(value).replace('\n',' ') - elif (type(value) is list): - return '[' + ", ".join(_value_to_str(x) for x in value) + ']' + elif type(value) is numpy.ndarray: + return str(value).replace("\n", " ") + elif type(value) is list: + return "[" + ", ".join(_value_to_str(x) for x in value) + "]" else: return str(value) def _escape_space(s): if sys.version_info.major == 3: - return "".join([ch.encode('unicode_escape').decode() if ch.isspace() else ch for ch in s]) - return "".join([ch.encode('string_escape') if ch.isspace() else ch for ch in s]) + return "".join( + [ + ch.encode("unicode_escape").decode() if ch.isspace() else ch + for ch in s + ] + ) + return "".join( + [ch.encode("string_escape") if ch.isspace() else ch for ch in s] + ) def _truncate_respect_unicode(s, max_length): - if (len(s) <= max_length): + if len(s) <= max_length: return s else: if sys.version_info.major < 3: - u = unicode(s, 'utf-8', errors='replace') - return u[:max_length].encode('utf-8') + u = unicode(s, "utf-8", errors="replace") + return u[:max_length].encode("utf-8") else: return s[:max_length] - def _truncate_str(s, wrap_str=False): """ Truncate and optionally wrap the input string as unicode, replace @@ -2061,11 +2167,11 @@ def _truncate_str(s, wrap_str=False): if len(s) <= max_column_width: if sys.version_info.major < 3: - return unicode(s, 'utf-8', errors='replace') + return unicode(s, "utf-8", errors="replace") else: return s else: - ret = '' + ret = "" # if wrap_str is true, wrap the text and take at most 2 rows if wrap_str: wrapped_lines = wrap(s, max_column_width) @@ -2073,13 +2179,15 @@ def _truncate_str(s, wrap_str=False): return wrapped_lines[0] last_line = wrapped_lines[1] if len(last_line) >= max_column_width: - last_line = _truncate_respect_unicode(last_line, max_column_width - 4) - ret = wrapped_lines[0] + '\n' + last_line + ' ...' + last_line = _truncate_respect_unicode( + last_line, max_column_width - 4 + ) + ret = wrapped_lines[0] + "\n" + last_line + " ..." else: - ret = _truncate_respect_unicode(s, max_column_width - 4) + '...' + ret = _truncate_respect_unicode(s, max_column_width - 4) + "..." if sys.version_info.major < 3: - return unicode(ret, 'utf-8', errors='replace') + return unicode(ret, "utf-8", errors="replace") else: return ret @@ -2098,34 +2206,48 @@ def _truncate_str(s, wrap_str=False): col = columns.pop() # check the max length of element in the column if len(headsf) > 0: - col_width = min(max_column_width, max(len(str(x)) for x in headsf[col])) + col_width = min( + max_column_width, max(len(str(x)) for x in headsf[col]) + ) else: col_width = max_column_width - if (table_width + col_width < max_row_width): + if table_width + col_width < max_row_width: # truncate the header if necessary header = _truncate_str(col, wrap_text) - tbl.add_column(header, [_truncate_str(_value_to_str(x), wrap_text) for x in headsf[col]]) - table_width = str(tbl).find('\n') + tbl.add_column( + header, + [ + _truncate_str(_value_to_str(x), wrap_text) + for x in headsf[col] + ], + ) + table_width = str(tbl).find("\n") num_column_of_last_table += 1 else: # the column does not fit in the current table, push it back to columns columns.append(col) break - tbl.align = 'c' + tbl.align = "c" row_of_tables.append(tbl) # add a column of all "..." if there are more columns than displayed if self.num_columns() > max_columns: - row_of_tables[-1].add_column('...', ['...'] * len(headsf)) + row_of_tables[-1].add_column("...", ["..."] * len(headsf)) num_column_of_last_table += 1 # add a row of all "..." if there are more rows than displayed if self.__has_size__() and self.num_rows() > headsf.num_rows(): - row_of_tables[-1].add_row(['...'] * num_column_of_last_table) + row_of_tables[-1].add_row(["..."] * num_column_of_last_table) return row_of_tables - def print_rows(self, num_rows=10, num_columns=40, max_column_width=30, - max_row_width=80, output_file=None): + def print_rows( + self, + num_rows=10, + num_columns=40, + max_column_width=30, + max_row_width=80, + output_file=None, + ): """ Print the first M rows and N columns of the SFrame in human readable format. @@ -2161,13 +2283,18 @@ def print_rows(self, num_rows=10, num_columns=40, max_column_width=30, max_row_width = max(max_row_width, max_column_width + 1) printed_sf = self._imagecols_to_stringcols(num_rows) - row_of_tables = printed_sf.__get_pretty_tables__(wrap_text=False, - max_rows_to_display=num_rows, - max_columns=num_columns, - max_column_width=max_column_width, - max_row_width=max_row_width) + row_of_tables = printed_sf.__get_pretty_tables__( + wrap_text=False, + max_rows_to_display=num_rows, + max_columns=num_columns, + max_column_width=max_column_width, + max_row_width=max_row_width, + ) footer = "[%d rows x %d columns]\n" % self.shape - print('\n'.join([str(tb) for tb in row_of_tables]) + "\n" + footer, file=output_file) + print( + "\n".join([str(tb) for tb in row_of_tables]) + "\n" + footer, + file=output_file, + ) def _imagecols_to_stringcols(self, num_rows=10): # A list of column types @@ -2178,14 +2305,14 @@ def _imagecols_to_stringcols(self, num_rows=10): # Constructing names of sframe columns that are of image type image_column_names = [names[i] for i in range(len(names)) if types[i] == _Image] - #If there are image-type columns, copy the SFrame and cast the top MAX_NUM_ROWS_TO_DISPLAY of those columns to string + # If there are image-type columns, copy the SFrame and cast the top MAX_NUM_ROWS_TO_DISPLAY of those columns to string printed_sf = self.__copy__() if len(image_column_names) > 0: for t in names: if t in image_column_names: printed_sf[t] = self[t].astype(str) return printed_sf.head(num_rows) - + def drop_duplicates(self, subset): """ Return SFrame with duplicate rows removed, optionally only considering certain columns. @@ -2216,11 +2343,19 @@ def drop_duplicates(self, subset): [3 rows x 3 columns] """ - result = all(elem in self.column_names() for elem in subset) - if result : - return (self.groupby(subset, {col: aggregate.SELECT_ONE(col) for col in self.column_names() if col not in subset})) + result = all(elem in self.column_names() for elem in subset) + if result: + return self.groupby( + subset, + { + col: aggregate.SELECT_ONE(col) + for col in self.column_names() + if col not in subset + }, + ) else: raise TypeError("subset not in sframe column") + def __str_impl__(self, num_rows=10, footer=True): """ Returns a string containing the first num_rows elements of the frame, along @@ -2230,20 +2365,22 @@ def __str_impl__(self, num_rows=10, footer=True): printed_sf = self._imagecols_to_stringcols(MAX_ROWS_TO_DISPLAY) - row_of_tables = printed_sf.__get_pretty_tables__(wrap_text=False, max_rows_to_display=MAX_ROWS_TO_DISPLAY) + row_of_tables = printed_sf.__get_pretty_tables__( + wrap_text=False, max_rows_to_display=MAX_ROWS_TO_DISPLAY + ) is_empty = len(printed_sf) == 0 - if (not footer): - return (is_empty, '\n'.join([str(tb) for tb in row_of_tables])) + if not footer: + return (is_empty, "\n".join([str(tb) for tb in row_of_tables])) if self.__has_size__(): - footer = '[%d rows x %d columns]\n' % self.shape - if (self.num_rows() > MAX_ROWS_TO_DISPLAY): - footer += '\n'.join(FOOTER_STRS) + footer = "[%d rows x %d columns]\n" % self.shape + if self.num_rows() > MAX_ROWS_TO_DISPLAY: + footer += "\n".join(FOOTER_STRS) else: - footer = '[? rows x %d columns]\n' % self.num_columns() - footer += '\n'.join(LAZY_FOOTER_STRS) - return (is_empty, '\n'.join([str(tb) for tb in row_of_tables]) + "\n" + footer) + footer = "[? rows x %d columns]\n" % self.num_columns() + footer += "\n".join(LAZY_FOOTER_STRS) + return (is_empty, "\n".join([str(tb) for tb in row_of_tables]) + "\n" + footer) def __str__(self, num_rows=10, footer=True): """ @@ -2257,21 +2394,29 @@ def _repr_html_(self): printed_sf = self._imagecols_to_stringcols(MAX_ROWS_TO_DISPLAY) - row_of_tables = printed_sf.__get_pretty_tables__(wrap_text=True, - max_row_width=120, - max_columns=40, - max_column_width=25, - max_rows_to_display=MAX_ROWS_TO_DISPLAY) + row_of_tables = printed_sf.__get_pretty_tables__( + wrap_text=True, + max_row_width=120, + max_columns=40, + max_column_width=25, + max_rows_to_display=MAX_ROWS_TO_DISPLAY, + ) if self.__has_size__(): - footer = '[%d rows x %d columns]
' % self.shape - if (self.num_rows() > MAX_ROWS_TO_DISPLAY): - footer += '
'.join(FOOTER_STRS) + footer = "[%d rows x %d columns]
" % self.shape + if self.num_rows() > MAX_ROWS_TO_DISPLAY: + footer += "
".join(FOOTER_STRS) else: - footer = '[? rows x %d columns]
' % self.num_columns() - footer += '
'.join(LAZY_FOOTER_STRS) + footer = "[? rows x %d columns]
" % self.num_columns() + footer += "
".join(LAZY_FOOTER_STRS) begin = '
' - end = '\n
' - return begin + '\n'.join([tb.get_html_string(format=True) for tb in row_of_tables]) + "\n" + footer + end + end = "\n" + return ( + begin + + "\n".join([tb.get_html_string(format=True) for tb in row_of_tables]) + + "\n" + + footer + + end + ) def __nonzero__(self): """ @@ -2318,7 +2463,9 @@ def _row_selector(self, other): """ if type(other) is SArray: if self.__has_size__() and other.__has_size__() and len(other) != len(self): - raise IndexError("Cannot perform logical indexing on arrays of different length.") + raise IndexError( + "Cannot perform logical indexing on arrays of different length." + ) with cython_context(): return SFrame(_proxy=self.__proxy__.logical_filter(other.__proxy__)) @@ -2433,18 +2580,21 @@ def to_dataframe(self): from ..toolkits.image_classifier._evaluation import _image_resize - assert HAS_PANDAS, 'pandas is not installed.' + assert HAS_PANDAS, "pandas is not installed." df = pandas.DataFrame() for i in range(self.num_columns()): column_name = self.column_names()[i] - if(self.column_types()[i] == _Image): - df[column_name] = [_image_resize(x[column_name])._to_pil_image() for x in self.select_columns([column_name])] + if self.column_types()[i] == _Image: + df[column_name] = [ + _image_resize(x[column_name])._to_pil_image() + for x in self.select_columns([column_name]) + ] else: df[column_name] = list(self[column_name]) if len(df[column_name]) == 0: column_type = self.column_types()[i] if column_type in (array.array, type(None)): - column_type = 'object' + column_type = "object" df[column_name] = df[column_name].astype(column_type) return df @@ -2461,8 +2611,9 @@ def to_numpy(self): A Numpy Array containing all the values of the SFrame """ - assert HAS_NUMPY, 'numpy is not installed.' + assert HAS_NUMPY, "numpy is not installed." import numpy + return numpy.transpose(numpy.asarray([self[x] for x in self.column_names()])) def tail(self, n=10): @@ -2536,6 +2687,7 @@ def apply(self, fn, dtype=None): nativefn = None try: from .. import extensions as extensions + nativefn = extensions._build_native_function_call(fn) except: pass @@ -2543,12 +2695,14 @@ def apply(self, fn, dtype=None): if nativefn is not None: # this is a toolkit lambda. We can do something about it with cython_context(): - return SArray(_proxy=self.__proxy__.transform_native(nativefn, dtype, seed)) + return SArray( + _proxy=self.__proxy__.transform_native(nativefn, dtype, seed) + ) with cython_context(): return SArray(_proxy=self.__proxy__.transform(fn, dtype, seed)) - def flat_map(self, column_names, fn, column_types='auto', seed=None): + def flat_map(self, column_names, fn, column_types="auto", seed=None): """ Map each row of the SFrame to multiple rows in a new SFrame via a function. @@ -2614,28 +2768,32 @@ def flat_map(self, column_names, fn, column_types='auto', seed=None): if seed is None: seed = abs(hash("%0.20f" % time.time())) % (2 ** 31) - # determine the column_types - if column_types == 'auto': + if column_types == "auto": types = set() sample = self[0:10] results = [fn(row) for row in sample] for rows in results: if type(rows) is not list: - raise TypeError("Output type of the lambda function must be a list of lists") + raise TypeError( + "Output type of the lambda function must be a list of lists" + ) # note: this skips empty lists for row in rows: if type(row) is not list: - raise TypeError("Output type of the lambda function must be a list of lists") + raise TypeError( + "Output type of the lambda function must be a list of lists" + ) types.add(tuple([type(v) for v in row])) if len(types) == 0: raise TypeError( - "Could not infer output column types from the first ten rows " +\ - "of the SFrame. Please use the 'column_types' parameter to " +\ - "set the types.") + "Could not infer output column types from the first ten rows " + + "of the SFrame. Please use the 'column_types' parameter to " + + "set the types." + ) if len(types) > 1: raise TypeError("Mapped rows must have the same length and types") @@ -2643,9 +2801,13 @@ def flat_map(self, column_names, fn, column_types='auto', seed=None): column_types = list(types.pop()) assert type(column_types) is list, "'column_types' must be a list." - assert len(column_types) == len(column_names), "Number of output columns must match the size of column names" + assert len(column_types) == len( + column_names + ), "Number of output columns must match the size of column names" with cython_context(): - return SFrame(_proxy=self.__proxy__.flat_map(fn, column_names, column_types, seed)) + return SFrame( + _proxy=self.__proxy__.flat_map(fn, column_names, column_types, seed) + ) def sample(self, fraction, seed=None, exact=False): """ @@ -2686,10 +2848,10 @@ def sample(self, fraction, seed=None, exact=False): if seed is None: seed = abs(hash("%0.20f" % time.time())) % (2 ** 31) - if (fraction > 1 or fraction < 0): - raise ValueError('Invalid sampling rate: ' + str(fraction)) + if fraction > 1 or fraction < 0: + raise ValueError("Invalid sampling rate: " + str(fraction)) - if (self.num_rows() == 0 or self.num_columns() == 0): + if self.num_rows() == 0 or self.num_columns() == 0: return self else: with cython_context(): @@ -2735,10 +2897,10 @@ def random_split(self, fraction, seed=None, exact=False): >>> print(len(sf_train), len(sf_test)) 922 102 """ - if (fraction > 1 or fraction < 0): - raise ValueError('Invalid sampling rate: ' + str(fraction)) - - if (self.num_rows() == 0 or self.num_columns() == 0): + if fraction > 1 or fraction < 0: + raise ValueError("Invalid sampling rate: " + str(fraction)) + + if self.num_rows() == 0 or self.num_columns() == 0: return (SFrame(), SFrame()) if seed is None: @@ -2749,12 +2911,14 @@ def random_split(self, fraction, seed=None, exact=False): try: seed = int(seed) except ValueError: - raise ValueError('The \'seed\' parameter must be of type int.') - + raise ValueError("The 'seed' parameter must be of type int.") with cython_context(): proxy_pair = self.__proxy__.random_split(fraction, seed, exact) - return (SFrame(data=[], _proxy=proxy_pair[0]), SFrame(data=[], _proxy=proxy_pair[1])) + return ( + SFrame(data=[], _proxy=proxy_pair[0]), + SFrame(data=[], _proxy=proxy_pair[1]), + ) def topk(self, column_name, k=10, reverse=False): """ @@ -2810,7 +2974,6 @@ def topk(self, column_name, k=10, reverse=False): if type(column_name) is not str: raise TypeError("column_name must be a string") - sf = self[self[column_name].is_topk(k, reverse)] return sf.sort(column_name, ascending=reverse) @@ -2847,38 +3010,54 @@ def save(self, filename, format=None): """ if format is None: - if filename.endswith(('.csv', '.csv.gz')): - format = 'csv' - elif filename.endswith(('.json')): - format = 'json' + if filename.endswith((".csv", ".csv.gz")): + format = "csv" + elif filename.endswith((".json")): + format = "json" else: - format = 'binary' + format = "binary" else: - if format == 'csv': - if not filename.endswith(('.csv', '.csv.gz')): - filename = filename + '.csv' - elif format != 'binary' and format != 'json': - raise ValueError("Invalid format: {}. Supported formats are 'csv' and 'binary' and 'json'".format(format)) + if format == "csv": + if not filename.endswith((".csv", ".csv.gz")): + filename = filename + ".csv" + elif format != "binary" and format != "json": + raise ValueError( + "Invalid format: {}. Supported formats are 'csv' and 'binary' and 'json'".format( + format + ) + ) ## Save the SFrame url = _make_internal_url(filename) with cython_context(): - if format == 'binary': + if format == "binary": self.__proxy__.save(url) - elif format == 'csv': - assert filename.endswith(('.csv', '.csv.gz')) + elif format == "csv": + assert filename.endswith((".csv", ".csv.gz")) self.__proxy__.save_as_csv(url, {}) - elif format == 'json': + elif format == "json": self.export_json(url) else: raise ValueError("Unsupported format: {}".format(format)) - def export_csv(self, filename, delimiter=',', line_terminator='\n', - header=True, quote_level=csv.QUOTE_NONNUMERIC, double_quote=True, - escape_char='\\', quote_char='\"', na_rep='', - file_header='', file_footer='', line_prefix='', - _no_prefix_on_first_value=False, **kwargs): + def export_csv( + self, + filename, + delimiter=",", + line_terminator="\n", + header=True, + quote_level=csv.QUOTE_NONNUMERIC, + double_quote=True, + escape_char="\\", + quote_char='"', + na_rep="", + file_header="", + file_footer="", + line_prefix="", + _no_prefix_on_first_value=False, + **kwargs + ): """ Writes an SFrame to a CSV file. @@ -2925,49 +3104,47 @@ def export_csv(self, filename, delimiter=',', line_terminator='\n', """ # Pandas argument compatibility if "sep" in kwargs: - delimiter = kwargs['sep'] - del kwargs['sep'] + delimiter = kwargs["sep"] + del kwargs["sep"] if "quotechar" in kwargs: - quote_char = kwargs['quotechar'] - del kwargs['quotechar'] + quote_char = kwargs["quotechar"] + del kwargs["quotechar"] if "doublequote" in kwargs: - double_quote = kwargs['doublequote'] - del kwargs['doublequote'] + double_quote = kwargs["doublequote"] + del kwargs["doublequote"] if "lineterminator" in kwargs: - line_terminator = kwargs['lineterminator'] - del kwargs['lineterminator'] + line_terminator = kwargs["lineterminator"] + del kwargs["lineterminator"] if len(kwargs) > 0: raise TypeError("Unexpected keyword arguments " + str(list(kwargs.keys()))) write_csv_options = {} - write_csv_options['delimiter'] = delimiter - write_csv_options['escape_char'] = escape_char - write_csv_options['double_quote'] = double_quote - write_csv_options['quote_char'] = quote_char + write_csv_options["delimiter"] = delimiter + write_csv_options["escape_char"] = escape_char + write_csv_options["double_quote"] = double_quote + write_csv_options["quote_char"] = quote_char if quote_level == csv.QUOTE_MINIMAL: - write_csv_options['quote_level'] = 0 + write_csv_options["quote_level"] = 0 elif quote_level == csv.QUOTE_ALL: - write_csv_options['quote_level'] = 1 + write_csv_options["quote_level"] = 1 elif quote_level == csv.QUOTE_NONNUMERIC: - write_csv_options['quote_level'] = 2 + write_csv_options["quote_level"] = 2 elif quote_level == csv.QUOTE_NONE: - write_csv_options['quote_level'] = 3 - write_csv_options['header'] = header - write_csv_options['line_terminator'] = line_terminator - write_csv_options['na_value'] = na_rep - write_csv_options['file_header'] = file_header - write_csv_options['file_footer'] = file_footer - write_csv_options['line_prefix'] = line_prefix + write_csv_options["quote_level"] = 3 + write_csv_options["header"] = header + write_csv_options["line_terminator"] = line_terminator + write_csv_options["na_value"] = na_rep + write_csv_options["file_header"] = file_header + write_csv_options["file_footer"] = file_footer + write_csv_options["line_prefix"] = line_prefix # undocumented option. Disables line prefix on the first value line - write_csv_options['_no_prefix_on_first_value'] = _no_prefix_on_first_value + write_csv_options["_no_prefix_on_first_value"] = _no_prefix_on_first_value url = _make_internal_url(filename) self.__proxy__.save_as_csv(url, write_csv_options) - def export_json(self, - filename, - orient='records'): + def export_json(self, filename, orient="records"): """ Writes an SFrame to a JSON file. @@ -3033,14 +3210,19 @@ def export_json(self, """ if orient == "records": self.pack_columns(dtype=dict).export_csv( - filename, file_header='[', file_footer=']', - header=False, double_quote=False, - quote_level=csv.QUOTE_NONE, - line_prefix=',', - _no_prefix_on_first_value=True) + filename, + file_header="[", + file_footer="]", + header=False, + double_quote=False, + quote_level=csv.QUOTE_NONE, + line_prefix=",", + _no_prefix_on_first_value=True, + ) elif orient == "lines": self.pack_columns(dtype=dict).export_csv( - filename, header=False, double_quote=False, quote_level=csv.QUOTE_NONE) + filename, header=False, double_quote=False, quote_level=csv.QUOTE_NONE + ) else: raise ValueError("Invalid value for orient parameter (" + str(orient) + ")") @@ -3071,7 +3253,6 @@ def _save_reference(self, filename): with cython_context(): self.__proxy__.save_reference(url) - def select_column(self, column_name): """ Get a reference to the :class:`~turicreate.SArray` that corresponds with @@ -3152,19 +3333,32 @@ def select_columns(self, column_names): """ if not _is_non_string_iterable(column_names): raise TypeError("column_names must be an iterable") - if not (all([isinstance(x, six.string_types) or isinstance(x, type) or isinstance(x, bytes) - for x in column_names])): + if not ( + all( + [ + isinstance(x, six.string_types) + or isinstance(x, type) + or isinstance(x, bytes) + for x in column_names + ] + ) + ): raise TypeError("Invalid key type: must be str, unicode, bytes or type") - requested_str_columns = [s for s in column_names if isinstance(s, six.string_types)] + requested_str_columns = [ + s for s in column_names if isinstance(s, six.string_types) + ] # Make sure there are no duplicates keys from collections import Counter + column_names_counter = Counter(column_names) if (len(column_names)) != len(column_names_counter): for key in column_names_counter: if column_names_counter[key] > 1: - raise ValueError("There are duplicate keys in key list: '" + key + "'") + raise ValueError( + "There are duplicate keys in key list: '" + key + "'" + ) colnames_and_types = list(zip(self.column_names(), self.column_types())) @@ -3183,7 +3377,9 @@ def select_columns(self, column_names): selected_columns = selected_columns with cython_context(): - return SFrame(data=[], _proxy=self.__proxy__.select_columns(selected_columns)) + return SFrame( + data=[], _proxy=self.__proxy__.select_columns(selected_columns) + ) def add_column(self, data, column_name="", inplace=False): """ @@ -3236,7 +3432,6 @@ def add_column(self, data, column_name="", inplace=False): """ # Check type for pandas dataframe or SArray? - if not isinstance(data, SArray): if isinstance(data, _Iterable): data = SArray(data) @@ -3318,7 +3513,9 @@ def add_columns(self, data, column_names=None, inplace=False): my_columns = set(self.column_names()) for name in column_names: if name in my_columns: - raise ValueError("Column '" + name + "' already exists in current SFrame") + raise ValueError( + "Column '" + name + "' already exists in current SFrame" + ) else: if not _is_non_string_iterable(datalist): raise TypeError("datalist must be an iterable") @@ -3381,7 +3578,7 @@ def remove_column(self, column_name, inplace=False): """ column_name = str(column_name) if column_name not in self.column_names(): - raise KeyError('Cannot find column %s' % column_name) + raise KeyError("Cannot find column %s" % column_name) colid = self.column_names().index(column_name) if inplace: @@ -3437,7 +3634,7 @@ def remove_columns(self, column_names, inplace=False): for name in column_names: if name not in existing_columns: - raise KeyError('Cannot find column %s' % name) + raise KeyError("Cannot find column %s" % name) # Delete it going backwards so we don't invalidate indices deletion_indices = sorted(existing_columns[name] for name in column_names) @@ -3454,7 +3651,6 @@ def remove_columns(self, column_names, inplace=False): ret._cache = None return ret - def swap_columns(self, column_name_1, column_name_2, inplace=False): """ Returns an SFrame with two column positions swapped. @@ -3554,12 +3750,12 @@ def rename(self, names, inplace=False): +-------+-----------------+ [2 rows x 2 columns] """ - if (type(names) is not dict): - raise TypeError('names must be a dictionary: oldname -> newname') + if type(names) is not dict: + raise TypeError("names must be a dictionary: oldname -> newname") all_columns = set(self.column_names()) for k in names: if not k in all_columns: - raise ValueError('Cannot find column %s in the SFrame' % k) + raise ValueError("Cannot find column %s in the SFrame" % k) if inplace: ret = self @@ -3598,7 +3794,7 @@ def __getitem__(self, key): return self._row_selector(key) elif isinstance(key, six.string_types): if six.PY2 and type(key) == unicode: - key = key.encode('utf-8') + key = key.encode("utf-8") return self.select_column(key) elif type(key) is type: return self.select_columns([key]) @@ -3612,7 +3808,7 @@ def __getitem__(self, key): if key >= sf_len: raise IndexError("SFrame index out of range") - if not hasattr(self, '_cache') or self._cache is None: + if not hasattr(self, "_cache") or self._cache is None: self._cache = {} try: @@ -3627,8 +3823,9 @@ def __getitem__(self, key): # Do we have a good block size that won't cause memory to blow up? if not "getitem_cache_blocksize" in self._cache: - block_size = \ - (8*1024) // sum( (2 if dt in [int, long, float] else 8) for dt in self.column_types()) + block_size = (8 * 1024) // sum( + (2 if dt in [int, long, float] else 8) for dt in self.column_types() + ) block_size = max(16, block_size) self._cache["getitem_cache_blocksize"] = block_size @@ -3640,7 +3837,7 @@ def __getitem__(self, key): lb = block_num * block_size ub = min(sf_len, lb + block_size) - val_list = list(SFrame(_proxy = self.__proxy__.copy_range(lb, 1, ub))) + val_list = list(SFrame(_proxy=self.__proxy__.copy_range(lb, 1, ub))) self._cache["getitem_cache"] = (lb, ub, val_list) return val_list[int(key - lb)] @@ -3659,7 +3856,7 @@ def __getitem__(self, key): start = len(self) + start if stop < 0: stop = len(self) + stop - return SFrame(_proxy = self.__proxy__.copy_range(start, step, stop)) + return SFrame(_proxy=self.__proxy__.copy_range(start, step, stop)) else: raise TypeError("Invalid index type: must be SArray, list, int, or str") @@ -3675,7 +3872,7 @@ def __setitem__(self, key, value): self.add_columns(value, key, inplace=True) elif type(key) is str: sa_value = None - if (type(value) is SArray): + if type(value) is SArray: sa_value = value elif _is_non_string_iterable(value): # wrap list, array... to sarray sa_value = SArray(value) @@ -3692,29 +3889,29 @@ def __setitem__(self, key, value): # length than current one, which doesn't make sense if we are replacing # the only column. To support this, we first take out the only column # and then put it back if exception happens - single_column = (self.num_columns() == 1) - if (single_column): + single_column = self.num_columns() == 1 + if single_column: tmpname = key saved_column = self.select_column(key) self.remove_column(key, inplace=True) else: # add the column to a unique column name. - tmpname = '__' + '-'.join(self.column_names()) + tmpname = "__" + "-".join(self.column_names()) try: self.add_column(sa_value, tmpname, inplace=True) except Exception: - if (single_column): + if single_column: self.add_column(saved_column, key, inplace=True) raise - if (not single_column): + if not single_column: # if add succeeded, remove the column name and rename tmpname->columnname. self.swap_columns(key, tmpname, inplace=True) self.remove_column(key, inplace=True) self.rename({tmpname: key}, inplace=True) else: - raise TypeError('Cannot set column with key type ' + str(type(key))) + raise TypeError("Cannot set column with key type " + str(type(key))) def __delitem__(self, key): """ @@ -3759,13 +3956,12 @@ def __iter__(self): Provides an iterator to the rows of the SFrame. """ - def generator(): elems_at_a_time = 262144 self.__proxy__.begin_iterator() ret = self.__proxy__.iterator_get_next(elems_at_a_time) column_names = self.column_names() - while(True): + while True: for j in ret: yield dict(list(zip(column_names, j))) @@ -4119,83 +4315,109 @@ def groupby(self, key_column_names, operations, *args): # element (probably COUNT). wrap it in a list so we can reuse the # list processing code operation = op_entry - if not(isinstance(operation, list) or isinstance(operation, dict)): - operation = [operation] + if not (isinstance(operation, list) or isinstance(operation, dict)): + operation = [operation] if isinstance(operation, dict): - # now sweep the dict and add to group_columns and group_ops - for key in operation: - val = operation[key] - if type(val) is tuple: - (op, column) = val - if (op == '__builtin__avg__' and self[column[0]].dtype in [array.array, numpy.ndarray]): - op = '__builtin__vector__avg__' - - if (op == '__builtin__sum__' and self[column[0]].dtype in [array.array, numpy.ndarray]): - op = '__builtin__vector__sum__' - - if (op == '__builtin__argmax__' or op == '__builtin__argmin__') and ((type(column[0]) is tuple) != (type(key) is tuple)): - raise TypeError("Output column(s) and aggregate column(s) for aggregate operation should be either all tuple or all string.") - - if (op == '__builtin__argmax__' or op == '__builtin__argmin__') and type(column[0]) is tuple: - for (col,output) in zip(column[0],key): - group_columns = group_columns + [[col,column[1]]] + # now sweep the dict and add to group_columns and group_ops + for key in operation: + val = operation[key] + if type(val) is tuple: + (op, column) = val + if op == "__builtin__avg__" and self[column[0]].dtype in [ + array.array, + numpy.ndarray, + ]: + op = "__builtin__vector__avg__" + + if op == "__builtin__sum__" and self[column[0]].dtype in [ + array.array, + numpy.ndarray, + ]: + op = "__builtin__vector__sum__" + + if ( + op == "__builtin__argmax__" or op == "__builtin__argmin__" + ) and ((type(column[0]) is tuple) != (type(key) is tuple)): + raise TypeError( + "Output column(s) and aggregate column(s) for aggregate operation should be either all tuple or all string." + ) + + if ( + op == "__builtin__argmax__" or op == "__builtin__argmin__" + ) and type(column[0]) is tuple: + for (col, output) in zip(column[0], key): + group_columns = group_columns + [[col, column[1]]] + group_ops = group_ops + [op] + group_output_columns = group_output_columns + [output] + else: + group_columns = group_columns + [column] + group_ops = group_ops + [op] + group_output_columns = group_output_columns + [key] + + if op == "__builtin__concat__dict__": + key_column = column[0] + key_column_type = self.select_column(key_column).dtype + if not key_column_type in (int, float, str): + raise TypeError( + "CONCAT key column must be int, float or str type" + ) + + elif val == aggregate.COUNT: + group_output_columns = group_output_columns + [key] + val = aggregate.COUNT() + (op, column) = val + group_columns = group_columns + [column] group_ops = group_ops + [op] - group_output_columns = group_output_columns + [output] else: - group_columns = group_columns + [column] - group_ops = group_ops + [op] - group_output_columns = group_output_columns + [key] - - if (op == '__builtin__concat__dict__'): - key_column = column[0] - key_column_type = self.select_column(key_column).dtype - if not key_column_type in (int, float, str): - raise TypeError('CONCAT key column must be int, float or str type') - - elif val == aggregate.COUNT: - group_output_columns = group_output_columns + [key] - val = aggregate.COUNT() - (op, column) = val - group_columns = group_columns + [column] - group_ops = group_ops + [op] - else: - raise TypeError("Unexpected type in aggregator definition of output column: " + key) + raise TypeError( + "Unexpected type in aggregator definition of output column: " + + key + ) elif isinstance(operation, list): - # we will be using automatically defined column names - for val in operation: - if type(val) is tuple: - (op, column) = val - if (op == '__builtin__avg__' and self[column[0]].dtype in [array.array, numpy.ndarray]): - op = '__builtin__vector__avg__' - - if (op == '__builtin__sum__' and self[column[0]].dtype in [array.array, numpy.ndarray]): - op = '__builtin__vector__sum__' - - if (op == '__builtin__argmax__' or op == '__builtin__argmin__') and type(column[0]) is tuple: - for col in column[0]: - group_columns = group_columns + [[col,column[1]]] - group_ops = group_ops + [op] + # we will be using automatically defined column names + for val in operation: + if type(val) is tuple: + (op, column) = val + if op == "__builtin__avg__" and self[column[0]].dtype in [ + array.array, + numpy.ndarray, + ]: + op = "__builtin__vector__avg__" + + if op == "__builtin__sum__" and self[column[0]].dtype in [ + array.array, + numpy.ndarray, + ]: + op = "__builtin__vector__sum__" + + if ( + op == "__builtin__argmax__" or op == "__builtin__argmin__" + ) and type(column[0]) is tuple: + for col in column[0]: + group_columns = group_columns + [[col, column[1]]] + group_ops = group_ops + [op] + group_output_columns = group_output_columns + [""] + else: + group_columns = group_columns + [column] + group_ops = group_ops + [op] + group_output_columns = group_output_columns + [""] + + if op == "__builtin__concat__dict__": + key_column = column[0] + key_column_type = self.select_column(key_column).dtype + if not key_column_type in (int, float, str): + raise TypeError( + "CONCAT key column must be int, float or str type" + ) + + elif val == aggregate.COUNT: group_output_columns = group_output_columns + [""] + val = aggregate.COUNT() + (op, column) = val + group_columns = group_columns + [column] + group_ops = group_ops + [op] else: - group_columns = group_columns + [column] - group_ops = group_ops + [op] - group_output_columns = group_output_columns + [""] - - if (op == '__builtin__concat__dict__'): - key_column = column[0] - key_column_type = self.select_column(key_column).dtype - if not key_column_type in (int, float, str): - raise TypeError('CONCAT key column must be int, float or str type') - - elif val == aggregate.COUNT: - group_output_columns = group_output_columns + [""] - val = aggregate.COUNT() - (op, column) = val - group_columns = group_columns + [column] - group_ops = group_ops + [op] - else: - raise TypeError("Unexpected type in aggregator definition.") - + raise TypeError("Unexpected type in aggregator definition.") # let's validate group_columns and group_ops are valid for (cols, op) in zip(group_columns, group_ops): @@ -4211,14 +4433,14 @@ def groupby(self, key_column_names, operations, *args): if col not in my_column_names: raise KeyError("Column " + col + " does not exist in SFrame") - with cython_context(): - return SFrame(_proxy=self.__proxy__.groupby_aggregate(key_columns_array, - group_columns, - group_output_columns, - group_ops)) + return SFrame( + _proxy=self.__proxy__.groupby_aggregate( + key_columns_array, group_columns, group_output_columns, group_ops + ) + ) - def join(self, right, on=None, how='inner', alter_name=None): + def join(self, right, on=None, how="inner", alter_name=None): """ Merge two SFrames. Merges the current (left) SFrame with the given (right) SFrame using a SQL-style equi-join operation by columns. @@ -4334,7 +4556,7 @@ def join(self, right, on=None, how='inner', alter_name=None): +----+-------+-------+ [5 rows x 3 columns] """ - available_join_types = ['left','right','outer','inner'] + available_join_types = ["left", "right", "outer", "inner"] if not isinstance(right, SFrame): raise TypeError("Can only join two SFrames") @@ -4366,7 +4588,9 @@ def join(self, right, on=None, how='inner', alter_name=None): with cython_context(): if alter_name is None: - return SFrame(_proxy=self.__proxy__.join(right.__proxy__, how, join_keys)) + return SFrame( + _proxy=self.__proxy__.join(right.__proxy__, how, join_keys) + ) if type(alter_name) is dict: left_names = self.column_names() right_names = right.column_names() @@ -4377,7 +4601,11 @@ def join(self, right, on=None, how='inner', alter_name=None): raise ValueError("Key %s should not be equal to value" % k) if v in left_names or v in right_names: raise ValueError("Value %s will cause further collision" % v) - return SFrame(_proxy=self.__proxy__.join_with_custom_name(right.__proxy__, how, join_keys, alter_name)) + return SFrame( + _proxy=self.__proxy__.join_with_custom_name( + right.__proxy__, how, join_keys, alter_name + ) + ) def filter_by(self, values, column_name, exclude=False): """ @@ -4495,8 +4723,13 @@ def filter_by(self, values, column_name, exclude=False): given_type = value_sf.column_types()[0] if given_type != existing_type: - raise TypeError(("Type of given values ({0}) does not match type of column '" + - column_name + "' ({1}) in SFrame.").format(given_type, existing_type)) + raise TypeError( + ( + "Type of given values ({0}) does not match type of column '" + + column_name + + "' ({1}) in SFrame." + ).format(given_type, existing_type) + ) # Make sure the values list has unique values, or else join will not # filter. @@ -4511,16 +4744,20 @@ def filter_by(self, values, column_name, exclude=False): id_name += "1" value_sf = value_sf.add_row_number(id_name) - tmp = SFrame(_proxy=self.__proxy__.join(value_sf.__proxy__, - 'left', - {column_name:column_name})) + tmp = SFrame( + _proxy=self.__proxy__.join( + value_sf.__proxy__, "left", {column_name: column_name} + ) + ) ret_sf = tmp[tmp[id_name] == None] del ret_sf[id_name] return ret_sf else: - return SFrame(_proxy=self.__proxy__.join(value_sf.__proxy__, - 'inner', - {column_name:column_name})) + return SFrame( + _proxy=self.__proxy__.join( + value_sf.__proxy__, "inner", {column_name: column_name} + ) + ) def explore(self, title=None): """ @@ -4549,33 +4786,47 @@ def explore(self, title=None): import sys - if sys.platform != 'darwin' and sys.platform != 'linux2' and sys.platform != 'linux': - raise NotImplementedError('Visualization is currently supported only on macOS and Linux.') + if ( + sys.platform != "darwin" + and sys.platform != "linux2" + and sys.platform != "linux" + ): + raise NotImplementedError( + "Visualization is currently supported only on macOS and Linux." + ) # Suppress visualization output if 'none' target is set - from ..visualization._plot import _target, display_table_in_notebook, _ensure_web_server - if _target == 'none': + from ..visualization._plot import ( + _target, + display_table_in_notebook, + _ensure_web_server, + ) + + if _target == "none": return if title is None: title = "" # If browser target is set, launch in web browser - if _target == 'browser': + if _target == "browser": # First, make sure TURI_VISUALIZATION_WEB_SERVER_ROOT_DIRECTORY is set _ensure_web_server() # Launch localhost URL using Python built-in webbrowser module import webbrowser import turicreate as tc + url = tc.extensions.get_url_for_table(self, title) webbrowser.open_new_tab(url) return # If auto target is set, try to show inline in Jupyter Notebook try: - if _target == 'auto' and \ - (get_ipython().__class__.__name__ == "ZMQInteractiveShell" or get_ipython().__class__.__name__ == "Shell"): + if _target == "auto" and ( + get_ipython().__class__.__name__ == "ZMQInteractiveShell" + or get_ipython().__class__.__name__ == "Shell" + ): display_table_in_notebook(self, title) return except NameError: @@ -4633,8 +4884,15 @@ def plot(self): """ return Plot(_proxy=self.__proxy__.plot()) - def pack_columns(self, column_names=None, column_name_prefix=None, dtype=list, - fill_na=None, remove_prefix=True, new_column_name=None): + def pack_columns( + self, + column_names=None, + column_name_prefix=None, + dtype=list, + fill_na=None, + remove_prefix=True, + new_column_name=None, + ): """ Pack columns of the current SFrame into one single column. The result is a new SFrame with the unaffected columns from the original SFrame @@ -4799,7 +5057,9 @@ def pack_columns(self, column_names=None, column_name_prefix=None, dtype=list, """ if column_names is not None and column_name_prefix is not None: - raise ValueError("'column_names' and 'column_name_prefix' parameter cannot be given at the same time.") + raise ValueError( + "'column_names' and 'column_name_prefix' parameter cannot be given at the same time." + ) if new_column_name is None and column_name_prefix is not None: new_column_name = column_name_prefix @@ -4807,9 +5067,15 @@ def pack_columns(self, column_names=None, column_name_prefix=None, dtype=list, if column_name_prefix is not None: if type(column_name_prefix) != str: raise TypeError("'column_name_prefix' must be a string") - column_names = [name for name in self.column_names() if name.startswith(column_name_prefix)] + column_names = [ + name + for name in self.column_names() + if name.startswith(column_name_prefix) + ] if len(column_names) == 0: - raise ValueError("There is no column starts with prefix '" + column_name_prefix + "'") + raise ValueError( + "There is no column starts with prefix '" + column_name_prefix + "'" + ) elif column_names is None: column_names = self.column_names() else: @@ -4818,15 +5084,21 @@ def pack_columns(self, column_names=None, column_name_prefix=None, dtype=list, column_name_set = set(self.column_names()) for column in column_names: - if (column not in column_name_set): - raise ValueError("Current SFrame has no column called '" + str(column) + "'.") + if column not in column_name_set: + raise ValueError( + "Current SFrame has no column called '" + str(column) + "'." + ) # check duplicate names if len(set(column_names)) != len(column_names): - raise ValueError("There is duplicate column names in column_names parameter") + raise ValueError( + "There is duplicate column names in column_names parameter" + ) - if (dtype not in (dict, list, array.array)): - raise ValueError("Resulting dtype has to be one of dict/array.array/list type") + if dtype not in (dict, list, array.array): + raise ValueError( + "Resulting dtype has to be one of dict/array.array/list type" + ) # fill_na value for array needs to be numeric if dtype == array.array: @@ -4835,44 +5107,59 @@ def pack_columns(self, column_names=None, column_name_prefix=None, dtype=list, # all column_names have to be numeric type for column in column_names: if self[column].dtype not in (int, float): - raise TypeError("Column '" + column + "' type is not numeric, cannot pack into array type") + raise TypeError( + "Column '" + + column + + "' type is not numeric, cannot pack into array type" + ) # generate dict key names if pack to dictionary # we try to be smart here # if all column names are like: a.b, a.c, a.d,... # we then use "b", "c", "d", etc as the dictionary key during packing - if (dtype == dict) and (column_name_prefix is not None) and (remove_prefix == True): + if ( + (dtype == dict) + and (column_name_prefix is not None) + and (remove_prefix == True) + ): size_prefix = len(column_name_prefix) - first_char = set([c[size_prefix:size_prefix+1] for c in column_names]) - if ((len(first_char) == 1) and first_char.pop() in ['.','-','_']): - dict_keys = [name[size_prefix+1:] for name in column_names] + first_char = set([c[size_prefix : size_prefix + 1] for c in column_names]) + if (len(first_char) == 1) and first_char.pop() in [".", "-", "_"]: + dict_keys = [name[size_prefix + 1 :] for name in column_names] else: dict_keys = [name[size_prefix:] for name in column_names] else: dict_keys = column_names - rest_columns = [name for name in self.column_names() if name not in column_names] + rest_columns = [ + name for name in self.column_names() if name not in column_names + ] if new_column_name is not None: if type(new_column_name) != str: raise TypeError("'new_column_name' has to be a string") if new_column_name in rest_columns: - raise KeyError("Current SFrame already contains a column name " + new_column_name) + raise KeyError( + "Current SFrame already contains a column name " + new_column_name + ) else: new_column_name = "" - ret_sa = None with cython_context(): - ret_sa = SArray(_proxy=self.__proxy__.pack_columns(column_names, dict_keys, - dtype, fill_na)) + ret_sa = SArray( + _proxy=self.__proxy__.pack_columns( + column_names, dict_keys, dtype, fill_na + ) + ) new_sf = self.select_columns(rest_columns) new_sf.add_column(ret_sa, new_column_name, inplace=True) return new_sf - - def split_datetime(self, column_name, column_name_prefix=None, limit=None, timezone=False): + def split_datetime( + self, column_name, column_name_prefix=None, limit=None, timezone=False + ): """ Splits a datetime column of SFrame to multiple columns, with each value in a separate column. Returns a new SFrame with the expanded column replaced with @@ -4938,7 +5225,9 @@ def split_datetime(self, column_name, column_name_prefix=None, limit=None, timez +----+-----------------+-------------------+ """ if column_name not in self.column_names(): - raise KeyError("column '" + column_name + "' does not exist in current SFrame") + raise KeyError( + "column '" + column_name + "' does not exist in current SFrame" + ) if column_name_prefix is None: column_name_prefix = column_name @@ -4946,7 +5235,7 @@ def split_datetime(self, column_name, column_name_prefix=None, limit=None, timez new_sf = self[column_name].split_datetime(column_name_prefix, limit, timezone) # construct return SFrame, check if there is conflict - rest_columns = [name for name in self.column_names() if name != column_name] + rest_columns = [name for name in self.column_names() if name != column_name] new_names = new_sf.column_names() while set(new_names).intersection(rest_columns): new_names = [name + ".1" for name in new_names] @@ -4956,8 +5245,14 @@ def split_datetime(self, column_name, column_name_prefix=None, limit=None, timez ret_sf.add_columns(new_sf, inplace=True) return ret_sf - def unpack(self, column_name=None, column_name_prefix=None, column_types=None, - na_value=None, limit=None): + def unpack( + self, + column_name=None, + column_name_prefix=None, + column_types=None, + na_value=None, + limit=None, + ): """ Expand one column of this SFrame to multiple columns with each value in a separate column. Returns a new SFrame with the unpacked column @@ -5096,34 +5391,40 @@ def unpack(self, column_name=None, column_name_prefix=None, column_types=None, """ if column_name is None: - if self.num_columns()==0: + if self.num_columns() == 0: raise RuntimeError("No column exists in the current SFrame") for t in range(self.num_columns()): column_type = self.column_types()[t] - if column_type==dict or column_type==list or column_type==array.array: + if ( + column_type == dict + or column_type == list + or column_type == array.array + ): if column_name is None: column_name = self.column_names()[t] else: raise RuntimeError("Column name needed to unpack") - if column_name is None: raise RuntimeError("No columns can be unpacked") elif column_name_prefix is None: - column_name_prefix="" + column_name_prefix = "" elif column_name not in self.column_names(): - raise KeyError("Column '" + column_name + "' does not exist in current SFrame") + raise KeyError( + "Column '" + column_name + "' does not exist in current SFrame" + ) if column_name_prefix is None: column_name_prefix = column_name - - new_sf = self[column_name].unpack(column_name_prefix, column_types, na_value, limit) + new_sf = self[column_name].unpack( + column_name_prefix, column_types, na_value, limit + ) # construct return SFrame, check if there is conflict - rest_columns = [name for name in self.column_names() if name != column_name] + rest_columns = [name for name in self.column_names() if name != column_name] new_names = new_sf.column_names() while set(new_names).intersection(rest_columns): new_names = [name + ".1" for name in new_names] @@ -5133,7 +5434,9 @@ def unpack(self, column_name=None, column_name_prefix=None, column_types=None, ret_sf.add_columns(new_sf, inplace=True) return ret_sf - def stack(self, column_name, new_column_name=None, drop_na=False, new_column_type=None): + def stack( + self, column_name, new_column_name=None, drop_na=False, new_column_type=None + ): """ Convert a "wide" column of an SFrame to one or two "tall" columns by stacking all values. @@ -5261,11 +5564,15 @@ def stack(self, column_name, new_column_name=None, drop_na=False, new_column_typ # validate column_name column_name = str(column_name) if column_name not in self.column_names(): - raise ValueError("Cannot find column '" + str(column_name) + "' in the SFrame.") + raise ValueError( + "Cannot find column '" + str(column_name) + "' in the SFrame." + ) - stack_column_type = self[column_name].dtype - if (stack_column_type not in [dict, array.array, list]): - raise TypeError("Stack is only supported for column of dict/list/array type.") + stack_column_type = self[column_name].dtype + if stack_column_type not in [dict, array.array, list]: + raise TypeError( + "Stack is only supported for column of dict/list/array type." + ) # user defined types. do some checking if new_column_type is not None: @@ -5274,60 +5581,70 @@ def stack(self, column_name, new_column_name=None, drop_na=False, new_column_typ new_column_type = [new_column_type] if (stack_column_type in [list, array.array]) and len(new_column_type) != 1: - raise ValueError("Expecting a single column type to unpack list or array columns") + raise ValueError( + "Expecting a single column type to unpack list or array columns" + ) if (stack_column_type in [dict]) and len(new_column_type) != 2: raise ValueError("Expecting two column types to unpack a dict column") - if (new_column_name is not None): + if new_column_name is not None: if stack_column_type == dict: - if (type(new_column_name) is not list): - raise TypeError("new_column_name has to be a list to stack dict type") - elif (len(new_column_name) != 2): + if type(new_column_name) is not list: + raise TypeError( + "new_column_name has to be a list to stack dict type" + ) + elif len(new_column_name) != 2: raise TypeError("new_column_name must have length of two") else: - if (type(new_column_name) != str): + if type(new_column_name) != str: raise TypeError("new_column_name has to be a str") new_column_name = [new_column_name] # check if the new column name conflicts with existing ones for name in new_column_name: if (name in self.column_names()) and (name != column_name): - raise ValueError("Column with name '" + name + "' already exists, pick a new column name") + raise ValueError( + "Column with name '" + + name + + "' already exists, pick a new column name" + ) else: if stack_column_type == dict: - new_column_name = ["",""] + new_column_name = ["", ""] else: new_column_name = [""] # infer column types head_row = SArray(self[column_name].head(100)).dropna() - if (len(head_row) == 0): - raise ValueError("Cannot infer column type because there is not enough rows to infer value") + if len(head_row) == 0: + raise ValueError( + "Cannot infer column type because there is not enough rows to infer value" + ) if new_column_type is None: # we have to perform type inference if stack_column_type == dict: # infer key/value type - keys = []; values = [] + keys = [] + values = [] for row in head_row: for val in row: keys.append(val) - if val is not None: values.append(row[val]) + if val is not None: + values.append(row[val]) - new_column_type = [ - infer_type_of_list(keys), - infer_type_of_list(values) - ] + new_column_type = [infer_type_of_list(keys), infer_type_of_list(values)] else: values = [v for v in itertools.chain.from_iterable(head_row)] new_column_type = [infer_type_of_list(values)] - with cython_context(): - return SFrame(_proxy=self.__proxy__.stack(column_name, - new_column_name, - new_column_type, drop_na)) + return SFrame( + _proxy=self.__proxy__.stack( + column_name, new_column_name, new_column_type, drop_na + ) + ) def unstack(self, column_names, new_column_name=None): """ @@ -5401,22 +5718,35 @@ def unstack(self, column_names, new_column_name=None): +------+-----------+ [4 rows x 2 columns] """ - if (type(column_names) != str and len(column_names) != 2): - raise TypeError("'column_names' parameter has to be either a string or a list of two strings.") + if type(column_names) != str and len(column_names) != 2: + raise TypeError( + "'column_names' parameter has to be either a string or a list of two strings." + ) with cython_context(): if type(column_names) == str: key_columns = [i for i in self.column_names() if i != column_names] if new_column_name is not None: - return self.groupby(key_columns, {new_column_name : aggregate.CONCAT(column_names)}) + return self.groupby( + key_columns, {new_column_name: aggregate.CONCAT(column_names)} + ) else: return self.groupby(key_columns, aggregate.CONCAT(column_names)) elif len(column_names) == 2: key_columns = [i for i in self.column_names() if i not in column_names] if new_column_name is not None: - return self.groupby(key_columns, {new_column_name: aggregate.CONCAT(column_names[0], column_names[1])}) + return self.groupby( + key_columns, + { + new_column_name: aggregate.CONCAT( + column_names[0], column_names[1] + ) + }, + ) else: - return self.groupby(key_columns, aggregate.CONCAT(column_names[0], column_names[1])) + return self.groupby( + key_columns, aggregate.CONCAT(column_names[0], column_names[1]) + ) def unique(self): """ @@ -5463,7 +5793,7 @@ def unique(self): +----+-------+ [4 rows x 2 columns] """ - return self.groupby(self.column_names(),{}) + return self.groupby(self.column_names(), {}) def sort(self, key_column_names, ascending=True): """ @@ -5566,46 +5896,51 @@ def sort(self, key_column_names, ascending=True): sort_column_orders = [] # validate key_column_names - if (type(key_column_names) == str): + if type(key_column_names) == str: sort_column_names = [key_column_names] - elif (type(key_column_names) == list): - if (len(key_column_names) == 0): + elif type(key_column_names) == list: + if len(key_column_names) == 0: raise ValueError("Please provide at least one column to sort") first_param_types = set([type(i) for i in key_column_names]) - if (len(first_param_types) != 1): + if len(first_param_types) != 1: raise ValueError("key_column_names element are not of the same type") first_param_type = first_param_types.pop() - if (first_param_type == tuple): + if first_param_type == tuple: sort_column_names = [i[0] for i in key_column_names] sort_column_orders = [i[1] for i in key_column_names] - elif(first_param_type == str): + elif first_param_type == str: sort_column_names = key_column_names else: raise TypeError("key_column_names type is not supported") else: - raise TypeError("key_column_names type is not correct. Supported types are str, list of str or list of (str,bool) pair.") + raise TypeError( + "key_column_names type is not correct. Supported types are str, list of str or list of (str,bool) pair." + ) # use the second parameter if the sort order is not given - if (len(sort_column_orders) == 0): + if len(sort_column_orders) == 0: sort_column_orders = [ascending for i in sort_column_names] # make sure all column exists my_column_names = set(self.column_names()) for column in sort_column_names: - if (type(column) != str): - raise TypeError("Only string parameter can be passed in as column names") - if (column not in my_column_names): + if type(column) != str: + raise TypeError( + "Only string parameter can be passed in as column names" + ) + if column not in my_column_names: raise ValueError("SFrame has no column named: '" + str(column) + "'") - if (self[column].dtype not in (str, int, float,datetime.datetime)): + if self[column].dtype not in (str, int, float, datetime.datetime): raise TypeError("Only columns of type (str, int, float) can be sorted") - with cython_context(): - return SFrame(_proxy=self.__proxy__.sort(sort_column_names, sort_column_orders)) + return SFrame( + _proxy=self.__proxy__.sort(sort_column_names, sort_column_orders) + ) - def dropna(self, columns=None, how='any', recursive=False): + def dropna(self, columns=None, how="any", recursive=False): """ Remove missing values from an SFrame. A missing value is either ``None`` or ``NaN``. If ``how`` is 'any', a row will be removed if any of the @@ -5685,9 +6020,13 @@ def dropna(self, columns=None, how='any', recursive=False): (columns, all_behavior) = self.__dropna_errchk(columns, how) with cython_context(): - return SFrame(_proxy=self.__proxy__.drop_missing_values(columns, all_behavior, False, recursive)) + return SFrame( + _proxy=self.__proxy__.drop_missing_values( + columns, all_behavior, False, recursive + ) + ) - def dropna_split(self, columns=None, how='any', recursive=False): + def dropna_split(self, columns=None, how="any", recursive=False): """ Split rows with missing values from this SFrame. This function has the same functionality as :py:func:`~turicreate.SFrame.dropna`, but returns a @@ -5752,7 +6091,9 @@ def dropna_split(self, columns=None, how='any', recursive=False): (columns, all_behavior) = self.__dropna_errchk(columns, how) - sframe_tuple = self.__proxy__.drop_missing_values(columns, all_behavior, True, recursive) + sframe_tuple = self.__proxy__.drop_missing_values( + columns, all_behavior, True, recursive + ) if len(sframe_tuple) != 2: raise RuntimeError("Did not return two SFrames!") @@ -5776,11 +6117,10 @@ def __dropna_errchk(self, columns, how): if (str not in list_types) or (len(list_types) > 1): raise TypeError("All columns must be of 'str' type") - - if how not in ['any','all']: + if how not in ["any", "all"]: raise ValueError("Must specify 'any' or 'all'") - if how == 'all': + if how == "all": all_behavior = True else: all_behavior = False @@ -5833,7 +6173,7 @@ def fillna(self, column_name, value): ret[column_name] = ret[column_name].fillna(value) return ret - def add_row_number(self, column_name='id', start=0, inplace=False): + def add_row_number(self, column_name="id", start=0, inplace=False): """ Returns an SFrame with a new column that numbers each row sequentially. By default the count starts at 0, but this can be changed @@ -5891,7 +6231,9 @@ def add_row_number(self, column_name='id', start=0, inplace=False): raise TypeError("Must give start as int") if column_name in self.column_names(): - raise RuntimeError("Column '" + column_name + "' already exists in the current SFrame") + raise RuntimeError( + "Column '" + column_name + "' already exists in the current SFrame" + ) the_col = _create_sequential_sarray(self.num_rows(), start) diff --git a/src/python/turicreate/data_structures/sframe_builder.py b/src/python/turicreate/data_structures/sframe_builder.py index cc45788075..58344a7b6b 100644 --- a/src/python/turicreate/data_structures/sframe_builder.py +++ b/src/python/turicreate/data_structures/sframe_builder.py @@ -17,6 +17,7 @@ __LOGGER__ = _logging.getLogger(__name__) + class SFrameBuilder(object): """ An interface to incrementally build an SFrame (row by row). It has some @@ -75,8 +76,15 @@ class SFrameBuilder(object): [3 rows x 3 columns] """ - def __init__(self, column_types, column_names=None, num_segments=1, - history_size=10, save_location=None): + + def __init__( + self, + column_types, + column_names=None, + num_segments=1, + history_size=10, + save_location=None, + ): self._column_names = column_names self._column_types = column_types self._num_segments = num_segments @@ -88,22 +96,26 @@ def __init__(self, column_types, column_names=None, num_segments=1, if column_names is not None and column_types is not None: if len(column_names) != len(column_types): - raise AssertionError("There must be same amount of column names as column types.") + raise AssertionError( + "There must be same amount of column names as column types." + ) elif column_names is None and column_types is not None: self._column_names = self._generate_column_names(len(column_types)) else: raise AssertionError("Column types must be defined!") self._builder = UnitySFrameBuilderProxy() - self._builder.init(self._column_types, - self._column_names, - self._num_segments, - self._history_size, - self._save_location) + self._builder.init( + self._column_types, + self._column_names, + self._num_segments, + self._history_size, + self._save_location, + ) self._block_size = 1024 def _generate_column_names(self, num_columns): - return ["X"+str(i) for i in range(1,num_columns+1)] + return ["X" + str(i) for i in range(1, num_columns + 1)] def append(self, data, segment=0): """ @@ -124,7 +136,7 @@ def append(self, data, segment=0): preserved as they are added. """ # Assume this case refers to an SFrame with a single column - if not hasattr(data, '__iter__'): + if not hasattr(data, "__iter__"): data = [data] self._builder.append(data, segment) @@ -146,13 +158,13 @@ def append_multiple(self, data, segment=0): any value in segment 0, and the order of rows in each segment is preserved as they are added. """ - if not hasattr(data, '__iter__'): + if not hasattr(data, "__iter__"): raise TypeError("append_multiple must be passed an iterable object") tmp_list = [] # Avoid copy in cases that we are passed materialized data that is # smaller than our block size - if hasattr(data, '__len__'): + if hasattr(data, "__len__"): if len(data) <= self._block_size: self._builder.append_multiple(data, segment) return @@ -181,7 +193,7 @@ def read_history(self, num=10, segment=0): out : list[list] """ if num < 0: - num = 0 + num = 0 return self._builder.read_history(num, segment) def close(self): diff --git a/src/python/turicreate/data_structures/sgraph.py b/src/python/turicreate/data_structures/sgraph.py index 6f98537702..21da99d697 100644 --- a/src/python/turicreate/data_structures/sgraph.py +++ b/src/python/turicreate/data_structures/sgraph.py @@ -3,12 +3,12 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" This package defines the Turi Create SGraph, Vertex, and Edge objects. The SGraph is a directed graph, consisting of a set of Vertex objects and Edges that connect pairs of Vertices. The methods in this module are available from the top level import of the turicreate package. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -27,24 +27,25 @@ import copy import sys + if sys.version_info.major > 2: from functools import reduce ## \internal Default column name for vertex id. -_VID_COLUMN = '__id' +_VID_COLUMN = "__id" ## \internal Default column name for source vid. -_SRC_VID_COLUMN = '__src_id' +_SRC_VID_COLUMN = "__src_id" ## \internal Default column name for target vid. -_DST_VID_COLUMN = '__dst_id' +_DST_VID_COLUMN = "__dst_id" -#/**************************************************************************/ -#/* */ -#/* SGraph Related Classes */ -#/* */ -#/**************************************************************************/ +# /**************************************************************************/ +# /* */ +# /* SGraph Related Classes */ +# /* */ +# /**************************************************************************/ class Vertex(object): """ A vertex object, consisting of a vertex ID and a dictionary of vertex @@ -74,7 +75,7 @@ class Vertex(object): >>> g = g.add_vertices(verts) """ - __slots__ = ['vid', 'attr'] + __slots__ = ["vid", "attr"] def __init__(self, vid, attr={}, _series=None): """__init__(self, vid, attr={}) @@ -129,7 +130,7 @@ class Edge(object): >>> g = g.add_vertices(verts).add_edges(edges) """ - __slots__ = ['src_vid', 'dst_vid', 'attr'] + __slots__ = ["src_vid", "dst_vid", "attr"] def __init__(self, src_vid, dst_vid, attr={}, _series=None): """__init__(self, vid, attr={}) @@ -147,12 +148,26 @@ def __init__(self, src_vid, dst_vid, attr={}, _series=None): self.attr = attr def __repr__(self): - return ("E(" + str(self.src_vid) + " -> " + str(self.dst_vid) + ", " + - str(self.attr) + ")") + return ( + "E(" + + str(self.src_vid) + + " -> " + + str(self.dst_vid) + + ", " + + str(self.attr) + + ")" + ) def __str__(self): - return ("E(" + str(self.src_vid) + " -> " + str(self.dst_vid) + ", " + - str(self.attr) + ")") + return ( + "E(" + + str(self.src_vid) + + " -> " + + str(self.dst_vid) + + ", " + + str(self.attr) + + ")" + ) class SGraph(object): @@ -214,10 +229,17 @@ class SGraph(object): >>> g = g.add_edges(Edge(1, 2)) """ - __slots__ = ['__proxy__', '_vertices', '_edges'] - - def __init__(self, vertices=None, edges=None, vid_field='__id', - src_field='__src_id', dst_field='__dst_id', _proxy=None): + __slots__ = ["__proxy__", "_vertices", "_edges"] + + def __init__( + self, + vertices=None, + edges=None, + vid_field="__id", + src_field="__src_id", + dst_field="__dst_id", + _proxy=None, + ): """ __init__(vertices=None, edges=None, vid_field='__id', src_field='__src_id', dst_field='__dst_id') @@ -243,7 +265,7 @@ def __init__(self, vertices=None, edges=None, vid_field='__id', dst_field : str, optional The name of target id column in the `edges` SFrame. """ - if (_proxy is None): + if _proxy is None: self.__proxy__ = UnityGraphProxy() if vertices is not None: self.__proxy__ = self.add_vertices(vertices, vid_field).__proxy__ @@ -260,8 +282,11 @@ def __str__(self): def __repr__(self): """Returns a readable string representation summarizing the graph.""" - return "SGraph(%s)\nVertex Fields:%s\nEdge Fields:%s" % \ - (str(self.summary()), str(self.get_vertex_fields()), str(self.get_edge_fields())) + return "SGraph(%s)\nVertex Fields:%s\nEdge Fields:%s" % ( + str(self.summary()), + str(self.get_vertex_fields()), + str(self.get_edge_fields()), + ) def __copy__(self): return SGraph(_proxy=self.__proxy__) @@ -385,7 +410,7 @@ def summary(self): ret = self.__proxy__.summary() return dict(ret.items()) - def get_vertices(self, ids=[], fields={}, format='sframe'): + def get_vertices(self, ids=[], fields={}, format="sframe"): """ get_vertices(self, ids=list(), fields={}, format='sframe') Return a collection of vertices and their attributes. @@ -459,26 +484,28 @@ def get_vertices(self, ids=[], fields={}, format='sframe'): ids = [ids] if type(ids) not in (list, SArray): - raise TypeError('ids must be list or SArray type') + raise TypeError("ids must be list or SArray type") with cython_context(): sf = SFrame(_proxy=self.__proxy__.get_vertices(ids, fields)) - if (format == 'sframe'): + if format == "sframe": return sf - elif (format == 'dataframe'): - assert HAS_PANDAS, 'Cannot use dataframe because Pandas is not available or version is too low.' + elif format == "dataframe": + assert ( + HAS_PANDAS + ), "Cannot use dataframe because Pandas is not available or version is too low." if sf.num_rows() == 0: return pd.DataFrame() else: df = sf.head(sf.num_rows()).to_dataframe() - return df.set_index('__id') - elif (format == 'list'): + return df.set_index("__id") + elif format == "list": return _dataframe_to_vertex_list(sf.to_dataframe()) else: raise ValueError("Invalid format specifier") - def get_edges(self, src_ids=[], dst_ids=[], fields={}, format='sframe'): + def get_edges(self, src_ids=[], dst_ids=[], fields={}, format="sframe"): """ get_edges(self, src_ids=list(), dst_ids=list(), fields={}, format='sframe') Return a collection of edges and their attributes. This function is used @@ -560,9 +587,9 @@ def get_edges(self, src_ids=[], dst_ids=[], fields={}, format='sframe'): dst_ids = [dst_ids] if type(src_ids) not in (list, SArray): - raise TypeError('src_ids must be list or SArray type') + raise TypeError("src_ids must be list or SArray type") if type(dst_ids) not in (list, SArray): - raise TypeError('dst_ids must be list or SArray type') + raise TypeError("dst_ids must be list or SArray type") # implicit Nones if len(src_ids) == 0 and len(dst_ids) > 0: @@ -574,15 +601,17 @@ def get_edges(self, src_ids=[], dst_ids=[], fields={}, format='sframe'): with cython_context(): sf = SFrame(_proxy=self.__proxy__.get_edges(src_ids, dst_ids, fields)) - if (format == 'sframe'): + if format == "sframe": return sf - if (format == 'dataframe'): - assert HAS_PANDAS, 'Cannot use dataframe because Pandas is not available or version is too low.' + if format == "dataframe": + assert ( + HAS_PANDAS + ), "Cannot use dataframe because Pandas is not available or version is too low." if sf.num_rows() == 0: return pd.DataFrame() else: return sf.head(sf.num_rows()).to_dataframe() - elif (format == 'list'): + elif format == "list": return _dataframe_to_edge_list(sf.to_dataframe()) else: raise ValueError("Invalid format specifier") @@ -721,7 +750,9 @@ def add_edges(self, edges, src_field=None, dst_field=None): sf = _edge_data_to_sframe(edges, src_field, dst_field) with cython_context(): - proxy = self.__proxy__.add_edges(sf.__proxy__, _SRC_VID_COLUMN, _DST_VID_COLUMN) + proxy = self.__proxy__.add_edges( + sf.__proxy__, _SRC_VID_COLUMN, _DST_VID_COLUMN + ) return SGraph(_proxy=proxy) def get_fields(self): @@ -840,10 +871,10 @@ def select_fields(self, fields): >>> g2 = g.select_fields(fields=['breed']) """ - if (type(fields) is str): + if type(fields) is str: fields = [fields] if not isinstance(fields, list) or not all(type(x) is str for x in fields): - raise TypeError('\"fields\" must be a str or list[str]') + raise TypeError('"fields" must be a str or list[str]') vfields = self.__proxy__.get_vertex_fields() efields = self.__proxy__.get_edge_fields() @@ -858,7 +889,7 @@ def select_fields(self, fields): selected_efields.append(f) found = True if not found: - raise ValueError('Field \'%s\' not in graph' % f) + raise ValueError("Field '%s' not in graph" % f) with cython_context(): proxy = self.__proxy__ @@ -867,7 +898,7 @@ def select_fields(self, fields): return SGraph(_proxy=proxy) def triple_apply(self, triple_apply_fn, mutated_fields, input_fields=None): - ''' + """ Apply a transform function to each edge and its associated source and target vertices in parallel. Each edge is visited once and in parallel. Modification to vertex data is protected by lock. The effect on the @@ -967,25 +998,29 @@ def triple_apply(self, triple_apply_fn, mutated_fields, input_fields=None): dtype: int Rows: 5 [4, 1, 1, 1, 4] - ''' + """ assert inspect.isfunction(triple_apply_fn), "Input must be a function" if not (type(mutated_fields) is list or type(mutated_fields) is str): - raise TypeError('mutated_fields must be str or list of str') - if not (input_fields is None or type(input_fields) is list or type(input_fields) is str): - raise TypeError('input_fields must be str or list of str') + raise TypeError("mutated_fields must be str or list of str") + if not ( + input_fields is None + or type(input_fields) is list + or type(input_fields) is str + ): + raise TypeError("input_fields must be str or list of str") if type(mutated_fields) == str: mutated_fields = [mutated_fields] if len(mutated_fields) == 0: - raise ValueError('mutated_fields cannot be empty') - for f in ['__id', '__src_id', '__dst_id']: + raise ValueError("mutated_fields cannot be empty") + for f in ["__id", "__src_id", "__dst_id"]: if f in mutated_fields: - raise ValueError('mutated_fields cannot contain %s' % f) + raise ValueError("mutated_fields cannot contain %s" % f) all_fields = self.get_fields() if not set(mutated_fields).issubset(set(all_fields)): extra_fields = list(set(mutated_fields).difference(set(all_fields))) - raise ValueError('graph does not contain fields: %s' % str(extra_fields)) + raise ValueError("graph does not contain fields: %s" % str(extra_fields)) # select input fields if input_fields is None: @@ -1001,18 +1036,27 @@ def triple_apply(self, triple_apply_fn, mutated_fields, input_fields=None): nativefn = None try: from .. import extensions + nativefn = extensions._build_native_function_call(triple_apply_fn) except: # failure are fine. we just fall out into the next few phases pass if nativefn is not None: with cython_context(): - return SGraph(_proxy=g.__proxy__.lambda_triple_apply_native(nativefn, mutated_fields)) + return SGraph( + _proxy=g.__proxy__.lambda_triple_apply_native( + nativefn, mutated_fields + ) + ) else: with cython_context(): - return SGraph(_proxy=g.__proxy__.lambda_triple_apply(triple_apply_fn, mutated_fields)) + return SGraph( + _proxy=g.__proxy__.lambda_triple_apply( + triple_apply_fn, mutated_fields + ) + ) - def save(self, filename, format='auto'): + def save(self, filename, format="auto"): """ Save the SGraph to disk. If the graph is saved in binary format, the graph can be re-loaded using the :py:func:`load_sgraph` method. @@ -1049,15 +1093,17 @@ def save(self, filename, format='auto'): >>> g.save('mygraph.json', format='json') """ - if format == 'auto': - if filename.endswith(('.json', '.json.gz')): - format = 'json' + if format == "auto": + if filename.endswith((".json", ".json.gz")): + format = "json" else: - format = 'binary' + format = "binary" - if format not in ['binary', 'json', 'csv']: - raise ValueError('Invalid format: %s. Supported formats are: %s' - % (format, ['binary', 'json', 'csv'])) + if format not in ["binary", "json", "csv"]: + raise ValueError( + "Invalid format: %s. Supported formats are: %s" + % (format, ["binary", "json", "csv"]) + ) with cython_context(): self.__proxy__.save_graph(_make_internal_url(filename), format) @@ -1111,7 +1157,6 @@ def get_neighborhood(self, ids, radius=1, full_subgraph=True): full_subgraph=True) """ - verts = ids ## find the vertices within radius (and the path edges) @@ -1119,13 +1164,17 @@ def get_neighborhood(self, ids, radius=1, full_subgraph=True): edges_out = self.get_edges(src_ids=verts) edges_in = self.get_edges(dst_ids=verts) - verts = list(edges_in['__src_id']) + list(edges_in['__dst_id']) + \ - list(edges_out['__src_id']) + list(edges_out['__dst_id']) + verts = ( + list(edges_in["__src_id"]) + + list(edges_in["__dst_id"]) + + list(edges_out["__src_id"]) + + list(edges_out["__dst_id"]) + ) verts = list(set(verts)) ## make a new graph to return and add the vertices g = SGraph() - g = g.add_vertices(self.get_vertices(verts), vid_field='__id') + g = g.add_vertices(self.get_vertices(verts), vid_field="__id") ## add the requested edge set if full_subgraph is True: @@ -1142,16 +1191,16 @@ def get_neighborhood(self, ids, radius=1, full_subgraph=True): path_edges = edges_out.append(edges_in) edges = path_edges.groupby(path_edges.column_names(), {}) - g = g.add_edges(edges, src_field='__src_id', dst_field='__dst_id') + g = g.add_edges(edges, src_field="__src_id", dst_field="__dst_id") return g -#/**************************************************************************/ -#/* */ -#/* Module Function */ -#/* */ -#/**************************************************************************/ -def load_sgraph(filename, format='binary', delimiter='auto'): +# /**************************************************************************/ +# /* */ +# /* Module Function */ +# /* */ +# /**************************************************************************/ +def load_sgraph(filename, format="binary", delimiter="auto"): """ Load SGraph from text file or previously saved SGraph binary. @@ -1193,51 +1242,58 @@ def load_sgraph(filename, format='binary', delimiter='auto'): >>> g2 = turicreate.load_sgraph('mygraph') """ - - if not format in ['binary', 'snap', 'csv', 'tsv']: - raise ValueError('Invalid format: %s' % format) + if not format in ["binary", "snap", "csv", "tsv"]: + raise ValueError("Invalid format: %s" % format) with cython_context(): g = None - if format == 'binary': + if format == "binary": proxy = glconnect.get_unity().load_graph(_make_internal_url(filename)) g = SGraph(_proxy=proxy) - elif format == 'snap': - if delimiter == 'auto': - delimiter = '\t' - sf = SFrame.read_csv(filename, comment_char='#', delimiter=delimiter, - header=False, column_type_hints=int) - g = SGraph().add_edges(sf, 'X1', 'X2') - elif format == 'csv': - if delimiter == 'auto': - delimiter = ',' + elif format == "snap": + if delimiter == "auto": + delimiter = "\t" + sf = SFrame.read_csv( + filename, + comment_char="#", + delimiter=delimiter, + header=False, + column_type_hints=int, + ) + g = SGraph().add_edges(sf, "X1", "X2") + elif format == "csv": + if delimiter == "auto": + delimiter = "," sf = SFrame.read_csv(filename, header=False, delimiter=delimiter) - g = SGraph().add_edges(sf, 'X1', 'X2') - elif format == 'tsv': - if delimiter == 'auto': - delimiter = '\t' + g = SGraph().add_edges(sf, "X1", "X2") + elif format == "tsv": + if delimiter == "auto": + delimiter = "\t" sf = SFrame.read_csv(filename, header=False, delimiter=delimiter) - g = SGraph().add_edges(sf, 'X1', 'X2') + g = SGraph().add_edges(sf, "X1", "X2") g.summary() # materialize return g -#/**************************************************************************/ -#/* */ -#/* Helper Function */ -#/* */ -#/**************************************************************************/ +# /**************************************************************************/ +# /* */ +# /* Helper Function */ +# /* */ +# /**************************************************************************/ def _vertex_list_to_dataframe(ls, id_column_name): """ Convert a list of vertices into dataframe. """ - assert HAS_PANDAS, 'Cannot use dataframe because Pandas is not available or version is too low.' + assert ( + HAS_PANDAS + ), "Cannot use dataframe because Pandas is not available or version is too low." cols = reduce(set.union, (set(v.attr.keys()) for v in ls)) df = pd.DataFrame({id_column_name: [v.vid for v in ls]}) for c in cols: df[c] = [v.attr.get(c) for v in ls] return df + def _vertex_list_to_sframe(ls, id_column_name): """ Convert a list of vertices into an SFrame. @@ -1256,23 +1312,30 @@ def _vertex_list_to_sframe(ls, id_column_name): sf[col] = [val] else: - raise TypeError('Vertices type {} is Not supported.'.format(type(ls))) + raise TypeError("Vertices type {} is Not supported.".format(type(ls))) return sf + def _edge_list_to_dataframe(ls, src_column_name, dst_column_name): """ Convert a list of edges into dataframe. """ - assert HAS_PANDAS, 'Cannot use dataframe because Pandas is not available or version is too low.' + assert ( + HAS_PANDAS + ), "Cannot use dataframe because Pandas is not available or version is too low." cols = reduce(set.union, (set(e.attr.keys()) for e in ls)) - df = pd.DataFrame({ - src_column_name: [e.src_vid for e in ls], - dst_column_name: [e.dst_vid for e in ls]}) + df = pd.DataFrame( + { + src_column_name: [e.src_vid for e in ls], + dst_column_name: [e.dst_vid for e in ls], + } + ) for c in cols: df[c] = [e.attr.get(c) for e in ls] return df + def _edge_list_to_sframe(ls, src_column_name, dst_column_name): """ Convert a list of edges into an SFrame. @@ -1291,17 +1354,20 @@ def _edge_list_to_sframe(ls, src_column_name, dst_column_name): sf[dst_column_name] = [ls.dst_vid] else: - raise TypeError('Edges type {} is Not supported.'.format(type(ls))) + raise TypeError("Edges type {} is Not supported.".format(type(ls))) return sf + def _dataframe_to_vertex_list(df): """ Convert dataframe into list of vertices, assuming that vertex ids are stored in _VID_COLUMN. """ cols = df.columns if len(cols): - assert _VID_COLUMN in cols, "Vertex DataFrame must contain column %s" % _VID_COLUMN + assert _VID_COLUMN in cols, ( + "Vertex DataFrame must contain column %s" % _VID_COLUMN + ) df = df[cols].T ret = [Vertex(None, _series=df[col]) for col in df] return ret @@ -1315,8 +1381,12 @@ def _dataframe_to_edge_list(df): """ cols = df.columns if len(cols): - assert _SRC_VID_COLUMN in cols, "Vertex DataFrame must contain column %s" % _SRC_VID_COLUMN - assert _DST_VID_COLUMN in cols, "Vertex DataFrame must contain column %s" % _DST_VID_COLUMN + assert _SRC_VID_COLUMN in cols, ( + "Vertex DataFrame must contain column %s" % _SRC_VID_COLUMN + ) + assert _DST_VID_COLUMN in cols, ( + "Vertex DataFrame must contain column %s" % _DST_VID_COLUMN + ) df = df[cols].T ret = [Edge(None, None, _series=df[col]) for col in df] return ret @@ -1340,7 +1410,7 @@ def _vertex_data_to_sframe(data, vid_field): return data_copy if type(data) == Vertex or type(data) == list: - return _vertex_list_to_sframe(data, '__id') + return _vertex_list_to_sframe(data, "__id") elif HAS_PANDAS and type(data) == pd.DataFrame: if vid_field is None: @@ -1349,24 +1419,29 @@ def _vertex_data_to_sframe(data, vid_field): if not ("index" in data.columns): # pandas reset_index() will insert a new column of name "index". sf = SFrame(data.reset_index()) # "index" - sf.rename({'index': _VID_COLUMN}, inplace=True) + sf.rename({"index": _VID_COLUMN}, inplace=True) return sf else: # pandas reset_index() will insert a new column of name "level_0" if there exists a column named "index". sf = SFrame(data.reset_index()) # "level_0" - sf.rename({'level_0': _VID_COLUMN}, inplace=True) + sf.rename({"level_0": _VID_COLUMN}, inplace=True) return sf else: - raise ValueError("Index of the vertices dataframe is not unique, \ - try specifying vid_field name to use a column for vertex ids.") + raise ValueError( + "Index of the vertices dataframe is not unique, \ + try specifying vid_field name to use a column for vertex ids." + ) else: sf = SFrame(data) if _VID_COLUMN in sf.column_names(): - raise ValueError('%s reserved vid column name already exists in the SFrame' % _VID_COLUMN) + raise ValueError( + "%s reserved vid column name already exists in the SFrame" + % _VID_COLUMN + ) sf.rename({vid_field: _VID_COLUMN}, inplace=True) return sf else: - raise TypeError('Vertices type %s is Not supported.' % str(type(data))) + raise TypeError("Vertices type %s is Not supported." % str(type(data))) def _edge_data_to_sframe(data, src_field, dst_field): @@ -1378,9 +1453,12 @@ def _edge_data_to_sframe(data, src_field, dst_field): if isinstance(data, SFrame): # '__src_vid' and '__dst_vid' already in the sframe, and # it is ok to not specify src_field and dst_field - if src_field is None and dst_field is None and \ - _SRC_VID_COLUMN in data.column_names() and \ - _DST_VID_COLUMN in data.column_names(): + if ( + src_field is None + and dst_field is None + and _SRC_VID_COLUMN in data.column_names() + and _DST_VID_COLUMN in data.column_names() + ): return data if src_field is None: raise ValueError("src_field must be specified for SFrame input") @@ -1395,7 +1473,9 @@ def _edge_data_to_sframe(data, src_field, dst_field): data_copy.rename({_SRC_VID_COLUMN: _DST_VID_COLUMN}, inplace=True) data_copy[_SRC_VID_COLUMN] = dst_id_column else: - data_copy.rename({src_field: _SRC_VID_COLUMN, dst_field: _DST_VID_COLUMN}, inplace=True) + data_copy.rename( + {src_field: _SRC_VID_COLUMN, dst_field: _DST_VID_COLUMN}, inplace=True + ) return data_copy elif HAS_PANDAS and type(data) == pd.DataFrame: @@ -1412,7 +1492,9 @@ def _edge_data_to_sframe(data, src_field, dst_field): sf.rename({_SRC_VID_COLUMN: _DST_VID_COLUMN}, inplace=True) sf[_SRC_VID_COLUMN] = dst_id_column else: - sf.rename({src_field: _SRC_VID_COLUMN, dst_field: _DST_VID_COLUMN}, inplace=True) + sf.rename( + {src_field: _SRC_VID_COLUMN, dst_field: _DST_VID_COLUMN}, inplace=True + ) return sf elif type(data) == Edge: @@ -1422,7 +1504,8 @@ def _edge_data_to_sframe(data, src_field, dst_field): return _edge_list_to_sframe(data, _SRC_VID_COLUMN, _DST_VID_COLUMN) else: - raise TypeError('Edges type %s is Not supported.' % str(type(data))) + raise TypeError("Edges type %s is Not supported." % str(type(data))) + ## Hack: overriding GFrame class name to make it appears as SFrame## GFrame.__name__ = SFrame.__name__ diff --git a/src/python/turicreate/data_structures/sketch.py b/src/python/turicreate/data_structures/sketch.py index f71a37e5c9..89bffea5ef 100644 --- a/src/python/turicreate/data_structures/sketch.py +++ b/src/python/turicreate/data_structures/sketch.py @@ -17,7 +17,7 @@ import operator from math import sqrt -__all__ = ['Sketch'] +__all__ = ["Sketch"] class Sketch(object): @@ -160,95 +160,101 @@ def __init__(self, array=None, background=False, sub_sketch_keys=[], _proxy=None key needs to be a string, for SArray of vector(array) type, the key needs to be positive integer """ - if (_proxy): + if _proxy: self.__proxy__ = _proxy else: self.__proxy__ = UnitySketchProxy() if not isinstance(array, SArray): raise TypeError("Sketch object can only be constructed from SArrays") - self.__proxy__.construct_from_sarray(array.__proxy__, background, sub_sketch_keys) + self.__proxy__.construct_from_sarray( + array.__proxy__, background, sub_sketch_keys + ) def __repr__(self): - """ + """ Emits a brief summary of all the statistics as a string. """ - fields = [ - ['size', 'Length' , 'Yes'], - ['min', 'Min' , 'Yes'], - ['max', 'Max' , 'Yes'], - ['mean', 'Mean' , 'Yes'], - ['sum', 'Sum' , 'Yes'], - ['var', 'Variance' , 'Yes'], - ['std', 'Standard Deviation' , 'Yes'], - ['num_missing', '# Missing Values' , 'Yes',], - ['num_unique', '# unique values', 'No' ] - ] - - s = '\n' - result = [] - for field in fields: + fields = [ + ["size", "Length", "Yes"], + ["min", "Min", "Yes"], + ["max", "Max", "Yes"], + ["mean", "Mean", "Yes"], + ["sum", "Sum", "Yes"], + ["var", "Variance", "Yes"], + ["std", "Standard Deviation", "Yes"], + ["num_missing", "# Missing Values", "Yes",], + ["num_unique", "# unique values", "No"], + ] + + s = "\n" + result = [] + for field in fields: + try: + method_to_call = getattr(self, field[0]) + result.append([field[1], str(method_to_call()), field[2]]) + except: + pass + sf = SArray(result).unpack(column_name_prefix="") + sf.rename({"0": "item", "1": "value", "2": "is exact"}, inplace=True) + s += sf.__str__(footer=False) + s += "\n" + + s += "\nMost frequent items:\n" + frequent = self.frequent_items() + # convert to string key + frequent_strkeys = {} + for key in frequent: + strkey = str(key) + if strkey in frequent_strkeys: + frequent_strkeys[strkey] += frequent[key] + else: + frequent_strkeys[strkey] = frequent[key] + + sorted_freq = sorted( + frequent_strkeys.items(), key=operator.itemgetter(1), reverse=True + ) + if len(sorted_freq) == 0: + s += " -- All elements appear with less than 0.01% frequency -- \n" + else: + sorted_freq = sorted_freq[:10] + sf = SFrame() + sf["value"] = [elem[0] for elem in sorted_freq] + sf["count"] = [elem[1] for elem in sorted_freq] + s += sf.__str__(footer=False) + "\n" + s += "\n" + + try: + # print quantiles + self.quantile(0) # XXX: is this necessary? + s += "Quantiles: \n" + sf = SFrame() + for q in [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.00]: + sf.add_column( + SArray([self.quantile(q)]), str(int(q * 100)) + "%", inplace=True + ) + s += sf.__str__(footer=False) + "\n" + except: + pass + + try: + t_k = self.dict_key_summary() + t_v = self.dict_value_summary() + s += "\n******** Dictionary Element Key Summary ********\n" + s += t_k.__repr__() + s += "\n******** Dictionary Element Value Summary ********\n" + s += t_v.__repr__() + "\n" + except: + pass + try: - method_to_call = getattr(self, field[0]) - result.append([field[1], str(method_to_call()), field[2]]) + t_k = self.element_summary() + s += "\n******** Element Summary ********\n" + s += t_k.__repr__() + "\n" except: - pass - sf = SArray(result).unpack(column_name_prefix = "") - sf.rename({'0': 'item', '1':'value', '2': 'is exact'}, inplace=True) - s += sf.__str__(footer=False) - s += "\n" - - s += "\nMost frequent items:\n" - frequent = self.frequent_items() - # convert to string key - frequent_strkeys = {} - for key in frequent: - strkey = str(key) - if strkey in frequent_strkeys: - frequent_strkeys[strkey] += frequent[key] - else: - frequent_strkeys[strkey] = frequent[key] - - sorted_freq = sorted(frequent_strkeys.items(), key=operator.itemgetter(1), reverse=True) - if len(sorted_freq) == 0: - s += " -- All elements appear with less than 0.01% frequency -- \n" - else: - sorted_freq = sorted_freq[:10] - sf = SFrame() - sf['value'] = [elem[0] for elem in sorted_freq] - sf['count'] = [elem[1] for elem in sorted_freq] - s += sf.__str__(footer=False) + "\n" - s += "\n" - - try: - # print quantiles - self.quantile(0) # XXX: is this necessary? - s += "Quantiles: \n" - sf = SFrame() - for q in [0.0,0.01,0.05,0.25,0.5,0.75,0.95,0.99,1.00]: - sf.add_column(SArray([self.quantile(q)]), str(int(q * 100)) + '%', inplace=True) - s += sf.__str__(footer=False) + "\n" - except: - pass - - try: - t_k = self.dict_key_summary() - t_v = self.dict_value_summary() - s += "\n******** Dictionary Element Key Summary ********\n" - s += t_k.__repr__() - s += "\n******** Dictionary Element Value Summary ********\n" - s += t_v.__repr__() + '\n' - except: - pass - - try: - t_k = self.element_summary() - s += "\n******** Element Summary ********\n" - s += t_k.__repr__() + '\n' - except: - pass - - return s.expandtabs(8) + pass + + return s.expandtabs(8) def __str__(self): """ @@ -546,7 +552,7 @@ def element_length_summary(self): """ with cython_context(): - return Sketch(_proxy = self.__proxy__.element_length_summary()) + return Sketch(_proxy=self.__proxy__.element_length_summary()) def dict_key_summary(self): """ @@ -574,7 +580,7 @@ def dict_key_summary(self): """ with cython_context(): - return Sketch(_proxy = self.__proxy__.dict_key_summary()) + return Sketch(_proxy=self.__proxy__.dict_key_summary()) def dict_value_summary(self): """ @@ -616,7 +622,7 @@ def dict_value_summary(self): """ with cython_context(): - return Sketch(_proxy = self.__proxy__.dict_value_summary()) + return Sketch(_proxy=self.__proxy__.dict_value_summary()) def element_summary(self): """ @@ -657,9 +663,9 @@ def element_summary(self): +-----+-----+-----+-----+-----+-----+-----+-----+------+ """ with cython_context(): - return Sketch(_proxy = self.__proxy__.element_summary()) + return Sketch(_proxy=self.__proxy__.element_summary()) - def element_sub_sketch(self, keys = None): + def element_sub_sketch(self, keys=None): """ Returns the sketch summary for the given set of keys. This is only applicable for sketch summary created from SArray of sarray or dict type. @@ -720,7 +726,7 @@ def element_sub_sketch(self, keys = None): single_val = True keys = [keys] value_types = set([type(i) for i in keys]) - if (len(value_types) > 1): + if len(value_types) > 1: raise ValueError("All keys should have the same type.") with cython_context(): @@ -729,10 +735,14 @@ def element_sub_sketch(self, keys = None): # check return key matches input key for key in keys: - if key not in ret_sketches: - raise KeyError("Cannot retrieve element sub sketch for key '" + str(key) + "'. Element sub sketch can only be retrieved when the summary object was created using the 'sub_sketch_keys' option.") + if key not in ret_sketches: + raise KeyError( + "Cannot retrieve element sub sketch for key '" + + str(key) + + "'. Element sub sketch can only be retrieved when the summary object was created using the 'sub_sketch_keys' option." + ) for key in ret_sketches: - ret[key] = Sketch(_proxy = ret_sketches[key]) + ret[key] = Sketch(_proxy=ret_sketches[key]) if single_val: return ret[keys[0]] @@ -740,7 +750,7 @@ def element_sub_sketch(self, keys = None): return ret def cancel(self): - """ + """ Cancels a background sketch computation immediately if one is ongoing. Does nothing otherwise. @@ -749,5 +759,5 @@ def cancel(self): >>> s = sa.summary(array, background=True) >>> s.cancel() """ - with cython_context(): - self.__proxy__.cancel() + with cython_context(): + self.__proxy__.cancel() diff --git a/src/python/turicreate/extensions.py b/src/python/turicreate/extensions.py index 18a2e3c65d..3c538b427e 100644 --- a/src/python/turicreate/extensions.py +++ b/src/python/turicreate/extensions.py @@ -33,8 +33,10 @@ from ._cython.context import debug_trace as cython_context from sys import version_info as _version_info import types as _types + if _sys.version_info.major == 2: from types import ClassType as _ClassType + _class_type = _ClassType else: _class_type = type @@ -79,6 +81,7 @@ _thismodule = _sys.modules[__name__] class_uid_to_class = {} + def _wrap_function_return(val): """ Recursively walks each thing in val, opening lists and dictionaries, @@ -87,11 +90,11 @@ def _wrap_function_return(val): """ if type(val) is _UnityGraphProxy: - return _SGraph(_proxy = val) + return _SGraph(_proxy=val) elif type(val) is _UnitySFrameProxy: - return _SFrame(_proxy = val) + return _SFrame(_proxy=val) elif type(val) is _UnitySArrayProxy: - return _SArray(_proxy = val) + return _SArray(_proxy=val) elif type(val) is _UnityModel: # we need to cast it up to the appropriate type uid = val.get_uid() @@ -102,10 +105,11 @@ def _wrap_function_return(val): elif type(val) is list: return [_wrap_function_return(i) for i in val] elif type(val) is dict: - return dict( (i, _wrap_function_return(val[i])) for i in val) + return dict((i, _wrap_function_return(val[i])) for i in val) else: return val + def _setattr_wrapper(mod, key, value): """ A setattr wrapper call used only by _publish(). This ensures that anything @@ -115,6 +119,7 @@ def _setattr_wrapper(mod, key, value): if mod == _thismodule: setattr(_sys.modules[__name__], key, value) + def _toolkit_function_pack_args(arguments, args, kwargs): """ Packs the arguments into the proper form. @@ -124,7 +129,12 @@ def _toolkit_function_pack_args(arguments, args, kwargs): num_args_got = len(args) + len(kwargs) num_args_required = len(arguments) if num_args_got != num_args_required: - raise TypeError("Expecting " + str(num_args_required) + " arguments, got " + str(num_args_got)) + raise TypeError( + "Expecting " + + str(num_args_required) + + " arguments, got " + + str(num_args_got) + ) ## fill the dict first with the regular args argument_dict = {} @@ -139,6 +149,7 @@ def _toolkit_function_pack_args(arguments, args, kwargs): return argument_dict + def _toolkit_function_unpack_return(ret): # handle errors @@ -149,8 +160,8 @@ def _toolkit_function_unpack_return(ret): raise _ToolkitError("Toolkit failed with unknown error") ret = _wrap_function_return(ret[2]) - if type(ret) is dict and 'return_value' in ret: - return ret['return_value'] + if type(ret) is dict and "return_value" in ret: + return ret["return_value"] else: return ret @@ -181,14 +192,14 @@ def _run_toolkit_function(fnname, arguments, args, kwargs): return _toolkit_function_unpack_return(ret) + # Implementation for backgrounding function calls -class ToolkitFunctionFuture(object): +class ToolkitFunctionFuture(object): def __init__(self, proxy): self.__proxy__ = proxy - def result(self): """ Waits for the answer to finish processing in the background, returning @@ -232,8 +243,11 @@ def _run_toolkit_function_background(fnname, arguments, args, kwargs): def _make_injected_function(fn, arguments): return lambda *args, **kwargs: _run_toolkit_function(fn, arguments, args, kwargs) + def _make_injected_function_background(fn, arguments): - return lambda *args, **kwargs: _run_toolkit_function_background(fn, arguments, args, kwargs) + return lambda *args, **kwargs: _run_toolkit_function_background( + fn, arguments, args, kwargs + ) def _class_instance_from_name(class_name, *arg, **kwarg): @@ -249,20 +263,23 @@ def _class_instance_from_name(class_name, *arg, **kwarg): """ # we first look in tc.extensions for the class name - module_path = class_name.split('.') + module_path = class_name.split(".") import_path = module_path[0:-1] - module = __import__('.'.join(import_path), fromlist=[module_path[-1]]) + module = __import__(".".join(import_path), fromlist=[module_path[-1]]) class_ = getattr(module, module_path[-1]) instance = class_(*arg, **kwarg) return instance + def _create_class_instance(class_name, _proxy): """ Look for the class in .extensions in case it has already been imported (perhaps as a builtin extensions hard compiled into unity_server). """ try: - return _class_instance_from_name('turicreate.extensions.' + class_name, _proxy=_proxy) + return _class_instance_from_name( + "turicreate.extensions." + class_name, _proxy=_proxy + ) except: pass return _class_instance_from_name(class_name, _proxy=_proxy) @@ -277,48 +294,52 @@ class _ToolkitClass: inject functions, and attributes into their appropriate places. """ - _functions = {} # The functions in the class - _get_properties = [] # The getable properties in the class - _set_properties = [] # The setable properties in the class + _functions = {} # The functions in the class + _get_properties = [] # The getable properties in the class + _set_properties = [] # The setable properties in the class _tkclass = None - def __init__(self, *args, **kwargs): tkclass_name = getattr(self.__init__, "tkclass_name") _proxy = None if "_proxy" in kwargs: - _proxy = kwargs['_proxy'] - del kwargs['_proxy'] + _proxy = kwargs["_proxy"] + del kwargs["_proxy"] if _proxy: - self.__dict__['_tkclass'] = _proxy + self.__dict__["_tkclass"] = _proxy elif tkclass_name: - self.__dict__['_tkclass'] = _get_unity().create_toolkit_class(tkclass_name) + self.__dict__["_tkclass"] = _get_unity().create_toolkit_class(tkclass_name) try: # fill the functions and properties - self.__dict__['_functions'] = self._tkclass.list_functions() - self.__dict__['_get_properties'] = self._tkclass.list_get_properties() - self.__dict__['_set_properties'] = self._tkclass.list_set_properties() + self.__dict__["_functions"] = self._tkclass.list_functions() + self.__dict__["_get_properties"] = self._tkclass.list_get_properties() + self.__dict__["_set_properties"] = self._tkclass.list_set_properties() # rewrite the doc string for this class try: - self.__dict__['__doc__'] = self._tkclass.get('get_docstring', {'__symbol__':'__doc__'}) - self.__class__.__dict__['__doc__'] = self.__dict__['__doc__'] + self.__dict__["__doc__"] = self._tkclass.get( + "get_docstring", {"__symbol__": "__doc__"} + ) + self.__class__.__dict__["__doc__"] = self.__dict__["__doc__"] except: pass except: - raise _ToolkitError("Cannot create Toolkit Class for this class. " - "This class was not created with the new toolkit class system.") + raise _ToolkitError( + "Cannot create Toolkit Class for this class. " + "This class was not created with the new toolkit class system." + ) # for compatibility with older classes / models - self.__dict__['__proxy__'] = self.__dict__['_tkclass'] + self.__dict__["__proxy__"] = self.__dict__["_tkclass"] - if '__init__' in self.__dict__['_functions']: + if "__init__" in self.__dict__["_functions"]: self.__run_class_function("__init__", args, kwargs) elif len(args) != 0 or len(kwargs) != 0: raise TypeError("This constructor takes no arguments") def __dir__(self): - return list(self._functions.keys()) + self._get_properties + self._set_properties - + return ( + list(self._functions.keys()) + self._get_properties + self._set_properties + ) def __run_class_function(self, fnname, args, kwargs): arguments = self._functions[fnname] @@ -342,37 +363,38 @@ def __run_class_function(self, fnname, args, kwargs): ret = _wrap_function_return(ret) return ret - def __getattr__(self, name): - if name == '__proxy__': - return self.__dict__['__proxy__'] + if name == "__proxy__": + return self.__dict__["__proxy__"] elif name in self._get_properties: # is it an attribute? return _wrap_function_return(self._tkclass.get_property(name)) elif name in self._functions: # is it a function? ret = lambda *args, **kwargs: self.__run_class_function(name, args, kwargs) - ret.__doc__ = "Name: " + name + "\nParameters: " + str(self._functions[name]) + "\n" + ret.__doc__ = ( + "Name: " + name + "\nParameters: " + str(self._functions[name]) + "\n" + ) try: ret.__doc__ += self._tkclass.get_docstring(name) - ret.__doc__ += '\n' + ret.__doc__ += "\n" except: pass return ret else: raise AttributeError("no attribute " + name) - def __setattr__(self, name, value): - if name == '__proxy__': - self.__dict__['__proxy__'] = value + if name == "__proxy__": + self.__dict__["__proxy__"] = value elif name in self._set_properties: # is it a setable property? - arguments = {'value':value} + arguments = {"value": value} return _wrap_function_return(self._tkclass.set_property(name, arguments)) else: raise AttributeError("no attribute " + name) + def _list_functions(): """ Lists all the functions registered in unity_server. @@ -380,9 +402,11 @@ def _list_functions(): unity = _get_unity() return unity.list_toolkit_functions() + def _publish(): import copy + """ Publishes all functions and classes registered in unity_server. The functions and classes will appear in the module turicreate.extensions @@ -397,21 +421,21 @@ def _publish(): for fn in fnlist: props = unity.describe_toolkit_function(fn) # quit if there is nothing we can process - if 'arguments' not in props: + if "arguments" not in props: continue - arguments = props['arguments'] + arguments = props["arguments"] newfunc = _make_injected_function(fn, arguments) newfunc.run_background = _make_injected_function_background(fn, arguments) newfunc.__doc__ = "Name: " + fn + "\nParameters: " + str(arguments) + "\n" - if 'documentation' in props: - newfunc.__doc__ += props['documentation'] + "\n" + if "documentation" in props: + newfunc.__doc__ += props["documentation"] + "\n" - newfunc.__dict__['__glmeta__'] = {'extension_name':fn, 'arguments': arguments} + newfunc.__dict__["__glmeta__"] = {"extension_name": fn, "arguments": arguments} - modpath = fn.split('.') + modpath = fn.split(".") # walk the module tree mod = _thismodule for path in modpath[:-1]: @@ -427,31 +451,38 @@ def _publish(): for tkclass in tkclasslist: m = unity.describe_toolkit_class(tkclass) # of v2 type - if not ('functions' in m and 'get_properties' in m and 'set_properties' in m and 'uid' in m): + if not ( + "functions" in m + and "get_properties" in m + and "set_properties" in m + and "uid" in m + ): continue # create a new class if _version_info.major == 3: new_class = _ToolkitClass.__dict__.copy() - del new_class['__dict__'] - del new_class['__weakref__'] + del new_class["__dict__"] + del new_class["__weakref__"] else: new_class = copy.deepcopy(_ToolkitClass.__dict__) - new_class['__init__'] = _types.FunctionType(new_class['__init__'].__code__, - new_class['__init__'].__globals__, - name='__init__', - argdefs=(), - closure=()) + new_class["__init__"] = _types.FunctionType( + new_class["__init__"].__code__, + new_class["__init__"].__globals__, + name="__init__", + argdefs=(), + closure=(), + ) # rewrite the init method to add the toolkit class name so it will # default construct correctly - new_class['__init__'].tkclass_name = tkclass + new_class["__init__"].tkclass_name = tkclass newclass = _class_type(tkclass, (), new_class) - setattr(newclass, '__glmeta__', {'extension_name':tkclass}) - class_uid_to_class[m['uid']] = newclass - modpath = tkclass.split('.') + setattr(newclass, "__glmeta__", {"extension_name": tkclass}) + class_uid_to_class[m["uid"]] = newclass + modpath = tkclass.split(".") # walk the module tree mod = _thismodule for path in modpath[:-1]: @@ -463,7 +494,6 @@ def _publish(): _setattr_wrapper(mod, modpath[-1], newclass) - class _ExtMetaPath(object): """ This is a magic metapath searcher. To understand how this works, @@ -473,6 +503,7 @@ class _ExtMetaPath(object): particular module import was requested, allowing this to essentially 'override' the default import behaviors. """ + def find_module(self, fullname, submodule_path=None): """ We have to see if fullname refers to a module we can import. @@ -491,24 +522,25 @@ def find_module(self, fullname, submodule_path=None): # locations import sys import os + # This drops the last "." So if I am importing aaa.bbb.xxx # module_subpath is aaa.bbb - module_subpath = ".".join(fullname.split('.')[:-1]) + module_subpath = ".".join(fullname.split(".")[:-1]) for path in sys.path: # joins the path to aaa/bbb/xxx - pathname = os.path.join(path, os.sep.join(fullname.split('.'))) + pathname = os.path.join(path, os.sep.join(fullname.split("."))) # try to laod the ".so" extension try: - if os.path.exists(pathname + '.so'): - ext_import(pathname + '.so', module_subpath) + if os.path.exists(pathname + ".so"): + ext_import(pathname + ".so", module_subpath) break except: pass # try to laod the ".dylib" extension try: - if os.path.exists(pathname + '.dylib'): - ext_import(pathname + '.dylib', module_subpath) + if os.path.exists(pathname + ".dylib"): + ext_import(pathname + ".dylib", module_subpath) break except: pass @@ -522,7 +554,7 @@ def try_find_module(self, fullname, submodule_path=None): # Essentially: if fullname == aaa.bbb.xxx # Then we try to see if we have loaded tc.extensions.aaa.bbb.xxx mod = _thismodule - modpath = fullname.split('.') + modpath = fullname.split(".") # walk the module tree mod = _thismodule for path in modpath: @@ -534,6 +566,7 @@ def try_find_module(self, fullname, submodule_path=None): def load_module(self, fullname): import sys + # we may have already been loaded if fullname in sys.modules: return sys.modules[fullname] @@ -541,7 +574,7 @@ def load_module(self, fullname): # Essentially: if fullname == aaa.bbb.xxx # Then we try to look for tc.extensions.aaa.bbb.xxx mod = _thismodule - modpath = fullname.split('.') + modpath = fullname.split(".") for path in modpath: mod = getattr(mod, path) @@ -552,13 +585,16 @@ def load_module(self, fullname): sys.modules[fullname] = mod return mod + _ext_meta_path_singleton = None + def _add_meta_path(): """ called on unity_server import to insert the meta path loader. """ import sys + global _ext_meta_path_singleton if _ext_meta_path_singleton is None: _ext_meta_path_singleton = _ExtMetaPath() @@ -641,10 +677,14 @@ def ext_import(soname, module_subpath=""): 3.0 """ import warnings - warnings.warn("turicreate.ext_import is deprecated. It will be removed in the next major release.") + + warnings.warn( + "turicreate.ext_import is deprecated. It will be removed in the next major release." + ) unity = _get_unity() import os + if os.path.exists(soname): soname = os.path.abspath(soname) else: @@ -654,8 +694,9 @@ def ext_import(soname, module_subpath=""): raise RuntimeError(ret) _publish() # push the functions into the corresponding module namespace - return unity.list_toolkit_functions_in_dynamic_module(soname) + unity.list_toolkit_classes_in_dynamic_module(soname) - + return unity.list_toolkit_functions_in_dynamic_module( + soname + ) + unity.list_toolkit_classes_in_dynamic_module(soname) def _get_toolkit_function_name_from_function(fn): @@ -665,22 +706,24 @@ def _get_toolkit_function_name_from_function(fn): Otherwise we return an empty string. """ try: - if '__glmeta__' in fn.__dict__: - return fn.__dict__['__glmeta__']['extension_name'] + if "__glmeta__" in fn.__dict__: + return fn.__dict__["__glmeta__"]["extension_name"] else: return "" except: return "" + def _get_argument_list_from_toolkit_function_name(fn): """ Given a toolkit function name, return the argument list """ unity = _get_unity() fnprops = unity.describe_toolkit_function(fn) - argnames = fnprops['arguments'] + argnames = fnprops["arguments"] return argnames + class _Closure: """ Defines a closure class describing a lambda closure. Contains 2 fields: @@ -704,6 +747,7 @@ class _Closure: [0, 0], --> is not captured value. is argument 0 of the lambda. [0, 1] --> is not captured value. is argument 1 of the lambda. """ + def __init__(self, native_fn_name, arguments): self.native_fn_name = native_fn_name self.arguments = arguments @@ -715,7 +759,7 @@ def _descend_namespace(caller_globals, name): walk the globals expanding caller_globals['a']['b']['c']['d'] returning the result. Raises an exception (IndexError) on failure. """ - names = name.split('.') + names = name.split(".") cur = caller_globals for i in names: if type(cur) is dict: @@ -724,6 +768,7 @@ def _descend_namespace(caller_globals, name): cur = getattr(cur, i) return cur + def _build_native_function_call(fn): """ If fn can be interpreted and handled as a native function: i.e. @@ -798,8 +843,9 @@ def _build_native_function_call(fn): # attempt to recursively break down any other functions import inspect + for i in range(len(arglist)): - if arglist[i][0] == 1 and inspect.isfunction(arglist[i][1]): + if arglist[i][0] == 1 and inspect.isfunction(arglist[i][1]): try: arglist[i][1] = _build_native_function_call(arglist[i][1]) except: diff --git a/src/python/turicreate/meta/__init__.py b/src/python/turicreate/meta/__init__.py index 44c3e35e93..9e78e7dcd8 100644 --- a/src/python/turicreate/meta/__init__.py +++ b/src/python/turicreate/meta/__init__.py @@ -10,24 +10,30 @@ from .decompiler.instructions import make_module from .asttools.visitors.pysourcegen import dump_python_source import sys -def decompile(code, mode='exec'): - ''' + + +def decompile(code, mode="exec"): + """ Decompile a code object into python ast. :param mode: must be 'exec' to compile a module or 'eval' to compile an expression. - ''' - if mode == 'exec': + """ + if mode == "exec": return make_module(code) else: raise Exception("can not handle mode %r yet" % mode) -def test(stream=sys.stdout, descriptions=True, verbosity=2, failfast=False, buffer=False): - ''' + +def test( + stream=sys.stdout, descriptions=True, verbosity=2, failfast=False, buffer=False +): + """ Load and run the meta test suite. - ''' + """ import unittest as _unit import os as _os + star_dir = _os.path.dirname(__file__) test_suite = _unit.defaultTestLoader.discover(star_dir) runner = _unit.TextTestRunner(stream, descriptions, verbosity, failfast, buffer) diff --git a/src/python/turicreate/meta/asttools/__init__.py b/src/python/turicreate/meta/asttools/__init__.py index d4c3fc3fd7..b9551b49b1 100644 --- a/src/python/turicreate/meta/asttools/__init__.py +++ b/src/python/turicreate/meta/asttools/__init__.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Module to augment and analyze python ast nodes. This module uses the python `ast` module exclusively not the deprecated `compiler.ast`. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -18,12 +18,14 @@ from ..asttools.visitors import dont_visit, visit_children, Visitor -class Undedined: pass +class Undedined: + pass + def cmp_ast(node1, node2): - ''' + """ Compare if two nodes are equal. - ''' + """ if type(node1) != type(node2): return False @@ -49,11 +51,9 @@ def cmp_ast(node1, node2): return True - - -#=============================================================================== +# =============================================================================== # -#=============================================================================== +# =============================================================================== from ..asttools.visitors.print_visitor import print_ast, dump_ast as str_ast from ..asttools.visitors.pysourcegen import python_source, dump_python_source diff --git a/src/python/turicreate/meta/asttools/mutators/__init__.py b/src/python/turicreate/meta/asttools/mutators/__init__.py index da3827cdde..2157fc9c80 100644 --- a/src/python/turicreate/meta/asttools/mutators/__init__.py +++ b/src/python/turicreate/meta/asttools/mutators/__init__.py @@ -3,9 +3,9 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Tools to mutate ast nodes. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ diff --git a/src/python/turicreate/meta/asttools/mutators/prune_mutator.py b/src/python/turicreate/meta/asttools/mutators/prune_mutator.py index 701e6dcb6b..0ad9c8955a 100644 --- a/src/python/turicreate/meta/asttools/mutators/prune_mutator.py +++ b/src/python/turicreate/meta/asttools/mutators/prune_mutator.py @@ -3,46 +3,50 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Jul 18, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ import _ast from ...asttools import Visitor, visit_children + def removable(self, node): - ''' + """ node is removable only if all of its children are as well. - ''' + """ throw_away = [] for child in self.children(node): throw_away.append(self.visit(child)) - if self.mode == 'exclusive': + if self.mode == "exclusive": return all(throw_away) - elif self.mode == 'inclusive': + elif self.mode == "inclusive": return any(throw_away) else: raise TypeError("mode must be one of 'exclusive' or 'inclusive'") + # Helper function to create a pass node with a line number and col_offset Pass = lambda node: _ast.Pass(lineno=node.lineno, col_offset=node.col_offset) + class PruneVisitor(Visitor): - ''' + """ Visitor to remove ast nodes :param symbols: set of symbol that are removable. - ''' - def __init__(self, symbols, mode='exclusive'): + """ + + def __init__(self, symbols, mode="exclusive"): self.remove_symbols = symbols - if mode not in ['exclusive', 'inclusive']: + if mode not in ["exclusive", "inclusive"]: raise TypeError("mode must be one of 'exclusive' or 'inclusive'") self.mode = mode @@ -50,9 +54,9 @@ def __init__(self, symbols, mode='exclusive'): visitDefault = removable def reduce(self, body): - ''' + """ remove nodes from a list - ''' + """ i = 0 while i < len(body): stmnt = body[i] @@ -85,7 +89,6 @@ def visitFor(self, node): if len_body == 0: node.body.append(_ast.Pass(lineno=node.lineno, col_offset=node.col_offset)) - self.reduce(node.orelse) len_orelse = len(node.orelse) @@ -175,7 +178,15 @@ def visitTryExcept(self, node): hndlr.body.append(Pass(hndlr)) if len_body == 0: - node.handlers = [_ast.ExceptHandler(type=None, name=None, body=[Pass(node)], lineno=node.lineno, col_offset=node.col_offset)] + node.handlers = [ + _ast.ExceptHandler( + type=None, + name=None, + body=[Pass(node)], + lineno=node.lineno, + col_offset=node.col_offset, + ) + ] return len_body == 0 and len(node.orelse) == 0 diff --git a/src/python/turicreate/meta/asttools/mutators/remove_trivial.py b/src/python/turicreate/meta/asttools/mutators/remove_trivial.py index fbc20416d9..72d34d3898 100644 --- a/src/python/turicreate/meta/asttools/mutators/remove_trivial.py +++ b/src/python/turicreate/meta/asttools/mutators/remove_trivial.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Aug 3, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -20,12 +20,13 @@ from ...asttools.visitors.symbol_visitor import get_symbols from ...asttools.visitors.cond_symbol_visitor import conditional_lhs -class Assignment(object): +class Assignment(object): def __init__(self, root, assignments): self.root = root self.assignments = assignments + def visit_conditional(self, node): conditional, stable = conditional_lhs(node) @@ -34,10 +35,12 @@ def visit_conditional(self, node): return bgather = GatherAssignments() - for stmnt in node.body: bgather.visit(stmnt) + for stmnt in node.body: + bgather.visit(stmnt) egather = GatherAssignments() - for stmnt in node.orelse: egather.visit(stmnt) + for stmnt in node.orelse: + egather.visit(stmnt) for symbol in stable: node_list = self.assign_id_map.setdefault(symbol, []) @@ -50,10 +53,11 @@ def visit_conditional(self, node): node_list.append(Assignment(root=node, assignments=assignments)) + class GatherAssignments(Visitor): - ''' + """ Collect ast nodes that assign to the same variable. - ''' + """ def __init__(self): self.assign_id_map = {} @@ -75,8 +79,9 @@ def visitAssign(self, node): node_list = self.assign_id_map.setdefault(id, []) node_list.append(Assignment(root=node, assignments=(node,))) + def remove_trivial(root): - ''' + """ Remove redundant statements. The statement `a = 1` will be removed:: @@ -91,7 +96,7 @@ def remove_trivial(root): a = 2 :param root: ast node - ''' + """ gen = GatherAssignments() gen.visit(root) @@ -106,7 +111,7 @@ def remove_trivial(root): i1 = root.body.index(assignments[j].root) i2 = root.body.index(assignments[j + 1].root) - body = root.body[i1 + 1:i2] + body = root.body[i1 + 1 : i2] grapher = GraphGen() for stmnt in body: grapher.visit(stmnt) @@ -119,8 +124,9 @@ def remove_trivial(root): for old in to_remove: replace_nodes(root, old, Pass(old)) + def remove_unused_assign(root, symbol): - ''' + """ Remove redundant statements. The statement `a = 1` will be removed:: @@ -135,7 +141,7 @@ def remove_unused_assign(root, symbol): a = 2 :param root: ast node - ''' + """ gen = GatherAssignments() gen.visit(root) @@ -145,7 +151,6 @@ def remove_unused_assign(root, symbol): if symbol not in gen.assign_id_map: return - assignments = gen.assign_id_map[symbol] if len(assignments) < 2: @@ -155,7 +160,7 @@ def remove_unused_assign(root, symbol): i1 = root.body.index(assignments[j].root) i2 = root.body.index(assignments[j + 1].root) - body = root.body[i1 + 1:i2] + body = root.body[i1 + 1 : i2] grapher = GraphGen() for stmnt in body: grapher.visit(stmnt) diff --git a/src/python/turicreate/meta/asttools/mutators/replace_mutator.py b/src/python/turicreate/meta/asttools/mutators/replace_mutator.py index 5584fe2010..8dd5bca428 100644 --- a/src/python/turicreate/meta/asttools/mutators/replace_mutator.py +++ b/src/python/turicreate/meta/asttools/mutators/replace_mutator.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Aug 3, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -15,10 +15,11 @@ import _ast from ...asttools.visitors import Visitor + class Replacer(Visitor): - ''' + """ Visitor to replace nodes. - ''' + """ def __init__(self, old, new): self.old = old @@ -46,25 +47,28 @@ def visitDefault(self, node): return + def replace_nodes(root, old, new): - ''' + """ Replace the old node with the new one. Old must be an indirect child of root :param root: ast node that contains an indirect reference to old :param old: node to replace :param new: node to replace `old` with - ''' + """ rep = Replacer(old, new) rep.visit(root) return + class NodeRemover(Visitor): - ''' + """ Remove a node. - ''' + """ + def __init__(self, to_remove): self.to_remove diff --git a/src/python/turicreate/meta/asttools/tests/__init__.py b/src/python/turicreate/meta/asttools/tests/__init__.py index c144b1c405..747b360c0d 100644 --- a/src/python/turicreate/meta/asttools/tests/__init__.py +++ b/src/python/turicreate/meta/asttools/tests/__init__.py @@ -12,6 +12,7 @@ from ...asttools import Visitor, cmp_ast, str_ast from ...asttools.visitors.graph_visitor import GraphGen + class NodeRecorder(Visitor): def __init__(self): self.ast_nodenames = set() @@ -22,11 +23,13 @@ def visitDefault(self, node): for child in self.children(node): self.visit(child) + def ast_types(node): rec = NodeRecorder() rec.visit(node) return rec.ast_nodenames + class AllTypesTested(object): def __init__(self): @@ -45,17 +48,22 @@ def tested(self): return all_ast_nodes - self.nodenames + def assert_ast_eq(testcase, orig_ast, expected_ast): if not cmp_ast(orig_ast, expected_ast): - str1 = str_ast(orig_ast, indent=' ', newline='\n') - str2 = str_ast(expected_ast, indent=' ', newline='\n') - msg = 'AST Trees are not equal\n## left ########### \n%s\n## right ########### \n%s' % (str1, str2) + str1 = str_ast(orig_ast, indent=" ", newline="\n") + str2 = str_ast(expected_ast, indent=" ", newline="\n") + msg = ( + "AST Trees are not equal\n## left ########### \n%s\n## right ########### \n%s" + % (str1, str2) + ) testcase.fail(msg) try: import networkx + have_networkx = True except: have_networkx = False @@ -64,5 +72,5 @@ def assert_ast_eq(testcase, orig_ast, expected_ast): if __name__ == "__main__": - #import sys;sys.argv = ['', 'Test.testName'] + # import sys;sys.argv = ['', 'Test.testName'] unittest.main() diff --git a/src/python/turicreate/meta/asttools/tests/test_conditional_symbols.py b/src/python/turicreate/meta/asttools/tests/test_conditional_symbols.py index 49adf5fe9a..bae10e90ea 100644 --- a/src/python/turicreate/meta/asttools/tests/test_conditional_symbols.py +++ b/src/python/turicreate/meta/asttools/tests/test_conditional_symbols.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Aug 9, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -16,12 +16,17 @@ import ast from ...testing import py2only -class Test(unittest.TestCase): - - def assertCorrect(self, source, lhs_conditional=None, lhs_stable=None, - rhs_conditional=None, rhs_stable=None, - undefined=None): +class Test(unittest.TestCase): + def assertCorrect( + self, + source, + lhs_conditional=None, + lhs_stable=None, + rhs_conditional=None, + rhs_stable=None, + undefined=None, + ): mod = ast.parse(source) @@ -43,127 +48,397 @@ def assertCorrect(self, source, lhs_conditional=None, lhs_stable=None, self.assertEqual(un, set(undefined)) def test_assign(self): - self.assertCorrect(source='a = 1', - lhs_conditional=[], lhs_stable=['a'], - rhs_conditional=[], rhs_stable=[], - undefined=[]) + self.assertCorrect( + source="a = 1", + lhs_conditional=[], + lhs_stable=["a"], + rhs_conditional=[], + rhs_stable=[], + undefined=[], + ) def test_assign2(self): - self.assertCorrect(source='a = b', lhs_conditional=[], lhs_stable=['a'], rhs_conditional=[], rhs_stable=['b'], undefined=['b']) + self.assertCorrect( + source="a = b", + lhs_conditional=[], + lhs_stable=["a"], + rhs_conditional=[], + rhs_stable=["b"], + undefined=["b"], + ) def test_assign3(self): - self.assertCorrect(source='a, b = b', lhs_conditional=[], lhs_stable=['a', 'b'], rhs_conditional=[], rhs_stable=['b'], undefined=['b']) + self.assertCorrect( + source="a, b = b", + lhs_conditional=[], + lhs_stable=["a", "b"], + rhs_conditional=[], + rhs_stable=["b"], + undefined=["b"], + ) def test_assign4(self): - self.assertCorrect(source='b = 1; a = b', lhs_conditional=[], lhs_stable=['a', 'b'], rhs_conditional=[], rhs_stable=['b'], undefined=[]) + self.assertCorrect( + source="b = 1; a = b", + lhs_conditional=[], + lhs_stable=["a", "b"], + rhs_conditional=[], + rhs_stable=["b"], + undefined=[], + ) def test_assign5(self): - self.assertCorrect(source='a = b; b = 1', lhs_conditional=[], lhs_stable=['a', 'b'], rhs_conditional=[], rhs_stable=['b'], undefined=['b']) + self.assertCorrect( + source="a = b; b = 1", + lhs_conditional=[], + lhs_stable=["a", "b"], + rhs_conditional=[], + rhs_stable=["b"], + undefined=["b"], + ) def test_assign6(self): - self.assertCorrect(source='a = a', lhs_conditional=[], lhs_stable=['a', ], rhs_conditional=[], rhs_stable=['a'], undefined=['a']) + self.assertCorrect( + source="a = a", + lhs_conditional=[], + lhs_stable=["a",], + rhs_conditional=[], + rhs_stable=["a"], + undefined=["a"], + ) def test_aug_assign(self): - self.assertCorrect(source='a += 1', lhs_conditional=[], lhs_stable=['a'], rhs_conditional=[], rhs_stable=['a'], undefined=['a']) + self.assertCorrect( + source="a += 1", + lhs_conditional=[], + lhs_stable=["a"], + rhs_conditional=[], + rhs_stable=["a"], + undefined=["a"], + ) def test_assign_attr(self): - self.assertCorrect(source='a.a = 1', lhs_conditional=[], lhs_stable=[], rhs_conditional=[], rhs_stable=['a'], undefined=['a']) + self.assertCorrect( + source="a.a = 1", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["a"], + undefined=["a"], + ) def test_assign_subscr(self): - self.assertCorrect(source='a[b] = 1', lhs_conditional=[], lhs_stable=[], rhs_conditional=[], rhs_stable=['a', 'b'], undefined=['a', 'b']) + self.assertCorrect( + source="a[b] = 1", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["a", "b"], + undefined=["a", "b"], + ) def test_if0(self): - self.assertCorrect(source='if a: b', lhs_conditional=[], lhs_stable=[], rhs_conditional=['b'], rhs_stable=['a'], undefined=['a', 'b']) + self.assertCorrect( + source="if a: b", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=["b"], + rhs_stable=["a"], + undefined=["a", "b"], + ) def test_if1(self): - self.assertCorrect(source='if a: b = 1', lhs_conditional=['b'], lhs_stable=[], rhs_conditional=[], rhs_stable=['a'], undefined=['a']) + self.assertCorrect( + source="if a: b = 1", + lhs_conditional=["b"], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["a"], + undefined=["a"], + ) def test_if_else0(self): - self.assertCorrect(source='if a: b\nelse: b', lhs_conditional=[], lhs_stable=[], rhs_conditional=[], rhs_stable=['a', 'b'], undefined=['a', 'b']) + self.assertCorrect( + source="if a: b\nelse: b", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["a", "b"], + undefined=["a", "b"], + ) def test_if_else1(self): - self.assertCorrect(source='if a: b = 1\nelse: b = 1', lhs_conditional=[], lhs_stable=['b'], rhs_conditional=[], rhs_stable=['a'], undefined=['a']) + self.assertCorrect( + source="if a: b = 1\nelse: b = 1", + lhs_conditional=[], + lhs_stable=["b"], + rhs_conditional=[], + rhs_stable=["a"], + undefined=["a"], + ) def test_if_elif0(self): - self.assertCorrect(source='if a: b\nelif c: b', lhs_conditional=[], lhs_stable=[], rhs_conditional=['b', 'c'], rhs_stable=['a'], undefined=['a', 'b', 'c']) + self.assertCorrect( + source="if a: b\nelif c: b", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=["b", "c"], + rhs_stable=["a"], + undefined=["a", "b", "c"], + ) def test_if_elif1(self): - self.assertCorrect(source='if a: b\nelif c: b\nelse: b', lhs_conditional=[], lhs_stable=[], rhs_conditional=['c'], rhs_stable=['a', 'b'], undefined=['a', 'b', 'c']) + self.assertCorrect( + source="if a: b\nelif c: b\nelse: b", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=["c"], + rhs_stable=["a", "b"], + undefined=["a", "b", "c"], + ) def test_if_elif2(self): - self.assertCorrect(source='if a: b\nelif c: pass\nelse: b', lhs_conditional=[], lhs_stable=[], rhs_conditional=['b', 'c'], rhs_stable=['a'], undefined=['a', 'b', 'c']) + self.assertCorrect( + source="if a: b\nelif c: pass\nelse: b", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=["b", "c"], + rhs_stable=["a"], + undefined=["a", "b", "c"], + ) def test_for(self): - self.assertCorrect(source='for i in j: k = 1', lhs_conditional=['i', 'k'], lhs_stable=[], rhs_conditional=[], rhs_stable=['j'], undefined=['j']) + self.assertCorrect( + source="for i in j: k = 1", + lhs_conditional=["i", "k"], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["j"], + undefined=["j"], + ) def test_for_else0(self): - self.assertCorrect(source='for i in j: k = 1\nelse: k = 2', lhs_conditional=['i'], lhs_stable=['k'], rhs_conditional=[], rhs_stable=['j'], undefined=['j']) + self.assertCorrect( + source="for i in j: k = 1\nelse: k = 2", + lhs_conditional=["i"], + lhs_stable=["k"], + rhs_conditional=[], + rhs_stable=["j"], + undefined=["j"], + ) def test_for_else1(self): - self.assertCorrect(source='for i in j: k = 1\nb = k', lhs_conditional=['i', 'k'], lhs_stable=['b'], rhs_conditional=[], rhs_stable=['j', 'k'], undefined=['j', 'k']) + self.assertCorrect( + source="for i in j: k = 1\nb = k", + lhs_conditional=["i", "k"], + lhs_stable=["b"], + rhs_conditional=[], + rhs_stable=["j", "k"], + undefined=["j", "k"], + ) def test_for_break0(self): - self.assertCorrect(source='for i in j:\n break\n k = 1', lhs_conditional=['i', 'k'], lhs_stable=[], rhs_conditional=[], rhs_stable=['j'], undefined=['j']) + self.assertCorrect( + source="for i in j:\n break\n k = 1", + lhs_conditional=["i", "k"], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["j"], + undefined=["j"], + ) def test_for_break1(self): - self.assertCorrect(source='for i in j:\n break\n k = 1\nelse: k = 2', lhs_conditional=['i', 'k'], lhs_stable=[], rhs_conditional=[], rhs_stable=['j'], undefined=['j']) + self.assertCorrect( + source="for i in j:\n break\n k = 1\nelse: k = 2", + lhs_conditional=["i", "k"], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["j"], + undefined=["j"], + ) def test_while(self): - self.assertCorrect(source='while b: a = 1', lhs_conditional=['a'], lhs_stable=[], rhs_conditional=[], rhs_stable=['b'], undefined=['b']) + self.assertCorrect( + source="while b: a = 1", + lhs_conditional=["a"], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["b"], + undefined=["b"], + ) def test_while_else(self): - self.assertCorrect(source='while b: a = 1\nelse: a = 2', lhs_conditional=[], lhs_stable=['a'], rhs_conditional=[], rhs_stable=['b'], undefined=['b']) + self.assertCorrect( + source="while b: a = 1\nelse: a = 2", + lhs_conditional=[], + lhs_stable=["a"], + rhs_conditional=[], + rhs_stable=["b"], + undefined=["b"], + ) def test_while_else_break(self): - self.assertCorrect(source='while b:\n break\n a = 1\nelse: a = 2', lhs_conditional=['a'], lhs_stable=[], rhs_conditional=[], rhs_stable=['b'], undefined=['b']) + self.assertCorrect( + source="while b:\n break\n a = 1\nelse: a = 2", + lhs_conditional=["a"], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["b"], + undefined=["b"], + ) def test_while_break(self): - self.assertCorrect(source='while b:\n break\n a = 1', lhs_conditional=['a'], lhs_stable=[], rhs_conditional=[], rhs_stable=['b'], undefined=['b']) + self.assertCorrect( + source="while b:\n break\n a = 1", + lhs_conditional=["a"], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["b"], + undefined=["b"], + ) def test_nested_if(self): - self.assertCorrect(source='if a:\n if b: c', lhs_conditional=[], lhs_stable=[], rhs_conditional=['b', 'c'], rhs_stable=['a'], undefined=['a', 'b', 'c']) + self.assertCorrect( + source="if a:\n if b: c", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=["b", "c"], + rhs_stable=["a"], + undefined=["a", "b", "c"], + ) def test_nested_if1(self): - self.assertCorrect(source='if a:\n if b: c = 1', lhs_conditional=['c'], lhs_stable=[], rhs_conditional=['b'], rhs_stable=['a'], undefined=['a', 'b']) + self.assertCorrect( + source="if a:\n if b: c = 1", + lhs_conditional=["c"], + lhs_stable=[], + rhs_conditional=["b"], + rhs_stable=["a"], + undefined=["a", "b"], + ) def test_nested_for(self): - self.assertCorrect(source='for a in b:\n for c in a: d', lhs_conditional=['a', 'c'], lhs_stable=[], rhs_conditional=['a', 'd'], rhs_stable=['b'], undefined=['b', 'd']) + self.assertCorrect( + source="for a in b:\n for c in a: d", + lhs_conditional=["a", "c"], + lhs_stable=[], + rhs_conditional=["a", "d"], + rhs_stable=["b"], + undefined=["b", "d"], + ) def test_nested_while(self): - self.assertCorrect(source='while a:\n while c: d', lhs_conditional=[], lhs_stable=[], rhs_conditional=['c', 'd'], rhs_stable=['a'], undefined=['a', 'c', 'd']) + self.assertCorrect( + source="while a:\n while c: d", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=["c", "d"], + rhs_stable=["a"], + undefined=["a", "c", "d"], + ) def test_conditional_after_stable(self): - self.assertCorrect(source='a = 1\nif b: a = 2', lhs_conditional=[], lhs_stable=['a'], rhs_conditional=[], rhs_stable=['b'], undefined=['b']) + self.assertCorrect( + source="a = 1\nif b: a = 2", + lhs_conditional=[], + lhs_stable=["a"], + rhs_conditional=[], + rhs_stable=["b"], + undefined=["b"], + ) @py2only def test_exec(self): - self.assertCorrect(source='exec a in b, c', lhs_conditional=[], lhs_stable=[], rhs_conditional=[], rhs_stable=['a', 'b', 'c'], undefined=['a', 'b', 'c']) + self.assertCorrect( + source="exec a in b, c", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["a", "b", "c"], + undefined=["a", "b", "c"], + ) def test_assert(self): - self.assertCorrect(source='assert b, msg', lhs_conditional=[], lhs_stable=[], rhs_conditional=[], rhs_stable=[ 'b', 'msg'], undefined=['b', 'msg']) + self.assertCorrect( + source="assert b, msg", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["b", "msg"], + undefined=["b", "msg"], + ) def test_raise(self): - self.assertCorrect(source='raise b', lhs_conditional=[], lhs_stable=[], rhs_conditional=[], rhs_stable=[ 'b'], undefined=['b']) + self.assertCorrect( + source="raise b", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["b"], + undefined=["b"], + ) def test_try(self): - self.assertCorrect(source='try: a \nexcept: b', lhs_conditional=[], lhs_stable=[], rhs_conditional=['a', 'b'], rhs_stable=[], undefined=['a', 'b']) + self.assertCorrect( + source="try: a \nexcept: b", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=["a", "b"], + rhs_stable=[], + undefined=["a", "b"], + ) def test_try2(self): - self.assertCorrect(source='try: a = 1 \nexcept c as d: a = 2', lhs_conditional=['d'], lhs_stable=['a'], rhs_conditional=['c'], rhs_stable=[], undefined=['c']) + self.assertCorrect( + source="try: a = 1 \nexcept c as d: a = 2", + lhs_conditional=["d"], + lhs_stable=["a"], + rhs_conditional=["c"], + rhs_stable=[], + undefined=["c"], + ) def test_try_else(self): - self.assertCorrect(source='try: a = 1 \nexcept c as d: a = 2\nelse: x = 1', lhs_conditional=['d', 'x'], lhs_stable=['a'], rhs_conditional=['c'], rhs_stable=[], undefined=['c']) + self.assertCorrect( + source="try: a = 1 \nexcept c as d: a = 2\nelse: x = 1", + lhs_conditional=["d", "x"], + lhs_stable=["a"], + rhs_conditional=["c"], + rhs_stable=[], + undefined=["c"], + ) def test_try_finally(self): - self.assertCorrect(source='try: a = 1 \nexcept c as d: a = 2\nfinally: x = 1', lhs_conditional=['d'], lhs_stable=['a', 'x'], rhs_conditional=['c'], rhs_stable=[], undefined=['c']) + self.assertCorrect( + source="try: a = 1 \nexcept c as d: a = 2\nfinally: x = 1", + lhs_conditional=["d"], + lhs_stable=["a", "x"], + rhs_conditional=["c"], + rhs_stable=[], + undefined=["c"], + ) def test_bug001(self): - self.assertCorrect(source='if a: d\nd', lhs_conditional=[], lhs_stable=[], rhs_conditional=[], rhs_stable=['a', 'd'], undefined=['a', 'd']) + self.assertCorrect( + source="if a: d\nd", + lhs_conditional=[], + lhs_stable=[], + rhs_conditional=[], + rhs_stable=["a", "d"], + undefined=["a", "d"], + ) def test_bug002(self): - self.assertCorrect(source='if a: d = 1\nd = 1', lhs_conditional=[], lhs_stable=['d'], rhs_conditional=[], rhs_stable=['a'], undefined=['a']) + self.assertCorrect( + source="if a: d = 1\nd = 1", + lhs_conditional=[], + lhs_stable=["d"], + rhs_conditional=[], + rhs_stable=["a"], + undefined=["a"], + ) if __name__ == "__main__": - #import sys;sys.argv = ['', 'Test.test_assign'] + # import sys;sys.argv = ['', 'Test.test_assign'] unittest.main() diff --git a/src/python/turicreate/meta/asttools/tests/test_depgraph.py b/src/python/turicreate/meta/asttools/tests/test_depgraph.py index a0cb79c3c3..e6280c26a5 100644 --- a/src/python/turicreate/meta/asttools/tests/test_depgraph.py +++ b/src/python/turicreate/meta/asttools/tests/test_depgraph.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Aug 2, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -22,21 +22,22 @@ def binop_method(op): def test_binop(self): - source = 'c = a %s b' % (op,) - self.assertDepends(source, {('c', 'a'), ('c', 'b')}, - {'a', 'b'}, {'c'}) + source = "c = a %s b" % (op,) + self.assertDepends(source, {("c", "a"), ("c", "b")}, {"a", "b"}, {"c"}) + return test_binop + def unarynop_method(op): def test_unaryop(self): - source = 'c = %s b' % (op,) - self.assertDepends(source, { ('c', 'b')}, - { 'b'}, {'c'}) + source = "c = %s b" % (op,) + self.assertDepends(source, {("c", "b")}, {"b"}, {"c"}) + return test_unaryop + @skip_networkx class Test(unittest.TestCase): - def assertDepends(self, source, edges, undefined=None, modified=None): mod = ast.parse(source) @@ -54,249 +55,251 @@ def assertDepends(self, source, edges, undefined=None, modified=None): return gen def test_name(self): - source = 'a' + source = "a" self.assertDepends(source, set()) def test_assign(self): - source = 'a = b' - self.assertDepends(source, {('a', 'b')}, {'b'}, {'a'}) + source = "a = b" + self.assertDepends(source, {("a", "b")}, {"b"}, {"a"}) def test_assign_tuple(self): - source = '(a, c) = b' - self.assertDepends(source, {('a', 'b'), ('c', 'b')}, {'b'}, {'a', 'c'}) + source = "(a, c) = b" + self.assertDepends(source, {("a", "b"), ("c", "b")}, {"b"}, {"a", "c"}) def test_assign_multi(self): - source = 'a = b = c' - self.assertDepends(source, {('a', 'c'), ('b', 'c')}, - {'c'}, {'a', 'b'}) + source = "a = b = c" + self.assertDepends(source, {("a", "c"), ("b", "c")}, {"c"}, {"a", "b"}) def test_assign_attr(self): - source = 'a.x = b' - self.assertDepends(source, {('a', 'b')}, - {'b', 'a'}, {'a'}) + source = "a.x = b" + self.assertDepends(source, {("a", "b")}, {"b", "a"}, {"a"}) def test_attr_assign(self): - source = 'a = b.x' - self.assertDepends(source, {('a', 'b')}, - {'b'}, {'a'}) + source = "a = b.x" + self.assertDepends(source, {("a", "b")}, {"b"}, {"a"}) def test_subscr(self): - source = 'a[:] = b[:]' - self.assertDepends(source, {('a', 'b')}, - {'a', 'b'}, {'a'}) + source = "a[:] = b[:]" + self.assertDepends(source, {("a", "b")}, {"a", "b"}, {"a"}) def test_subscr_value(self): - source = 'a = b[c]' - self.assertDepends(source, {('a', 'b'), ('a', 'c')}, - {'b', 'c'}, {'a'}) + source = "a = b[c]" + self.assertDepends(source, {("a", "b"), ("a", "c")}, {"b", "c"}, {"a"}) def test_subscr_lvalue(self): - source = 'a[c] = b' - self.assertDepends(source, {('a', 'b'), ('a', 'c')}, - {'a', 'b', 'c'}, {'a'}) + source = "a[c] = b" + self.assertDepends(source, {("a", "b"), ("a", "c")}, {"a", "b", "c"}, {"a"}) def test_subscr_attr(self): - source = 'a[:] = b[:].b' - self.assertDepends(source, {('a', 'b')}, - {'a', 'b'}, {'a'}) + source = "a[:] = b[:].b" + self.assertDepends(source, {("a", "b")}, {"a", "b"}, {"a"}) def test_import(self): - source = 'import foo; foo.a = b' - self.assertDepends(source, {('foo', 'b')}, - {'b'}, {'foo'}) + source = "import foo; foo.a = b" + self.assertDepends(source, {("foo", "b")}, {"b"}, {"foo"}) def test_import_from(self): - source = 'from bar import foo; foo.a = b' - self.assertDepends(source, {('foo', 'b')}, - {'b'}, {'foo'}) - + source = "from bar import foo; foo.a = b" + self.assertDepends(source, {("foo", "b")}, {"b"}, {"foo"}) def test_import_as(self): - source = 'import bar as foo; foo.a = b' - self.assertDepends(source, {('foo', 'b')}, - {'b'}, {'foo'}) + source = "import bar as foo; foo.a = b" + self.assertDepends(source, {("foo", "b")}, {"b"}, {"foo"}) def test_import_from_as(self): - source = 'from bar import baz as foo; foo.a = b' - self.assertDepends(source, {('foo', 'b')}, - {'b'}, {'foo'}) - + source = "from bar import baz as foo; foo.a = b" + self.assertDepends(source, {("foo", "b")}, {"b"}, {"foo"}) def test_augment_assign(self): - source = 'a += b' - self.assertDepends(source, {('a', 'b'), ('a', 'a')}, {'b'}, {'a'}) + source = "a += b" + self.assertDepends(source, {("a", "b"), ("a", "a")}, {"b"}, {"a"}) - test_add = binop_method('+') - test_sub = binop_method('-') - test_pow = binop_method('**') + test_add = binop_method("+") + test_sub = binop_method("-") + test_pow = binop_method("**") - test_eq = binop_method('==') - test_ne = binop_method('!=') + test_eq = binop_method("==") + test_ne = binop_method("!=") - test_rshift = binop_method('>>') - test_lshift = binop_method('<<') + test_rshift = binop_method(">>") + test_lshift = binop_method("<<") - test_mult = binop_method('*') - test_mod = binop_method('%') - test_div = binop_method('/') - test_floordiv = binop_method('//') - test_bitxor = binop_method('^') + test_mult = binop_method("*") + test_mod = binop_method("%") + test_div = binop_method("/") + test_floordiv = binop_method("//") + test_bitxor = binop_method("^") - test_lt = binop_method('<') - test_gt = binop_method('>') + test_lt = binop_method("<") + test_gt = binop_method(">") - test_lte = binop_method('<=') - test_gte = binop_method('>=') + test_lte = binop_method("<=") + test_gte = binop_method(">=") - test_in = binop_method('in') - test_not_in = binop_method('not in') - test_is = binop_method('is') - test_is_not = binop_method('is not') + test_in = binop_method("in") + test_not_in = binop_method("not in") + test_is = binop_method("is") + test_is_not = binop_method("is not") - test_bit_or = binop_method('|') - test_bit_and = binop_method('&') + test_bit_or = binop_method("|") + test_bit_and = binop_method("&") - test_or = binop_method('or') - test_and = binop_method('and') + test_or = binop_method("or") + test_and = binop_method("and") - test_not = unarynop_method('not') - test_uadd = unarynop_method('+') - test_usub = unarynop_method('-') - test_invert = unarynop_method('~') + test_not = unarynop_method("not") + test_uadd = unarynop_method("+") + test_usub = unarynop_method("-") + test_invert = unarynop_method("~") def test_call(self): - source = 'foo(a)' - self.assertDepends(source, {('foo', 'a'), ('a', 'foo')}, - {'a', 'foo'},) - + source = "foo(a)" + self.assertDepends( + source, {("foo", "a"), ("a", "foo")}, {"a", "foo"}, + ) def test_for(self): - source = 'for i in a:\n b' - self.assertDepends(source, {('i', 'a'), ('b', 'a')}, - {'a', 'b'}, {'i'}) + source = "for i in a:\n b" + self.assertDepends(source, {("i", "a"), ("b", "a")}, {"a", "b"}, {"i"}) def test_for2(self): - source = 'for i in a:\n x += b[i]' - self.assertDepends(source, {('i', 'a'), ('b', 'a'), ('x', 'a'), ('x', 'i'), ('x', 'b'), ('x', 'x')}, - {'a', 'b'}, {'x', 'i'}) + source = "for i in a:\n x += b[i]" + self.assertDepends( + source, + {("i", "a"), ("b", "a"), ("x", "a"), ("x", "i"), ("x", "b"), ("x", "x")}, + {"a", "b"}, + {"x", "i"}, + ) def test_for_unpack(self): - source = 'for i, j in a:\n x += b[i]' - self.assertDepends(source, {('i', 'a'), ('j', 'a'), ('b', 'a'), ('x', 'a'), ('x', 'i'), ('x', 'b'), ('x', 'x')}, - {'a', 'b'}, {'x', 'i', 'j'}) - + source = "for i, j in a:\n x += b[i]" + self.assertDepends( + source, + { + ("i", "a"), + ("j", "a"), + ("b", "a"), + ("x", "a"), + ("x", "i"), + ("x", "b"), + ("x", "x"), + }, + {"a", "b"}, + {"x", "i", "j"}, + ) def test_dict(self): - source = 'c = {a:b}' - self.assertDepends(source, {('c', 'a'), ('c', 'b')}, - {'a', 'b'}, {'c'}) + source = "c = {a:b}" + self.assertDepends(source, {("c", "a"), ("c", "b")}, {"a", "b"}, {"c"}) def test_list(self): - source = 'c = [a,b]' - self.assertDepends(source, {('c', 'a'), ('c', 'b')}, - {'a', 'b'}, {'c'}) + source = "c = [a,b]" + self.assertDepends(source, {("c", "a"), ("c", "b")}, {"a", "b"}, {"c"}) def test_tuple(self): - source = 'c = (a,b)' - self.assertDepends(source, {('c', 'a'), ('c', 'b')}, - {'a', 'b'}, {'c'}) + source = "c = (a,b)" + self.assertDepends(source, {("c", "a"), ("c", "b")}, {"a", "b"}, {"c"}) def test_set(self): - source = 'c = {a,b}' - self.assertDepends(source, {('c', 'a'), ('c', 'b')}, - {'a', 'b'}, {'c'}) + source = "c = {a,b}" + self.assertDepends(source, {("c", "a"), ("c", "b")}, {"a", "b"}, {"c"}) def test_if(self): - source = 'if a: b' - self.assertDepends(source, {('b', 'a')}, {'a', 'b'}, set()) + source = "if a: b" + self.assertDepends(source, {("b", "a")}, {"a", "b"}, set()) def test_if_else(self): - source = 'if a: b\nelse: c' - self.assertDepends(source, {('b', 'a'), ('c', 'a')}, {'a', 'b', 'c'}, set()) + source = "if a: b\nelse: c" + self.assertDepends(source, {("b", "a"), ("c", "a")}, {"a", "b", "c"}, set()) def test_if_elif_else(self): - source = 'if a: b\nelif x: c\nelse: d' - self.assertDepends(source, {('b', 'a'), - ('c', 'x'), ('c', 'a'), - ('d', 'a'), ('d', 'x'), - ('x', 'a')}, {'a', 'b', 'c', 'd', 'x'}, set()) + source = "if a: b\nelif x: c\nelse: d" + self.assertDepends( + source, + {("b", "a"), ("c", "x"), ("c", "a"), ("d", "a"), ("d", "x"), ("x", "a")}, + {"a", "b", "c", "d", "x"}, + set(), + ) def test_if_expr(self): - source = 'd = b if a else c' - self.assertDepends(source, {('d', 'a'), ('d', 'b'), ('d', 'c')}, {'a', 'b', 'c'}, {'d'}) + source = "d = b if a else c" + self.assertDepends( + source, {("d", "a"), ("d", "b"), ("d", "c")}, {"a", "b", "c"}, {"d"} + ) def test_assert(self): - source = 'assert a' - self.assertDepends(source, set(), {'a' }, set()) + source = "assert a" + self.assertDepends(source, set(), {"a"}, set()) def test_with(self): - source = 'with a as b: c' - self.assertDepends(source, {('b', 'a'), ('c', 'a')}, {'a', 'c'}, {'b'}) + source = "with a as b: c" + self.assertDepends(source, {("b", "a"), ("c", "a")}, {"a", "c"}, {"b"}) def test_while(self): - source = 'while a: c' - self.assertDepends(source, {('c', 'a')}, {'a', 'c'}) + source = "while a: c" + self.assertDepends(source, {("c", "a")}, {"a", "c"}) def test_function_def(self): - source = '''a = 1 + source = """a = 1 def foo(b): return a + b -''' - self.assertDepends(source, {('foo', 'a')}) +""" + self.assertDepends(source, {("foo", "a")}) def test_lambda(self): - source = '''a = 1 + source = """a = 1 foo = lambda b: a + b -''' - self.assertDepends(source, {('foo', 'a')}) - +""" + self.assertDepends(source, {("foo", "a")}) def test_list_comp(self): - source = 'a = [b for b in c]' - self.assertDepends(source, {('a', 'c')}) + source = "a = [b for b in c]" + self.assertDepends(source, {("a", "c")}) def test_dict_comp(self): - source = 'a = {b:d for b,d in c}' - self.assertDepends(source, {('a', 'c')}) + source = "a = {b:d for b,d in c}" + self.assertDepends(source, {("a", "c")}) def test_set_comp(self): - source = 'a = {b for b in c}' - self.assertDepends(source, {('a', 'c')}) + source = "a = {b for b in c}" + self.assertDepends(source, {("a", "c")}) def test_try_except(self): - source = ''' + source = """ try: a except b: c - ''' - self.assertDepends(source, {('c', 'a'), ('c', 'b')}) + """ + self.assertDepends(source, {("c", "a"), ("c", "b")}) def test_try_except_else(self): - source = ''' + source = """ try: a except b: c else: d - ''' - self.assertDepends(source, {('c', 'a'), ('c', 'b'), ('d', 'a')}) + """ + self.assertDepends(source, {("c", "a"), ("c", "b"), ("d", "a")}) def test_try_finally(self): - source = ''' + source = """ try: a except b: c finally: d - ''' - self.assertDepends(source, {('c', 'a'), ('c', 'b'), ('d', 'a'), - ('d', 'b'), ('d', 'c')}) + """ + self.assertDepends( + source, {("c", "a"), ("c", "b"), ("d", "a"), ("d", "b"), ("d", "c")} + ) + if __name__ == "__main__": - #import sys;sys.argv = ['', 'Test.test_assign'] + # import sys;sys.argv = ['', 'Test.test_assign'] unittest.main(exit=False) print(tested.tested()) diff --git a/src/python/turicreate/meta/asttools/tests/test_prune.py b/src/python/turicreate/meta/asttools/tests/test_prune.py index c2f2538229..c0a6980657 100644 --- a/src/python/turicreate/meta/asttools/tests/test_prune.py +++ b/src/python/turicreate/meta/asttools/tests/test_prune.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Aug 2, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -22,9 +22,8 @@ class TestExclusive(unittest.TestCase): - def assertPruned(self, source, pruned, symbols): - mutator = PruneVisitor(symbols=symbols, mode='exclusive') + mutator = PruneVisitor(symbols=symbols, mode="exclusive") orig_ast = ast.parse(source) expected_ast = ast.parse(pruned) @@ -35,162 +34,160 @@ def assertPruned(self, source, pruned, symbols): tested.update(orig_ast) def test_assign(self): - source = 'a = b; c = d' - pruned = 'a = b;' - - self.assertPruned(source, pruned, symbols=['c', 'd']) + source = "a = b; c = d" + pruned = "a = b;" - pruned2 = 'c = d' + self.assertPruned(source, pruned, symbols=["c", "d"]) - self.assertPruned(source, pruned2, symbols=['a', 'b']) + pruned2 = "c = d" - pruned = 'a = b; c = d' + self.assertPruned(source, pruned2, symbols=["a", "b"]) - self.assertPruned(source, pruned, symbols=['c']) + pruned = "a = b; c = d" - pruned2 = 'a = b; c = d' + self.assertPruned(source, pruned, symbols=["c"]) - self.assertPruned(source, pruned2, symbols=['b']) + pruned2 = "a = b; c = d" + self.assertPruned(source, pruned2, symbols=["b"]) def test_binop(self): - source = 'a + b; c + d' - pruned = 'a + b' + source = "a + b; c + d" + pruned = "a + b" - self.assertPruned(source, pruned, symbols=['c', 'd']) + self.assertPruned(source, pruned, symbols=["c", "d"]) def test_unaryop(self): - source = '+b; -c' - pruned = '+b' + source = "+b; -c" + pruned = "+b" - self.assertPruned(source, pruned, symbols=['c']) + self.assertPruned(source, pruned, symbols=["c"]) def test_for(self): - source = 'for i in j: k' + source = "for i in j: k" - pruned = 'for i in j: pass' - self.assertPruned(source, pruned, symbols=['k']) + pruned = "for i in j: pass" + self.assertPruned(source, pruned, symbols=["k"]) - pruned = '' - self.assertPruned(source, pruned, symbols=['k', 'i', 'j']) + pruned = "" + self.assertPruned(source, pruned, symbols=["k", "i", "j"]) def test_for_else(self): - source = 'for i in j:\n k\nelse:\n l' - - pruned = 'for i in j:\n k' - self.assertPruned(source, pruned, symbols=['l']) + source = "for i in j:\n k\nelse:\n l" + pruned = "for i in j:\n k" + self.assertPruned(source, pruned, symbols=["l"]) - pruned = 'for i in j:\n pass\nelse:\n l' - self.assertPruned(source, pruned, symbols=['i', 'j', 'k']) + pruned = "for i in j:\n pass\nelse:\n l" + self.assertPruned(source, pruned, symbols=["i", "j", "k"]) def test_with_as(self): - source = 'with a as b: c' + source = "with a as b: c" - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'b', 'c']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "b", "c"]) - pruned = 'with a as b: pass' - self.assertPruned(source, pruned, symbols=['c']) + pruned = "with a as b: pass" + self.assertPruned(source, pruned, symbols=["c"]) def test_with(self): - source = 'with a: c' + source = "with a: c" - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'c']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "c"]) - pruned = '' - self.assertPruned(source, pruned, symbols=['c']) + pruned = "" + self.assertPruned(source, pruned, symbols=["c"]) def test_if(self): - source = 'if a: b\nelse: c' + source = "if a: b\nelse: c" - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'b', 'c']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "b", "c"]) - pruned = '' - self.assertPruned(source, pruned, symbols=['b', 'c']) + pruned = "" + self.assertPruned(source, pruned, symbols=["b", "c"]) - pruned = 'if a: b' - self.assertPruned(source, pruned, symbols=['c']) + pruned = "if a: b" + self.assertPruned(source, pruned, symbols=["c"]) def test_if_expr(self): - source = 'a = b if c else d' + source = "a = b if c else d" - pruned = 'a = b if c else d' - self.assertPruned(source, pruned, symbols=['b', 'c', 'd']) + pruned = "a = b if c else d" + self.assertPruned(source, pruned, symbols=["b", "c", "d"]) - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'b', 'c', 'd']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "b", "c", "d"]) def test_while(self): - source = 'while a: b' + source = "while a: b" - pruned = '' - self.assertPruned(source, pruned, symbols=['b', ]) + pruned = "" + self.assertPruned(source, pruned, symbols=["b",]) - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'b', 'c', 'd']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "b", "c", "d"]) def test_import(self): - source = 'import a' + source = "import a" - pruned = '' - self.assertPruned(source, pruned, symbols=['a', ]) + pruned = "" + self.assertPruned(source, pruned, symbols=["a",]) - source = 'import a, b' + source = "import a, b" - pruned = 'import a, b' - self.assertPruned(source, pruned, symbols=['a', ]) + pruned = "import a, b" + self.assertPruned(source, pruned, symbols=["a",]) - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'b']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "b"]) def test_import_from(self): - source = 'from a import b' + source = "from a import b" - pruned = 'from a import b' - self.assertPruned(source, pruned, symbols=['a', ]) + pruned = "from a import b" + self.assertPruned(source, pruned, symbols=["a",]) - pruned = '' - self.assertPruned(source, pruned, symbols=['b', ]) + pruned = "" + self.assertPruned(source, pruned, symbols=["b",]) def test_try(self): - source = ''' + source = """ try: a except b as c: d -''' +""" - pruned = ''' -''' - self.assertPruned(source, pruned, symbols=['a', 'b', 'c', 'd']) + pruned = """ +""" + self.assertPruned(source, pruned, symbols=["a", "b", "c", "d"]) - pruned = ''' + pruned = """ try: a except b as c: pass -''' +""" - self.assertPruned(source, pruned, symbols=['d']) + self.assertPruned(source, pruned, symbols=["d"]) - pruned = ''' -''' - self.assertPruned(source, pruned, symbols=['a', 'd']) + pruned = """ +""" + self.assertPruned(source, pruned, symbols=["a", "d"]) def test_try_else(self): - source = ''' + source = """ try: a except b as c: d else: e -''' +""" - pruned = ''' + pruned = """ try: pass except: @@ -198,11 +195,11 @@ def test_try_else(self): else: e -''' - self.assertPruned(source, pruned, symbols=['a', ]) +""" + self.assertPruned(source, pruned, symbols=["a",]) def test_try_finally(self): - source = ''' + source = """ try: a except b as c: @@ -211,9 +208,9 @@ def test_try_finally(self): e finally: f -''' +""" - pruned = ''' + pruned = """ try: pass except: @@ -223,10 +220,10 @@ def test_try_finally(self): finally: f -''' - self.assertPruned(source, pruned, symbols=['a', ]) +""" + self.assertPruned(source, pruned, symbols=["a",]) - pruned = ''' + pruned = """ try: pass except: @@ -236,28 +233,25 @@ def test_try_finally(self): finally: pass -''' - self.assertPruned(source, pruned, symbols=['a', 'f']) +""" + self.assertPruned(source, pruned, symbols=["a", "f"]) - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'd', 'e', 'f']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "d", "e", "f"]) @py2only def test_exec(self): - source = 'exec a' - pruned = 'exec a' - self.assertPruned(source, pruned, symbols=['a']) - + source = "exec a" + pruned = "exec a" + self.assertPruned(source, pruned, symbols=["a"]) def test_attr(self): pass - class TestInclusive(unittest.TestCase): - def assertPruned(self, source, pruned, symbols): - mutator = PruneVisitor(symbols=symbols, mode='inclusive') + mutator = PruneVisitor(symbols=symbols, mode="inclusive") orig_ast = ast.parse(source) expected_ast = ast.parse(pruned) @@ -268,162 +262,160 @@ def assertPruned(self, source, pruned, symbols): tested.update(orig_ast) def test_assign(self): - source = 'a = b; c = d' - pruned = 'a = b;' - - self.assertPruned(source, pruned, symbols=['c', 'd']) + source = "a = b; c = d" + pruned = "a = b;" - pruned2 = 'c = d' + self.assertPruned(source, pruned, symbols=["c", "d"]) - self.assertPruned(source, pruned2, symbols=['a', 'b']) + pruned2 = "c = d" - pruned = 'a = b' + self.assertPruned(source, pruned2, symbols=["a", "b"]) - self.assertPruned(source, pruned, symbols=['c']) + pruned = "a = b" - pruned2 = 'c = d' + self.assertPruned(source, pruned, symbols=["c"]) - self.assertPruned(source, pruned2, symbols=['b']) + pruned2 = "c = d" + self.assertPruned(source, pruned2, symbols=["b"]) def test_binop(self): - source = 'a + b; c + d' - pruned = 'a + b' + source = "a + b; c + d" + pruned = "a + b" - self.assertPruned(source, pruned, symbols=['c', 'd']) + self.assertPruned(source, pruned, symbols=["c", "d"]) def test_unaryop(self): - source = '+b; -c' - pruned = '+b' + source = "+b; -c" + pruned = "+b" - self.assertPruned(source, pruned, symbols=['c']) + self.assertPruned(source, pruned, symbols=["c"]) def test_for(self): - source = 'for i in j: k' + source = "for i in j: k" - pruned = 'for i in j: pass' - self.assertPruned(source, pruned, symbols=['k']) + pruned = "for i in j: pass" + self.assertPruned(source, pruned, symbols=["k"]) - pruned = '' - self.assertPruned(source, pruned, symbols=['k', 'i', 'j']) + pruned = "" + self.assertPruned(source, pruned, symbols=["k", "i", "j"]) def test_for_else(self): - source = 'for i in j:\n k\nelse:\n l' - - pruned = 'for i in j:\n k' - self.assertPruned(source, pruned, symbols=['l']) + source = "for i in j:\n k\nelse:\n l" + pruned = "for i in j:\n k" + self.assertPruned(source, pruned, symbols=["l"]) - pruned = 'for i in j:\n pass\nelse:\n l' - self.assertPruned(source, pruned, symbols=['i', 'j', 'k']) + pruned = "for i in j:\n pass\nelse:\n l" + self.assertPruned(source, pruned, symbols=["i", "j", "k"]) def test_with_as(self): - source = 'with a as b: c' + source = "with a as b: c" - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'b', 'c']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "b", "c"]) - pruned = 'with a as b: pass' - self.assertPruned(source, pruned, symbols=['c']) + pruned = "with a as b: pass" + self.assertPruned(source, pruned, symbols=["c"]) def test_with(self): - source = 'with a: c' + source = "with a: c" - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'c']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "c"]) - pruned = '' - self.assertPruned(source, pruned, symbols=['c']) + pruned = "" + self.assertPruned(source, pruned, symbols=["c"]) def test_if(self): - source = 'if a: b\nelse: c' + source = "if a: b\nelse: c" - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'b', 'c']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "b", "c"]) - pruned = '' - self.assertPruned(source, pruned, symbols=['b', 'c']) + pruned = "" + self.assertPruned(source, pruned, symbols=["b", "c"]) - pruned = 'if a: b' - self.assertPruned(source, pruned, symbols=['c']) + pruned = "if a: b" + self.assertPruned(source, pruned, symbols=["c"]) def test_if_expr(self): - source = 'a = b if c else d' + source = "a = b if c else d" - pruned = '' - self.assertPruned(source, pruned, symbols=['b', 'c', 'd']) + pruned = "" + self.assertPruned(source, pruned, symbols=["b", "c", "d"]) - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'b', 'c', 'd']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "b", "c", "d"]) def test_while(self): - source = 'while a: b' + source = "while a: b" - pruned = '' - self.assertPruned(source, pruned, symbols=['b', ]) + pruned = "" + self.assertPruned(source, pruned, symbols=["b",]) - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'b', 'c', 'd']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "b", "c", "d"]) def test_import(self): - source = 'import a' + source = "import a" - pruned = '' - self.assertPruned(source, pruned, symbols=['a', ]) + pruned = "" + self.assertPruned(source, pruned, symbols=["a",]) - source = 'import a, b' + source = "import a, b" - pruned = '' - self.assertPruned(source, pruned, symbols=['a', ]) + pruned = "" + self.assertPruned(source, pruned, symbols=["a",]) - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'b']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "b"]) def test_import_from(self): - source = 'from a import b' + source = "from a import b" - pruned = 'from a import b' - self.assertPruned(source, pruned, symbols=['a', ]) + pruned = "from a import b" + self.assertPruned(source, pruned, symbols=["a",]) - pruned = '' - self.assertPruned(source, pruned, symbols=['b', ]) + pruned = "" + self.assertPruned(source, pruned, symbols=["b",]) def test_try(self): - source = ''' + source = """ try: a except b as c: d -''' +""" - pruned = ''' -''' - self.assertPruned(source, pruned, symbols=['a', 'b', 'c', 'd']) + pruned = """ +""" + self.assertPruned(source, pruned, symbols=["a", "b", "c", "d"]) - pruned = ''' + pruned = """ try: a except b as c: pass -''' +""" - self.assertPruned(source, pruned, symbols=['d']) + self.assertPruned(source, pruned, symbols=["d"]) - pruned = ''' -''' - self.assertPruned(source, pruned, symbols=['a', 'd']) + pruned = """ +""" + self.assertPruned(source, pruned, symbols=["a", "d"]) def test_try_else(self): - source = ''' + source = """ try: a except b as c: d else: e -''' +""" - pruned = ''' + pruned = """ try: pass except: @@ -431,11 +423,11 @@ def test_try_else(self): else: e -''' - self.assertPruned(source, pruned, symbols=['a', ]) +""" + self.assertPruned(source, pruned, symbols=["a",]) def test_try_finally(self): - source = ''' + source = """ try: a except b as c: @@ -444,9 +436,9 @@ def test_try_finally(self): e finally: f -''' +""" - pruned = ''' + pruned = """ try: pass except: @@ -456,10 +448,10 @@ def test_try_finally(self): finally: f -''' - self.assertPruned(source, pruned, symbols=['a', ]) +""" + self.assertPruned(source, pruned, symbols=["a",]) - pruned = ''' + pruned = """ try: pass except: @@ -469,26 +461,24 @@ def test_try_finally(self): finally: pass -''' - self.assertPruned(source, pruned, symbols=['a', 'f']) +""" + self.assertPruned(source, pruned, symbols=["a", "f"]) - pruned = '' - self.assertPruned(source, pruned, symbols=['a', 'd', 'e', 'f']) + pruned = "" + self.assertPruned(source, pruned, symbols=["a", "d", "e", "f"]) @py2only def test_exec(self): - source = 'exec a' - pruned = 'exec a' - self.assertPruned(source, pruned, symbols=['a']) - + source = "exec a" + pruned = "exec a" + self.assertPruned(source, pruned, symbols=["a"]) def test_attr(self): pass - if __name__ == "__main__": - #import sys;sys.argv = ['', 'Test.test_assign'] + # import sys;sys.argv = ['', 'Test.test_assign'] unittest.main(exit=False) print(tested.tested()) diff --git a/src/python/turicreate/meta/asttools/tests/test_remove_trivial.py b/src/python/turicreate/meta/asttools/tests/test_remove_trivial.py index d688754a9d..98165d92cd 100644 --- a/src/python/turicreate/meta/asttools/tests/test_remove_trivial.py +++ b/src/python/turicreate/meta/asttools/tests/test_remove_trivial.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Aug 5, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -29,9 +29,9 @@ def simple_case(self, toremove, expected): assert_ast_eq(self, root, expected_root) + @skip_networkx class Test(unittest.TestCase): - def assertRemoved(self, toremove, expected): root = ast.parse(toremove) @@ -43,65 +43,71 @@ def assertRemoved(self, toremove, expected): assert_ast_eq(self, root, expected) def test_single(self): - simple_case(self, 'a = 1', - 'a = 1') + simple_case(self, "a = 1", "a = 1") def test_empty(self): - simple_case(self,'', - '') + simple_case(self, "", "") def test_simple(self): - simple_case(self, 'a = 1; a = 2', - 'pass; a = 2') + simple_case(self, "a = 1; a = 2", "pass; a = 2") def test_multi(self): - simple_case(self, 'a = 1; a = 2; a = 3', - 'pass; pass; a = 3') + simple_case(self, "a = 1; a = 2; a = 3", "pass; pass; a = 3") def test_apart(self): - simple_case(self, 'a = 1; b = 1; a = 2', - 'pass; b = 1; a = 2') + simple_case(self, "a = 1; b = 1; a = 2", "pass; b = 1; a = 2") def test_if(self): - simple_case(self, 'a = 1\nif x: a = 2', - 'a = 1\nif x: a = 2') + simple_case(self, "a = 1\nif x: a = 2", "a = 1\nif x: a = 2") def test_if2(self): - simple_case(self, 'if x: a = 2\na = 1', - 'if x: a = 2\na = 1') + simple_case(self, "if x: a = 2\na = 1", "if x: a = 2\na = 1") def test_if_else(self): - simple_case(self, 'a = 1\nif x: a = 2\nelse: a = 3', - 'pass\nif x: a = 2\nelse: a = 3') + simple_case( + self, "a = 1\nif x: a = 2\nelse: a = 3", "pass\nif x: a = 2\nelse: a = 3" + ) def test_if_else2(self): - simple_case(self, 'if x: a = 2\nelse: a = 3\na = 1', - 'if x: pass\nelse: pass\na = 1') + simple_case( + self, "if x: a = 2\nelse: a = 3\na = 1", "if x: pass\nelse: pass\na = 1" + ) def test_for(self): - simple_case(self, 'a = 1\nfor x in y: a = 2', - 'a = 1\nfor x in y: a = 2') + simple_case(self, "a = 1\nfor x in y: a = 2", "a = 1\nfor x in y: a = 2") def test_for_else(self): - simple_case(self, 'a = 1\nfor x in y: a = 2\nelse: a = 3', - 'pass\nfor x in y: a = 2\nelse: a = 3') + simple_case( + self, + "a = 1\nfor x in y: a = 2\nelse: a = 3", + "pass\nfor x in y: a = 2\nelse: a = 3", + ) def test_for_else_break(self): - simple_case(self, 'a = 1\nfor x in y:\n break\n a = 2\nelse: a = 3', - 'a = 1\nfor x in y:\n break\n a = 2\nelse: a = 3') + simple_case( + self, + "a = 1\nfor x in y:\n break\n a = 2\nelse: a = 3", + "a = 1\nfor x in y:\n break\n a = 2\nelse: a = 3", + ) def test_for_else_conti(self): - simple_case(self, 'a = 1\nfor x in y:\n continue\n a = 2\nelse: a = 3', - 'a = 1\nfor x in y:\n continue\n a = 2\nelse: a = 3') + simple_case( + self, + "a = 1\nfor x in y:\n continue\n a = 2\nelse: a = 3", + "a = 1\nfor x in y:\n continue\n a = 2\nelse: a = 3", + ) def test_while(self): - simple_case(self, 'a = 1\nwhile x: a = 2', - 'a = 1\nwhile x: a = 2') + simple_case(self, "a = 1\nwhile x: a = 2", "a = 1\nwhile x: a = 2") def test_while_else(self): - simple_case(self, 'a = 1\nwhile x: a = 2\nelse: a = 3', - 'pass\nwhile x: a = 2\nelse: a = 3') + simple_case( + self, + "a = 1\nwhile x: a = 2\nelse: a = 3", + "pass\nwhile x: a = 2\nelse: a = 3", + ) + if __name__ == "__main__": - #import sys;sys.argv = ['', 'Test.testName'] + # import sys;sys.argv = ['', 'Test.testName'] unittest.main() diff --git a/src/python/turicreate/meta/asttools/tests/test_replace.py b/src/python/turicreate/meta/asttools/tests/test_replace.py index c016e86aa9..bfde06acef 100644 --- a/src/python/turicreate/meta/asttools/tests/test_replace.py +++ b/src/python/turicreate/meta/asttools/tests/test_replace.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Aug 5, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -17,31 +17,31 @@ import ast from ...asttools.tests import assert_ast_eq -class Test(unittest.TestCase): +class Test(unittest.TestCase): def test_replace_name(self): - root = ast.parse('a = 1') + root = ast.parse("a = 1") name_a = root.body[0].targets[0] - name_b = ast.Name(id='b', ctx=ast.Store()) + name_b = ast.Name(id="b", ctx=ast.Store()) replace_nodes(root, name_a, name_b) - expected = ast.parse('b = 1') + expected = ast.parse("b = 1") assert_ast_eq(self, root, expected) - def test_replace_non_existent(self): - root = ast.parse('a = 1') + root = ast.parse("a = 1") name_a = root.body[0].targets[0] - name_b = ast.Name(id='b', ctx=ast.Store()) + name_b = ast.Name(id="b", ctx=ast.Store()) replace_nodes(root, name_b, name_a) - expected = ast.parse('a = 1') + expected = ast.parse("a = 1") assert_ast_eq(self, root, expected) + if __name__ == "__main__": - #import sys;sys.argv = ['', 'Test.testName'] + # import sys;sys.argv = ['', 'Test.testName'] unittest.main() diff --git a/src/python/turicreate/meta/asttools/tests/test_sourcegen.py b/src/python/turicreate/meta/asttools/tests/test_sourcegen.py index 3bec0700ae..14267ea3a5 100644 --- a/src/python/turicreate/meta/asttools/tests/test_sourcegen.py +++ b/src/python/turicreate/meta/asttools/tests/test_sourcegen.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Aug 3, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -20,33 +20,36 @@ tested = AllTypesTested() + def simple_expr(expr): def test_sourcegen_expr(self): self.assertSame(expr) return test_sourcegen_expr + def bin_op(op): def test_bin_op(self): - self.assertSame('(a %s b)' % (op,)) + self.assertSame("(a %s b)" % (op,)) return test_bin_op + def unary_op(op): def test_bin_op(self): - self.assertSame('(%sb)' % (op,)) + self.assertSame("(%sb)" % (op,)) return test_bin_op + def aug_assign(op): def test_bin_op(self): - self.assertSame('a %s= b' % (op,)) + self.assertSame("a %s= b" % (op,)) return test_bin_op -class Test(unittest.TestCase): - +class Test(unittest.TestCase): def assertSame(self, source): module = ast.parse(source) @@ -56,11 +59,10 @@ def assertSame(self, source): gen.visit(module) generated_source = gen.dumps() - self.assertMultiLineEqual(source, generated_source.strip('\n')) - -class TestSimple(Test): + self.assertMultiLineEqual(source, generated_source.strip("\n")) +class TestSimple(Test): def assertSame(self, source): module = ast.parse(source) @@ -70,75 +72,73 @@ def assertSame(self, source): gen.visit(module) generated_source = gen.dumps() - self.assertEqual(source, generated_source.strip('\n')) - + self.assertEqual(source, generated_source.strip("\n")) - test_expr = simple_expr('a') - test_del = simple_expr('del a') - test_assign = simple_expr('a = 1') - test_assign_multi = simple_expr('a = b = 1') + test_expr = simple_expr("a") + test_del = simple_expr("del a") + test_assign = simple_expr("a = 1") + test_assign_multi = simple_expr("a = b = 1") - test_attr = simple_expr('a.b') - test_assattr = simple_expr('a.b = 1') + test_attr = simple_expr("a.b") + test_assattr = simple_expr("a.b = 1") - test_index = simple_expr('a[b]') - test_index2 = simple_expr('a[b, c]') + test_index = simple_expr("a[b]") + test_index2 = simple_expr("a[b, c]") - test_slice0 = simple_expr('a[:]') - test_slice1 = simple_expr('a[1:]') - test_slice2 = simple_expr('a[1:2]') - test_slice3 = simple_expr('a[1:2:3]') - test_slice4 = simple_expr('a[1::3]') - test_slice5 = simple_expr('a[::3]') - test_slice6 = simple_expr('a[:3]') - test_slice7 = simple_expr('a[...]') + test_slice0 = simple_expr("a[:]") + test_slice1 = simple_expr("a[1:]") + test_slice2 = simple_expr("a[1:2]") + test_slice3 = simple_expr("a[1:2:3]") + test_slice4 = simple_expr("a[1::3]") + test_slice5 = simple_expr("a[::3]") + test_slice6 = simple_expr("a[:3]") + test_slice7 = simple_expr("a[...]") - test_raise = simple_expr('raise Foo') + test_raise = simple_expr("raise Foo") - test_raise1 = py2only(simple_expr('raise Foo, bar')) - test_raise2 = py2only(simple_expr('raise Foo, bar, baz')) + test_raise1 = py2only(simple_expr("raise Foo, bar")) + test_raise2 = py2only(simple_expr("raise Foo, bar, baz")) - test_raise_from = py3only(simple_expr('raise Foo() from bar')) + test_raise_from = py3only(simple_expr("raise Foo() from bar")) + test_call0 = simple_expr("foo()") + test_call1 = simple_expr("a = foo()") + test_call2 = simple_expr("foo(x)") + test_call3 = simple_expr("foo(x, y)") + test_call4 = simple_expr("foo(x=y)") + test_call5 = simple_expr("foo(z, x=y)") + test_call6 = simple_expr("foo(*z)") + test_call7 = simple_expr("foo(**z)") + test_call8 = simple_expr("foo(a, b=c, *d, **z)") - test_call0 = simple_expr('foo()') - test_call1 = simple_expr('a = foo()') - test_call2 = simple_expr('foo(x)') - test_call3 = simple_expr('foo(x, y)') - test_call4 = simple_expr('foo(x=y)') - test_call5 = simple_expr('foo(z, x=y)') - test_call6 = simple_expr('foo(*z)') - test_call7 = simple_expr('foo(**z)') - test_call8 = simple_expr('foo(a, b=c, *d, **z)') + test_pass = simple_expr("pass") - test_pass = simple_expr('pass') + test_import = simple_expr("import a") + test_import_as = simple_expr("import a as b") - test_import = simple_expr('import a') - test_import_as = simple_expr('import a as b') + test_from_import = simple_expr("from c import a") + test_from_import_as = simple_expr("from c import a as b") - test_from_import = simple_expr('from c import a') - test_from_import_as = simple_expr('from c import a as b') + test_dict0 = simple_expr("{}") + test_dict1 = simple_expr("{a:b}") + test_dict2 = simple_expr("{a:b, c:d}") - test_dict0 = simple_expr('{}') - test_dict1 = simple_expr('{a:b}') - test_dict2 = simple_expr('{a:b, c:d}') + test_list0 = simple_expr("[]") + test_list1 = simple_expr("[a]") + test_list2 = simple_expr("[a, b]") - test_list0 = simple_expr('[]') - test_list1 = simple_expr('[a]') - test_list2 = simple_expr('[a, b]') + test_set1 = simple_expr("{a}") + test_set2 = simple_expr("{a, b}") - test_set1 = simple_expr('{a}') - test_set2 = simple_expr('{a, b}') + test_exec0 = py2only(simple_expr("exec a in None, None")) + test_exec1 = py2only(simple_expr("exec a in b, None")) + test_exec2 = py2only(simple_expr("exec a in b, c")) - test_exec0 = py2only(simple_expr('exec a in None, None')) - test_exec1 = py2only(simple_expr('exec a in b, None')) - test_exec2 = py2only(simple_expr('exec a in b, c')) + test_assert1 = simple_expr("assert False") + test_assert2 = simple_expr("assert False, msg") - test_assert1 = simple_expr('assert False') - test_assert2 = simple_expr('assert False, msg') - - test_global1 = simple_expr('global a') - test_global2 = simple_expr('global a, b') + test_global1 = simple_expr("global a") + test_global2 = simple_expr("global a, b") test_str = simple_expr("x = 'a'") @@ -156,226 +156,226 @@ def assertSame(self, source): test_set_comp = simple_expr("{a for b in c}") test_dict_comp = simple_expr("{a:d for b in c}") - test_iadd = aug_assign('+') - test_isub = aug_assign('-') - test_imult = aug_assign('*') - test_ipow = aug_assign('**') - test_idiv = aug_assign('/') - test_ifdiv = aug_assign('//') + test_iadd = aug_assign("+") + test_isub = aug_assign("-") + test_imult = aug_assign("*") + test_ipow = aug_assign("**") + test_idiv = aug_assign("/") + test_ifdiv = aug_assign("//") + test_add = bin_op("+") + test_sub = bin_op("-") + test_mult = bin_op("*") + test_pow = bin_op("**") + test_div = bin_op("/") + test_floordiv = bin_op("//") + test_mod = bin_op("%") - test_add = bin_op('+') - test_sub = bin_op('-') - test_mult = bin_op('*') - test_pow = bin_op('**') - test_div = bin_op('/') - test_floordiv = bin_op('//') - test_mod = bin_op('%') + test_eq = bin_op("==") + test_neq = bin_op("!=") + test_lt = bin_op("<") + test_gt = bin_op(">") + test_lte = bin_op("<=") + test_gte = bin_op(">=") - test_eq = bin_op('==') - test_neq = bin_op('!=') - test_lt = bin_op('<') - test_gt = bin_op('>') - test_lte = bin_op('<=') - test_gte = bin_op('>=') + test_lshift = bin_op("<<") + test_rshift = bin_op(">>") - test_lshift = bin_op('<<') - test_rshift = bin_op('>>') + test_lshift = bin_op("and") + test_rshift = bin_op("or") - test_lshift = bin_op('and') - test_rshift = bin_op('or') + test_in = bin_op("in") + test_not_in = bin_op("not in") - test_in = bin_op('in') - test_not_in = bin_op('not in') + test_is = bin_op("is") + test_is_not = bin_op("is not") - test_is = bin_op('is') - test_is_not = bin_op('is not') + test_bitand = bin_op("&") + test_bitor = bin_op("|") + test_bitxor = bin_op("^") - test_bitand = bin_op('&') - test_bitor = bin_op('|') - test_bitxor = bin_op('^') + test_usub = unary_op("-") + test_uadd = unary_op("+") + test_unot = unary_op("not ") + test_uinvert = unary_op("~") - test_usub = unary_op('-') - test_uadd = unary_op('+') - test_unot = unary_op('not ') - test_uinvert = unary_op('~') class ControlFlow(Test): - def test_if(self): - source = '''if a: - b''' + source = """if a: + b""" self.assertSame(source) def test_if_else(self): - source = '''if a: + source = """if a: b else: - c''' + c""" self.assertSame(source) def test_elif_else(self): - source = '''if a: + source = """if a: b elif d: e else: - c''' + c""" self.assertSame(source) def test_while(self): - source = '''while a: - b''' + source = """while a: + b""" self.assertSame(source) def test_break(self): - source = '''while a: - break''' + source = """while a: + break""" self.assertSame(source) def test_continue(self): - source = '''while a: - continue''' + source = """while a: + continue""" self.assertSame(source) def test_with0(self): - source = '''with a: - b''' + source = """with a: + b""" self.assertSame(source) def test_with1(self): - source = '''with a as b: - c''' + source = """with a as b: + c""" self.assertSame(source) def test_function_def(self): - source = '''def foo(): - pass''' + source = """def foo(): + pass""" self.assertSame(source) def test_return(self): - source = '''def foo(): - return 1.1''' + source = """def foo(): + return 1.1""" self.assertSame(source) def test_yield(self): - source = '''def foo(): - yield 1.1''' + source = """def foo(): + yield 1.1""" self.assertSame(source) def test_function_args1(self): - source = '''def foo(a): - pass''' + source = """def foo(a): + pass""" self.assertSame(source) def test_function_args2(self): - source = '''def foo(a, b): - pass''' + source = """def foo(a, b): + pass""" self.assertSame(source) def test_function_args3(self): - source = '''def foo(b=c): - pass''' + source = """def foo(b=c): + pass""" self.assertSame(source) def test_function_args4(self): - source = '''def foo(b=c, d=e): - pass''' + source = """def foo(b=c, d=e): + pass""" self.assertSame(source) def test_function_args5(self): - source = '''def foo(*a): - pass''' + source = """def foo(*a): + pass""" self.assertSame(source) def test_try_except(self): - source = '''try: + source = """try: a except: - b''' + b""" self.assertSame(source) def test_try_except1(self): - source = '''try: + source = """try: a except Exception: - b''' + b""" self.assertSame(source) def test_try_except2(self): - source = '''try: + source = """try: a except Exception as error: - b''' + b""" self.assertSame(source) def test_try_except3(self): - source = '''try: + source = """try: a except Exception as error: pass except: - b''' + b""" self.assertSame(source) def test_try_except_else(self): - source = '''try: + source = """try: a except Exception as error: pass except: b else: - c''' + c""" self.assertSame(source) def test_try_except_finally(self): - source = '''try: + source = """try: a except Exception as error: pass except: b finally: - c''' + c""" self.assertSame(source) def test_for(self): - source = '''for i in j: - pass''' + source = """for i in j: + pass""" self.assertSame(source) def test_for_else(self): - source = '''for i in j: + source = """for i in j: l else: - k''' + k""" self.assertSame(source) def test_class_def(self): - source = '''class A(): - pass''' + source = """class A(): + pass""" self.assertSame(source) def test_class_def1(self): - source = '''class A(object): - pass''' + source = """class A(object): + pass""" self.assertSame(source) def test_class_def2(self): - source = '''class A(object, foo): - pass''' + source = """class A(object, foo): + pass""" self.assertSame(source) def test_class_def3(self): - source = '''class A(object, foo): + source = """class A(object, foo): a = 1 def bar(): - pass''' + pass""" self.assertSame(source) + if __name__ == "__main__": - #import sys;sys.argv = ['', 'Test.test_expr'] + # import sys;sys.argv = ['', 'Test.test_expr'] unittest.main(exit=False) print(tested.tested()) diff --git a/src/python/turicreate/meta/asttools/tests/test_symbol_visitor.py b/src/python/turicreate/meta/asttools/tests/test_symbol_visitor.py index 9aaa58d486..d75ff01973 100644 --- a/src/python/turicreate/meta/asttools/tests/test_symbol_visitor.py +++ b/src/python/turicreate/meta/asttools/tests/test_symbol_visitor.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Aug 5, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -18,41 +18,41 @@ class Test(unittest.TestCase): - - def assertHasSymbols(self, codestring, expected_symbols, ctxts=(ast.Load, ast.Store)): + def assertHasSymbols( + self, codestring, expected_symbols, ctxts=(ast.Load, ast.Store) + ): root = ast.parse(codestring) symbols = get_symbols(root, ctxts) self.assertEqual(symbols, expected_symbols) - def test_simple(self): - self.assertHasSymbols('a', {'a'}) + self.assertHasSymbols("a", {"a"}) def test_load(self): - self.assertHasSymbols('a', {'a'}, ast.Load) - self.assertHasSymbols('a', set(), ast.Store) + self.assertHasSymbols("a", {"a"}, ast.Load) + self.assertHasSymbols("a", set(), ast.Store) def test_store(self): - self.assertHasSymbols('a = 1', {'a'}, ast.Store) - self.assertHasSymbols('a = 1', set(), ast.Load) + self.assertHasSymbols("a = 1", {"a"}, ast.Store) + self.assertHasSymbols("a = 1", set(), ast.Load) def test_store_item(self): - self.assertHasSymbols('a[:] = 1', {'a'}, ast.Load) - self.assertHasSymbols('a[:] = 1', set(), ast.Store) + self.assertHasSymbols("a[:] = 1", {"a"}, ast.Load) + self.assertHasSymbols("a[:] = 1", set(), ast.Store) def test_store_attr(self): - self.assertHasSymbols('a.b = 1', {'a'}, ast.Load) - self.assertHasSymbols('a.b = 1', set(), ast.Store) + self.assertHasSymbols("a.b = 1", {"a"}, ast.Load) + self.assertHasSymbols("a.b = 1", set(), ast.Store) def test_for(self): - self.assertHasSymbols('for i in x:\n a.b = 1', {'a', 'x'}, ast.Load) - self.assertHasSymbols('for i in x:\n a.b = 1', {'i'}, ast.Store) + self.assertHasSymbols("for i in x:\n a.b = 1", {"a", "x"}, ast.Load) + self.assertHasSymbols("for i in x:\n a.b = 1", {"i"}, ast.Store) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/src/python/turicreate/meta/asttools/visitors/__init__.py b/src/python/turicreate/meta/asttools/visitors/__init__.py index afaee5760c..bdc115b13c 100644 --- a/src/python/turicreate/meta/asttools/visitors/__init__.py +++ b/src/python/turicreate/meta/asttools/visitors/__init__.py @@ -9,16 +9,17 @@ import _ast + def dont_visit(self, node): pass + def visit_children(self, node): for child in self.children(node): self.visit(child) class Visitor(object): - def children(self, node): for field in node._fields: value = getattr(node, field) @@ -28,13 +29,11 @@ def children(self, node): yield item else: pass - elif isinstance(value, _ast.AST): + elif isinstance(value, _ast.AST): yield value return - - def visit_list(self, nodes, *args, **kwargs): result = [] @@ -45,22 +44,20 @@ def visit_list(self, nodes, *args, **kwargs): def visit(self, node, *args, **kwargs): node_name = type(node).__name__ - attr = 'visit' + node_name + attr = "visit" + node_name if hasattr(self, attr): - method = getattr(self, 'visit' + node_name) + method = getattr(self, "visit" + node_name) return method(node, *args, **kwargs) - elif hasattr(self, 'visitDefault'): - method = getattr(self, 'visitDefault') + elif hasattr(self, "visitDefault"): + method = getattr(self, "visitDefault") return method(node, *args, **kwargs) else: - method = getattr(self, 'visit' + node_name) + method = getattr(self, "visit" + node_name) return method(node, *args, **kwargs) - class Mutator(Visitor): - def mutateDefault(self, node): for field in node._fields: value = getattr(node, field) @@ -73,7 +70,7 @@ def mutateDefault(self, node): else: pass - elif isinstance(value, _ast.AST): + elif isinstance(value, _ast.AST): new_value = self.mutate(value) if new_value is not None: setattr(node, field, new_value) @@ -84,14 +81,14 @@ def mutate(self, node, *args, **kwargs): node_name = type(node).__name__ - attr = 'mutate' + node_name + attr = "mutate" + node_name if hasattr(self, attr): - mehtod = getattr(self, 'mutate' + node_name) + mehtod = getattr(self, "mutate" + node_name) return mehtod(node, *args, **kwargs) - elif hasattr(self, 'mutateDefault'): - mehtod = getattr(self, 'mutateDefault') + elif hasattr(self, "mutateDefault"): + mehtod = getattr(self, "mutateDefault") return mehtod(node, *args, **kwargs) else: - mehtod = getattr(self, 'mutate' + node_name) + mehtod = getattr(self, "mutate" + node_name) return mehtod(node, *args, **kwargs) diff --git a/src/python/turicreate/meta/asttools/visitors/cond_symbol_visitor.py b/src/python/turicreate/meta/asttools/visitors/cond_symbol_visitor.py index a23bda9058..527b05ece3 100644 --- a/src/python/turicreate/meta/asttools/visitors/cond_symbol_visitor.py +++ b/src/python/turicreate/meta/asttools/visitors/cond_symbol_visitor.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Aug 4, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -17,8 +17,8 @@ import ast from ...utils import py2op -class ConditionalSymbolVisitor(Visitor): +class ConditionalSymbolVisitor(Visitor): def __init__(self): self._cond_lhs = set() self._stable_lhs = set() @@ -33,7 +33,6 @@ def __init__(self): visitModule = visit_children visitPass = visit_children - def update_stable_rhs(self, symbols): new_symbols = symbols - self._stable_rhs self._update_undefined(new_symbols) @@ -64,7 +63,9 @@ def update_cond_lhs(self, symbols): def _update_undefined(self, symbols): self.undefined.update(symbols - self._stable_lhs) + update_undefined = _update_undefined + @property def stable_lhs(self): assert not (self._stable_lhs & self._cond_lhs) @@ -125,7 +126,6 @@ def visitBreak(self, node): def visitContinue(self, node): self.seen_break = True - def visit_loop(self, node): gen = ConditionalSymbolVisitor() @@ -355,19 +355,21 @@ def visitWith(self, node): def visitReturn(self, node): self.update_stable_rhs(get_symbols(node.value, ast.Load)) + def csv(node): gen = ConditionalSymbolVisitor() gen.visit(node) return gen + def lhs(node): - ''' + """ Return a set of symbols in `node` that are assigned. :param node: ast node :returns: set of strings. - ''' + """ gen = ConditionalSymbolVisitor() if isinstance(node, (list, tuple)): @@ -376,14 +378,15 @@ def lhs(node): gen.visit(node) return gen.lhs + def rhs(node): - ''' + """ Return a set of symbols in `node` that are used. :param node: ast node :returns: set of strings. - ''' + """ gen = ConditionalSymbolVisitor() if isinstance(node, (list, tuple)): @@ -392,14 +395,15 @@ def rhs(node): gen.visit(node) return gen.rhs + def conditional_lhs(node): - ''' + """ Group outputs into conditional and stable :param node: ast node :returns: tuple of (conditional, stable) - ''' + """ gen = ConditionalSymbolVisitor() gen.visit(node) @@ -407,13 +411,13 @@ def conditional_lhs(node): def conditional_symbols(node): - ''' + """ Group lhs and rhs into conditional, stable and undefined :param node: ast node :returns: tuple of (conditional_lhs, stable_lhs),(conditional_rhs, stable_rhs), undefined - ''' + """ gen = ConditionalSymbolVisitor() gen.visit(node) @@ -422,9 +426,10 @@ def conditional_symbols(node): undefined = gen.undefined return lhs, rhs, undefined -if __name__ == '__main__': - source = ''' +if __name__ == "__main__": + + source = """ while k: a = 1 b = 1 @@ -434,6 +439,6 @@ def conditional_symbols(node): a =2 c= 3 d = 1 - ''' + """ print(conditional_lhs(ast.parse(source))) diff --git a/src/python/turicreate/meta/asttools/visitors/copy_tree.py b/src/python/turicreate/meta/asttools/visitors/copy_tree.py index d745798188..fa5510b955 100644 --- a/src/python/turicreate/meta/asttools/visitors/copy_tree.py +++ b/src/python/turicreate/meta/asttools/visitors/copy_tree.py @@ -3,22 +3,23 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Dec 12, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ from ..asttools import Visitor import ast -#FIXME: add tests +# FIXME: add tests class CopyVisitor(Visitor): - ''' + """ Copy only ast nodes and lists - ''' + """ + def visitDefault(self, node): Node = type(node) new_node = Node() diff --git a/src/python/turicreate/meta/asttools/visitors/graph_visitor.py b/src/python/turicreate/meta/asttools/visitors/graph_visitor.py index 4c2dd473b6..61488efdac 100644 --- a/src/python/turicreate/meta/asttools/visitors/graph_visitor.py +++ b/src/python/turicreate/meta/asttools/visitors/graph_visitor.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Jul 18, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -15,17 +15,19 @@ import _ast from ...asttools.visitors.symbol_visitor import get_symbols + try: from networkx import DiGraph except ImportError: DiGraph = None + def collect_(self, node): names = set() for child in self.children(node): names.update(self.visit(child)) - if hasattr(node, 'ctx'): + if hasattr(node, "ctx"): if isinstance(node.ctx, _ast.Store): self.modified.update(names) elif isinstance(node.ctx, _ast.Load): @@ -34,7 +36,6 @@ def collect_(self, node): class CollectNodes(Visitor): - def __init__(self, call_deps=False): self.graph = DiGraph() self.modified = set() @@ -70,8 +71,8 @@ def visitName(self, node): def visitalias(self, node): name = node.asname if node.asname else node.name - if '.' in name: - name = name.split('.', 1)[0] + if "." in name: + name = name.split(".", 1)[0] if not self.graph.has_node(name): self.graph.add_node(name) @@ -82,12 +83,12 @@ def visitCall(self, node): left = self.visit(node.func) right = set() - for attr in ('args', 'keywords'): + for attr in ("args", "keywords"): for child in getattr(node, attr): if child: right.update(self.visit(child)) - for attr in ('starargs', 'kwargs'): + for attr in ("starargs", "kwargs"): child = getattr(node, attr) if child: right.update(self.visit(child)) @@ -162,11 +163,12 @@ def visitDictComp(self, node): def add_edges(graph, targets, sources): - for target in targets: - for src in sources: - edge = target, src - if not graph.has_edge(*edge): - graph.add_edge(*edge) + for target in targets: + for src in sources: + edge = target, src + if not graph.has_edge(*edge): + graph.add_edge(*edge) + class GlobalDeps(object): def __init__(self, gen, nodes): @@ -180,10 +182,11 @@ def __enter__(self): def __exit__(self, *args): self.gen.context_names = self._old_context_names + class GraphGen(CollectNodes): - ''' + """ Create a graph from the execution flow of the ast - ''' + """ visitModule = visit_children @@ -213,7 +216,6 @@ def visit_function_def(self, node): for stmnt in node.body: self.visit(stmnt) - def visitFunctionDef(self, node): gen = GraphGen() @@ -329,7 +331,6 @@ def visitWith(self, node): return nodes - def visitWhile(self, node): nodes = set() @@ -385,13 +386,11 @@ def visitTryExcept(self, node): all_nodes.update(nodes) - return all_nodes - def make_graph(node, call_deps=False): - ''' + """ Create a dependency graph from an ast node. :param node: ast node. @@ -399,7 +398,7 @@ def make_graph(node, call_deps=False): function calls. (i.e for `a.b(c)` a depends on b and b depends on a) :returns: a tuple of (graph, undefined) - ''' + """ gen = GraphGen(call_deps=call_deps) gen.visit(node) diff --git a/src/python/turicreate/meta/asttools/visitors/print_visitor.py b/src/python/turicreate/meta/asttools/visitors/print_visitor.py index 3009e611d2..71255236c0 100644 --- a/src/python/turicreate/meta/asttools/visitors/print_visitor.py +++ b/src/python/turicreate/meta/asttools/visitors/print_visitor.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Jul 19, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -24,23 +24,26 @@ class Indentor(object): - def __init__(self, printer, indent=' '): + def __init__(self, printer, indent=" "): self.printer = printer self.indent = indent + def __enter__(self): self.printer._indent = self.printer._indent + self.indent def __exit__(self, *args): - indent = self.printer._indent[:-len(self.indent)] + indent = self.printer._indent[: -len(self.indent)] self.printer._indent = indent + clsname = lambda node: type(node).__name__ def depth(node): return len(flatten(node)) + def flatten(node): result = [] @@ -56,21 +59,23 @@ def flatten(node): return result + def ast_keys(node): return node._fields + def ast_values(node): return [getattr(node, field, None) for field in node._fields] + def ast_items(node): return [(field, getattr(node, field, None)) for field in node._fields] class ASTPrinter(Visitor): - - def __init__(self, indent=' ', level=0, newline='\n'): + def __init__(self, indent=" ", level=0, newline="\n"): self.out = StringIO() - self._indent = '' + self._indent = "" self.one_indent = indent self.level = level self.newline = newline @@ -84,40 +89,52 @@ def dumps(self): return self.out.read() def print(self, text, noindent=False, **kwargs): -# if noindent: -# prf = '' -# else: -# prf = self._indent + # if noindent: + # prf = '' + # else: + # prf = self._indent new_text = text.format(**kwargs) -# print(prf, new_text, file=self.out, sep='', end='') - print(new_text, file=self.out, sep='', end='') + # print(prf, new_text, file=self.out, sep='', end='') + print(new_text, file=self.out, sep="", end="") def indent(self, level): ident = self.one_indent * level return Indentor(self, ident) - - def visitDefault(self, node): - nodename = '%s(' % clsname(node) + nodename = "%s(" % clsname(node) self.print(nodename, noindent=True) undefined = [attr for attr in node._fields if not hasattr(node, attr)] if undefined: - warn('ast node %r does not have required field(s) %r ' % (clsname(node), undefined,), stacklevel=2) + warn( + "ast node %r does not have required field(s) %r " + % (clsname(node), undefined,), + stacklevel=2, + ) undefined = [attr for attr in node._attributes if not hasattr(node, attr)] if undefined: - warn('ast does %r not have required attribute(s) %r ' % (clsname(node), undefined,), stacklevel=2) - - children = sorted([(attr, getattr(node, attr)) for attr in node._fields if hasattr(node, attr)]) + warn( + "ast does %r not have required attribute(s) %r " + % (clsname(node), undefined,), + stacklevel=2, + ) + + children = sorted( + [ + (attr, getattr(node, attr)) + for attr in node._fields + if hasattr(node, attr) + ] + ) with self.indent(len(nodename)): i = 0 while children: attr, child = children.pop(0) if isinstance(child, (list, tuple)): - text = '{attr}=['.format(attr=attr) + text = "{attr}=[".format(attr=attr) self.print(text) with self.indent(len(text)): for j, inner_child in enumerate(child): @@ -126,11 +143,13 @@ def visitDefault(self, node): else: self.print(repr(inner_child)) if j < (len(child) - 1): - self.print(", {nl}{idnt}", nl=self.newline, idnt=self._indent) + self.print( + ", {nl}{idnt}", nl=self.newline, idnt=self._indent + ) - self.print(']') + self.print("]") else: - text = '{attr}='.format(attr=attr) + text = "{attr}=".format(attr=attr) self.print(text) with self.indent(len(text)): @@ -142,14 +161,13 @@ def visitDefault(self, node): if children: self.print(", {nl}{idnt}", nl=self.newline, idnt=self._indent) - - i += 1 self.print(")") -def dump_ast(ast, indent=' ', newline='\n'): - ''' + +def dump_ast(ast, indent=" ", newline="\n"): + """ Returns a string representing the ast. @@ -157,14 +175,15 @@ def dump_ast(ast, indent=' ', newline='\n'): :param indent: how far to indent a newline. :param newline: The newline character. - ''' + """ visitor = ASTPrinter(indent=indent, level=0, newline=newline) visitor.visit(ast) return visitor.dumps() -def print_ast(ast, indent=' ', initlevel=0, newline='\n', file=sys.stdout): - ''' + +def print_ast(ast, indent=" ", initlevel=0, newline="\n", file=sys.stdout): + """ Pretty print an ast node. :param ast: the ast to print. @@ -178,7 +197,7 @@ def print_ast(ast, indent=' ', initlevel=0, newline='\n', file=sys.stdout): node = ast.parse(source) print_ast(node, indent='', newline='') - ''' + """ visitor = ASTPrinter(indent=indent, level=initlevel, newline=newline) visitor.visit(ast) diff --git a/src/python/turicreate/meta/asttools/visitors/pysourcegen.py b/src/python/turicreate/meta/asttools/visitors/pysourcegen.py index f6e24b0ea8..c50e03ac82 100644 --- a/src/python/turicreate/meta/asttools/visitors/pysourcegen.py +++ b/src/python/turicreate/meta/asttools/visitors/pysourcegen.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Jul 15, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -24,19 +24,18 @@ class ASTFormatter(Formatter): - def format_field(self, value, format_spec): - if format_spec == 'node': + if format_spec == "node": gen = ExprSourceGen() gen.visit(value) return gen.dumps() - elif value == '': + elif value == "": return value else: return super(ASTFormatter, self).format_field(value, format_spec) def get_value(self, key, args, kwargs): - if key == '': + if key == "": return args[0] elif key in kwargs: return kwargs[key] @@ -48,24 +47,27 @@ def get_value(self, key, args, kwargs): raise Exception + def str_node(node): gen = ExprSourceGen() gen.visit(node) return gen.dumps() + def simple_string(value): def visitNode(self, node): self.print(value, **node.__dict__) + return visitNode + class ExprSourceGen(Visitor): def __init__(self): self.out = StringIO() self.formatter = ASTFormatter() - self.indent = ' ' + self.indent = " " self.level = 0 - @property def indenter(self): return Indenter(self) @@ -86,14 +88,16 @@ def dumps(self): def print(self, line, *args, **kwargs): line = self.formatter.format(line, *args, **kwargs) - level = kwargs.get('level') + level = kwargs.get("level") prx = self.indent * (level if level else self.level) - print(prx, line, sep='', end='', file=self.out) + print(prx, line, sep="", end="", file=self.out) - def print_lines(self, lines,): + def print_lines( + self, lines, + ): prx = self.indent * self.level for line in lines: - print(prx, line, sep='', file=self.out) + print(prx, line, sep="", file=self.out) def visitName(self, node): self.print(node.id) @@ -112,20 +116,20 @@ def visitarguments(self, node): default = defaults.pop(0) self.visit(arg) if default is not None: - self.print('={:node}', default) + self.print("={:node}", default) while args: arg = args.pop(0) default = defaults.pop(0) - self.print(', ') + self.print(", ") self.visit(arg) if default is not None: - self.print('={:node}', default) + self.print("={:node}", default) if node.vararg: - self.print('{0}*{1}', ', ' if i else '', node.vararg) + self.print("{0}*{1}", ", " if i else "", node.vararg) if node.kwarg: - self.print('{0}**{1}', ', ' if i else '', node.kwarg) + self.print("{0}**{1}", ", " if i else "", node.kwarg) @visitarguments.py3op def visitarguments(self, node): @@ -141,22 +145,22 @@ def visitarguments(self, node): default = defaults.pop(0) self.visit(arg) if default is not None: - self.print('={:node}', default) + self.print("={:node}", default) while args: arg = args.pop(0) default = defaults.pop(0) - self.print(', ') + self.print(", ") self.visit(arg) if default is not None: - self.print('={:node}', default) + self.print("={:node}", default) if node.vararg: - self.print('{0}*{1}', ', ' if i else '', node.vararg) + self.print("{0}*{1}", ", " if i else "", node.vararg) if node.varargannotation: - self.print(':{:node}', node.varargannotation) + self.print(":{:node}", node.varargannotation) elif node.kwonlyargs: - self.print('{0}*', ', ' if i else '') + self.print("{0}*", ", " if i else "") kwonlyargs = list(node.kwonlyargs) @@ -168,24 +172,29 @@ def visitarguments(self, node): while kwonlyargs: kw_arg = kwonlyargs.pop(0) kw_default = kw_defaults.pop(0) - self.print(', ') + self.print(", ") self.visit(kw_arg) if kw_default is not None: - self.print('={:node}', kw_default) + self.print("={:node}", kw_default) if node.kwarg: - self.print('{0}**{1}', ', ' if i else '', node.kwarg) + self.print("{0}**{1}", ", " if i else "", node.kwarg) if node.varargannotation: - self.print(':{:node}', node.kwargannotation) + self.print(":{:node}", node.kwargannotation) def visitNum(self, node): self.print(repr(node.n)) def visitBinOp(self, node): - self.print('({left:node} {op:node} {right:node})', left=node.left, op=node.op, right=node.right) + self.print( + "({left:node} {op:node} {right:node})", + left=node.left, + op=node.op, + right=node.right, + ) def visitAdd(self, node): - self.print('+') + self.print("+") def visitalias(self, node): if node.asname is None: @@ -195,7 +204,7 @@ def visitalias(self, node): def visitCall(self, node): - self.print('{func:node}(' , func=node.func) + self.print("{func:node}(", func=node.func) i = 0 print_comma = lambda i: self.print(", ") if i > 0 else None @@ -203,25 +212,25 @@ def visitCall(self, node): for arg in node.args: print_comma(i) - self.print('{:node}', arg) + self.print("{:node}", arg) i += 1 for kw in node.keywords: print_comma(i) - self.print('{:node}', kw) + self.print("{:node}", kw) i += 1 if node.starargs: print_comma(i) - self.print('*{:node}', node.starargs) + self.print("*{:node}", node.starargs) i += 1 if node.kwargs: print_comma(i) - self.print('**{:node}', node.kwargs) + self.print("**{:node}", node.kwargs) i += 1 - self.print(')') + self.print(")") def visitkeyword(self, node): self.print("{0}={1:node}", node.arg, node.value) @@ -230,184 +239,184 @@ def visitStr(self, node): self.print(repr(node.s)) def visitMod(self, node): - self.print('%') + self.print("%") - def visitTuple(self, node, brace='()'): + def visitTuple(self, node, brace="()"): self.print(brace[0]) print_comma = lambda i: self.print(", ") if i > 0 else None i = 0 with self.no_indent: - for elt in node.elts: + for elt in node.elts: print_comma(i) - self.print('{:node}', elt) + self.print("{:node}", elt) i += 1 if len(node.elts) == 1: - self.print(',') + self.print(",") self.print(brace[1]) def visitCompare(self, node): - self.print('({0:node} ', node.left) + self.print("({0:node} ", node.left) with self.no_indent: for (op, right) in zip(node.ops, node.comparators): - self.print('{0:node} {1:node}' , op, right) - self.print(')') + self.print("{0:node} {1:node}", op, right) + self.print(")") @py2op def visitRaise(self, node): - self.print('raise ') + self.print("raise ") with self.no_indent: if node.type: - self.print('{:node}' , node.type) + self.print("{:node}", node.type) if node.inst: - self.print(', {:node}' , node.inst) + self.print(", {:node}", node.inst) if node.tback: - self.print(', {:node}' , node.tback) + self.print(", {:node}", node.tback) @visitRaise.py3op def visitRaise(self, node): - self.print('raise ') + self.print("raise ") with self.no_indent: if node.exc: - self.print('{:node}' , node.exc) + self.print("{:node}", node.exc) if node.cause: - self.print(' from {:node}' , node.cause) + self.print(" from {:node}", node.cause) def visitAttribute(self, node): - self.print('{:node}.{attr}', node.value, attr=node.attr) + self.print("{:node}.{attr}", node.value, attr=node.attr) def visitDict(self, node): - self.print('{{') + self.print("{{") items = zip(node.keys, node.values) with self.no_indent: i = 0 - pc = lambda : self.print(", ") if i > 0 else None + pc = lambda: self.print(", ") if i > 0 else None for key, value in items: pc() - self.print('{0:node}:{1:node}', key, value) + self.print("{0:node}:{1:node}", key, value) i += 1 - self.print('}}') + self.print("}}") def visitSet(self, node): - self.print('{{') + self.print("{{") items = node.elts with self.no_indent: i = 0 - pc = lambda : self.print(", ") if i > 0 else None + pc = lambda: self.print(", ") if i > 0 else None for value in items: pc() - self.print('{0:node}', value) + self.print("{0:node}", value) i += 1 - self.print('}}') + self.print("}}") def visitList(self, node): - self.print('[') + self.print("[") with self.no_indent: i = 0 - pc = lambda : self.print(", ") if i > 0 else None + pc = lambda: self.print(", ") if i > 0 else None for item in node.elts: pc() - self.print('{:node}', item) + self.print("{:node}", item) i += 1 - self.print(']') + self.print("]") def visitSubscript(self, node): - self.print('{0:node}[{1:node}]', node.value, node.slice) + self.print("{0:node}[{1:node}]", node.value, node.slice) def visitIndex(self, node): if isinstance(node.value, _ast.Tuple): with self.no_indent: - self.visit(node.value, brace=['', '']) + self.visit(node.value, brace=["", ""]) else: - self.print('{:node}', node.value) + self.print("{:node}", node.value) def visitSlice(self, node): with self.no_indent: if node.lower is not None: - self.print('{:node}', node.lower) - self.print(':') + self.print("{:node}", node.lower) + self.print(":") if node.upper is not None: - self.print('{:node}', node.upper) + self.print("{:node}", node.upper) if node.step is not None: - self.print(':') - self.print('{:node}', node.step) + self.print(":") + self.print("{:node}", node.step) def visitExtSlice(self, node): dims = list(node.dims) with self.no_indent: dim = dims.pop(0) - self.print('{0:node}', dim) + self.print("{0:node}", dim) while dims: dim = dims.pop(0) - self.print(', {0:node}', dim) + self.print(", {0:node}", dim) def visitUnaryOp(self, node): - self.print('({0:node}{1:node})', node.op, node.operand) + self.print("({0:node}{1:node})", node.op, node.operand) def visitAssert(self, node): - self.print('assert {0:node}', node.test) + self.print("assert {0:node}", node.test) if node.msg: with self.no_indent: - self.print(', {0:node}', node.msg) + self.print(", {0:node}", node.msg) - visitUSub = simple_string('-') - visitUAdd = simple_string('+') - visitNot = simple_string('not ') - visitInvert = simple_string('~') + visitUSub = simple_string("-") + visitUAdd = simple_string("+") + visitNot = simple_string("not ") + visitInvert = simple_string("~") - visitAnd = simple_string('and') - visitOr = simple_string('or') + visitAnd = simple_string("and") + visitOr = simple_string("or") - visitSub = simple_string('-') - visitFloorDiv = simple_string('//') - visitDiv = simple_string('/') - visitMod = simple_string('%') - visitMult = simple_string('*') - visitPow = simple_string('**') + visitSub = simple_string("-") + visitFloorDiv = simple_string("//") + visitDiv = simple_string("/") + visitMod = simple_string("%") + visitMult = simple_string("*") + visitPow = simple_string("**") - visitEq = simple_string('==') - visitNotEq = simple_string('!=') + visitEq = simple_string("==") + visitNotEq = simple_string("!=") - visitLt = simple_string('<') - visitGt = simple_string('>') + visitLt = simple_string("<") + visitGt = simple_string(">") - visitLtE = simple_string('<=') - visitGtE = simple_string('>=') + visitLtE = simple_string("<=") + visitGtE = simple_string(">=") - visitLShift = simple_string('<<') - visitRShift = simple_string('>>') + visitLShift = simple_string("<<") + visitRShift = simple_string(">>") - visitIn = simple_string('in') - visitNotIn = simple_string('not in') + visitIn = simple_string("in") + visitNotIn = simple_string("not in") - visitIs = simple_string('is') - visitIsNot = simple_string('is not') + visitIs = simple_string("is") + visitIsNot = simple_string("is not") - visitBitAnd = simple_string('&') - visitBitOr = simple_string('|') - visitBitXor = simple_string('^') + visitBitAnd = simple_string("&") + visitBitOr = simple_string("|") + visitBitXor = simple_string("^") - visitEllipsis = simple_string('...') + visitEllipsis = simple_string("...") - visitYield = simple_string('yield {value:node}') + visitYield = simple_string("yield {value:node}") def visitBoolOp(self, node): @@ -415,56 +424,52 @@ def visitBoolOp(self, node): values = list(node.values) left = values.pop(0) - self.print('({:node} ', left) + self.print("({:node} ", left) while values: left = values.pop(0) - self.print('{0:node} {1:node})', node.op, left) + self.print("{0:node} {1:node})", node.op, left) def visitIfExp(self, node): - self.print('{body:node} if {test:node} else {orelse:node}', **node.__dict__) + self.print("{body:node} if {test:node} else {orelse:node}", **node.__dict__) def visitLambda(self, node): - self.print('lambda {0:node}: {1:node}', node.args, node.body) + self.print("lambda {0:node}: {1:node}", node.args, node.body) def visitListComp(self, node): - self.print('[{0:node}', node.elt) - + self.print("[{0:node}", node.elt) generators = list(node.generators) with self.no_indent: while generators: generator = generators.pop(0) - self.print('{0:node}', generator) + self.print("{0:node}", generator) - self.print(']') + self.print("]") def visitSetComp(self, node): - self.print('{{{0:node}', node.elt) - + self.print("{{{0:node}", node.elt) generators = list(node.generators) with self.no_indent: while generators: generator = generators.pop(0) - self.print('{0:node}', generator) + self.print("{0:node}", generator) - self.print('}}') + self.print("}}") def visitDictComp(self, node): - self.print('{{{0:node}:{1:node}', node.key, node.value) - + self.print("{{{0:node}:{1:node}", node.key, node.value) generators = list(node.generators) with self.no_indent: while generators: generator = generators.pop(0) - self.print('{0:node}', generator) - - self.print('}}') + self.print("{0:node}", generator) + self.print("}}") def visitcomprehension(self, node): - self.print(' for {0:node} in {1:node}', node.target, node.iter) + self.print(" for {0:node} in {1:node}", node.target, node.iter) ifs = list(node.ifs) while ifs: @@ -477,13 +482,15 @@ def visitarg(self, node): if node.annotation: with self.no_indent: - self.print(':{0:node}', node.annotation) + self.print(":{0:node}", node.annotation) + def visit_expr(node): gen = ExprSourceGen() gen.visit(node) return gen.dumps() + class NoIndent(object): def __init__(self, gen): self.gen = gen @@ -495,20 +502,21 @@ def __enter__(self): def __exit__(self, *args): self.gen.level = self.level + class Indenter(object): def __init__(self, gen): self.gen = gen def __enter__(self): - self.gen.print('\n', level=0) + self.gen.print("\n", level=0) self.gen.level += 1 def __exit__(self, *args): self.gen.level -= 1 -class SourceGen(ExprSourceGen): - def __init__(self, header=''): +class SourceGen(ExprSourceGen): + def __init__(self, header=""): super(SourceGen, self).__init__() print(header, file=self.out) @@ -519,33 +527,33 @@ def visitModule(self, node): if isinstance(children[0].value, _ast.Str): doc = children.pop(0).value self.print("'''") - self.print_lines(doc.s.split('\n')) - self.print_lines(["'''", '\n', '\n']) + self.print_lines(doc.s.split("\n")) + self.print_lines(["'''", "\n", "\n"]) for node in children: self.visit(node) def visitFor(self, node): - self.print('for {0:node} in {1:node}:', node.target, node.iter) + self.print("for {0:node} in {1:node}:", node.target, node.iter) with self.indenter: for stmnt in node.body: self.visit(stmnt) if node.orelse: - self.print('else:') + self.print("else:") with self.indenter: for stmnt in node.orelse: self.visit(stmnt) @py2op def visitFunctionDef(self, node): - #fields = ('name', 'args', 'body', 'decorator_list') + # fields = ('name', 'args', 'body', 'decorator_list') for decorator in node.decorator_list: - self.print('@{decorator:node}\n', decorator=decorator) + self.print("@{decorator:node}\n", decorator=decorator) args = visit_expr(node.args) - self.print('def {name}({args}):' , name=node.name, args=args) + self.print("def {name}({args}):", name=node.name, args=args) with self.indenter: for child in node.body: @@ -556,16 +564,16 @@ def visitFunctionDef(self, node): def visitFunctionDef(self, node): for decorator in node.decorator_list: - self.print('@{decorator:node}\n', decorator=decorator) + self.print("@{decorator:node}\n", decorator=decorator) args = visit_expr(node.args) - self.print('def {name}({args})' , name=node.name, args=args) + self.print("def {name}({args})", name=node.name, args=args) with self.no_indent: if node.returns: - self.print(' -> {:node}:', node.returns) + self.print(" -> {:node}:", node.returns) else: - self.print(':', node.returns) + self.print(":", node.returns) with self.indenter: for child in node.body: @@ -575,31 +583,37 @@ def visitFunctionDef(self, node): def visitAssign(self, node): targets = [visit_expr(target) for target in node.targets] - self.print('{targets} = {value:node}\n', targets=' = '.join(targets), value=node.value) + self.print( + "{targets} = {value:node}\n", targets=" = ".join(targets), value=node.value + ) def visitAugAssign(self, node): - self.print('{target:node} {op:node}= {value:node}\n', **node.__dict__) + self.print("{target:node} {op:node}= {value:node}\n", **node.__dict__) def visitIf(self, node, indent_first=True): - self.print('if {:node}:', node.test, level=self.level if indent_first else 0) + self.print("if {:node}:", node.test, level=self.level if indent_first else 0) with self.indenter: if node.body: for expr in node.body: self.visit(expr) else: - self.print('pass') - + self.print("pass") - if node.orelse and len(node.orelse) == 1 and isinstance(node.orelse[0], _ast.If): - self.print('el'); self.visit(node.orelse[0], indent_first=False) + if ( + node.orelse + and len(node.orelse) == 1 + and isinstance(node.orelse[0], _ast.If) + ): + self.print("el") + self.visit(node.orelse[0], indent_first=False) elif node.orelse: - self.print('else:') + self.print("else:") with self.indenter: for expr in node.orelse: self.visit(expr) - self.print('\n') + self.print("\n") def visitImportFrom(self, node): for name in node.names: @@ -614,7 +628,7 @@ def visitPrint(self, node): with self.no_indent: if node.dest: - self.print(">> {:node}" , node.dest) + self.print(">> {:node}", node.dest) if not node.values and node.nl: self.print("\n") return @@ -622,10 +636,10 @@ def visitPrint(self, node): self.print(", ") i = 0 - pc = lambda : self.print(", ") if i > 0 else None - for value in node.values: + pc = lambda: self.print(", ") if i > 0 else None + for value in node.values: pc() - self.print("{:node}" , value) + self.print("{:node}", value) if not node.nl: self.print(",") @@ -633,25 +647,28 @@ def visitPrint(self, node): self.print("\n") def visitExec(self, node): - self.print('exec {0:node} in {1}, {2}\n', node.body, - 'None' if node.globals is None else str_node(node.globals), - 'None' if node.locals is None else str_node(node.locals)) + self.print( + "exec {0:node} in {1}, {2}\n", + node.body, + "None" if node.globals is None else str_node(node.globals), + "None" if node.locals is None else str_node(node.locals), + ) def visitWith(self, node): - self.print('with {0:node}', node.context_expr) + self.print("with {0:node}", node.context_expr) if node.optional_vars is not None: - self.print(' as {0:node}', node.optional_vars, level=0) - self.print(':', level=0) + self.print(" as {0:node}", node.optional_vars, level=0) + self.print(":", level=0) with self.indenter: if node.body: for expr in node.body: self.visit(expr) else: - self.print('pass\n') + self.print("pass\n") def visitGlobal(self, node): - self.print('global ') + self.print("global ") with self.no_indent: names = list(node.names) if names: @@ -659,25 +676,24 @@ def visitGlobal(self, node): self.print(name) while names: name = names.pop(0) - self.print(', {0}', name) - self.print('\n') - + self.print(", {0}", name) + self.print("\n") def visitDelete(self, node): - self.print('del ') + self.print("del ") targets = list(node.targets) with self.no_indent: target = targets.pop(0) - self.print('{0:node}', target) + self.print("{0:node}", target) while targets: target = targets.pop(0) - self.print(', {0:node}', target) - self.print('\n') + self.print(", {0:node}", target) + self.print("\n") def visitWhile(self, node): - self.print('while {0:node}:', node.test) + self.print("while {0:node}:", node.test) with self.indenter: if node.body: @@ -686,49 +702,48 @@ def visitWhile(self, node): else: self.print("pass") - if node.orelse: - self.print('else:') + self.print("else:") with self.indenter: for expr in node.orelse: self.visit(expr) - self.print('\n') - self.print('\n') - + self.print("\n") + self.print("\n") def visitExpr(self, node): - self.print('{:node}\n', node.value) + self.print("{:node}\n", node.value) - visitBreak = simple_string('break\n') - visitPass = simple_string('pass\n') - visitContinue = simple_string('continue\n') + visitBreak = simple_string("break\n") + visitPass = simple_string("pass\n") + visitContinue = simple_string("continue\n") def visitReturn(self, node): if node.value is not None: - self.print('return {:node}\n', node.value) + self.print("return {:node}\n", node.value) def visitTryExcept(self, node): - self.print('try:') + self.print("try:") with self.indenter: if node.body: for stmnt in node.body: self.visit(stmnt) else: - self.print('pass') + self.print("pass") for hndlr in node.handlers: self.visit(hndlr) if node.orelse: - self.print('else:') + self.print("else:") with self.indenter: for stmnt in node.orelse: self.visit(stmnt) + @py2op def visitExceptHandler(self, node): - self.print('except') + self.print("except") with self.no_indent: if node.type: @@ -743,11 +758,11 @@ def visitExceptHandler(self, node): for stmnt in node.body: self.visit(stmnt) else: - self.print('pass') + self.print("pass") @visitExceptHandler.py3op def visitExceptHandler(self, node): - self.print('except') + self.print("except") with self.no_indent: if node.type: @@ -761,12 +776,11 @@ def visitExceptHandler(self, node): for stmnt in node.body: self.visit(stmnt) - def visitTryFinally(self, node): for item in node.body: self.visit(item) - self.print('finally:') + self.print("finally:") with self.indenter: for item in node.finalbody: @@ -776,12 +790,12 @@ def visitTryFinally(self, node): def visitClassDef(self, node): for decorator in node.decorator_list: - self.print('@{0:node}\n', decorator) + self.print("@{0:node}\n", decorator) - self.print('class {0}', node.name) + self.print("class {0}", node.name) with self.no_indent: - self.print('(') + self.print("(") bases = list(node.bases) if bases: base = bases.pop(0) @@ -789,7 +803,7 @@ def visitClassDef(self, node): while bases: base = bases.pop(0) self.print(", {0:node}", base) - self.print(')') + self.print(")") self.print(":") @@ -804,12 +818,12 @@ def visitClassDef(self, node): def visitClassDef(self, node): for decorator in node.decorator_list: - self.print('@{0:node}\n', decorator) + self.print("@{0:node}\n", decorator) - self.print('class {0}', node.name) + self.print("class {0}", node.name) with self.no_indent: - self.print('(') + self.print("(") bases = list(node.bases) i = 0 if bases: @@ -822,7 +836,8 @@ def visitClassDef(self, node): keywords = list(node.keywords) if keywords: - if i: self.print(', ') + if i: + self.print(", ") i += 1 keyword = keywords.pop(0) self.print("{0:node}", keyword) @@ -831,16 +846,18 @@ def visitClassDef(self, node): self.print(", {0:node}", keyword) if node.starargs: - if i: self.print(', ') + if i: + self.print(", ") i += 1 self.print("*{0:node}", node.starargs) if node.kwargs: - if i: self.print(', ') + if i: + self.print(", ") i += 1 self.print("*{0:node}", node.kwargs) - self.print(')') + self.print(")") self.print(":") @@ -851,24 +868,26 @@ def visitClassDef(self, node): else: self.print("pass\n\n") + def python_source(ast, file=sys.stdout): - ''' + """ Generate executable python source code from an ast node. :param ast: ast node :param file: file to write output to. - ''' + """ gen = SourceGen() gen.visit(ast) gen.dump(file) + def dump_python_source(ast): - ''' + """ :return: a string containing executable python source code from an ast node. :param ast: ast node :param file: file to write output to. - ''' + """ gen = SourceGen() gen.visit(ast) return gen.dumps() diff --git a/src/python/turicreate/meta/asttools/visitors/symbol_visitor.py b/src/python/turicreate/meta/asttools/visitors/symbol_visitor.py index 7c773f63cf..817a49a5d5 100644 --- a/src/python/turicreate/meta/asttools/visitors/symbol_visitor.py +++ b/src/python/turicreate/meta/asttools/visitors/symbol_visitor.py @@ -3,17 +3,18 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Aug 3, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ from ...asttools.visitors import Visitor import ast + class SymbolVisitor(Visitor): def __init__(self, ctx_types=(ast.Load, ast.Store)): @@ -45,8 +46,8 @@ def visitalias(self, node): name = node.asname if node.asname else node.name - if '.' in name: - name = name.split('.', 1)[0] + if "." in name: + name = name.split(".", 1)[0] if ast.Store in self.ctx_types: return {name} @@ -54,9 +55,8 @@ def visitalias(self, node): return set() - def get_symbols(node, ctx_types=(ast.Load, ast.Store)): - ''' + """ Returns all symbols defined in an ast node. if ctx_types is given, then restrict the symbols to ones with that context. @@ -65,6 +65,6 @@ def get_symbols(node, ctx_types=(ast.Load, ast.Store)): :param ctx_types: type or tuple of types that may be found assigned to the `ctx` attribute of an ast Name node. - ''' + """ gen = SymbolVisitor(ctx_types) return gen.visit(node) diff --git a/src/python/turicreate/meta/bytecodetools/__init__.py b/src/python/turicreate/meta/bytecodetools/__init__.py index b715ffa6a4..835a5af847 100644 --- a/src/python/turicreate/meta/bytecodetools/__init__.py +++ b/src/python/turicreate/meta/bytecodetools/__init__.py @@ -3,10 +3,10 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Python byte-code tools expands on the Python dis module. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ diff --git a/src/python/turicreate/meta/bytecodetools/bytecode_consumer.py b/src/python/turicreate/meta/bytecodetools/bytecode_consumer.py index dc16157e66..2655c5b609 100644 --- a/src/python/turicreate/meta/bytecodetools/bytecode_consumer.py +++ b/src/python/turicreate/meta/bytecodetools/bytecode_consumer.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Apr 28, 2012 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -15,44 +15,47 @@ class ByteCodeConsumer(object): - ''' + """ ByteCodeVisitor - ''' + """ + def __init__(self, code): self.code = code self.byte_code = code.co_code def consume(self): - ''' + """ Consume byte-code - ''' - generic_consume = getattr(self, 'generic_consume', None) + """ + generic_consume = getattr(self, "generic_consume", None) for instr in disassembler(self.code): - method_name = 'consume_%s' % (instr.opname) + method_name = "consume_%s" % (instr.opname) method = getattr(self, method_name, generic_consume) if not method: - raise AttributeError("class %r has no method %r" % (type(self).__name__, method_name)) + raise AttributeError( + "class %r has no method %r" % (type(self).__name__, method_name) + ) self.instruction_pre(instr) method(instr) self.instruction_post(instr) def instruction_pre(self, instr): - ''' + """ consumer calls this instruction before every instruction. - ''' + """ def instruction_post(self, instr): - ''' + """ consumer calls this instruction after every instruction. - ''' + """ class StackedByteCodeConsumer(ByteCodeConsumer): - ''' + """ A consumer with the concept of a stack. - ''' + """ def __init__(self, code): ByteCodeConsumer.__init__(self, code) diff --git a/src/python/turicreate/meta/bytecodetools/disassembler_.py b/src/python/turicreate/meta/bytecodetools/disassembler_.py index f49d7b2b88..309c64814c 100644 --- a/src/python/turicreate/meta/bytecodetools/disassembler_.py +++ b/src/python/turicreate/meta/bytecodetools/disassembler_.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on May 10, 2012 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ @@ -21,13 +21,14 @@ if sys.version_info < (3, 4): - co_ord = (lambda c:c) if py3 else ord + co_ord = (lambda c: c) if py3 else ord + def _walk_ops(code): """ Yield (offset, opcode, argument number) tuples for all instructions in *code*. """ - code = getattr(code, 'co_code', b'') + code = getattr(code, "co_code", b"") code = [co_ord(instr) for instr in code] n = len(code) @@ -44,7 +45,9 @@ def _walk_ops(code): extended_arg = oparg * 65536 yield i, op, oparg + else: + def _walk_ops(code): """ Yield (offset, opcode, argument number) tuples for all @@ -55,7 +58,7 @@ def _walk_ops(code): yield instr.offset, op, instr.arg -def disassembler(co, lasti= -1): +def disassembler(co, lasti=-1): """Disassemble a code object. :param co: code object diff --git a/src/python/turicreate/meta/bytecodetools/instruction.py b/src/python/turicreate/meta/bytecodetools/instruction.py index 3c586deb23..776a3e0378 100644 --- a/src/python/turicreate/meta/bytecodetools/instruction.py +++ b/src/python/turicreate/meta/bytecodetools/instruction.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on May 10, 2012 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -16,14 +16,15 @@ import sys py3 = sys.version_info.major >= 3 -co_ord = (lambda c:c) if py3 else ord +co_ord = (lambda c: c) if py3 else ord class Instruction(object): - ''' + """ A Python byte-code instruction. - ''' - def __init__(self, i= -1, op=None, lineno=None): + """ + + def __init__(self, i=-1, op=None, lineno=None): self.i = i self.op = op self.lineno = lineno @@ -50,13 +51,13 @@ def to(self): raise Exception("this is not a jump op (%s)" % (self.opname,)) def __repr__(self): - res = '<%s(%i)' % (opcode.opname[self.op], self.i,) + res = "<%s(%i)" % (opcode.opname[self.op], self.i,) if self.arg is not None: - res += ' arg=%r' % (self.arg,) + res += " arg=%r" % (self.arg,) elif self.oparg is not None: - res += ' oparg=%r' % (self.oparg,) - return res + '>' + res += " oparg=%r" % (self.oparg,) + return res + ">" def __str__(self): result = [] @@ -67,14 +68,14 @@ def __str__(self): result.append(" ") if self.lasti: - result.append('-->') + result.append("-->") else: - result.append(' ') + result.append(" ") if self.label: - result.append('>>') + result.append(">>") else: - result.append(' ') + result.append(" ") result.append(repr(self.i).rjust(4)) @@ -84,15 +85,15 @@ def __str__(self): result.append(repr(self.oparg).rjust(5)) if self.op in opcode.hasconst: - result.append('(' + repr(self.arg) + ')') + result.append("(" + repr(self.arg) + ")") elif self.op in opcode.hasname: - result.append('(' + repr(self.arg) + ')') + result.append("(" + repr(self.arg) + ")") elif self.op in opcode.hasjrel: - result.append('(to ' + repr(self.arg) + ')') + result.append("(to " + repr(self.arg) + ")") elif self.op in opcode.haslocal: - result.append('(' + repr(self.arg) + ')') + result.append("(" + repr(self.arg) + ")") elif self.op in opcode.hascompare: - result.append('(' + repr(self.arg) + ')') + result.append("(" + repr(self.arg) + ")") elif self.op in opcode.hasfree: - result.append('(' + repr(self.arg) + ')') - return ' '.join(result) + result.append("(" + repr(self.arg) + ")") + return " ".join(result) diff --git a/src/python/turicreate/meta/bytecodetools/print_code.py b/src/python/turicreate/meta/bytecodetools/print_code.py index 044cb5c377..7126619cc6 100644 --- a/src/python/turicreate/meta/bytecodetools/print_code.py +++ b/src/python/turicreate/meta/bytecodetools/print_code.py @@ -3,25 +3,27 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on May 10, 2012 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ from .bytecode_consumer import ByteCodeConsumer from argparse import ArgumentParser -class ByteCodePrinter(ByteCodeConsumer): +class ByteCodePrinter(ByteCodeConsumer): def generic_consume(self, instr): print(instr) + def main(): parser = ArgumentParser() parser.add_argument() -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/src/python/turicreate/meta/bytecodetools/pyc_file.py b/src/python/turicreate/meta/bytecodetools/pyc_file.py index 71ab706986..179ea50072 100644 --- a/src/python/turicreate/meta/bytecodetools/pyc_file.py +++ b/src/python/turicreate/meta/bytecodetools/pyc_file.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on May 10, 2012 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -17,22 +17,27 @@ import struct import marshal + def extract(binary): - ''' + """ Extract a code object from a binary pyc file. :param binary: a sequence of bytes from a pyc file. - ''' + """ if len(binary) <= 8: - raise Exception("Binary pyc must be greater than 8 bytes (got %i)" % len(binary)) + raise Exception( + "Binary pyc must be greater than 8 bytes (got %i)" % len(binary) + ) magic = binary[:4] MAGIC = get_magic() if magic != MAGIC: - raise Exception("Python version mismatch (%r != %r) Is this a pyc file?" % (magic, MAGIC)) + raise Exception( + "Python version mismatch (%r != %r) Is this a pyc file?" % (magic, MAGIC) + ) - modtime = time.asctime(time.localtime(struct.unpack('i', binary[4:8])[0])) + modtime = time.asctime(time.localtime(struct.unpack("i", binary[4:8])[0])) code = marshal.loads(binary[8:]) diff --git a/src/python/turicreate/meta/decompiler/__init__.py b/src/python/turicreate/meta/decompiler/__init__.py index 284709f96a..a51a478b99 100644 --- a/src/python/turicreate/meta/decompiler/__init__.py +++ b/src/python/turicreate/meta/decompiler/__init__.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Decompiler module. This module can decompile arbitrary code objects into a python ast. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -21,30 +21,30 @@ import marshal - def decompile_func(func): - ''' + """ Decompile a function into ast.FunctionDef node. :param func: python function (can not be a built-in) :return: ast.FunctionDef instance. - ''' + """ code = func.__code__ # For python 3 -# defaults = func.func_defaults if sys.version_info.major < 3 else func.__defaults__ -# if defaults: -# default_names = code.co_varnames[:code.co_argcount][-len(defaults):] -# else: -# default_names = [] -# defaults = [_ast.Name(id='%s_default' % name, ctx=_ast.Load() , lineno=0, col_offset=0) for name in default_names] + # defaults = func.func_defaults if sys.version_info.major < 3 else func.__defaults__ + # if defaults: + # default_names = code.co_varnames[:code.co_argcount][-len(defaults):] + # else: + # default_names = [] + # defaults = [_ast.Name(id='%s_default' % name, ctx=_ast.Load() , lineno=0, col_offset=0) for name in default_names] ast_node = make_function(code, defaults=[], lineno=code.co_firstlineno) return ast_node + def compile_func(ast_node, filename, globals, **defaults): - ''' + """ Compile a function from an ast.FunctionDef instance. :param ast_node: ast.FunctionDef instance @@ -52,14 +52,14 @@ def compile_func(ast_node, filename, globals, **defaults): :param globals: will be used as func_globals :return: A python function object - ''' + """ function_name = ast_node.name module = _ast.Module(body=[ast_node]) - ctx = {'%s_default' % key : arg for key, arg in defaults.items()} + ctx = {"%s_default" % key: arg for key, arg in defaults.items()} - code = compile(module, filename, 'exec') + code = compile(module, filename, "exec") eval(code, globals, ctx) @@ -67,9 +67,10 @@ def compile_func(ast_node, filename, globals, **defaults): return function -#from imp import get_magic + +# from imp import get_magic # -#def extract(binary): +# def extract(binary): # # if len(binary) <= 8: # raise Exception("Binary pyc must be greater than 8 bytes (got %i)" % len(binary)) @@ -86,13 +87,14 @@ def compile_func(ast_node, filename, globals, **defaults): # # return modtime, code + def decompile_pyc(bin_pyc, output=sys.stdout): - ''' + """ decompile apython pyc or pyo binary file. :param bin_pyc: input file objects :param output: output file objects - ''' + """ from turicreate.meta.asttools import python_source diff --git a/src/python/turicreate/meta/decompiler/control_flow_instructions.py b/src/python/turicreate/meta/decompiler/control_flow_instructions.py index c90575715e..bea83e376a 100644 --- a/src/python/turicreate/meta/decompiler/control_flow_instructions.py +++ b/src/python/turicreate/meta/decompiler/control_flow_instructions.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Jul 14, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -16,8 +16,9 @@ from ..bytecodetools.instruction import Instruction from ..asttools.visitors.print_visitor import print_ast from ..utils import py3op, py2op, py3 -AND_JUMPS = ['JUMP_IF_FALSE_OR_POP', 'POP_JUMP_IF_FALSE'] -OR_JUMPS = ['JUMP_IF_TRUE_OR_POP', 'POP_JUMP_IF_TRUE'] + +AND_JUMPS = ["JUMP_IF_FALSE_OR_POP", "POP_JUMP_IF_FALSE"] +OR_JUMPS = ["JUMP_IF_TRUE_OR_POP", "POP_JUMP_IF_TRUE"] JUMPS = AND_JUMPS + OR_JUMPS JUMP_OPS = [opcode.opmap[name] for name in JUMPS] @@ -26,6 +27,7 @@ def split(block, name): func = lambda instr: instr.opname == name return split_cond(block, func) + def split_cond(block, func, raise_=True): block = block[:] @@ -41,6 +43,7 @@ def split_cond(block, func, raise_=True): return new_block, None, block + def find_index(lst, func, default=None): for i, item in enumerate(lst): if func(item): @@ -48,6 +51,7 @@ def find_index(lst, func, default=None): else: return default + def rfind_index(lst, func, default=None): for i, item in reversed(list(enumerate(lst))): if func(item): @@ -55,10 +59,11 @@ def rfind_index(lst, func, default=None): else: return default + def refactor_ifs(stmnt, ifs): - ''' + """ for if statements in list comprehension - ''' + """ if isinstance(stmnt, _ast.BoolOp): test, right = stmnt.values if isinstance(stmnt.op, _ast.Or): @@ -69,6 +74,7 @@ def refactor_ifs(stmnt, ifs): return refactor_ifs(right, ifs) return stmnt + def parse_logic(struct): lineno = struct.lineno @@ -82,7 +88,7 @@ def parse_logic(struct): parent = struct.parent - Logic = _ast.Or if struct.flag == 'OR' else _ast.And + Logic = _ast.Or if struct.flag == "OR" else _ast.And if isinstance(parent, LogicalOp): ast_parent, insert_into = parse_logic(struct.parent) @@ -107,6 +113,7 @@ def __init__(self, assign, list, ifs, lineno): self.ifs = ifs self.lineno = lineno + class LogicalOp(object): def __init__(self, flag, right, parent, lineno): self.flag = flag @@ -115,11 +122,10 @@ def __init__(self, flag, right, parent, lineno): self.lineno = lineno def __repr__(self): - return '%s(%r, parent=%r)' % (self.flag, self.right, self.parent) + return "%s(%r, parent=%r)" % (self.flag, self.right, self.parent) class CtrlFlowInstructions(object): - def split_handlers(self, handlers_blocks): handlers = [] except_instrs = [] @@ -130,10 +136,10 @@ def split_handlers(self, handlers_blocks): instr = handlers_blocks.pop(0) except_instrs.append(instr) - if (instr.opname == 'COMPARE_OP') and (instr.arg == 'exception match'): + if (instr.opname == "COMPARE_OP") and (instr.arg == "exception match"): jump = handlers_blocks.pop(0) - assert jump.opname == 'POP_JUMP_IF_FALSE' + assert jump.opname == "POP_JUMP_IF_FALSE" next_handler = jump.oparg @@ -144,9 +150,9 @@ def split_handlers(self, handlers_blocks): instr = handlers_blocks.pop(0) except_instrs.append(instr) - assert except_instrs[0].opname == 'DUP_TOP' - assert except_instrs[-3].opname == 'POP_TOP' - assert except_instrs[-1].opname == 'POP_TOP' + assert except_instrs[0].opname == "DUP_TOP" + assert except_instrs[-3].opname == "POP_TOP" + assert except_instrs[-1].opname == "POP_TOP" exec_stmnt = self.decompile_block(except_instrs[1:-4]).stmnt() @@ -154,11 +160,15 @@ def split_handlers(self, handlers_blocks): exc_type = exec_stmnt[0] - - if except_instrs[-2].opname == 'STORE_NAME': - exc_name = _ast.Name(id=except_instrs[-2].arg, ctx=_ast.Store(), lineno=except_instrs[-2].lineno, col_offset=0) + if except_instrs[-2].opname == "STORE_NAME": + exc_name = _ast.Name( + id=except_instrs[-2].arg, + ctx=_ast.Store(), + lineno=except_instrs[-2].lineno, + col_offset=0, + ) else: - assert except_instrs[-2].opname == 'POP_TOP' + assert except_instrs[-2].opname == "POP_TOP" exc_name = None handler_body = [] @@ -170,36 +180,59 @@ def split_handlers(self, handlers_blocks): handler_body.append(instr) - assert handler_body[-1].opname == 'JUMP_FORWARD' + assert handler_body[-1].opname == "JUMP_FORWARD" ends.append(handler_body[-1].arg) exc_body = self.decompile_block(handler_body[:-1]).stmnt() if not exc_body: - exc_body.append(_ast.Pass(lineno=except_instrs[-2].lineno, col_offset=0)) - #is this for python 3? + exc_body.append( + _ast.Pass(lineno=except_instrs[-2].lineno, col_offset=0) + ) + # is this for python 3? if py3 and exc_name is not None: exc_name = exc_name.id - handlers.append(_ast.ExceptHandler(type=exc_type, name=exc_name, body=exc_body, lineno=instr.lineno, col_offset=0)) + handlers.append( + _ast.ExceptHandler( + type=exc_type, + name=exc_name, + body=exc_body, + lineno=instr.lineno, + col_offset=0, + ) + ) except_instrs = [] - assert except_instrs[-1].opname == 'END_FINALLY' + assert except_instrs[-1].opname == "END_FINALLY" if len(except_instrs) == 1: pass else: - assert except_instrs[0].opname == 'POP_TOP' - assert except_instrs[1].opname == 'POP_TOP' - assert except_instrs[2].opname == 'POP_TOP' - assert except_instrs[-2].opname in ['JUMP_FORWARD', 'JUMP_ABSOLUTE'], except_instrs[-2] + assert except_instrs[0].opname == "POP_TOP" + assert except_instrs[1].opname == "POP_TOP" + assert except_instrs[2].opname == "POP_TOP" + assert except_instrs[-2].opname in [ + "JUMP_FORWARD", + "JUMP_ABSOLUTE", + ], except_instrs[-2] ends.append(except_instrs[-2].arg) exc_body = self.decompile_block(except_instrs[3:-2]).stmnt() if not exc_body: - exc_body.append(_ast.Pass(lineno=except_instrs[-2].lineno, col_offset=0)) - - handlers.append(_ast.ExceptHandler(type=None, name=None, body=exc_body, lineno=except_instrs[0].lineno, col_offset=0)) + exc_body.append( + _ast.Pass(lineno=except_instrs[-2].lineno, col_offset=0) + ) + + handlers.append( + _ast.ExceptHandler( + type=None, + name=None, + body=exc_body, + lineno=except_instrs[0].lineno, + col_offset=0, + ) + ) assert all(e == ends[0] for e in ends) @@ -207,13 +240,12 @@ def split_handlers(self, handlers_blocks): return end, handlers - -# @py3op + # @py3op def do_try_except_block(self, block): while 1: instr = block.pop(-1) - if instr.opname == 'POP_BLOCK': + if instr.opname == "POP_BLOCK": break try_except = self.decompile_block(block).stmnt() @@ -221,7 +253,7 @@ def do_try_except_block(self, block): finally_block = [] while 1: next_instr = self.ilst.pop(0) - if next_instr.opname == 'END_FINALLY': + if next_instr.opname == "END_FINALLY": break finally_block.append(next_instr) @@ -231,59 +263,59 @@ def do_try_except_block(self, block): self.ast_stack.append(try_finally) -# @py3op + # @py3op def do_except_block(self, block): handler_block = [] for instr in block: - if instr.opname == 'POP_BLOCK': + if instr.opname == "POP_BLOCK": break handler_block.append(instr) while 1: instr = self.ilst.pop(0) - if instr.opname == 'END_FINALLY': + if instr.opname == "END_FINALLY": break body = self.decompile_block(handler_block).stmnt() self.ast_stack.extend(body) -# @py2op -# def SETUP_FINALLY(self, instr): -# pass -# -# @SETUP_FINALLY.py3op + # @py2op + # def SETUP_FINALLY(self, instr): + # pass + # + # @SETUP_FINALLY.py3op def SETUP_FINALLY(self, instr): to = instr.arg try_except_block = self.make_block(to, inclusive=False) - if try_except_block[0].opname == 'SETUP_EXCEPT': + if try_except_block[0].opname == "SETUP_EXCEPT": self.do_try_except_block(try_except_block) else: self.do_except_block(try_except_block) -# raise Exception() -# print("try_except_block", try_except_block) -# -# finally_block = [] -# while 1: -# next_instr = self.ilst.pop(0) -# if next_instr.opname == 'END_FINALLY': -# break -# finally_block.append(next_instr) -# -# print("finally_block", finally_block) -# -# finally_ = self.decompile_block(finally_block).stmnt() -# -# print("finally_", finally_) -# print_ast(finally_[0]) -# print() -## print_ast(finally_[1]) -## print("\n\n") -# -# try_except = self.decompile_block(try_except_block).stmnt() + # raise Exception() + # print("try_except_block", try_except_block) + # + # finally_block = [] + # while 1: + # next_instr = self.ilst.pop(0) + # if next_instr.opname == 'END_FINALLY': + # break + # finally_block.append(next_instr) + # + # print("finally_block", finally_block) + # + # finally_ = self.decompile_block(finally_block).stmnt() + # + # print("finally_", finally_) + # print_ast(finally_[0]) + # print() + ## print_ast(finally_[1]) + ## print("\n\n") + # + # try_except = self.decompile_block(try_except_block).stmnt() @py2op def SETUP_EXCEPT(self, instr): @@ -292,17 +324,19 @@ def SETUP_EXCEPT(self, instr): try_block = self.make_block(to, inclusive=False) - assert try_block[-1].opname in ['JUMP_FORWARD', 'JUMP_ABSOLUTE'], try_block[-1] - assert try_block[-2].opname == 'POP_BLOCK', try_block[-2] + assert try_block[-1].opname in ["JUMP_FORWARD", "JUMP_ABSOLUTE"], try_block[-1] + assert try_block[-2].opname == "POP_BLOCK", try_block[-2] try_stmnts = self.decompile_block(try_block[:-2]).stmnt() body = try_stmnts - handlers_blocks = self.make_block(try_block[-1].arg, inclusive=False, raise_=False) + handlers_blocks = self.make_block( + try_block[-1].arg, inclusive=False, raise_=False + ) end, handlers = self.split_handlers(handlers_blocks) - #raise exception in python 3 (python 2 ilst does not include end so else may go beyond) + # raise exception in python 3 (python 2 ilst does not include end so else may go beyond) else_block = self.make_block(end, inclusive=False, raise_=py3) else_stmnts = self.decompile_block(else_block).stmnt() @@ -311,7 +345,13 @@ def SETUP_EXCEPT(self, instr): else: else_ = [] - try_except = _ast.TryExcept(body=body, handlers=handlers, orelse=else_, lineno=instr.lineno, col_offset=0) + try_except = _ast.TryExcept( + body=body, + handlers=handlers, + orelse=else_, + lineno=instr.lineno, + col_offset=0, + ) self.ast_stack.append(try_except) @@ -322,13 +362,15 @@ def SETUP_EXCEPT(self, instr): try_block = self.make_block(to, inclusive=False) - assert try_block[-1].opname in ['JUMP_FORWARD', 'JUMP_ABSOLUTE'] - assert try_block[-2].opname == 'POP_BLOCK' + assert try_block[-1].opname in ["JUMP_FORWARD", "JUMP_ABSOLUTE"] + assert try_block[-2].opname == "POP_BLOCK" try_stmnts = self.decompile_block(try_block[:-2]).stmnt() body = try_stmnts - handlers_blocks = self.make_block(try_block[-1].arg, inclusive=False, raise_=False) + handlers_blocks = self.make_block( + try_block[-1].arg, inclusive=False, raise_=False + ) end, handlers = self.split_handlers(handlers_blocks) @@ -341,7 +383,13 @@ def SETUP_EXCEPT(self, instr): else: else_ = [] - try_except = _ast.TryExcept(body=body, handlers=handlers, orelse=else_, lineno=instr.lineno, col_offset=0) + try_except = _ast.TryExcept( + body=body, + handlers=handlers, + orelse=else_, + lineno=instr.lineno, + col_offset=0, + ) self.ast_stack.append(try_except) @@ -353,7 +401,7 @@ def SETUP_LOOP(self, instr): to = instr.arg loop_block = self.make_block(to, inclusive=False, raise_=False) - if 'FOR_ITER' in [ins.opname for ins in loop_block]: + if "FOR_ITER" in [ins.opname for ins in loop_block]: self.for_loop(loop_block) else: self.while_loop(instr, loop_block) @@ -362,30 +410,35 @@ def BREAK_LOOP(self, instr): self.ast_stack.append(_ast.Break(lineno=instr.lineno, col_offset=0)) def for_loop(self, loop_block): - iter_block, _, body_else_block = split(loop_block, 'GET_ITER') + iter_block, _, body_else_block = split(loop_block, "GET_ITER") -# for_iter = body_else_block[0] + # for_iter = body_else_block[0] for_iter = body_else_block.pop(0) - assert for_iter.opname == 'FOR_ITER' + assert for_iter.opname == "FOR_ITER" - idx = find_index(body_else_block, lambda instr: instr.opname == 'POP_BLOCK' and for_iter.to == instr.i) + idx = find_index( + body_else_block, + lambda instr: instr.opname == "POP_BLOCK" and for_iter.to == instr.i, + ) assert idx is not False body_block = body_else_block[:idx] - else_block = body_else_block[idx + 1:] + else_block = body_else_block[idx + 1 :] jump_abs = body_block.pop() - assert jump_abs.opname == 'JUMP_ABSOLUTE' and jump_abs.to == for_iter.i + assert jump_abs.opname == "JUMP_ABSOLUTE" and jump_abs.to == for_iter.i iter_stmnt = self.decompile_block(iter_block).stmnt() assert len(iter_stmnt) == 1 iter_stmnt = iter_stmnt[0] - body_lst = self.decompile_block(body_block[:], stack_items=[None], jump_map={for_iter.i:for_iter.to}).stmnt() + body_lst = self.decompile_block( + body_block[:], stack_items=[None], jump_map={for_iter.i: for_iter.to} + ).stmnt() assign_ = body_lst.pop(0) body = body_lst @@ -396,7 +449,14 @@ def for_loop(self, loop_block): else_ = self.decompile_block(else_block[:]).stmnt() assign = assign_.targets[0] - for_ = _ast.For(target=assign, iter=iter_stmnt, body=body, orelse=else_, lineno=iter_stmnt.lineno, col_offset=0) + for_ = _ast.For( + target=assign, + iter=iter_stmnt, + body=body, + orelse=else_, + lineno=iter_stmnt.lineno, + col_offset=0, + ) self.ast_stack.append(for_) @@ -406,10 +466,12 @@ def make_list_comp(self, get_iter, for_iter): jump_abs = block.pop() - assert jump_abs.opname == 'JUMP_ABSOLUTE', jump_abs.opname - jump_map = {for_iter.i:for_iter.to} + assert jump_abs.opname == "JUMP_ABSOLUTE", jump_abs.opname + jump_map = {for_iter.i: for_iter.to} - stmnts = self.decompile_block(block, stack_items=[None], jump_map=jump_map).stmnt() + stmnts = self.decompile_block( + block, stack_items=[None], jump_map=jump_map + ).stmnt() if len(stmnts) > 1: @@ -428,14 +490,30 @@ def make_list_comp(self, get_iter, for_iter): elt = refactor_ifs(stmnts[0], ifs) assert len(assign.targets) == 1 - generators = [_ast.comprehension(target=assign.targets[0], iter=list_expr, ifs=ifs, lineno=get_iter.lineno, col_offset=0)] + generators = [ + _ast.comprehension( + target=assign.targets[0], + iter=list_expr, + ifs=ifs, + lineno=get_iter.lineno, + col_offset=0, + ) + ] if isinstance(list_, _ast.Assign): - comp = _ast.comprehension(target=list_.targets[0], iter=None, ifs=ifs, lineno=get_iter.lineno, col_offset=0) + comp = _ast.comprehension( + target=list_.targets[0], + iter=None, + ifs=ifs, + lineno=get_iter.lineno, + col_offset=0, + ) generators.insert(0, comp) - list_comp = _ast.ListComp(elt=elt, generators=generators, lineno=get_iter.lineno, col_offset=0) + list_comp = _ast.ListComp( + elt=elt, generators=generators, lineno=get_iter.lineno, col_offset=0 + ) else: list_expr = self.ast_stack.pop() list_comp = stmnts[0] @@ -445,7 +523,13 @@ def make_list_comp(self, get_iter, for_iter): # empty ast.List object list_ = self.ast_stack.pop() if not isinstance(list_, _ast.Assign): - comp = _ast.comprehension(target=list_.targets[0], iter=None, ifs=[], lineno=get_iter.lineno, col_offset=0) + comp = _ast.comprehension( + target=list_.targets[0], + iter=None, + ifs=[], + lineno=get_iter.lineno, + col_offset=0, + ) generators.insert(0, comp) generators[0].iter = list_expr @@ -463,10 +547,12 @@ def extract_listcomp(self, function, sequence): generators = list(value.generators) for generator in generators: - if generator.iter.id == '.0': + if generator.iter.id == ".0": generator.iter = sequence - setcomp = _ast.ListComp(elt=value.elt, generators=generators, lineno=value.lineno, col_offset=0) + setcomp = _ast.ListComp( + elt=value.elt, generators=generators, lineno=value.lineno, col_offset=0 + ) self.ast_stack.append(setcomp) def extract_setcomp(self, function, sequence): @@ -480,10 +566,12 @@ def extract_setcomp(self, function, sequence): generators = list(value.generators) for generator in generators: - if generator.iter.id == '.0': + if generator.iter.id == ".0": generator.iter = sequence - setcomp = _ast.SetComp(elt=value.elt, generators=generators, lineno=value.lineno, col_offset=0) + setcomp = _ast.SetComp( + elt=value.elt, generators=generators, lineno=value.lineno, col_offset=0 + ) self.ast_stack.append(setcomp) def extract_dictcomp(self, function, sequence): @@ -497,55 +585,61 @@ def extract_dictcomp(self, function, sequence): generators = list(value.generators) for generator in generators: - if generator.iter.id == '.0': + if generator.iter.id == ".0": generator.iter = sequence - setcomp = _ast.DictComp(key=value.elt[0], value=value.elt[1], generators=generators, lineno=value.lineno, col_offset=0) + setcomp = _ast.DictComp( + key=value.elt[0], + value=value.elt[1], + generators=generators, + lineno=value.lineno, + col_offset=0, + ) self.ast_stack.append(setcomp) -# -# assert len(function.code.nodes) == 1 -# assert isinstance(function.code.nodes[0], _ast.Return) -# -# value = function.code.nodes[0].value -# -# assert isinstance(value, _ast.ListComp) -# -# quals = value.quals -# key, value = value.expr -# -# for qual in quals: -# qual.list = sequence -# -# setcomp = _ast.DictComp(key=key, value=value, generators=quals, lineno=value.lineno, col_offset=0) -# self.ast_stack.append(setcomp) + + # + # assert len(function.code.nodes) == 1 + # assert isinstance(function.code.nodes[0], _ast.Return) + # + # value = function.code.nodes[0].value + # + # assert isinstance(value, _ast.ListComp) + # + # quals = value.quals + # key, value = value.expr + # + # for qual in quals: + # qual.list = sequence + # + # setcomp = _ast.DictComp(key=key, value=value, generators=quals, lineno=value.lineno, col_offset=0) + # self.ast_stack.append(setcomp) def GET_ITER(self, instr): for_iter = self.ilst.pop(0) - if for_iter.opname == 'CALL_FUNCTION': + if for_iter.opname == "CALL_FUNCTION": call_function = for_iter assert call_function.oparg == 1 sequence = self.ast_stack.pop() function = self.ast_stack.pop() - if function.name == '': + if function.name == "": self.extract_listcomp(function, sequence) - elif function.name == '': + elif function.name == "": self.extract_setcomp(function, sequence) - elif function.name == '': + elif function.name == "": self.extract_dictcomp(function, sequence) else: assert False, function.name - elif for_iter.opname == 'FOR_ITER': + elif for_iter.opname == "FOR_ITER": self.make_list_comp(instr, for_iter) else: assert False - def LIST_APPEND(self, instr): -# assert instr.oparg == 2 + # assert instr.oparg == 2 pass def MAP_ADD(self, instr): @@ -553,24 +647,24 @@ def MAP_ADD(self, instr): value = self.ast_stack.pop() self.ast_stack.append((key, value)) - 'NOP' + "NOP" def SET_ADD(self, instr): - 'NOP' + "NOP" def FOR_ITER(self, instr): - #set or dict comp + # set or dict comp self.make_list_comp(instr, instr) def while_loop(self, instr, loop_block): kw = dict(lineno=instr.lineno, col_offset=0) - loop_block_map = {instr.i:instr.op for instr in loop_block} + loop_block_map = {instr.i: instr.op for instr in loop_block} first_i = loop_block[0].i - func = lambda instr: instr.opname == 'JUMP_ABSOLUTE' and instr.oparg == first_i + func = lambda instr: instr.opname == "JUMP_ABSOLUTE" and instr.oparg == first_i body_index = rfind_index(loop_block[:-1], func) if body_index is None: @@ -579,7 +673,7 @@ def while_loop(self, instr, loop_block): else: if body_index + 1 < len(loop_block): pop_block = loop_block[body_index + 1] - const_while = pop_block.opname != 'POP_BLOCK' + const_while = pop_block.opname != "POP_BLOCK" const_else = True else: const_while = True @@ -589,7 +683,7 @@ def while_loop(self, instr, loop_block): test = _ast.Num(1, **kw) body_ = self.decompile_block(loop_block[:body_index]).stmnt() - else_block = loop_block[body_index + 1:] + else_block = loop_block[body_index + 1 :] if else_block: else_ = self.decompile_block(else_block).stmnt() else: @@ -597,7 +691,10 @@ def while_loop(self, instr, loop_block): else: pop_block = loop_block[body_index + 1] - func = lambda instr: instr.opname in ['POP_JUMP_IF_FALSE', 'POP_JUMP_IF_TRUE'] and instr.oparg == pop_block.i + func = ( + lambda instr: instr.opname in ["POP_JUMP_IF_FALSE", "POP_JUMP_IF_TRUE"] + and instr.oparg == pop_block.i + ) idx = rfind_index(loop_block[:body_index], func) cond_block = loop_block[:idx] @@ -605,9 +702,9 @@ def while_loop(self, instr, loop_block): assert len(iter_stmnt) == 1 test = iter_stmnt[0] - body_ = self.decompile_block(loop_block[idx + 1:body_index]).stmnt() + body_ = self.decompile_block(loop_block[idx + 1 : body_index]).stmnt() - else_block = loop_block[body_index + 2:] + else_block = loop_block[body_index + 2 :] if else_block: else_ = self.decompile_block(else_block[:]).stmnt() else: @@ -617,10 +714,8 @@ def while_loop(self, instr, loop_block): self.ast_stack.append(while_) - def gather_jumps(self, jump_instr): - to = self.jump_map.get(jump_instr.to, jump_instr.to) assert to > jump_instr.i @@ -644,15 +739,17 @@ def gather_jumps(self, jump_instr): new_block = self.make_block(to=old_max, inclusive=False, raise_=False) and_block.extend(new_block) - #print("and_block", and_block) + # print("and_block", and_block) return and_block def process_logic(self, logic_block): if logic_block[0].opname in JUMPS: jump_instr = logic_block[0] - flag = 'OR' if jump_instr.opname in OR_JUMPS else 'AND' - idx = find_index(logic_block, lambda instr: jump_instr.oparg == instr.i, default=None) + flag = "OR" if jump_instr.opname in OR_JUMPS else "AND" + idx = find_index( + logic_block, lambda instr: jump_instr.oparg == instr.i, default=None + ) if idx is None: if len(logic_block) == 1: @@ -660,19 +757,21 @@ def process_logic(self, logic_block): else: right = self.process_logic(logic_block[1:]) parent = None -# assert False + # assert False else: - right = self.process_logic(logic_block[1:idx - 1]) - parent = self.process_logic(logic_block[idx - 1:]) + right = self.process_logic(logic_block[1 : idx - 1]) + parent = self.process_logic(logic_block[idx - 1 :]) -# if right is None: + # if right is None: return LogicalOp(flag, right, parent, jump_instr.lineno) else: - idx = find_index(logic_block, lambda instr: instr.opname in JUMPS, default=None) + idx = find_index( + logic_block, lambda instr: instr.opname in JUMPS, default=None + ) if idx is None: stmnts = self.decompile_block(logic_block).stmnt() -# assert len(stmnts) == 1 + # assert len(stmnts) == 1 return stmnts[0] else: right = logic_block[idx:] @@ -692,9 +791,8 @@ def process_logic(self, logic_block): right.parent = parent return right - def logic_ast(self, instr, left, hi): -# flag = 'OR' if opname[instr.op] in OR_JUMPS else 'AND' + # flag = 'OR' if opname[instr.op] in OR_JUMPS else 'AND' ast_, insert_into = parse_logic(hi) @@ -713,16 +811,21 @@ def JUMP_IF_TRUE_OR_POP(self, instr): def make_if(self, instr, left, and_block): block = [instr] + and_block[:-1] - maxmax = max(block, key=lambda ins: (0, 0) if (ins.op not in JUMP_OPS) else (self.jump_map.get(ins.oparg, ins.oparg), ins.i)) + maxmax = max( + block, + key=lambda ins: (0, 0) + if (ins.op not in JUMP_OPS) + else (self.jump_map.get(ins.oparg, ins.oparg), ins.i), + ) idx = block.index(maxmax) assert idx is not None - hi = self.process_logic(block[:idx + 1]) + hi = self.process_logic(block[: idx + 1]) if hi.right is None and hi.parent is None: - if instr.opname == 'POP_JUMP_IF_TRUE': + if instr.opname == "POP_JUMP_IF_TRUE": cond = _ast.UnaryOp(op=_ast.Not(), operand=left, lineno=0, col_offset=0) else: cond = left @@ -732,42 +835,43 @@ def make_if(self, instr, left, and_block): jump = and_block[-1] - if jump.opname == 'RETURN_VALUE': - body_block = block[idx + 1:] + [jump] + if jump.opname == "RETURN_VALUE": + body_block = block[idx + 1 :] + [jump] else: - body_block = block[idx + 1:] + body_block = block[idx + 1 :] body = self.decompile_block(body_block).stmnt() if jump.is_jump: else_block = self.make_block(jump.to, inclusive=False, raise_=False) - else: # it is a return + else: # it is a return else_block = [] if len(else_block): else_ = self.decompile_block(else_block).stmnt() -# -# if len(else_lst) == 1 and isinstance(else_lst[0], _ast.If): -# elif_ = else_lst[0] -# tests.extend(elif_.tests) -# else_ = elif_.else_ -# else: -# else_ = else_lst + # + # if len(else_lst) == 1 and isinstance(else_lst[0], _ast.If): + # elif_ = else_lst[0] + # tests.extend(elif_.tests) + # else_ = elif_.else_ + # else: + # else_ = else_lst else: else_ = [] - if_ = _ast.If(test=cond, body=body, orelse=else_, lineno=instr.lineno, col_offset=0) + if_ = _ast.If( + test=cond, body=body, orelse=else_, lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(if_) - def POP_JUMP_IF_TRUE(self, instr): left = self.ast_stack.pop() and_block = self.gather_jumps(instr) - if and_block[-1].opname in ['JUMP_FORWARD', 'JUMP_ABSOLUTE', 'RETURN_VALUE']: + if and_block[-1].opname in ["JUMP_FORWARD", "JUMP_ABSOLUTE", "RETURN_VALUE"]: self.make_if(instr, left, and_block) return @@ -778,27 +882,27 @@ def POP_JUMP_IF_TRUE(self, instr): def POP_JUMP_IF_FALSE(self, instr): - #print("POP_JUMP_IF_FALSE") + # print("POP_JUMP_IF_FALSE") left = self.ast_stack.pop() and_block = self.gather_jumps(instr) - #This is an IF statement + # This is an IF statement - if and_block[-1].opname in ['JUMP_FORWARD', 'JUMP_ABSOLUTE', 'RETURN_VALUE']: + if and_block[-1].opname in ["JUMP_FORWARD", "JUMP_ABSOLUTE", "RETURN_VALUE"]: - #this happens if the function was going to return anyway - if and_block[-1].opname == 'RETURN_VALUE': + # this happens if the function was going to return anyway + if and_block[-1].opname == "RETURN_VALUE": JUMP_FORWARD = Instruction(and_block[-1].i, 110, lineno=0) JUMP_FORWARD.arg = instr.to and_block.append(JUMP_FORWARD) - #print() - #print("make_if", instr, left, and_block) - #print() + # print() + # print("make_if", instr, left, and_block) + # print() self.make_if(instr, left, and_block) return - else: #This is an expression + else: # This is an expression hi = self.process_logic([instr] + and_block) ast_ = self.logic_ast(instr, left, hi) self.ast_stack.append(ast_) @@ -822,18 +926,17 @@ def SETUP_WITH(self, instr): with_block = self.make_block(to=instr.to, inclusive=False) - assert with_block.pop().opname == 'LOAD_CONST' - assert with_block.pop().opname == 'POP_BLOCK' + assert with_block.pop().opname == "LOAD_CONST" + assert with_block.pop().opname == "POP_BLOCK" with_cleanup = self.ilst.pop(0) - assert with_cleanup.opname == 'WITH_CLEANUP' + assert with_cleanup.opname == "WITH_CLEANUP" end_finally = self.ilst.pop(0) - assert end_finally.opname == 'END_FINALLY' - + assert end_finally.opname == "END_FINALLY" - with_ = self.decompile_block(with_block, stack_items=['WITH_BLOCK']).stmnt() + with_ = self.decompile_block(with_block, stack_items=["WITH_BLOCK"]).stmnt() - if isinstance(with_[0], _ast.Assign) and with_[0].value == 'WITH_BLOCK': + if isinstance(with_[0], _ast.Assign) and with_[0].value == "WITH_BLOCK": assign = with_.pop(0) as_ = assign.targets[0] else: @@ -843,8 +946,13 @@ def SETUP_WITH(self, instr): expr = self.ast_stack.pop() - with_ = _ast.With(context_expr=expr, optional_vars=as_, body=body, - lineno=instr.lineno, col_offset=0) + with_ = _ast.With( + context_expr=expr, + optional_vars=as_, + body=body, + lineno=instr.lineno, + col_offset=0, + ) self.ast_stack.append(with_) diff --git a/src/python/turicreate/meta/decompiler/disassemble.py b/src/python/turicreate/meta/decompiler/disassemble.py index fffdbd84d8..a7e5fc4ddb 100644 --- a/src/python/turicreate/meta/decompiler/disassemble.py +++ b/src/python/turicreate/meta/decompiler/disassemble.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Jul 14, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -20,19 +20,21 @@ py3 = sys.version_info.major >= 3 -co_ord = (lambda c:c) if py3 else ord +co_ord = (lambda c: c) if py3 else ord + def disassemble(co): """Disassemble a code object.""" return list(disassembler(co)) -def print_code(co, lasti= -1, level=0): + +def print_code(co, lasti=-1, level=0): """Disassemble a code object.""" code = co.co_code for constant in co.co_consts: - print( '| |' * level, end=' ') - print( 'constant:', constant) + print("| |" * level, end=" ") + print("constant:", constant) labels = findlabels(code) linestarts = dict(findlinestarts(co)) @@ -48,18 +50,22 @@ def print_code(co, lasti= -1, level=0): if i in linestarts: if i > 0: print() - print( '| |' * level, end=' ') - print( "%3d" % linestarts[i], end=' ') + print("| |" * level, end=" ") + print("%3d" % linestarts[i], end=" ") else: - print( '| |' * level, end=' ') - print(' ', end=' ') + print("| |" * level, end=" ") + print(" ", end=" ") - if i == lasti: print( '-->',end=' ') - else: print( ' ', end=' ') - if i in labels: print( '>>', end=' ') - else: print( ' ',end=' ') - print(repr(i).rjust(4), end=' ') - print(opcode.opname[op].ljust(20), end=' ') + if i == lasti: + print("-->", end=" ") + else: + print(" ", end=" ") + if i in labels: + print(">>", end=" ") + else: + print(" ", end=" ") + print(repr(i).rjust(4), end=" ") + print(opcode.opname[op].ljust(20), end=" ") i = i + 1 if op >= opcode.HAVE_ARGUMENT: oparg = co_ord(code[i]) + co_ord(code[i + 1]) * 256 + extended_arg @@ -67,25 +73,25 @@ def print_code(co, lasti= -1, level=0): i = i + 2 if op == opcode.EXTENDED_ARG: extended_arg = oparg * 65536 - print( repr(oparg).rjust(5), end=' ') + print(repr(oparg).rjust(5), end=" ") if op in opcode.hasconst: - print( '(' + repr(co.co_consts[oparg]) + ')', end=' ') + print("(" + repr(co.co_consts[oparg]) + ")", end=" ") if type(co.co_consts[oparg]) == types.CodeType: have_inner = co.co_consts[oparg] elif op in opcode.hasname: - print( '(' + co.co_names[oparg] + ')',end=' ') + print("(" + co.co_names[oparg] + ")", end=" ") elif op in opcode.hasjrel: - print('(to ' + repr(i + oparg) + ')', end=' ') + print("(to " + repr(i + oparg) + ")", end=" ") elif op in opcode.haslocal: - print('(' + co.co_varnames[oparg] + ')', end=' ') + print("(" + co.co_varnames[oparg] + ")", end=" ") elif op in opcode.hascompare: - print('(' + opcode.cmp_op[oparg] + ')', end=' ') + print("(" + opcode.cmp_op[oparg] + ")", end=" ") elif op in opcode.hasfree: if free is None: free = co.co_cellvars + co.co_freevars - print('(' + free[oparg] + ')', end=' ') + print("(" + free[oparg] + ")", end=" ") print() if have_inner is not False: diff --git a/src/python/turicreate/meta/decompiler/instructions.py b/src/python/turicreate/meta/decompiler/instructions.py index ff143cd527..2c2aed27a7 100644 --- a/src/python/turicreate/meta/decompiler/instructions.py +++ b/src/python/turicreate/meta/decompiler/instructions.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Jul 14, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -18,29 +18,40 @@ from ..asttools import print_ast from ..utils import py3, py3op, py2op -function_ops = ['CALL_FUNCTION', 'CALL_FUNCTION_KW', 'CALL_FUNCTION_VAR', 'CALL_FUNCTION_VAR_KW'] +function_ops = [ + "CALL_FUNCTION", + "CALL_FUNCTION_KW", + "CALL_FUNCTION_VAR", + "CALL_FUNCTION_VAR_KW", +] + def pop_doc(stmnts): - doc = pop_assignment(stmnts, '__doc__') + doc = pop_assignment(stmnts, "__doc__") assert isinstance(doc, _ast.Str) or doc is None return doc + def pop_assignment(stmnts, name): for i in range(len(stmnts)): stmnt = stmnts[i] - if isinstance(stmnt, _ast.Assign) and len(stmnt.targets) == 1 \ - and isinstance(stmnt.targets[0], _ast.Name) \ - and isinstance(stmnt.targets[0].ctx, _ast.Store): + if ( + isinstance(stmnt, _ast.Assign) + and len(stmnt.targets) == 1 + and isinstance(stmnt.targets[0], _ast.Name) + and isinstance(stmnt.targets[0].ctx, _ast.Store) + ): if stmnt.targets[0].id == name: stmnts.pop(i) return stmnt.value return None + def pop_return(stmnts): ns = len(stmnts) @@ -52,216 +63,265 @@ def pop_return(stmnts): def make_module(code): - from ..decompiler.disassemble import disassemble - instructions = Instructions(disassemble(code)) - stmnts = instructions.stmnt() - - doc = pop_doc(stmnts) - pop_return(stmnts) - -# stmnt = ast.Stmt(stmnts, 0) - - if doc is not None: - stmnts = [_ast.Expr(value=doc, lineno=doc.lineno, col_offset=0)] + stmnts - - ast_obj = _ast.Module(body=stmnts, lineno=0, col_offset=0) - - return ast_obj - -@py2op -def make_function(code, defaults=None, lineno=0): - from ..decompiler.disassemble import disassemble - - instructions = Instructions(disassemble(code)) + from ..decompiler.disassemble import disassemble - stmnts = instructions.stmnt() + instructions = Instructions(disassemble(code)) + stmnts = instructions.stmnt() - if code.co_flags & 2: - vararg = None - kwarg = None + doc = pop_doc(stmnts) + pop_return(stmnts) - varnames = list(code.co_varnames[:code.co_argcount]) - co_locals = list(code.co_varnames[code.co_argcount:]) + # stmnt = ast.Stmt(stmnts, 0) - #have var args - if code.co_flags & 4: - vararg = co_locals.pop(0) + if doc is not None: + stmnts = [_ast.Expr(value=doc, lineno=doc.lineno, col_offset=0)] + stmnts - #have kw args - if code.co_flags & 8: - kwarg = co_locals.pop() + ast_obj = _ast.Module(body=stmnts, lineno=0, col_offset=0) - args = [_ast.Name(id=argname, ctx=_ast.Param(), lineno=lineno, col_offset=0) for argname in varnames] + return ast_obj - args = _ast.arguments(args=args, - defaults=defaults if defaults else [], - kwarg=kwarg, - vararg=vararg, - lineno=lineno, col_offset=0 - ) - if code.co_name == '': - if len(stmnts) == 2: - if isinstance(stmnts[0], _ast.If) and isinstance(stmnts[1], _ast.Return): - assert len(stmnts[0].body) == 1 - assert isinstance(stmnts[0].body[0], _ast.Return) - stmnts = [_ast.Return(_ast.IfExp(stmnts[0].test, stmnts[0].body[0].value, stmnts[1].value))] - assert len(stmnts) == 1, stmnts - assert isinstance(stmnts[0], _ast.Return) - - stmnt = stmnts[0].value - ast_obj = _ast.Lambda(args=args, body=stmnt, lineno=lineno, col_offset=0) - else: - - if instructions.seen_yield: - return_ = stmnts[-1] - - assert isinstance(return_, _ast.Return) - assert isinstance(return_.value, _ast.Name) - assert return_.value.id == 'None' - return_.value = None - ast_obj = _ast.FunctionDef(name=code.co_name, args=args, body=stmnts, decorator_list=[], lineno=lineno, col_offset=0) +@py2op +def make_function(code, defaults=None, lineno=0): + from ..decompiler.disassemble import disassemble + + instructions = Instructions(disassemble(code)) + + stmnts = instructions.stmnt() + + if code.co_flags & 2: + vararg = None + kwarg = None + + varnames = list(code.co_varnames[: code.co_argcount]) + co_locals = list(code.co_varnames[code.co_argcount :]) + + # have var args + if code.co_flags & 4: + vararg = co_locals.pop(0) + + # have kw args + if code.co_flags & 8: + kwarg = co_locals.pop() + + args = [ + _ast.Name(id=argname, ctx=_ast.Param(), lineno=lineno, col_offset=0) + for argname in varnames + ] + + args = _ast.arguments( + args=args, + defaults=defaults if defaults else [], + kwarg=kwarg, + vararg=vararg, + lineno=lineno, + col_offset=0, + ) + if code.co_name == "": + if len(stmnts) == 2: + if isinstance(stmnts[0], _ast.If) and isinstance(stmnts[1], _ast.Return): + assert len(stmnts[0].body) == 1 + assert isinstance(stmnts[0].body[0], _ast.Return) + stmnts = [ + _ast.Return( + _ast.IfExp( + stmnts[0].test, stmnts[0].body[0].value, stmnts[1].value + ) + ) + ] + + assert len(stmnts) == 1, stmnts + assert isinstance(stmnts[0], _ast.Return) + + stmnt = stmnts[0].value + ast_obj = _ast.Lambda(args=args, body=stmnt, lineno=lineno, col_offset=0) + else: + + if instructions.seen_yield: + return_ = stmnts[-1] + + assert isinstance(return_, _ast.Return) + assert isinstance(return_.value, _ast.Name) + assert return_.value.id == "None" + return_.value = None + ast_obj = _ast.FunctionDef( + name=code.co_name, + args=args, + body=stmnts, + decorator_list=[], + lineno=lineno, + col_offset=0, + ) + + return ast_obj - return ast_obj @make_function.py3op def make_function(code, defaults=None, annotations=(), kw_defaults=(), lineno=0): - from ..decompiler.disassemble import disassemble + from ..decompiler.disassemble import disassemble - instructions = Instructions(disassemble(code)) + instructions = Instructions(disassemble(code)) - stmnts = instructions.stmnt() + stmnts = instructions.stmnt() - if code.co_flags & 2: - vararg = None - kwarg = None + if code.co_flags & 2: + vararg = None + kwarg = None - varnames = list(code.co_varnames[:code.co_argcount]) - kwonly_varnames = list(code.co_varnames[code.co_argcount:code.co_argcount + code.co_kwonlyargcount]) - co_locals = list(code.co_varnames[code.co_argcount + code.co_kwonlyargcount:]) + varnames = list(code.co_varnames[: code.co_argcount]) + kwonly_varnames = list( + code.co_varnames[code.co_argcount : code.co_argcount + code.co_kwonlyargcount] + ) + co_locals = list(code.co_varnames[code.co_argcount + code.co_kwonlyargcount :]) - assert (len(kw_defaults) % 2) == 0 + assert (len(kw_defaults) % 2) == 0 - kw_defaults = list(kw_defaults) - kw_default_dict = {} + kw_defaults = list(kw_defaults) + kw_default_dict = {} - while kw_defaults: - name = kw_defaults.pop(0) - value = kw_defaults.pop(0) + while kw_defaults: + name = kw_defaults.pop(0) + value = kw_defaults.pop(0) - kw_default_dict[name.s] = value + kw_default_dict[name.s] = value - kw_defaults = [] - for argname in kwonly_varnames: - kw_defaults.append(kw_default_dict.pop(argname)) - - #have var args - if code.co_flags & 4: - vararg = co_locals.pop(0) - - #have kw args - if code.co_flags & 8: - kwarg = co_locals.pop() - - args = [] - annotation_names = [annotation.arg for annotation in annotations] - - for argname in varnames: - if argname in annotation_names: - arg = [annotation for annotation in annotations if annotation.arg == argname][0] - else: - arg = _ast.arg(annotation=None, arg=argname, lineno=lineno, col_offset=0) #@UndefinedVariable + kw_defaults = [] + for argname in kwonly_varnames: + kw_defaults.append(kw_default_dict.pop(argname)) - args.append(arg) + # have var args + if code.co_flags & 4: + vararg = co_locals.pop(0) - kwonlyargs = [] - - for argname in kwonly_varnames: - if argname in annotation_names: - arg = [annotation for annotation in annotations if annotation.arg == argname][0] - else: - arg = _ast.arg(annotation=None, arg=argname, lineno=lineno, col_offset=0) #@UndefinedVariable - - kwonlyargs.append(arg) - - if 'return' in annotation_names: - arg = [annotation for annotation in annotations if annotation.arg == 'return'][0] - returns = arg.annotation - else: - returns = None + # have kw args + if code.co_flags & 8: + kwarg = co_locals.pop() - if vararg in annotation_names: - arg = [annotation for annotation in annotations if annotation.arg == vararg][0] - varargannotation = arg.annotation - else: - varargannotation = None + args = [] + annotation_names = [annotation.arg for annotation in annotations] - if kwarg in annotation_names: - arg = [annotation for annotation in annotations if annotation.arg == kwarg][0] - kwargannotation = arg.annotation - else: - kwargannotation = None - - args = _ast.arguments(args=args, - defaults=defaults if defaults else [], - kwarg=kwarg, - vararg=vararg, - kw_defaults=kw_defaults, - kwonlyargs=kwonlyargs, - kwargannotation=kwargannotation, - varargannotation=varargannotation, - lineno=lineno, col_offset=0 - ) - - - if code.co_name == '': - if len(stmnts) == 2: - if isinstance(stmnts[0], _ast.If) and isinstance(stmnts[1], _ast.Return): - assert len(stmnts[0].body) == 1 - assert isinstance(stmnts[0].body[0], _ast.Return) - stmnts = [_ast.Return(_ast.IfExp(stmnts[0].test, stmnts[0].body[0].value, stmnts[1].value))] - - assert isinstance(stmnts[0], _ast.Return) - - stmnt = stmnts[0].value - ast_obj = _ast.Lambda(args=args, body=stmnt, lineno=lineno, col_offset=0) + for argname in varnames: + if argname in annotation_names: + arg = [ + annotation for annotation in annotations if annotation.arg == argname + ][0] else: + arg = _ast.arg( + annotation=None, arg=argname, lineno=lineno, col_offset=0 + ) # @UndefinedVariable - if instructions.seen_yield: - return_ = stmnts[-1] + args.append(arg) - assert isinstance(return_, _ast.Return) - assert isinstance(return_.value, _ast.Name) - assert return_.value.id == 'None' - return_.value = None + kwonlyargs = [] - ast_obj = _ast.FunctionDef(name=code.co_name, args=args, - body=stmnts, decorator_list=[], - returns=returns, - lineno=lineno, col_offset=0) + for argname in kwonly_varnames: + if argname in annotation_names: + arg = [ + annotation for annotation in annotations if annotation.arg == argname + ][0] + else: + arg = _ast.arg( + annotation=None, arg=argname, lineno=lineno, col_offset=0 + ) # @UndefinedVariable + + kwonlyargs.append(arg) + + if "return" in annotation_names: + arg = [annotation for annotation in annotations if annotation.arg == "return"][ + 0 + ] + returns = arg.annotation + else: + returns = None + + if vararg in annotation_names: + arg = [annotation for annotation in annotations if annotation.arg == vararg][0] + varargannotation = arg.annotation + else: + varargannotation = None + + if kwarg in annotation_names: + arg = [annotation for annotation in annotations if annotation.arg == kwarg][0] + kwargannotation = arg.annotation + else: + kwargannotation = None + + args = _ast.arguments( + args=args, + defaults=defaults if defaults else [], + kwarg=kwarg, + vararg=vararg, + kw_defaults=kw_defaults, + kwonlyargs=kwonlyargs, + kwargannotation=kwargannotation, + varargannotation=varargannotation, + lineno=lineno, + col_offset=0, + ) + + if code.co_name == "": + if len(stmnts) == 2: + if isinstance(stmnts[0], _ast.If) and isinstance(stmnts[1], _ast.Return): + assert len(stmnts[0].body) == 1 + assert isinstance(stmnts[0].body[0], _ast.Return) + stmnts = [ + _ast.Return( + _ast.IfExp( + stmnts[0].test, stmnts[0].body[0].value, stmnts[1].value + ) + ) + ] + + assert isinstance(stmnts[0], _ast.Return) + + stmnt = stmnts[0].value + ast_obj = _ast.Lambda(args=args, body=stmnt, lineno=lineno, col_offset=0) + else: + + if instructions.seen_yield: + return_ = stmnts[-1] + + assert isinstance(return_, _ast.Return) + assert isinstance(return_.value, _ast.Name) + assert return_.value.id == "None" + return_.value = None + + ast_obj = _ast.FunctionDef( + name=code.co_name, + args=args, + body=stmnts, + decorator_list=[], + returns=returns, + lineno=lineno, + col_offset=0, + ) + + return ast_obj - return ast_obj class StackLogger(list): def append(self, object): - print(' + ', end='') - print_ast(object, indent='', newline='') + print(" + ", end="") + print_ast(object, indent="", newline="") print() list.append(self, object) def pop(self, *index): value = list.pop(self, *index) - print(' + ', end='') - print_ast(value, indent='', newline='') + print(" + ", end="") + print_ast(value, indent="", newline="") print() return value + def bitrange(x, start, stop): return ((1 << (stop - start)) - 1) & (x >> start) + level = 0 -class Instructions(CtrlFlowInstructions, SimpleInstructions): + +class Instructions(CtrlFlowInstructions, SimpleInstructions): def __init__(self, ilst, stack_items=None, jump_map=False): self.ilst_processed = [] self.ilst = ilst[:] @@ -273,7 +333,7 @@ def __init__(self, ilst, stack_items=None, jump_map=False): else: self.jump_map = {} -# self.ast_stack = StackLogger() + # self.ast_stack = StackLogger() self.ast_stack = [] if stack_items: @@ -293,27 +353,27 @@ def stmnt(self): def visit(self, instr): global level - name = instr.opname.replace('+', '_') + name = instr.opname.replace("+", "_") method = getattr(self, name, None) if method is None: - raise AttributeError('can not handle instruction %r' % (str(instr))) + raise AttributeError("can not handle instruction %r" % (str(instr))) -# print(' ' * level, "+ visit:", repr(instr)) -# level += 1 + # print(' ' * level, "+ visit:", repr(instr)) + # level += 1 method(instr) -# level -= 1 -# print(' ' * level, "- stack:", self.ast_stack) + # level -= 1 + # print(' ' * level, "- stack:", self.ast_stack) def make_block(self, to, inclusive=True, raise_=True): -# print("make_block", to,) + # print("make_block", to,) block = [] while len(self.ilst): instr = self.ilst.pop(0) block.append(instr) -# instr_i = self.jump_map.get(instr.i, instr.i) + # instr_i = self.jump_map.get(instr.i, instr.i) instr_i = instr.i if to == instr_i: @@ -323,7 +383,7 @@ def make_block(self, to, inclusive=True, raise_=True): break else: if raise_: -# print(block) + # print(block) raise IndexError("no instruction i=%s " % (to,)) return block @@ -349,13 +409,25 @@ def MAKE_FUNCTION(self, instr): for i in range(ndefaults): defaults.insert(0, self.ast_stack.pop()) - function = make_function(code, defaults, lineno=instr.lineno, annotations=annotations, kw_defaults=kw_defaults) + function = make_function( + code, + defaults, + lineno=instr.lineno, + annotations=annotations, + kw_defaults=kw_defaults, + ) doc = code.co_consts[0] if code.co_consts else None if isinstance(doc, str): - function.body.insert(0, _ast.Expr(value=_ast.Str(s=doc, lineno=instr.lineno, col_offset=0), - lineno=instr.lineno, col_offset=0)) + function.body.insert( + 0, + _ast.Expr( + value=_ast.Str(s=doc, lineno=instr.lineno, col_offset=0), + lineno=instr.lineno, + col_offset=0, + ), + ) self.ast_stack.append(function) @@ -375,14 +447,19 @@ def MAKE_FUNCTION(self, instr): doc = code.co_consts[0] if code.co_consts else None if isinstance(doc, str): - function.body.insert(0, _ast.Expr(value=_ast.Str(s=doc, lineno=instr.lineno, col_offset=0), - lineno=instr.lineno, col_offset=0)) - + function.body.insert( + 0, + _ast.Expr( + value=_ast.Str(s=doc, lineno=instr.lineno, col_offset=0), + lineno=instr.lineno, + col_offset=0, + ), + ) self.ast_stack.append(function) def LOAD_LOCALS(self, instr): - self.ast_stack.append('LOAD_LOCALS') + self.ast_stack.append("LOAD_LOCALS") @py3op def LOAD_BUILD_CLASS(self, instr): @@ -421,10 +498,17 @@ def LOAD_BUILD_CLASS(self, instr): assert isinstance(ret, _ast.Return) - class_ = _ast.ClassDef(name=name, bases=bases, body=code, decorator_list=[], - kwargs=kwargs, keywords=keywords, starargs=starargs, - lineno=instr.lineno, col_offset=0, - ) + class_ = _ast.ClassDef( + name=name, + bases=bases, + body=code, + decorator_list=[], + kwargs=kwargs, + keywords=keywords, + starargs=starargs, + lineno=instr.lineno, + col_offset=0, + ) self.ast_stack.append(class_) @@ -440,12 +524,12 @@ def BUILD_CLASS(self, instr): assert isinstance(func, _ast.FunctionDef) code = func.body - pop_assignment(code, '__module__') + pop_assignment(code, "__module__") doc = pop_doc(code) ret = code.pop() - assert isinstance(ret, _ast.Return) and ret.value == 'LOAD_LOCALS' + assert isinstance(ret, _ast.Return) and ret.value == "LOAD_LOCALS" bases = self.ast_stack.pop() @@ -453,13 +537,19 @@ def BUILD_CLASS(self, instr): bases = bases.elts name = self.ast_stack.pop() - class_ = _ast.ClassDef(name=name, bases=bases, body=code, decorator_list=[], - lineno=instr.lineno, col_offset=0) + class_ = _ast.ClassDef( + name=name, + bases=bases, + body=code, + decorator_list=[], + lineno=instr.lineno, + col_offset=0, + ) self.ast_stack.append(class_) def LOAD_CLOSURE(self, instr): - self.ast_stack.append('CLOSURE') + self.ast_stack.append("CLOSURE") def MAKE_CLOSURE(self, instr): return self.MAKE_FUNCTION(instr) diff --git a/src/python/turicreate/meta/decompiler/recompile.py b/src/python/turicreate/meta/decompiler/recompile.py index 78e0eee069..339cafac57 100644 --- a/src/python/turicreate/meta/decompiler/recompile.py +++ b/src/python/turicreate/meta/decompiler/recompile.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Nov 3, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -19,7 +19,7 @@ py3 = sys.version_info.major >= 3 if py3: - import builtins #@UnresolvedImport + import builtins # @UnresolvedImport else: import __builtin__ as builtins @@ -29,14 +29,15 @@ MAGIC = imp.get_magic() + def create_pyc(codestring, cfile, timestamp=None): if timestamp is None: timestamp = time() - codeobject = builtins.compile(codestring, '', 'exec') + codeobject = builtins.compile(codestring, "", "exec") cfile.write(MAGIC) - cfile.write(struct.pack('i', timestamp)) + cfile.write(struct.pack("i", timestamp)) marshal.dump(codeobject, cfile) cfile.flush() diff --git a/src/python/turicreate/meta/decompiler/simple_instructions.py b/src/python/turicreate/meta/decompiler/simple_instructions.py index dece7f5614..bdbb399730 100644 --- a/src/python/turicreate/meta/decompiler/simple_instructions.py +++ b/src/python/turicreate/meta/decompiler/simple_instructions.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Jul 14, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -21,37 +21,51 @@ from ..asttools import cmp_ast if py3: - class _ast_Print: pass + + class _ast_Print: + pass + + else: _ast_Print = _ast.Print + def isNone(node): if node is None: return True - elif isinstance(node, _ast.Name) and (node.id == 'None') and isinstance(node.ctx, _ast.Load): + elif ( + isinstance(node, _ast.Name) + and (node.id == "None") + and isinstance(node.ctx, _ast.Load) + ): return True return False -def BINARY_(OP): +def BINARY_(OP): def BINARY_OP(self, instr): right = self.ast_stack.pop() left = self.ast_stack.pop() - add = _ast.BinOp(left=left, right=right, op=OP(), lineno=instr.lineno, col_offset=0) + add = _ast.BinOp( + left=left, right=right, op=OP(), lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(add) + return BINARY_OP -def INPLACE_(OP): +def INPLACE_(OP): def INPLACE_OP(self, instr): right = self.ast_stack.pop() left = self.ast_stack.pop() left.ctx = _ast.Store() - aug_assign = _ast.AugAssign(target=left, op=OP(), value=right, lineno=instr.lineno, col_offset=0) + aug_assign = _ast.AugAssign( + target=left, op=OP(), value=right, lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(aug_assign) @@ -59,7 +73,6 @@ def INPLACE_OP(self, instr): def UNARY_(OP): - def UNARY_OP(self, instr): expr = self.ast_stack.pop() not_ = _ast.UnaryOp(op=OP(), operand=expr, lineno=instr.lineno, col_offset=0) @@ -68,27 +81,30 @@ def UNARY_OP(self, instr): return UNARY_OP -CMP_OPMAP = {'>=' :_ast.GtE, - '<=' :_ast.LtE, - '>' :_ast.Gt, - '<' :_ast.Lt, - '==': _ast.Eq, - '!=': _ast.NotEq, - 'in': _ast.In, - 'not in': _ast.NotIn, - 'is':_ast.Is, - 'is not':_ast.IsNot, - } + +CMP_OPMAP = { + ">=": _ast.GtE, + "<=": _ast.LtE, + ">": _ast.Gt, + "<": _ast.Lt, + "==": _ast.Eq, + "!=": _ast.NotEq, + "in": _ast.In, + "not in": _ast.NotIn, + "is": _ast.Is, + "is not": _ast.IsNot, +} + def make_const(arg, lineno=0, col_offset=0): - kw = {'lineno':lineno, 'col_offset':col_offset} + kw = {"lineno": lineno, "col_offset": col_offset} if isinstance(arg, str): const = _ast.Str(s=arg, **kw) elif isinstance(arg, (int, float, complex)): const = _ast.Num(n=arg, **kw) elif arg is None: - const = _ast.Name(id='None', ctx=_ast.Load(), **kw) + const = _ast.Name(id="None", ctx=_ast.Load(), **kw) elif isinstance(arg, tuple): elts = [] for item in arg: @@ -99,19 +115,23 @@ def make_const(arg, lineno=0, col_offset=0): return const -class SimpleInstructions(object): +class SimpleInstructions(object): def LOAD_CONST(self, instr): const = make_const(instr.arg, lineno=instr.lineno, col_offset=0) self.ast_stack.append(const) def LOAD_NAME(self, instr): - name = _ast.Name(id=instr.arg, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0) + name = _ast.Name( + id=instr.arg, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(name) def LOAD_DEREF(self, instr): - name = _ast.Name(id=instr.arg, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0) + name = _ast.Name( + id=instr.arg, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(name) def CALL_FUNCTION_VAR(self, instr): @@ -152,7 +172,6 @@ def CALL_FUNCTION(self, instr): nkwargs = instr.oparg >> 8 nargs = (~(nkwargs << 8)) & instr.oparg - args = [] keywords = [] @@ -167,7 +186,6 @@ def CALL_FUNCTION(self, instr): arg = self.ast_stack.pop() args.insert(0, arg) - if len(args) == 1 and isinstance(args[0], (_ast.FunctionDef, _ast.ClassDef)): function = args[0] @@ -180,19 +198,29 @@ def CALL_FUNCTION(self, instr): self.ast_stack.append(function) return - node = self.ast_stack.pop() - callfunc = _ast.Call(func=node, args=args, keywords=keywords, starargs=None, kwargs=None, - lineno=instr.lineno, col_offset=0) + callfunc = _ast.Call( + func=node, + args=args, + keywords=keywords, + starargs=None, + kwargs=None, + lineno=instr.lineno, + col_offset=0, + ) self.ast_stack.append(callfunc) def LOAD_FAST(self, instr): - name = _ast.Name(id=instr.arg, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0) + name = _ast.Name( + id=instr.arg, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(name) def LOAD_GLOBAL(self, instr): - name = _ast.Name(id=instr.arg, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0) + name = _ast.Name( + id=instr.arg, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(name) def STORE_FAST(self, instr): @@ -221,13 +249,15 @@ def STORE_NAME(self, instr): else: as_name = instr.arg if value.names[0].asname is None: - base_name = value.names[0].name.split('.')[0] + base_name = value.names[0].name.split(".")[0] if base_name != as_name: value.names[0].asname = as_name self.ast_stack.append(value) - elif isinstance(value, (_ast.Attribute)) and isinstance(value.value, (_ast.Import)): + elif isinstance(value, (_ast.Attribute)) and isinstance( + value.value, (_ast.Import) + ): asname = instr.arg value = value.value value.names[0].asname = asname @@ -242,19 +272,25 @@ def STORE_NAME(self, instr): self.ast_stack.append(value) elif isinstance(value, _ast.Assign): _ = self.ast_stack.pop() - assname = _ast.Name(instr.arg, _ast.Store(), lineno=instr.lineno, col_offset=0) + assname = _ast.Name( + instr.arg, _ast.Store(), lineno=instr.lineno, col_offset=0 + ) value.targets.append(assname) self.ast_stack.append(value) else: - assname = _ast.Name(instr.arg, _ast.Store(), lineno=instr.lineno, col_offset=0) + assname = _ast.Name( + instr.arg, _ast.Store(), lineno=instr.lineno, col_offset=0 + ) - assign = _ast.Assign(targets=[assname], value=value, lineno=instr.lineno, col_offset=0) + assign = _ast.Assign( + targets=[assname], value=value, lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(assign) @py3op def STORE_LOCALS(self, instr): - 'remove Locals from class def' + "remove Locals from class def" self.ast_stack.pop() def STORE_GLOBAL(self, instr): @@ -280,7 +316,9 @@ def LOAD_ATTR(self, instr): attr = instr.arg - get_attr = _ast.Attribute(value=name, attr=attr, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0) + get_attr = _ast.Attribute( + value=name, attr=attr, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(get_attr) @@ -291,8 +329,16 @@ def STORE_ATTR(self, instr): expr = self.ast_stack.pop() expr = self.process_ifexpr(expr) - assattr = _ast.Attribute(value=node, attr=attrname, ctx=_ast.Store(), lineno=instr.lineno, col_offset=0) - set_attr = _ast.Assign(targets=[assattr], value=expr, lineno=instr.lineno, col_offset=0) + assattr = _ast.Attribute( + value=node, + attr=attrname, + ctx=_ast.Store(), + lineno=instr.lineno, + col_offset=0, + ) + set_attr = _ast.Assign( + targets=[assattr], value=expr, lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(set_attr) @@ -314,7 +360,9 @@ def IMPORT_FROM(self, instr): names = [_ast.alias(instr.arg, None)] modname = import_.names[0].name - from_ = _ast.ImportFrom(module=modname, names=names, level=0, lineno=instr.lineno, col_offset=0) + from_ = _ast.ImportFrom( + module=modname, names=names, level=0, lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(from_) self.ast_stack.append(import_) @@ -323,9 +371,15 @@ def IMPORT_STAR(self, instr): import_ = self.ast_stack.pop() names = import_.names - alias = _ast.alias(name='*', asname=None) + alias = _ast.alias(name="*", asname=None) - from_ = _ast.ImportFrom(module=names[0].name, names=[alias], level=0, lineno=instr.lineno, col_offset=0) + from_ = _ast.ImportFrom( + module=names[0].name, + names=[alias], + level=0, + lineno=instr.lineno, + col_offset=0, + ) self.ast_stack.append(from_) @@ -369,11 +423,11 @@ def ROT_TWO(self, instr): one = self.ast_stack.pop() two = self.ast_stack.pop() - if self.ilst[0].opname == 'STORE_NAME': + if self.ilst[0].opname == "STORE_NAME": kw = dict(lineno=instr.lineno, col_offset=0) stores = [] - while self.ilst[0].opname == 'STORE_NAME': + while self.ilst[0].opname == "STORE_NAME": stores.append(self.ilst.pop(0)) assert len(stores) <= 3, stores @@ -383,12 +437,14 @@ def ROT_TWO(self, instr): tup_load = _ast.Tuple(elts=elts_load[::-1], ctx=_ast.Load(), **kw) - elts_store = [_ast.Name(id=store.arg, ctx=_ast.Store(), **kw) for store in stores] + elts_store = [ + _ast.Name(id=store.arg, ctx=_ast.Store(), **kw) for store in stores + ] tup_store = _ast.Tuple(elts=elts_store, ctx=_ast.Store(), **kw) assgn = _ast.Assign(value=tup_load, targets=[tup_store], **kw) self.ast_stack.append(assgn) -# self.ast_stack.append(tup_store) + # self.ast_stack.append(tup_store) else: self.ast_stack.append(one) self.ast_stack.append(two) @@ -436,11 +492,16 @@ def COMPARE_OP(self, instr): expr = self.ast_stack.pop() OP = CMP_OPMAP[op] - compare = _ast.Compare(left=expr, ops=[OP()], comparators=[right], lineno=instr.lineno, col_offset=0) + compare = _ast.Compare( + left=expr, + ops=[OP()], + comparators=[right], + lineno=instr.lineno, + col_offset=0, + ) self.ast_stack.append(compare) - def YIELD_VALUE(self, instr): value = self.ast_stack.pop() @@ -455,7 +516,9 @@ def BUILD_LIST(self, instr): nitems = instr.oparg nodes = [] - list_ = _ast.List(elts=nodes, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0) + list_ = _ast.List( + elts=nodes, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0 + ) for i in range(nitems): nodes.insert(0, self.ast_stack.pop()) @@ -466,12 +529,14 @@ def BUILD_TUPLE(self, instr): nitems = instr.oparg nodes = [] - list_ = _ast.Tuple(elts=nodes, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0) + list_ = _ast.Tuple( + elts=nodes, ctx=_ast.Load(), lineno=instr.lineno, col_offset=0 + ) for i in range(nitems): nodes.insert(0, self.ast_stack.pop()) - if any([item == 'CLOSURE' for item in nodes]): - assert all([item == 'CLOSURE' for item in nodes]) + if any([item == "CLOSURE" for item in nodes]): + assert all([item == "CLOSURE" for item in nodes]) return self.ast_stack.append(list_) @@ -497,7 +562,7 @@ def BUILD_MAP(self, instr): while 1: new_instr = self.ilst.pop(0) - if new_instr.opname == 'STORE_MAP': + if new_instr.opname == "STORE_MAP": break map_instrs.append(new_instr) @@ -508,7 +573,6 @@ def BUILD_MAP(self, instr): values.append(items[0]) keys.append(items[1]) - list_ = _ast.Dict(keys=keys, values=values, lineno=instr.lineno, col_offset=0) self.ast_stack.append(list_) @@ -516,7 +580,9 @@ def UNPACK_SEQUENCE(self, instr): nargs = instr.oparg nodes = [] - ast_tuple = _ast.Tuple(elts=nodes, ctx=_ast.Store(), lineno=instr.lineno, col_offset=0) + ast_tuple = _ast.Tuple( + elts=nodes, ctx=_ast.Store(), lineno=instr.lineno, col_offset=0 + ) for i in range(nargs): nex_instr = self.ilst.pop(0) self.ast_stack.append(None) @@ -535,19 +601,25 @@ def UNPACK_SEQUENCE(self, instr): assert cmp_ast(assgn.value, value_dup) else: - assgn = _ast.Assign(targets=[ast_tuple], value=expr, lineno=instr.lineno, col_offset=0) + assgn = _ast.Assign( + targets=[ast_tuple], value=expr, lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(assgn) def DELETE_NAME(self, instr): - name = _ast.Name(id=instr.arg, ctx=_ast.Del(), lineno=instr.lineno, col_offset=0) + name = _ast.Name( + id=instr.arg, ctx=_ast.Del(), lineno=instr.lineno, col_offset=0 + ) delete = _ast.Delete(targets=[name], lineno=instr.lineno, col_offset=0) self.ast_stack.append(delete) def DELETE_FAST(self, instr): - name = _ast.Name(id=instr.arg, ctx=_ast.Del(), lineno=instr.lineno, col_offset=0) + name = _ast.Name( + id=instr.arg, ctx=_ast.Del(), lineno=instr.lineno, col_offset=0 + ) delete = _ast.Delete(targets=[name], lineno=instr.lineno, col_offset=0) self.ast_stack.append(delete) @@ -555,7 +627,13 @@ def DELETE_FAST(self, instr): def DELETE_ATTR(self, instr): expr = self.ast_stack.pop() - attr = _ast.Attribute(value=expr, attr=instr.arg, ctx=_ast.Del(), lineno=instr.lineno, col_offset=0) + attr = _ast.Attribute( + value=expr, + attr=instr.arg, + ctx=_ast.Del(), + lineno=instr.lineno, + col_offset=0, + ) delete = _ast.Delete(targets=[attr], lineno=instr.lineno, col_offset=0) self.ast_stack.append(delete) @@ -568,10 +646,16 @@ def EXEC_STMT(self, instr): if locals_ is globals_: locals_ = None - if isinstance(globals_, _ast.Name) and getattr(globals_, 'id',) == 'None': + if isinstance(globals_, _ast.Name) and getattr(globals_, "id",) == "None": globals_ = None - exec_ = _ast.Exec(body=expr, globals=globals_, locals=locals_, lineno=instr.lineno, col_offset=0) + exec_ = _ast.Exec( + body=expr, + globals=globals_, + locals=locals_, + lineno=instr.lineno, + col_offset=0, + ) self.ast_stack.append(exec_) @@ -593,7 +677,6 @@ def DUP_TOP_TWO(self, instr): self.ast_stack.append(expr2) self.ast_stack.append(expr1) - def DUP_TOPX(self, instr): exprs = [] @@ -613,7 +696,6 @@ def ROT_THREE(self, instr): self.ast_stack.append(expr3) self.ast_stack.append(expr2) - def ROT_FOUR(self, instr): expr1 = self.ast_stack.pop() expr2 = self.ast_stack.pop() @@ -625,9 +707,6 @@ def ROT_FOUR(self, instr): self.ast_stack.append(expr3) self.ast_stack.append(expr2) - - - def PRINT_ITEM(self, instr): item = self.ast_stack.pop() @@ -640,7 +719,9 @@ def PRINT_ITEM(self, instr): if isinstance(print_, _ast_Print) and not print_.nl and print_.dest is None: print_.values.append(item) else: - print_ = _ast_Print(dest=None, values=[item], nl=False, lineno=instr.lineno, col_offset=0) + print_ = _ast_Print( + dest=None, values=[item], nl=False, lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(print_) def PRINT_NEWLINE(self, instr): @@ -649,7 +730,9 @@ def PRINT_NEWLINE(self, instr): if isinstance(item, _ast_Print) and not item.nl and item.dest is None: item.nl = True else: - print_ = _ast_Print(dest=None, values=[], nl=True, lineno=instr.lineno, col_offset=0) + print_ = _ast_Print( + dest=None, values=[], nl=True, lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(print_) def PRINT_ITEM_TO(self, instr): @@ -665,7 +748,9 @@ def PRINT_ITEM_TO(self, instr): assert dup_print is print_ self.ast_stack.append(stream) else: - print_ = _ast_Print(dest=stream, values=[], nl=False, lineno=instr.lineno, col_offset=0) + print_ = _ast_Print( + dest=stream, values=[], nl=False, lineno=instr.lineno, col_offset=0 + ) item = self.ast_stack.pop() @@ -682,10 +767,11 @@ def PRINT_NEWLINE_TO(self, instr): if isinstance(item, _ast_Print) and not item.nl and item.dest is stream: item.nl = True else: - print_ = _ast_Print(dest=stream, values=[], nl=True, lineno=instr.lineno, col_offset=0) + print_ = _ast_Print( + dest=stream, values=[], nl=True, lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(print_) - def format_slice(self, index, kw): if isinstance(index, _ast.Tuple): @@ -721,7 +807,7 @@ def BINARY_SUBSCR(self, instr): self.ast_stack.append(subscr) def SLICE_0(self, instr): - 'obj[:]' + "obj[:]" value = self.ast_stack.pop() kw = dict(lineno=instr.lineno, col_offset=0) @@ -731,7 +817,7 @@ def SLICE_0(self, instr): self.ast_stack.append(subscr) def SLICE_1(self, instr): - 'obj[lower:]' + "obj[lower:]" lower = self.ast_stack.pop() value = self.ast_stack.pop() @@ -742,7 +828,7 @@ def SLICE_1(self, instr): self.ast_stack.append(subscr) def SLICE_2(self, instr): - 'obj[:stop]' + "obj[:stop]" upper = self.ast_stack.pop() value = self.ast_stack.pop() @@ -752,9 +838,8 @@ def SLICE_2(self, instr): self.ast_stack.append(subscr) - def SLICE_3(self, instr): - 'obj[lower:upper]' + "obj[lower:upper]" upper = self.ast_stack.pop() lower = self.ast_stack.pop() value = self.ast_stack.pop() @@ -765,7 +850,6 @@ def SLICE_3(self, instr): self.ast_stack.append(subscr) - def BUILD_SLICE(self, instr): step = None @@ -788,7 +872,7 @@ def BUILD_SLICE(self, instr): self.ast_stack.append(slice) def STORE_SLICE_0(self, instr): - 'obj[:] = expr' + "obj[:] = expr" value = self.ast_stack.pop() expr = self.ast_stack.pop() @@ -800,7 +884,7 @@ def STORE_SLICE_0(self, instr): self.ast_stack.append(assign) def STORE_SLICE_1(self, instr): - 'obj[lower:] = expr' + "obj[lower:] = expr" lower = self.ast_stack.pop() value = self.ast_stack.pop() expr = self.ast_stack.pop() @@ -812,9 +896,8 @@ def STORE_SLICE_1(self, instr): assign = _ast.Assign(targets=[subscr], value=expr, **kw) self.ast_stack.append(assign) - def STORE_SLICE_2(self, instr): - 'obj[:upper] = expr' + "obj[:upper] = expr" upper = self.ast_stack.pop() value = self.ast_stack.pop() expr = self.ast_stack.pop() @@ -827,7 +910,7 @@ def STORE_SLICE_2(self, instr): self.ast_stack.append(assign) def STORE_SLICE_3(self, instr): - 'obj[lower:upper] = expr' + "obj[lower:upper] = expr" upper = self.ast_stack.pop() lower = self.ast_stack.pop() @@ -849,7 +932,7 @@ def STORE_SLICE_3(self, instr): self.ast_stack.append(assign) def DELETE_SLICE_0(self, instr): - 'obj[:] = expr' + "obj[:] = expr" value = self.ast_stack.pop() kw = dict(lineno=instr.lineno, col_offset=0) @@ -860,7 +943,7 @@ def DELETE_SLICE_0(self, instr): self.ast_stack.append(delete) def DELETE_SLICE_1(self, instr): - 'obj[lower:] = expr' + "obj[lower:] = expr" lower = self.ast_stack.pop() value = self.ast_stack.pop() @@ -871,9 +954,8 @@ def DELETE_SLICE_1(self, instr): delete = _ast.Delete(targets=[subscr], **kw) self.ast_stack.append(delete) - def DELETE_SLICE_2(self, instr): - 'obj[:upper] = expr' + "obj[:upper] = expr" upper = self.ast_stack.pop() value = self.ast_stack.pop() @@ -885,7 +967,7 @@ def DELETE_SLICE_2(self, instr): self.ast_stack.append(delete) def DELETE_SLICE_3(self, instr): - 'obj[lower:upper] = expr' + "obj[lower:upper] = expr" upper = self.ast_stack.pop() lower = self.ast_stack.pop() value = self.ast_stack.pop() @@ -943,8 +1025,9 @@ def RAISE_VARARGS(self, instr): if nargs > 0: type = self.ast_stack.pop() - raise_ = _ast.Raise(tback=tback, inst=inst, type=type, - lineno=instr.lineno, col_offset=0) + raise_ = _ast.Raise( + tback=tback, inst=inst, type=type, lineno=instr.lineno, col_offset=0 + ) self.ast_stack.append(raise_) @RAISE_VARARGS.py3op @@ -959,8 +1042,7 @@ def RAISE_VARARGS(self, instr): if nargs > 0: exc = self.ast_stack.pop() - raise_ = _ast.Raise(exc=exc, cause=cause, - lineno=instr.lineno, col_offset=0) + raise_ = _ast.Raise(exc=exc, cause=cause, lineno=instr.lineno, col_offset=0) self.ast_stack.append(raise_) @py3op @@ -973,7 +1055,9 @@ def EXTENDED_ARG(self, instr): kw = dict(lineno=instr.lineno, col_offset=0) for argument_name in argument_names.elts[::-1]: annotation = self.ast_stack.pop() - arg = _ast.arg(annotation=annotation, arg=argument_name.s, **kw) #@UndefinedVariable + arg = _ast.arg( + annotation=annotation, arg=argument_name.s, **kw + ) # @UndefinedVariable args.append(arg) for arg in args: diff --git a/src/python/turicreate/meta/decompiler/tests/__init__.py b/src/python/turicreate/meta/decompiler/tests/__init__.py index 68e3f56ced..b12b27d175 100644 --- a/src/python/turicreate/meta/decompiler/tests/__init__.py +++ b/src/python/turicreate/meta/decompiler/tests/__init__.py @@ -19,10 +19,10 @@ else: from io import StringIO -filename = 'tests.py' +filename = "tests.py" -class Base(unittest.TestCase): +class Base(unittest.TestCase): def assertAstEqual(self, left, right): if not isinstance(left, _ast.AST): @@ -34,33 +34,38 @@ def assertAstEqual(self, left, right): if not result: lstream = StringIO() - print_ast(left, indent='', file=lstream, newline='') + print_ast(left, indent="", file=lstream, newline="") rstream = StringIO() - print_ast(right, indent='', file=rstream, newline='') + print_ast(right, indent="", file=rstream, newline="") lstream.seek(0) rstream.seek(0) - msg = 'Ast Not Equal:\nGenerated: %r\nExpected: %r' % (lstream.read(), rstream.read()) + msg = "Ast Not Equal:\nGenerated: %r\nExpected: %r" % ( + lstream.read(), + rstream.read(), + ) raise self.failureException(msg) - - def statement(self, stmnt, equiv=None, expected_ast=None): - expected_ast = compile(stmnt, filename, 'exec', _ast.PyCF_ONLY_AST) if expected_ast is None else expected_ast - code = compile(expected_ast, filename, 'exec') + expected_ast = ( + compile(stmnt, filename, "exec", _ast.PyCF_ONLY_AST) + if expected_ast is None + else expected_ast + ) + code = compile(expected_ast, filename, "exec") if equiv is None: mod_ast = make_module(code) else: mod_ast = make_module(code) - expected_ast = compile(equiv, filename, 'exec', _ast.PyCF_ONLY_AST) + expected_ast = compile(equiv, filename, "exec", _ast.PyCF_ONLY_AST) self.assertAstEqual(mod_ast, expected_ast) - code = compile(mod_ast, filename, 'exec') - expected_code = compile(expected_ast, filename, 'exec') + code = compile(mod_ast, filename, "exec") + expected_code = compile(expected_ast, filename, "exec") self.assertEqual(code.co_code, expected_code.co_code) diff --git a/src/python/turicreate/meta/decompiler/tests/test_comprehensions.py b/src/python/turicreate/meta/decompiler/tests/test_comprehensions.py index 76d8cbf800..de0ea94207 100644 --- a/src/python/turicreate/meta/decompiler/tests/test_comprehensions.py +++ b/src/python/turicreate/meta/decompiler/tests/test_comprehensions.py @@ -3,188 +3,186 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Nov 6, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ import unittest from ...decompiler.tests import Base -class ListComprehension(Base): +class ListComprehension(Base): def test_comp1(self): - stmnt = '[a for b in c]' + stmnt = "[a for b in c]" self.statement(stmnt) def test_comp2(self): - stmnt = '[a() +1 for b in c]' + stmnt = "[a() +1 for b in c]" self.statement(stmnt) def test_comp3(self): - stmnt = 'y = [a() +1 for b in c]' + stmnt = "y = [a() +1 for b in c]" self.statement(stmnt) def test_comp_ifs(self): - stmnt = 'y = [a() +1 for b in c if asdf]' + stmnt = "y = [a() +1 for b in c if asdf]" self.statement(stmnt) def test_comp_ifs1(self): - stmnt = 'y = [a() +1 for b in c if asdf if asd]' + stmnt = "y = [a() +1 for b in c if asdf if asd]" self.statement(stmnt) def test_comp_ifs2(self): - stmnt = 'y = [a() +1 for b in c if asdf if not asd]' + stmnt = "y = [a() +1 for b in c if asdf if not asd]" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp1(self): - stmnt = '[a for b in c for d in e]' + stmnt = "[a for b in c for d in e]" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp2(self): - stmnt = '[a() +1 for b in c for d in e]' + stmnt = "[a() +1 for b in c for d in e]" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp3(self): - stmnt = 'y = [a() +1 for b in c for d in e]' + stmnt = "y = [a() +1 for b in c for d in e]" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp_ifs(self): - stmnt = 'y = [a() +1 for b in c if asdf for d in e]' + stmnt = "y = [a() +1 for b in c if asdf for d in e]" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp_ifs1(self): - stmnt = 'y = [a() +1 for b in c if asdf if asd for d in e if this]' + stmnt = "y = [a() +1 for b in c if asdf if asd for d in e if this]" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp_ifs2(self): - stmnt = 'y = [a() +1 for b in c for d in e if adsf]' + stmnt = "y = [a() +1 for b in c for d in e if adsf]" self.statement(stmnt) class SetComprehension(Base): - def test_comp1(self): - stmnt = '{a for b in c}' + stmnt = "{a for b in c}" self.statement(stmnt) def test_comp2(self): - stmnt = '{a() +1 for b in c}' + stmnt = "{a() +1 for b in c}" self.statement(stmnt) def test_comp3(self): - stmnt = 'y = {a() +1 for b in c}' + stmnt = "y = {a() +1 for b in c}" self.statement(stmnt) def test_comp_ifs(self): - stmnt = 'y = {a() +1 for b in c if asdf}' + stmnt = "y = {a() +1 for b in c if asdf}" self.statement(stmnt) def test_comp_ifs1(self): - stmnt = 'y = {a() +1 for b in c if asdf if asd}' + stmnt = "y = {a() +1 for b in c if asdf if asd}" self.statement(stmnt) def test_comp_ifs2(self): - stmnt = 'y = {a() +1 for b in c if asdf if not asd}' + stmnt = "y = {a() +1 for b in c if asdf if not asd}" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp1(self): - stmnt = '{a for b in c for d in e}' + stmnt = "{a for b in c for d in e}" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp2(self): - stmnt = '{a() +1 for b in c for d in e}' + stmnt = "{a() +1 for b in c for d in e}" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp3(self): - stmnt = 'y = {a() +1 for b in c for d in e}' + stmnt = "y = {a() +1 for b in c for d in e}" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp_ifs(self): - stmnt = 'y = {a() +1 for b in c if asdf for d in e}' + stmnt = "y = {a() +1 for b in c if asdf for d in e}" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp_ifs1(self): - stmnt = 'y = {a() +1 for b in c if asdf if asd for d in e if this}' + stmnt = "y = {a() +1 for b in c if asdf if asd for d in e if this}" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp_ifs2(self): - stmnt = 'y = {a() +1 for b in c for d in e if adsf}' + stmnt = "y = {a() +1 for b in c for d in e if adsf}" self.statement(stmnt) class DictComprehension(Base): - def test_comp1(self): - stmnt = '{a:q for b in c}' + stmnt = "{a:q for b in c}" self.statement(stmnt) def test_comp2(self): - stmnt = '{a() +1:q for b in c}' + stmnt = "{a() +1:q for b in c}" self.statement(stmnt) def test_comp3(self): - stmnt = 'y = {a() +1:q for b in c}' + stmnt = "y = {a() +1:q for b in c}" self.statement(stmnt) def test_comp_ifs(self): - stmnt = 'y = {a() +1:q for b in c if asdf}' + stmnt = "y = {a() +1:q for b in c if asdf}" self.statement(stmnt) def test_comp_ifs1(self): - stmnt = 'y = {a() +1:q for b in c if asdf if asd}' + stmnt = "y = {a() +1:q for b in c if asdf if asd}" self.statement(stmnt) def test_comp_ifs2(self): - stmnt = 'y = {a() +1:q for b in c if asdf if not asd}' + stmnt = "y = {a() +1:q for b in c if asdf if not asd}" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp1(self): - stmnt = '{a:q for b in c for d in e}' + stmnt = "{a:q for b in c for d in e}" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp2(self): - stmnt = '{a():q +1 for b in c for d in e}' + stmnt = "{a():q +1 for b in c for d in e}" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp3(self): - stmnt = 'y = {a() +1:q for b in c for d in e}' + stmnt = "y = {a() +1:q for b in c for d in e}" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp_ifs(self): - stmnt = 'y = {a() +1:q for b in c if asdf for d in e}' + stmnt = "y = {a() +1:q for b in c if asdf for d in e}" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp_ifs1(self): - stmnt = 'y = {a() +1:q for b in c if asdf if asd for d in e if this}' + stmnt = "y = {a() +1:q for b in c if asdf if asd for d in e if this}" self.statement(stmnt) @unittest.expectedFailure def test_multi_comp_ifs2(self): - stmnt = 'y = {a() +1:q for b in c for d in e if adsf}' + stmnt = "y = {a() +1:q for b in c for d in e if adsf}" self.statement(stmnt) if __name__ == "__main__": - #import sys;sys.argv = ['', 'Test.testName'] + # import sys;sys.argv = ['', 'Test.testName'] unittest.main() diff --git a/src/python/turicreate/meta/decompiler/tests/test_decompiler.py b/src/python/turicreate/meta/decompiler/tests/test_decompiler.py index aa524ef68c..77ceb5ac76 100644 --- a/src/python/turicreate/meta/decompiler/tests/test_decompiler.py +++ b/src/python/turicreate/meta/decompiler/tests/test_decompiler.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Jul 14, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -16,184 +16,180 @@ from ...decompiler.tests import Base -filename = 'tests.py' +filename = "tests.py" class LogicJumps(Base): - def test_logic1(self): - 'a and b or c' - self.statement('a and b or c') + "a and b or c" + self.statement("a and b or c") def test_logic2(self): - 'a or (b or c)' - self.statement('a or (b or c)') - + "a or (b or c)" + self.statement("a or (b or c)") def test_if_expr_discard(self): - stmnt = 'a if b else c' + stmnt = "a if b else c" self.statement(stmnt) @unittest.skip("I think this may be a bug in python") def test_if_expr_const_bug(self): - stmnt = '0 if 1 else 2' + stmnt = "0 if 1 else 2" self.statement(stmnt) def test_if_expr_assign(self): - stmnt = 'd = a if b else c' + stmnt = "d = a if b else c" self.statement(stmnt) def test_if_expr_assignattr(self): - stmnt = 'd.a = a if b else c' + stmnt = "d.a = a if b else c" self.statement(stmnt) def test_bug010(self): - stmnt = ''' + stmnt = """ def foo(): if a: return 1 else: return 2 - ''' + """ - equiv = ''' + equiv = """ def foo(): if a: return 1 return 2 return None - ''' + """ self.statement(stmnt, equiv=equiv) @unittest.expectedFailure def test_bug011(self): - stmnt = ''' + stmnt = """ def foo(): if a or b or c: return 1 else: return 2 - ''' + """ self.statement(stmnt) -class Function(Base): +class Function(Base): def test_function(self): - stmnt = ''' + stmnt = """ def foo(): return None -''' +""" self.statement(stmnt) def test_function_args(self): - stmnt = ''' + stmnt = """ def foo(a, b, c='asdf'): return None -''' +""" self.statement(stmnt) - def test_function_var_args(self): - stmnt = ''' + stmnt = """ def foo(a, b, *c): return None -''' +""" self.statement(stmnt) - def test_function_varkw_args(self): - stmnt = ''' + stmnt = """ def foo(a, b, *c, **d): return None -''' +""" self.statement(stmnt) def test_function_kw_args(self): - stmnt = ''' + stmnt = """ def foo(a, b, **d): return None -''' +""" self.statement(stmnt) def test_function_yield(self): - stmnt = ''' + stmnt = """ def foo(a, b): yield a + b return -''' +""" self.statement(stmnt) def test_function_decorator(self): - stmnt = ''' + stmnt = """ @bar def foo(a, b): return None -''' +""" self.statement(stmnt) def test_function_decorator2(self): - stmnt = ''' + stmnt = """ @bar @bar2 def foo(a, b): return None -''' +""" self.statement(stmnt) def test_build_lambda(self): - stmnt = 'lambda a: a' + stmnt = "lambda a: a" self.statement(stmnt) def test_build_lambda1(self): - stmnt = 'func = lambda a, b: a+1' + stmnt = "func = lambda a, b: a+1" self.statement(stmnt) def test_build_lambda_var_args(self): - stmnt = 'func = lambda a, *b: a+1' + stmnt = "func = lambda a, *b: a+1" self.statement(stmnt) def test_build_lambda_kw_args(self): - stmnt = 'func = lambda **b: a+1' + stmnt = "func = lambda **b: a+1" self.statement(stmnt) def test_build_lambda_varkw_args(self): - stmnt = 'func = lambda *a, **b: a+1' + stmnt = "func = lambda *a, **b: a+1" self.statement(stmnt) class ClassDef(Base): def test_build_class(self): - stmnt = ''' + stmnt = """ class Bar(object): 'adsf' a = 1 -''' +""" self.statement(stmnt) def test_build_class_wfunc(self): - stmnt = ''' + stmnt = """ class Bar(object): 'adsf' a = 1 def foo(self): return None -''' +""" self.statement(stmnt) def test_build_class_wdec(self): - stmnt = ''' + stmnt = """ @decorator class Bar(object): 'adsf' @@ -201,101 +197,99 @@ class Bar(object): def foo(self): return None -''' +""" self.statement(stmnt) - class ControlFlow(Base): def test_if(self): - self.statement('if a: b') + self.statement("if a: b") def test_if2(self): - self.statement('if a: b or c') + self.statement("if a: b or c") def test_if3(self): - self.statement('if a and b: c') + self.statement("if a and b: c") def test_if4(self): - self.statement('if a or b: c') + self.statement("if a or b: c") def test_if5(self): - self.statement('if not a: c') + self.statement("if not a: c") def test_if6(self): - self.statement('if not a or b: c') + self.statement("if not a or b: c") def test_elif(self): - stmnt = '''if a: + stmnt = """if a: b elif c: - d''' + d""" self.statement(stmnt) def test_if_else(self): - stmnt = '''if a: + stmnt = """if a: b else: - d''' + d""" self.statement(stmnt) def test_if_elif_else(self): - stmnt = '''if a: + stmnt = """if a: b elif f: d else: - d''' + d""" self.statement(stmnt) def test_tryexcept1(self): - stmnt = ''' + stmnt = """ try: foo except: bar -''' +""" self.statement(stmnt) def test_tryexcept_else(self): - stmnt = ''' + stmnt = """ try: foo except: bar else: baz -''' +""" self.statement(stmnt) def test_tryexcept2(self): - stmnt = ''' + stmnt = """ try: foo except Exception: bar else: baz -''' +""" self.statement(stmnt) - def test_tryexcept3(self): - stmnt = ''' + stmnt = """ try: foo except Exception as error: bar else: baz -''' +""" self.statement(stmnt) def test_tryexcept4(self): - stmnt = ''' + stmnt = """ try: foo except Exception as error: @@ -304,179 +298,178 @@ def test_tryexcept4(self): bar else: baz -''' +""" self.statement(stmnt) def test_while(self): - self.statement('while b: a') + self.statement("while b: a") def test_while1(self): - self.statement('while 1: a') - + self.statement("while 1: a") def test_while_logic(self): - self.statement('while a or b: x') + self.statement("while a or b: x") def test_while_logic2(self): - self.statement('while a and b: x') + self.statement("while a and b: x") def test_while_logic3(self): - self.statement('while a >= r and b == c: x') + self.statement("while a >= r and b == c: x") def test_while_else(self): - stmnt = ''' + stmnt = """ while a: break else: a -''' +""" self.statement(stmnt) def test_for(self): - stmnt = ''' + stmnt = """ for i in a: break -''' +""" self.statement(stmnt) def test_for2(self): - stmnt = ''' + stmnt = """ for i in a: b = 3 -''' +""" self.statement(stmnt) def test_for_else(self): - stmnt = ''' + stmnt = """ for i in a: b = 3 else: b= 2 -''' +""" self.statement(stmnt) def test_for_continue(self): - stmnt = ''' + stmnt = """ for i in a: b = 3 continue -''' +""" self.statement(stmnt) def test_for_unpack(self): - stmnt = ''' + stmnt = """ for i,j in a: b = 3 -''' +""" self.statement(stmnt) def test_try_continue(self): - stmnt = ''' + stmnt = """ for x in (1,2): try: continue except: pass -''' +""" self.statement(stmnt) def test_loop_01(self): - stmnt = ''' + stmnt = """ if c > d: if e > f: g h -''' +""" + def test_loop_bug(self): - stmnt = ''' + stmnt = """ for a in b: if c > d: if e > f: g h -''' +""" self.statement(stmnt) def test_while_bug(self): - stmnt = ''' + stmnt = """ while a: q while b: w -''' +""" self.statement(stmnt) @unittest.expectedFailure def test_while_bug02(self): - stmnt = ''' + stmnt = """ while 1: b += y if b < x: break -''' +""" self.statement(stmnt) -class Complex(Base): +class Complex(Base): def test_if_in_for(self): - stmnt = ''' + stmnt = """ for i in j: if i: j =1 -''' +""" self.statement(stmnt) def test_if_in_for2(self): - stmnt = ''' + stmnt = """ for i in j: if i: a else: b -''' +""" self.statement(stmnt) def test_if_in_for3(self): - stmnt = ''' + stmnt = """ for i in j: if i: break else: continue -''' - equiv = ''' +""" + equiv = """ for i in j: if i: break continue -''' +""" self.statement(stmnt, equiv) def test_if_in_while(self): - stmnt = ''' + stmnt = """ while i in j: if i: a else: b -''' +""" self.statement(stmnt) - def test_nested_if(self): - stmnt = ''' + stmnt = """ if a: if b: c else: d -''' +""" self.statement(stmnt) def test_nested_if2(self): - stmnt = ''' + stmnt = """ if a: if b: c @@ -484,20 +477,20 @@ def test_nested_if2(self): d else: b -''' +""" self.statement(stmnt) def test_if_return(self): - stmnt = ''' + stmnt = """ def a(): if b: return None return None -''' +""" self.statement(stmnt) def test_if_return2(self): - stmnt = ''' + stmnt = """ def a(): if b: a @@ -505,22 +498,20 @@ def a(): return b return c -''' +""" self.statement(stmnt) - def test_nested_while_bug(self): - stmnt = ''' + stmnt = """ if gid == 0: output[0] = initial while i < input.size: output[0] += shared[i] -''' +""" self.statement(stmnt) - def test_aug_assign_slice(self): - stmnt = 'c[idx:a:3] += b[idx:a]' + stmnt = "c[idx:a:3] += b[idx:a]" self.statement(stmnt) def test_issue_4(self): @@ -536,5 +527,5 @@ def example(idx): if __name__ == "__main__": - #import sys;sys.argv = ['', 'Test.test_assign'] + # import sys;sys.argv = ['', 'Test.test_assign'] unittest.main() diff --git a/src/python/turicreate/meta/decompiler/tests/test_simple.py b/src/python/turicreate/meta/decompiler/tests/test_simple.py index 43e902c940..25ff3172e2 100644 --- a/src/python/turicreate/meta/decompiler/tests/test_simple.py +++ b/src/python/turicreate/meta/decompiler/tests/test_simple.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Nov 9, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -17,347 +17,344 @@ class Simple(Base): - def test_assign(self): - 'a = b' - self.statement('a = b') + "a = b" + self.statement("a = b") def test_assign2(self): - 'a = b = c' - self.statement('a = b') + "a = b = c" + self.statement("a = b") def test_assign3(self): - 'a = b,d = c' - self.statement('a = b') + "a = b,d = c" + self.statement("a = b") def test_assign4(self): - 'a.y = b,d = c' - self.statement('a = b') + "a.y = b,d = c" + self.statement("a = b") def test_setattr(self): - 'a.b = b' - self.statement('a.b = b') + "a.b = b" + self.statement("a.b = b") def test_getattr(self): - 'a = b.b' - self.statement('a = b.b') + "a = b.b" + self.statement("a = b.b") def test_add(self): - 'a+b' - self.statement('a+b') + "a+b" + self.statement("a+b") def test_sub(self): - 'a-b' - self.statement('a-b') + "a-b" + self.statement("a-b") def test_mul(self): - 'a*b' - self.statement('a*b') + "a*b" + self.statement("a*b") def test_div(self): - 'a/b' - self.statement('a/b') + "a/b" + self.statement("a/b") def test_floordiv(self): - 'a//b' - self.statement('a//b') + "a//b" + self.statement("a//b") def test_pow(self): - 'a**b' - self.statement('a**b') + "a**b" + self.statement("a**b") def test_eq(self): - 'a==b' - self.statement('a==b') + "a==b" + self.statement("a==b") def test_iadd(self): - 'a+=b' - self.statement('a+=b') + "a+=b" + self.statement("a+=b") def test_isub(self): - 'a-=b' - self.statement('a-=b') + "a-=b" + self.statement("a-=b") def test_binary_and(self): - 'a & b' - self.statement('a & b') + "a & b" + self.statement("a & b") def test_binary_lshift(self): - 'a << b' - self.statement('a << b') + "a << b" + self.statement("a << b") def test_binary_rshift(self): - 'a >> b' - self.statement('a >> b') + "a >> b" + self.statement("a >> b") def test_binary_mod(self): - 'a % b' - self.statement('a % b') + "a % b" + self.statement("a % b") def test_binary_or(self): - 'a | b' - self.statement('a | b') + "a | b" + self.statement("a | b") def test_binary_xor(self): - 'a ^ b' - self.statement('a ^ b') + "a ^ b" + self.statement("a ^ b") def test_build_list(self): - '[x,y, 1, None]' - self.statement('[x,y, 1, None]') + "[x,y, 1, None]" + self.statement("[x,y, 1, None]") def test_build_tuple(self): - '(x,y, 1, None)' - self.statement('(x,y, 1, None)') + "(x,y, 1, None)" + self.statement("(x,y, 1, None)") def test_build_set(self): - '{x,y, 1, None}' - self.statement('{x,y, 1, None}') + "{x,y, 1, None}" + self.statement("{x,y, 1, None}") def test_build_dict(self): - '{a:x,b:y, c:1, d:None}' - self.statement('{a:x,b:y, c:1, d:None}') + "{a:x,b:y, c:1, d:None}" + self.statement("{a:x,b:y, c:1, d:None}") def test_unpack_tuple(self): - '(a,b) = c' - self.statement('(a,b) = c') - + "(a,b) = c" + self.statement("(a,b) = c") def test_delete_name(self): - stmnt = 'del a' + stmnt = "del a" self.statement(stmnt) def test_delete_attr(self): - stmnt = 'del a.a' + stmnt = "del a.a" self.statement(stmnt) @py2only def test_exec1(self): - stmnt = 'exec a' + stmnt = "exec a" self.statement(stmnt) @py2only def test_exec2(self): - stmnt = 'exec a in b' + stmnt = "exec a in b" self.statement(stmnt) @py2only def test_exec3(self): - stmnt = 'exec a in b,c' + stmnt = "exec a in b,c" self.statement(stmnt) @py2only def test_exec4(self): - stmnt = 'exec a in {2:1}, { }' + stmnt = "exec a in {2:1}, { }" self.statement(stmnt) def test_import_star(self): - stmnt = 'from a import *' + stmnt = "from a import *" self.statement(stmnt) - stmnt = 'from a.v import *' + stmnt = "from a.v import *" self.statement(stmnt) def test_import(self): - stmnt = 'import a' + stmnt = "import a" self.statement(stmnt) def test_import_as(self): - stmnt = 'import a as b' + stmnt = "import a as b" self.statement(stmnt) def test_import_from(self): - stmnt = 'from c import a as b' + stmnt = "from c import a as b" self.statement(stmnt) def test_import_from2(self): - stmnt = 'from c import a \nimport x' + stmnt = "from c import a \nimport x" self.statement(stmnt) def test_not(self): - stmnt = 'not a' + stmnt = "not a" self.statement(stmnt) - def test_call(self): - stmnt = 'a()' + stmnt = "a()" self.statement(stmnt) def test_call_args(self): - stmnt = 'a(a, b)' + stmnt = "a(a, b)" self.statement(stmnt) def test_call_args1(self): - stmnt = 'a(a, b, c=33)' + stmnt = "a(a, b, c=33)" self.statement(stmnt) def test_call_varargs(self): - stmnt = 'a(*a)' + stmnt = "a(*a)" self.statement(stmnt) def test_call_kwargs(self): - stmnt = 'a(a, b=0, **a)' + stmnt = "a(a, b=0, **a)" self.statement(stmnt) def test_call_var_kwargs(self): - stmnt = 'a(a, b=0, *d, **a)' + stmnt = "a(a, b=0, *d, **a)" self.statement(stmnt) @py2only def test_print(self): - stmnt = 'print foo,' + stmnt = "print foo," self.statement(stmnt) @py2only def test_printnl(self): - stmnt = 'print foo' + stmnt = "print foo" self.statement(stmnt) @py2only def test_printitems(self): - stmnt = 'print foo, bar, bas,' + stmnt = "print foo, bar, bas," self.statement(stmnt) @py2only def test_printitemsnl(self): - stmnt = 'print foo, bar, bas' + stmnt = "print foo, bar, bas" self.statement(stmnt) @py2only def test_print_to(self): - stmnt = 'print >> stream, foo,' + stmnt = "print >> stream, foo," self.statement(stmnt) @py2only def test_print_to_nl(self): - stmnt = 'print >> stream, foo' + stmnt = "print >> stream, foo" self.statement(stmnt) @py2only def test_printitems_to(self): - stmnt = 'print >> stream, foo, bar, bas,' + stmnt = "print >> stream, foo, bar, bas," self.statement(stmnt) @py2only def test_printitems_to_nl(self): - stmnt = 'print >> stream, foo, bar, bas' + stmnt = "print >> stream, foo, bar, bas" self.statement(stmnt) def test_subscr(self): - stmnt = 'x[y]' + stmnt = "x[y]" self.statement(stmnt) def test_subscr_assign(self): - stmnt = 'x[y] =z' + stmnt = "x[y] =z" self.statement(stmnt) def test_subscr_del(self): - stmnt = 'del x[y]' + stmnt = "del x[y]" self.statement(stmnt) def test_subscr0(self): - stmnt = 'x[:]' + stmnt = "x[:]" self.statement(stmnt) def test_subscr_assign0(self): - stmnt = 'x[:] =z' + stmnt = "x[:] =z" self.statement(stmnt) def test_subscr_del0(self): - stmnt = 'del x[:]' + stmnt = "del x[:]" self.statement(stmnt) def test_subscr1(self): - stmnt = 'x[a:]' + stmnt = "x[a:]" self.statement(stmnt) def test_subscr_assign1(self): - stmnt = 'x[a:] =z' + stmnt = "x[a:] =z" self.statement(stmnt) def test_subscr_del1(self): - stmnt = 'del x[a:]' + stmnt = "del x[a:]" self.statement(stmnt) def test_subscr2(self): - stmnt = 'x[:a]' + stmnt = "x[:a]" self.statement(stmnt) def test_subscr_assign2(self): - stmnt = 'x[:a] =z' + stmnt = "x[:a] =z" self.statement(stmnt) def test_subscr_del2(self): - stmnt = 'del x[:a]' + stmnt = "del x[:a]" self.statement(stmnt) def test_subscr3(self): - stmnt = 'x[b:a]' + stmnt = "x[b:a]" self.statement(stmnt) def test_subscr_assign3(self): - stmnt = 'x[b:a] =z' + stmnt = "x[b:a] =z" self.statement(stmnt) def test_subscr_del3(self): - stmnt = 'del x[b:a]' + stmnt = "del x[b:a]" self.statement(stmnt) def test_subscrX(self): - stmnt = 'x[b:a:c]' + stmnt = "x[b:a:c]" self.statement(stmnt) def test_subscr_assignX(self): - stmnt = 'x[b:a:c] =z' + stmnt = "x[b:a:c] =z" self.statement(stmnt) def test_subscr_delX(self): - stmnt = 'del x[b:a:c]' + stmnt = "del x[b:a:c]" self.statement(stmnt) def test_subscrX2(self): - stmnt = 'x[::]' + stmnt = "x[::]" self.statement(stmnt) def test_subscr_assignX2(self): - stmnt = 'x[::] =z' + stmnt = "x[::] =z" self.statement(stmnt) def test_subscr_delX2(self): - stmnt = 'del x[::]' + stmnt = "del x[::]" self.statement(stmnt) def test_subscr_tuple(self): - stmnt = 'x[x,a]' + stmnt = "x[x,a]" self.statement(stmnt) def test_subscr_tuple_set(self): - stmnt = 'x[x,a] =z' + stmnt = "x[x,a] =z" self.statement(stmnt) def test_subscr_tuple_del(self): - stmnt = 'del x[x,a]' + stmnt = "del x[x,a]" self.statement(stmnt) def test_subscrX3(self): - stmnt = 'x[x,:a]' + stmnt = "x[x,:a]" self.statement(stmnt) def test_subscr_assignX3(self): - stmnt = 'x[x,:a] =z' + stmnt = "x[x,:a] =z" self.statement(stmnt) def test_subscr_delX3(self): - stmnt = 'del x[x,:a]' + stmnt = "del x[x,:a]" self.statement(stmnt) def test_bug_001(self): - stmnt = 'a = 1; b = 2; (a, b) = (b, a)' + stmnt = "a = 1; b = 2; (a, b) = (b, a)" self.statement(stmnt) def test_bug_0021(self): - stmnt = '(a, b, c) = (c, b, a)' + stmnt = "(a, b, c) = (c, b, a)" self.statement(stmnt) def test_bug_002(self): @@ -371,8 +368,10 @@ def test_bug_003(self): self.statement(stmnt) def test_bug_004(self): - stmnt = '(a, b, c) = (c, b, a) = (x, y, z)' + stmnt = "(a, b, c) = (c, b, a) = (x, y, z)" self.statement(stmnt) + + if __name__ == "__main__": - #import sys;sys.argv = ['', 'Test.test_assign'] + # import sys;sys.argv = ['', 'Test.test_assign'] unittest.main() diff --git a/src/python/turicreate/meta/decompiler/util.py b/src/python/turicreate/meta/decompiler/util.py index bd315088f5..9f8046a70d 100644 --- a/src/python/turicreate/meta/decompiler/util.py +++ b/src/python/turicreate/meta/decompiler/util.py @@ -3,25 +3,29 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Jul 15, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ import _ast import sys + py3 = sys.version_info.major >= 3 + def ast_keys(node): return node._fields + def ast_values(node): return [getattr(node, field, None) for field in node._fields] + def ast_items(node): return [(field, getattr(node, field, None)) for field in node._fields] @@ -29,6 +33,7 @@ def ast_items(node): def depth(node): return len(flatten(node)) + def flatten(node): result = [] diff --git a/src/python/turicreate/meta/scripts/depyc.py b/src/python/turicreate/meta/scripts/depyc.py index 838b5597bb..83a5722948 100644 --- a/src/python/turicreate/meta/scripts/depyc.py +++ b/src/python/turicreate/meta/scripts/depyc.py @@ -3,12 +3,12 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Decompile python byte encoded modules code. Created on Jul 19, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ @@ -28,52 +28,56 @@ py3 = sys.version_info.major >= 3 + def depyc(args): binary = args.input.read() modtime, code = extract(binary) - print("Decompiling module %r compiled on %s" % (args.input.name, modtime,), file=sys.stderr) + print( + "Decompiling module %r compiled on %s" % (args.input.name, modtime,), + file=sys.stderr, + ) - if args.output_type == 'pyc': + if args.output_type == "pyc": if py3 and args.output is sys.stdout: args.output = sys.stdout.buffer args.output.write(binary) return - if args.output_type == 'opcode': + if args.output_type == "opcode": print_code(code) return mod_ast = make_module(code) - if args.output_type == 'ast': + if args.output_type == "ast": print_ast(mod_ast, file=args.output) return - if args.output_type == 'python': + if args.output_type == "python": python_source(mod_ast, file=args.output) return + raise Exception("unknown output type %r" % args.output_type) - raise Exception("unknown output type %r" % args.output_type) def src_tool(args): print("Analysing python module %r" % (args.input.name,), file=sys.stderr) source = args.input.read() mod_ast = ast.parse(source, args.input.name) - code = compile(source, args.input.name, mode='exec', dont_inherit=True) + code = compile(source, args.input.name, mode="exec", dont_inherit=True) - if args.output_type == 'opcode': + if args.output_type == "opcode": print_code(code) return - elif args.output_type == 'ast': + elif args.output_type == "ast": print_ast(mod_ast, file=args.output) return - elif args.output_type == 'python': + elif args.output_type == "python": print(source.decode(), file=args.output) - elif args.output_type == 'pyc': + elif args.output_type == "pyc": if py3 and args.output is sys.stdout: args.output = sys.stdout.buffer @@ -86,32 +90,44 @@ def src_tool(args): args.output = sys.stdout.buffer create_pyc(source, cfile=args.output, timestamp=timestamp) else: - raise Exception("unknown output type %r" % args.output_type) + raise Exception("unknown output type %r" % args.output_type) return + def setup_parser(parser): - parser.add_argument('input', type=FileType('rb')) - parser.add_argument('-t', '--input-type', default='from_filename', dest='input_type') + parser.add_argument("input", type=FileType("rb")) + parser.add_argument( + "-t", "--input-type", default="from_filename", dest="input_type" + ) - parser.add_argument('-o', '--output', default='-', type=FileType('wb')) + parser.add_argument("-o", "--output", default="-", type=FileType("wb")) group = parser.add_mutually_exclusive_group() - group.add_argument('--python', default='python', action='store_const', const='python', - dest='output_type') - group.add_argument('--ast', action='store_const', const='ast', - dest='output_type') - group.add_argument('--opcode', action='store_const', const='opcode', - dest='output_type') - group.add_argument('--pyc', action='store_const', const='pyc', - dest='output_type') + group.add_argument( + "--python", + default="python", + action="store_const", + const="python", + dest="output_type", + ) + group.add_argument("--ast", action="store_const", const="ast", dest="output_type") + group.add_argument( + "--opcode", action="store_const", const="opcode", dest="output_type" + ) + group.add_argument("--pyc", action="store_const", const="pyc", dest="output_type") + def main(): parser = ArgumentParser(description=__doc__) setup_parser(parser) args = parser.parse_args(sys.argv[1:]) - input_python = args.input.name.endswith('.py') if args.input_type == 'from_filename' else args.input_type == 'python' + input_python = ( + args.input.name.endswith(".py") + if args.input_type == "from_filename" + else args.input_type == "python" + ) if input_python: src_tool(args) @@ -120,5 +136,6 @@ def main(): args.input = sys.stdin.buffer depyc(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/src/python/turicreate/meta/testing.py b/src/python/turicreate/meta/testing.py index af52b3b061..3c80a3a8a4 100644 --- a/src/python/turicreate/meta/testing.py +++ b/src/python/turicreate/meta/testing.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Nov 5, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ diff --git a/src/python/turicreate/meta/utils.py b/src/python/turicreate/meta/utils.py index b70a9ca847..1ed24ed142 100644 --- a/src/python/turicreate/meta/utils.py +++ b/src/python/turicreate/meta/utils.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Created on Nov 4, 2011 @author: sean -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -15,34 +15,37 @@ py3 = sys.version_info.major >= 3 + class Python2(object): @staticmethod def py2op(func): return func - def __init__(self,*args, **kwargs): + def __init__(self, *args, **kwargs): raise NotImplementedError("This function is not implemented in python 2.x") + def py3op(func): if py3: - func.py2op = lambda _:func + func.py2op = lambda _: func return func else: return Python2 -class Python3(object): - def __init__(self,*args, **kwargs): +class Python3(object): + def __init__(self, *args, **kwargs): raise NotImplementedError("This function is not implemented in python 3.x") @staticmethod def py3op(func): return func + def py2op(func): if not py3: - func.py3op = lambda _:func + func.py3op = lambda _: func return func else: return Python3 diff --git a/src/python/turicreate/test/_test_api_visibility.py b/src/python/turicreate/test/_test_api_visibility.py index 35fc3d1734..d8d9e44f98 100644 --- a/src/python/turicreate/test/_test_api_visibility.py +++ b/src/python/turicreate/test/_test_api_visibility.py @@ -9,315 +9,365 @@ import unittest import turicreate -MODULES = ['aws', 'clustering', 'data_structures', 'graph_analytics', - 'data_matching', 'recommender'] -CLUSTER = ['kmeans', 'dbscan'] -RECOMMENDERS = ['popularity_recommender', - 'item_similarity_recommender', - 'factorization_recommender', - 'ranking_factorization_recommender', - 'item_content_recommender'] -ANOMALY_DETECTION = ['local_outlier_factor', 'moving_zscore', 'bayesian_changepoints'] -DATA_MATCHING = ['autotagger', 'deduplication', 'nearest_neighbor_deduplication', - 'nearest_neighbor_autotagger', 'record_linker', 'similarity_search'] -DATA_STRUCTURES = ['SFrame', 'SArray', 'Graph', 'Vertex', 'Edge'] -GRAPH_ANALYTICS = ['connected_components', 'graph_coloring', 'kcore', 'load_sgraph', 'pagerank', - 'triangle_counting', 'shortest_path'] -GENERAL = ['load_model', 'load_sframe', 'load_sarray', 'Model', 'CustomModel'] +MODULES = [ + "aws", + "clustering", + "data_structures", + "graph_analytics", + "data_matching", + "recommender", +] +CLUSTER = ["kmeans", "dbscan"] +RECOMMENDERS = [ + "popularity_recommender", + "item_similarity_recommender", + "factorization_recommender", + "ranking_factorization_recommender", + "item_content_recommender", +] +ANOMALY_DETECTION = ["local_outlier_factor", "moving_zscore", "bayesian_changepoints"] +DATA_MATCHING = [ + "autotagger", + "deduplication", + "nearest_neighbor_deduplication", + "nearest_neighbor_autotagger", + "record_linker", + "similarity_search", +] +DATA_STRUCTURES = ["SFrame", "SArray", "Graph", "Vertex", "Edge"] +GRAPH_ANALYTICS = [ + "connected_components", + "graph_coloring", + "kcore", + "load_sgraph", + "pagerank", + "triangle_counting", + "shortest_path", +] +GENERAL = ["load_model", "load_sframe", "load_sarray", "Model", "CustomModel"] + class TuriTests(unittest.TestCase): def test_top_level(self): - for x in (MODULES + CLUSTER + DATA_STRUCTURES + GRAPH_ANALYTICS + - GENERAL + RECOMMENDERS + ANOMALY_DETECTION + DATA_MATCHING): + for x in ( + MODULES + + CLUSTER + + DATA_STRUCTURES + + GRAPH_ANALYTICS + + GENERAL + + RECOMMENDERS + + ANOMALY_DETECTION + + DATA_MATCHING + ): self.assertTrue(x in dir(turicreate)) # TODO Test whether or not things are NOT visible + def get_visible_items(d): - return [x for x in dir(d) if not x.startswith('_')] + return [x for x in dir(d) if not x.startswith("_")] + def check_visible_modules(actual, expected): - assert set(actual) == set(expected), "API Surface mis-matched. \ - Expected %s. Got %s" % (expected, actual) + assert set(actual) == set(expected), ( + "API Surface mis-matched. \ + Expected %s. Got %s" + % (expected, actual) + ) -class TabCompleteVisibilityTests(unittest.TestCase): +class TabCompleteVisibilityTests(unittest.TestCase): def test_kmeans(self): # Testing an instantiated class that inherits from Model - sf = turicreate.SFrame({ - "d1": [1,2,3,4,5,6], - "d2": [5,4,3,2,1,6]}) + sf = turicreate.SFrame({"d1": [1, 2, 3, 4, 5, 6], "d2": [5, 4, 3, 2, 1, 6]}) m = turicreate.kmeans.create(sf, num_clusters=3, verbose=False) expected = [ - 'batch_size', - 'row_label_name', - 'cluster_id', - 'cluster_info', - 'features', - 'get', - '_list_fields', - '_list_fields', - 'max_iterations', - 'method', - 'name', - 'num_clusters', - 'num_examples', - 'num_features', - 'num_unpacked_features', - 'save', - 'show', - 'summary', - 'training_iterations', - 'training_time', - 'unpacked_features', - 'verbose', - 'predict'] - - actual = [x for x in dir(m) if not x.startswith('_')] + "batch_size", + "row_label_name", + "cluster_id", + "cluster_info", + "features", + "get", + "_list_fields", + "_list_fields", + "max_iterations", + "method", + "name", + "num_clusters", + "num_examples", + "num_features", + "num_unpacked_features", + "save", + "show", + "summary", + "training_iterations", + "training_time", + "unpacked_features", + "verbose", + "predict", + ] + + actual = [x for x in dir(m) if not x.startswith("_")] check_visible_modules(actual, expected) def test_supervised(self): # Testing an instantiated class that inherits from Model - sf = turicreate.SFrame({ - "d1": [1,2,3,4,5,6], - "d2": [5,4,3,2,1,6]}) - m = turicreate.linear_regression.create(sf, target='d1') - - expected = ['coefficients', - 'convergence_threshold', - 'evaluate', - 'export_coreml', - 'feature_rescaling', - 'features', - 'get', - 'l1_penalty', - 'l2_penalty', - 'lbfgs_memory_level', - '_list_fields', - '_list_fields', - 'max_iterations', - 'name', - 'num_coefficients', - 'num_examples', - 'num_features', - 'num_unpacked_features', - 'predict', - 'progress', - 'save', - 'show', - 'solver', - 'step_size', - 'summary', - 'target', - 'training_iterations', - 'training_loss', - 'training_rmse', - 'training_solver_status', - 'training_time', - 'unpacked_features'] - - actual = [x for x in dir(m) if not x.startswith('_')] + sf = turicreate.SFrame({"d1": [1, 2, 3, 4, 5, 6], "d2": [5, 4, 3, 2, 1, 6]}) + m = turicreate.linear_regression.create(sf, target="d1") + + expected = [ + "coefficients", + "convergence_threshold", + "evaluate", + "export_coreml", + "feature_rescaling", + "features", + "get", + "l1_penalty", + "l2_penalty", + "lbfgs_memory_level", + "_list_fields", + "_list_fields", + "max_iterations", + "name", + "num_coefficients", + "num_examples", + "num_features", + "num_unpacked_features", + "predict", + "progress", + "save", + "show", + "solver", + "step_size", + "summary", + "target", + "training_iterations", + "training_loss", + "training_rmse", + "training_solver_status", + "training_time", + "unpacked_features", + ] + + actual = [x for x in dir(m) if not x.startswith("_")] check_visible_modules(actual, expected) def test_churn_predictor(self): # Arrange time = [1453845953 + 20000 * x for x in range(500)] - user = [1,2,3,4,5] * 20 + [1,2,3,4] * 25 + [1,2,3] * 100 - actions = turicreate.SFrame({ - 'user_id': user, - 'timestamp': time, - 'action': [1,2,3,4,5] * 100, - }) + user = [1, 2, 3, 4, 5] * 20 + [1, 2, 3, 4] * 25 + [1, 2, 3] * 100 + actions = turicreate.SFrame( + {"user_id": user, "timestamp": time, "action": [1, 2, 3, 4, 5] * 100,} + ) + def _unix_timestamp_to_datetime(x): import datetime + return datetime.datetime.fromtimestamp(x) - actions['timestamp'] = actions['timestamp'].apply( - _unix_timestamp_to_datetime) - actions = turicreate.TimeSeries(actions, 'timestamp') + + actions["timestamp"] = actions["timestamp"].apply(_unix_timestamp_to_datetime) + actions = turicreate.TimeSeries(actions, "timestamp") # Act m = turicreate.churn_predictor.create(actions) - actual = [x for x in dir(m) if not x.startswith('_')] + actual = [x for x in dir(m) if not x.startswith("_")] # Assert. - expected = ['categorical_features', - 'evaluate', - 'extract_features', - 'get_feature_importance', - 'churn_period', - 'grace_period', - 'features', - 'get', - 'is_data_aggregated', - '_list_fields', - '_list_fields', - 'lookback_periods', - 'model_options', - 'name', - 'num_features', - 'num_observations', - 'num_users', - 'numerical_features', - 'predict', - 'explain', - 'processed_training_data', - 'save', - 'show', - 'summary', - 'time_boundaries', - 'time_period', - 'trained_model', - 'trained_explanation_model', - 'get_churn_report', - 'get_activity_baseline', - 'views', - 'use_advanced_features', - 'user_id'] + expected = [ + "categorical_features", + "evaluate", + "extract_features", + "get_feature_importance", + "churn_period", + "grace_period", + "features", + "get", + "is_data_aggregated", + "_list_fields", + "_list_fields", + "lookback_periods", + "model_options", + "name", + "num_features", + "num_observations", + "num_users", + "numerical_features", + "predict", + "explain", + "processed_training_data", + "save", + "show", + "summary", + "time_boundaries", + "time_period", + "trained_model", + "trained_explanation_model", + "get_churn_report", + "get_activity_baseline", + "views", + "use_advanced_features", + "user_id", + ] check_visible_modules(actual, expected) def test_topic_model(self): - sa = turicreate.SArray([{'a':5, 'b':3}, {'a':1, 'b':5, 'c':3}]) + sa = turicreate.SArray([{"a": 5, "b": 3}, {"a": 1, "b": 5, "c": 3}]) m = turicreate.topic_model.create(sa) - expected = ['alpha', - 'beta', - 'evaluate', - 'get', - 'get_topics', - '_list_fields', - 'name', - 'num_burnin', - 'num_iterations', - 'num_topics', - 'predict', - 'print_interval', - 'save', - 'show', - 'summary', - 'topics', - 'training_iterations', - 'training_time', - 'validation_time', - 'verbose', - 'vocabulary'] - - actual = [x for x in dir(m) if not x.startswith('_')] + expected = [ + "alpha", + "beta", + "evaluate", + "get", + "get_topics", + "_list_fields", + "name", + "num_burnin", + "num_iterations", + "num_topics", + "predict", + "print_interval", + "save", + "show", + "summary", + "topics", + "training_iterations", + "training_time", + "validation_time", + "verbose", + "vocabulary", + ] + + actual = [x for x in dir(m) if not x.startswith("_")] check_visible_modules(actual, expected) def test_local_outlier_factor(self): # Testing a class that inherits from CustomModel and ExposeAttributesFromProxy expected = [ - 'distance', - 'features', - 'get', - '_list_fields', - 'nearest_neighbors_model', - 'num_distance_components', - 'num_examples', - 'num_features', - 'num_neighbors', - 'num_unpacked_features', - 'predict', - 'row_label_name', - 'save', - 'scores', - 'show', - 'summary', - 'threshold_distances', - 'training_time', - 'unpacked_features', - 'verbose'] - sf = turicreate.SFrame({'x0': [0., 1., 1., 0., 1., 0., 5.], - 'x1': [2., 1., 0., 1., 2., 1.5, 2.5]}) + "distance", + "features", + "get", + "_list_fields", + "nearest_neighbors_model", + "num_distance_components", + "num_examples", + "num_features", + "num_neighbors", + "num_unpacked_features", + "predict", + "row_label_name", + "save", + "scores", + "show", + "summary", + "threshold_distances", + "training_time", + "unpacked_features", + "verbose", + ] + sf = turicreate.SFrame( + { + "x0": [0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 5.0], + "x1": [2.0, 1.0, 0.0, 1.0, 2.0, 1.5, 2.5], + } + ) m = turicreate.local_outlier_factor.create(sf, num_neighbors=3) - actual = [x for x in dir(m) if not x.startswith('_')] + actual = [x for x in dir(m) if not x.startswith("_")] check_visible_modules(actual, expected) def test_search(self): # Testing an instantiated class that inherits from SDKModel - sf = turicreate.SFrame({'text': ['Hello my friend', - 'I love this burrito']}) + sf = turicreate.SFrame({"text": ["Hello my friend", "I love this burrito"]}) m = turicreate._internal.search.create(sf) expected = [ - 'average_document_length', - 'bm25_b', - 'bm25_k1', - 'data', - 'elapsed_indexing', - 'elapsed_processing', - 'features', - 'get', - '_list_fields', - 'name', - 'num_documents', - 'num_tokens', - 'packed_sarrays', - 'query', - 'save', - 'show', - 'summary', - 'tfidf_threshold', - 'verbose', - 'vocabulary'] - - actual = [x for x in dir(m) if not x.startswith('_')] + "average_document_length", + "bm25_b", + "bm25_k1", + "data", + "elapsed_indexing", + "elapsed_processing", + "features", + "get", + "_list_fields", + "name", + "num_documents", + "num_tokens", + "packed_sarrays", + "query", + "save", + "show", + "summary", + "tfidf_threshold", + "verbose", + "vocabulary", + ] + + actual = [x for x in dir(m) if not x.startswith("_")] check_visible_modules(actual, expected) class ModuleVisibilityTests(unittest.TestCase): - def test_Image_type(self): expected = ["Image", "show"] - actual = [x for x in dir(turicreate.data_structures.image) if '_'not in x] + actual = [x for x in dir(turicreate.data_structures.image) if "_" not in x] self.assertTrue(set(actual) == set(expected)) def test_recommender(self): - recommenders = ['factorization_recommender', - 'ranking_factorization_recommender', - 'popularity_recommender', - 'item_content_recommender', - 'item_similarity_recommender'] - other = ['util', 'create'] + recommenders = [ + "factorization_recommender", + "ranking_factorization_recommender", + "popularity_recommender", + "item_content_recommender", + "item_similarity_recommender", + ] + other = ["util", "create"] # Check visibility in turicreate.recommender - actual = [x for x in dir(turicreate.recommender) if '__' not in x] - expected = (recommenders + other) + actual = [x for x in dir(turicreate.recommender) if "__" not in x] + expected = recommenders + other check_visible_modules(actual, expected) # For each module, check that there is a create() method. - expected = ['create'] + expected = ["create"] actual = get_visible_items(turicreate.recommender.factorization_recommender) - assert set(actual) == set(expected + ['FactorizationRecommender']) + assert set(actual) == set(expected + ["FactorizationRecommender"]) - actual = get_visible_items(turicreate.recommender.ranking_factorization_recommender) - assert set(actual) == set(expected + ['RankingFactorizationRecommender']) + actual = get_visible_items( + turicreate.recommender.ranking_factorization_recommender + ) + assert set(actual) == set(expected + ["RankingFactorizationRecommender"]) actual = get_visible_items(turicreate.recommender.item_similarity_recommender) - assert set(actual) == set(expected + ['ItemSimilarityRecommender']) + assert set(actual) == set(expected + ["ItemSimilarityRecommender"]) actual = get_visible_items(turicreate.recommender.item_content_recommender) - assert set(actual) == set(expected + ['ItemContentRecommender']) + assert set(actual) == set(expected + ["ItemContentRecommender"]) actual = get_visible_items(turicreate.recommender.popularity_recommender) - assert set(actual) == set(expected + ['PopularityRecommender']) + assert set(actual) == set(expected + ["PopularityRecommender"]) actual = get_visible_items(turicreate.recommender.util) - expected = ['random_split_by_user', 'RecommenderViews', \ - 'precision_recall_by_user', 'compare_models'] + expected = [ + "random_split_by_user", + "RecommenderViews", + "precision_recall_by_user", + "compare_models", + ] check_visible_modules(actual, expected) def test_nearest_neighbors(self): actual = get_visible_items(turicreate.nearest_neighbors) - expected = ['NearestNeighborsModel', 'create'] + expected = ["NearestNeighborsModel", "create"] check_visible_modules(actual, expected) def test_clustering(self): actual = get_visible_items(turicreate.clustering) - expected = ['kmeans', 'dbscan'] + expected = ["kmeans", "dbscan"] check_visible_modules(actual, expected) def test_data_matching(self): @@ -326,38 +376,46 @@ def test_data_matching(self): check_visible_modules(actual, expected) actual = get_visible_items(turicreate.autotagger) - expected = ['create'] + expected = ["create"] check_visible_modules(actual, expected) actual = get_visible_items(turicreate.deduplication) - expected = ['create'] + expected = ["create"] check_visible_modules(actual, expected) actual = get_visible_items(turicreate.nearest_neighbor_deduplication) - expected = ['NearestNeighborDeduplication', 'create'] + expected = ["NearestNeighborDeduplication", "create"] check_visible_modules(actual, expected) def test_anomaly_detection(self): actual = get_visible_items(turicreate.anomaly_detection) - expected = ['local_outlier_factor', 'moving_zscore', 'create', 'bayesian_changepoints'] + expected = [ + "local_outlier_factor", + "moving_zscore", + "create", + "bayesian_changepoints", + ] check_visible_modules(actual, expected) actual = get_visible_items(turicreate.toolkits.anomaly_detection) - expected = ['local_outlier_factor', 'moving_zscore', 'create', 'bayesian_changepoints'] + expected = [ + "local_outlier_factor", + "moving_zscore", + "create", + "bayesian_changepoints", + ] check_visible_modules(actual, expected) - actual = get_visible_items( - turicreate.anomaly_detection.local_outlier_factor) - expected = ['LocalOutlierFactorModel', 'create'] + actual = get_visible_items(turicreate.anomaly_detection.local_outlier_factor) + expected = ["LocalOutlierFactorModel", "create"] check_visible_modules(actual, expected) - actual = get_visible_items( - turicreate.anomaly_detection.moving_zscore) - expected = ['MovingZScoreModel', 'create'] + actual = get_visible_items(turicreate.anomaly_detection.moving_zscore) + expected = ["MovingZScoreModel", "create"] check_visible_modules(actual, expected) def test_lead_scoring(self): - expected = ['LeadScoringModel', 'create'] + expected = ["LeadScoringModel", "create"] actual = get_visible_items(turicreate.lead_scoring) check_visible_modules(actual, expected) @@ -367,180 +425,203 @@ def test_lead_scoring(self): def test_topic_model(self): actual = get_visible_items(turicreate.topic_model) - expected = ['TopicModel', - 'create', - 'perplexity'] + expected = ["TopicModel", "create", "perplexity"] check_visible_modules(actual, expected) def test_churn_predictor(self): actual = get_visible_items(turicreate.churn_predictor) - expected = ['ChurnPredictor', - 'create', - 'random_split'] + expected = ["ChurnPredictor", "create", "random_split"] check_visible_modules(actual, expected) def test_text_analytics(self): actual = get_visible_items(turicreate.text_analytics) - expected = ['tf_idf', - 'bm25', - 'stopwords', - 'count_words', - 'count_ngrams', - 'random_split', - 'parse_sparse', - 'parse_docword', - 'tokenize', - 'trim_rare_words', - 'split_by_sentence', - 'extract_parts_of_speech', - 'PartOfSpeech'] + expected = [ + "tf_idf", + "bm25", + "stopwords", + "count_words", + "count_ngrams", + "random_split", + "parse_sparse", + "parse_docword", + "tokenize", + "trim_rare_words", + "split_by_sentence", + "extract_parts_of_speech", + "PartOfSpeech", + ] check_visible_modules(actual, expected) def test_classifier(self): actual = get_visible_items(turicreate.classifier) - expected = ['create', - 'logistic_classifier', - 'boosted_trees_classifier', - 'random_forest_classifier', - 'decision_tree_classifier', - 'neuralnet_classifier', - 'svm_classifier', - 'nearest_neighbor_classifier'] + expected = [ + "create", + "logistic_classifier", + "boosted_trees_classifier", + "random_forest_classifier", + "decision_tree_classifier", + "neuralnet_classifier", + "svm_classifier", + "nearest_neighbor_classifier", + ] check_visible_modules(actual, expected) def test_regression(self): actual = get_visible_items(turicreate.regression) - expected = ['create', - 'linear_regression', - 'boosted_trees_regression', - 'decision_tree_regression', - 'random_forest_regression'] + expected = [ + "create", + "linear_regression", + "boosted_trees_regression", + "decision_tree_regression", + "random_forest_regression", + ] check_visible_modules(actual, expected) def test_cross_validation(self): actual = get_visible_items(turicreate.cross_validation) - expected = ['cross_val_score', - 'shuffle', - 'KFold'] + expected = ["cross_val_score", "shuffle", "KFold"] check_visible_modules(actual, expected) def test_toolkits(self): actual = get_visible_items(turicreate.toolkits) expected = [ - 'anomaly_detection', - 'churn_predictor', - 'classifier', - 'clustering', - 'comparison', - 'cross_validation', - 'data_matching', - 'deeplearning', - 'distances', - 'evaluation', - 'feature_engineering', - 'graph_analytics', - 'image_analysis', - 'lead_scoring', - 'model_parameter_search', - 'nearest_neighbors', - 'pattern_mining', - 'product_sentiment', - 'recommender', - 'regression', - 'sentiment_analysis', - 'text_analytics', - 'topic_model', - ] + "anomaly_detection", + "churn_predictor", + "classifier", + "clustering", + "comparison", + "cross_validation", + "data_matching", + "deeplearning", + "distances", + "evaluation", + "feature_engineering", + "graph_analytics", + "image_analysis", + "lead_scoring", + "model_parameter_search", + "nearest_neighbors", + "pattern_mining", + "product_sentiment", + "recommender", + "regression", + "sentiment_analysis", + "text_analytics", + "topic_model", + ] check_visible_modules(actual, expected) def test_graph_analytics(self): - common_functions = ['create'] + common_functions = ["create"] special_functions = {} - special_functions[turicreate.toolkits.graph_analytics.connected_components] \ - = ['ConnectedComponentsModel'] - special_functions[turicreate.toolkits.graph_analytics.graph_coloring] \ - = ['GraphColoringModel'] - special_functions[turicreate.toolkits.graph_analytics.kcore] \ - = ['KcoreModel'] - special_functions[turicreate.toolkits.graph_analytics.shortest_path] \ - = ['ShortestPathModel'] - special_functions[turicreate.toolkits.graph_analytics.triangle_counting] \ - = ['TriangleCountingModel'] - special_functions[turicreate.toolkits.graph_analytics.pagerank] \ - = ['PagerankModel'] + special_functions[turicreate.toolkits.graph_analytics.connected_components] = [ + "ConnectedComponentsModel" + ] + special_functions[turicreate.toolkits.graph_analytics.graph_coloring] = [ + "GraphColoringModel" + ] + special_functions[turicreate.toolkits.graph_analytics.kcore] = ["KcoreModel"] + special_functions[turicreate.toolkits.graph_analytics.shortest_path] = [ + "ShortestPathModel" + ] + special_functions[turicreate.toolkits.graph_analytics.triangle_counting] = [ + "TriangleCountingModel" + ] + special_functions[turicreate.toolkits.graph_analytics.pagerank] = [ + "PagerankModel" + ] for module, funcs in special_functions.items(): - actual = get_visible_items(module) - expected = common_functions + funcs - check_visible_modules(actual, expected) + actual = get_visible_items(module) + expected = common_functions + funcs + check_visible_modules(actual, expected) def test_models_with_hyper_parameters(self): - common_functions = ['create'] + common_functions = ["create"] special_functions = {} - special_functions[turicreate.linear_regression] = ['LinearRegression'] - special_functions[turicreate.boosted_trees_regression] = ['BoostedTreesRegression'] - special_functions[turicreate.random_forest_regression] = ['RandomForestRegression'] - special_functions[turicreate.decision_tree_regression] = ['DecisionTreeRegression'] - special_functions[turicreate.logistic_classifier] = ['LogisticClassifier'] - special_functions[turicreate.boosted_trees_classifier] = ['BoostedTreesClassifier'] - special_functions[turicreate.random_forest_classifier] = ['RandomForestClassifier'] - special_functions[turicreate.decision_tree_classifier] = ['DecisionTreeClassifier'] - - special_functions[turicreate.recommender.factorization_recommender] =\ - ['FactorizationRecommender'] - special_functions[turicreate.recommender.item_similarity_recommender] =\ - ['ItemSimilarityRecommender'] - special_functions[turicreate.recommender.item_content_recommender] =\ - ['ItemContentRecommender'] - special_functions[turicreate.recommender.ranking_factorization_recommender] =\ - ['RankingFactorizationRecommender'] - special_functions[turicreate.recommender.popularity_recommender] =\ - ['PopularityRecommender'] + special_functions[turicreate.linear_regression] = ["LinearRegression"] + special_functions[turicreate.boosted_trees_regression] = [ + "BoostedTreesRegression" + ] + special_functions[turicreate.random_forest_regression] = [ + "RandomForestRegression" + ] + special_functions[turicreate.decision_tree_regression] = [ + "DecisionTreeRegression" + ] + special_functions[turicreate.logistic_classifier] = ["LogisticClassifier"] + special_functions[turicreate.boosted_trees_classifier] = [ + "BoostedTreesClassifier" + ] + special_functions[turicreate.random_forest_classifier] = [ + "RandomForestClassifier" + ] + special_functions[turicreate.decision_tree_classifier] = [ + "DecisionTreeClassifier" + ] + + special_functions[turicreate.recommender.factorization_recommender] = [ + "FactorizationRecommender" + ] + special_functions[turicreate.recommender.item_similarity_recommender] = [ + "ItemSimilarityRecommender" + ] + special_functions[turicreate.recommender.item_content_recommender] = [ + "ItemContentRecommender" + ] + special_functions[turicreate.recommender.ranking_factorization_recommender] = [ + "RankingFactorizationRecommender" + ] + special_functions[turicreate.recommender.popularity_recommender] = [ + "PopularityRecommender" + ] for module, funcs in special_functions.items(): - actual = get_visible_items(module) - expected = common_functions + funcs - check_visible_modules(actual, expected) + actual = get_visible_items(module) + expected = common_functions + funcs + check_visible_modules(actual, expected) def test_topic_modelling_models(self): - common_functions = ['create', 'perplexity'] + common_functions = ["create", "perplexity"] special_functions = {} - special_functions[turicreate.topic_model] = ['TopicModel'] + special_functions[turicreate.topic_model] = ["TopicModel"] for module, funcs in special_functions.items(): - actual = get_visible_items(module) - expected = common_functions + funcs - check_visible_modules(actual, expected) + actual = get_visible_items(module) + expected = common_functions + funcs + check_visible_modules(actual, expected) def test_feature_engineering(self): actual = get_visible_items(turicreate.toolkits.feature_engineering) - expected = ['AutoVectorizer', - 'BM25', - 'CategoricalImputer', - 'CountThresholder', - 'CountFeaturizer', - 'DeepFeatureExtractor', - 'FeatureBinner', - 'FeatureHasher', - 'NGramCounter', - 'NumericImputer', - 'OneHotEncoder', - 'QuadraticFeatures', - 'RandomProjection', - 'TFIDF', - 'Tokenizer', - 'TransformerBase', - 'TransformerChain', - 'TransformToFlatDictionary', - 'WordCounter', - 'RareWordTrimmer', - 'SentenceSplitter', - 'PartOfSpeechExtractor', - 'create'] + expected = [ + "AutoVectorizer", + "BM25", + "CategoricalImputer", + "CountThresholder", + "CountFeaturizer", + "DeepFeatureExtractor", + "FeatureBinner", + "FeatureHasher", + "NGramCounter", + "NumericImputer", + "OneHotEncoder", + "QuadraticFeatures", + "RandomProjection", + "TFIDF", + "Tokenizer", + "TransformerBase", + "TransformerChain", + "TransformToFlatDictionary", + "WordCounter", + "RareWordTrimmer", + "SentenceSplitter", + "PartOfSpeechExtractor", + "create", + ] check_visible_modules(actual, expected) diff --git a/src/python/turicreate/test/dbapi2_mock/__init__.py b/src/python/turicreate/test/dbapi2_mock/__init__.py index 916202c998..a6fadba506 100644 --- a/src/python/turicreate/test/dbapi2_mock/__init__.py +++ b/src/python/turicreate/test/dbapi2_mock/__init__.py @@ -7,15 +7,16 @@ from __future__ import division as _ from __future__ import absolute_import as _ + class dbapi2_mock(object): def __init__(self): # Mandated globals self.apilevel = "2.0 " self.threadsafety = 3 - self.paramstyle = 'qmark' + self.paramstyle = "qmark" self.STRING = 41 self.BINARY = 42 self.DATETIME = 43 self.NUMBER = 44 self.ROWID = 45 - self.Error = Exception # StandardError not in python 3 + self.Error = Exception # StandardError not in python 3 diff --git a/src/python/turicreate/test/test_activity_classifier.py b/src/python/turicreate/test/test_activity_classifier.py index 3b29c9642f..80e9f74901 100644 --- a/src/python/turicreate/test/test_activity_classifier.py +++ b/src/python/turicreate/test/test_activity_classifier.py @@ -19,46 +19,75 @@ from turicreate.toolkits._main import ToolkitError as _ToolkitError import uuid -def _load_data(self, num_examples = 1000, num_features = 3, max_num_sessions = 4, - randomize_num_sessions = True, num_labels = 9, prediction_window = 5, - enforce_all_sessions = False): + +def _load_data( + self, + num_examples=1000, + num_features=3, + max_num_sessions=4, + randomize_num_sessions=True, + num_labels=9, + prediction_window=5, + enforce_all_sessions=False, +): random.seed(42) self.num_examples = num_examples self.num_features = num_features - self.num_sessions = random.randint(1, max_num_sessions) if randomize_num_sessions else max_num_sessions + self.num_sessions = ( + random.randint(1, max_num_sessions) + if randomize_num_sessions + else max_num_sessions + ) self.num_labels = num_labels self.prediction_window = prediction_window - self.features = ['X1-r', 'X2-r', 'X3-r'] - self.target = 'activity_label' - self.session_id = 'session_id' + self.features = ["X1-r", "X2-r", "X3-r"] + self.target = "activity_label" + self.session_id = "session_id" - if (enforce_all_sessions): + if enforce_all_sessions: random_session_ids = _random_session_ids(self.num_examples, self.num_sessions) else: - random_session_ids = sorted([random.randint(0, self.num_sessions - 1) for i in range(self.num_examples)]) + random_session_ids = sorted( + [random.randint(0, self.num_sessions - 1) for i in range(self.num_examples)] + ) - random_labels = [random.randint(0, self.num_labels - 1) for i in range(self.num_examples)] + random_labels = [ + random.randint(0, self.num_labels - 1) for i in range(self.num_examples) + ] - self.data = tc.util.generate_random_sframe(column_codes='r' * self.num_features, num_rows=self.num_examples, random_seed=42) + self.data = tc.util.generate_random_sframe( + column_codes="r" * self.num_features, num_rows=self.num_examples, random_seed=42 + ) self.data[self.session_id] = random_session_ids self.data[self.target] = random_labels -''' + +""" Creates a random session_id column, that guarantees that the number of sessions is exactly the requested one. -''' +""" + + def _random_session_ids(num_examples, num_sessions): examples_per_session = num_examples // num_sessions - if (examples_per_session == 0): - raise ValueError("Can't divide {} lines into {} sessions.".format(num_examples, num_sessions)) + if examples_per_session == 0: + raise ValueError( + "Can't divide {} lines into {} sessions.".format(num_examples, num_sessions) + ) min_lines_per_session = int(0.85 * examples_per_session) max_lines_per_session = int(1.15 * examples_per_session) - lines_in_each_session = [random.randint(min_lines_per_session, max_lines_per_session) for i in range(num_sessions)] - lines_in_each_session = [(x * (num_examples)) // sum(lines_in_each_session) for x in lines_in_each_session] + lines_in_each_session = [ + random.randint(min_lines_per_session, max_lines_per_session) + for i in range(num_sessions) + ] + lines_in_each_session = [ + (x * (num_examples)) // sum(lines_in_each_session) + for x in lines_in_each_session + ] lines_in_each_session[-1] += num_examples - sum(lines_in_each_session) session_ids = [] @@ -67,6 +96,7 @@ def _random_session_ids(num_examples, num_sessions): return session_ids + class ActivityClassifierCreateStressTests(unittest.TestCase): @classmethod def setUpClass(self): @@ -75,84 +105,118 @@ def setUpClass(self): def test_create_missing_value(self): sf_label = random.randint(0, self.num_labels - 1) sf_session_id = max(self.data[self.session_id]) - sf = self.data.append(tc.SFrame({self.features[0]: [None], self.features[1]: [3.14], self.features[2]: [5.23], self.target: [sf_label], self.session_id: [sf_session_id]})) + sf = self.data.append( + tc.SFrame( + { + self.features[0]: [None], + self.features[1]: [3.14], + self.features[2]: [5.23], + self.target: [sf_label], + self.session_id: [sf_session_id], + } + ) + ) with self.assertRaises(_ToolkitError): - tc.activity_classifier.create(sf, - features=self.features, - target=self.target, - session_id=self.session_id, - prediction_window=self.prediction_window, - validation_set=None) + tc.activity_classifier.create( + sf, + features=self.features, + target=self.target, + session_id=self.session_id, + prediction_window=self.prediction_window, + validation_set=None, + ) + def test_create_missing_validation_set(self): sf_label = random.randint(0, self.num_labels - 1) sf_session_id = max(self.data[self.session_id]) - sf = self.data.append(tc.SFrame({self.features[0]: [None], self.features[1]: [3.14], self.features[2]: [5.23], self.target: [sf_label], self.session_id: [sf_session_id]})) + sf = self.data.append( + tc.SFrame( + { + self.features[0]: [None], + self.features[1]: [3.14], + self.features[2]: [5.23], + self.target: [sf_label], + self.session_id: [sf_session_id], + } + ) + ) with self.assertRaises(_ToolkitError): - tc.activity_classifier.create(self.data, - features=self.features, - target=self.target, - session_id=self.session_id, - prediction_window=self.prediction_window, - validation_set=sf) + tc.activity_classifier.create( + self.data, + features=self.features, + target=self.target, + session_id=self.session_id, + prediction_window=self.prediction_window, + validation_set=sf, + ) def test_create_invalid_batch_size(self): with self.assertRaises(_ToolkitError): - tc.activity_classifier.create(self.data, - features=self.features, - target=self.target, - session_id=self.session_id, - prediction_window=self.prediction_window, - validation_set=None, - batch_size=-1) + tc.activity_classifier.create( + self.data, + features=self.features, + target=self.target, + session_id=self.session_id, + prediction_window=self.prediction_window, + validation_set=None, + batch_size=-1, + ) with self.assertRaises(_ToolkitError): - tc.activity_classifier.create(self.data, - features=self.features, - target=self.target, - session_id=self.session_id, - prediction_window=self.prediction_window, - validation_set=None, - batch_size='1') + tc.activity_classifier.create( + self.data, + features=self.features, + target=self.target, + session_id=self.session_id, + prediction_window=self.prediction_window, + validation_set=None, + batch_size="1", + ) def test_create_none_validation_set(self): - model = tc.activity_classifier.create(self.data, - features=self.features, - target=self.target, - session_id=self.session_id, - prediction_window=self.prediction_window, - validation_set=None) + model = tc.activity_classifier.create( + self.data, + features=self.features, + target=self.target, + session_id=self.session_id, + prediction_window=self.prediction_window, + validation_set=None, + ) predictions = model.predict(self.data) - def test_create_no_validation_set(self): - model = tc.activity_classifier.create(self.data, - features=self.features, - target=self.target, - session_id=self.session_id, - prediction_window=self.prediction_window) + model = tc.activity_classifier.create( + self.data, + features=self.features, + target=self.target, + session_id=self.session_id, + prediction_window=self.prediction_window, + ) predictions = model.predict(self.data) def test_create_with_verbose_False(self): args = [self.data, self.session_id, self.target] kwargs = { - 'features': self.features, - 'prediction_window': self.prediction_window + "features": self.features, + "prediction_window": self.prediction_window, } test_util.assert_longer_verbose_logs( - tc.activity_classifier.create, args, kwargs) + tc.activity_classifier.create, args, kwargs + ) def test_create_features_target_session(self): - model = tc.activity_classifier.create(self.data, - features=self.features, - target=self.target, - session_id=self.session_id) + model = tc.activity_classifier.create( + self.data, + features=self.features, + target=self.target, + session_id=self.session_id, + ) predictions = model.predict(self.data) - def test_create_target_session(self): - model = tc.activity_classifier.create(self.data, - target=self.target, - session_id=self.session_id) + model = tc.activity_classifier.create( + self.data, target=self.target, session_id=self.session_id + ) predictions = model.predict(self.data) def test_invalid_model(self): @@ -160,12 +224,14 @@ def test_invalid_model(self): Verify that creating a model with wrong fields fails """ with self.assertRaises(RuntimeError): - model = tc.activity_classifier.create(self.data, - features = self.features, - target ='wrong', - session_id=self.session_id, - prediction_window=self.prediction_window, - validation_set=None) + model = tc.activity_classifier.create( + self.data, + features=self.features, + target="wrong", + session_id=self.session_id, + prediction_window=self.prediction_window, + validation_set=None, + ) class ActivityClassifierAutoValdSetTest(unittest.TestCase): @@ -174,9 +240,9 @@ def setUpClass(self): self.fraction = 0.9 self.seed = 42 - def _compute_expect_frac(self,num_sessions): + def _compute_expect_frac(self, num_sessions): if num_sessions > 200000: - return 10000./num_sessions + return 10000.0 / num_sessions elif num_sessions >= 200: return 0.95 elif num_sessions >= 50: @@ -184,79 +250,127 @@ def _compute_expect_frac(self,num_sessions): return 1 def _create_auto_validation_set(self, is_small=False): - model = tc.activity_classifier.create(self.data, - features=self.features, - target=self.target, - session_id=self.session_id, - prediction_window=self.prediction_window, - validation_set='auto') + model = tc.activity_classifier.create( + self.data, + features=self.features, + target=self.target, + session_id=self.session_id, + prediction_window=self.prediction_window, + validation_set="auto", + ) predictions = model.predict(self.data) # Check the size of the auto validation set num_sessions = len(self.data[self.session_id].unique()) valid_num_sessions = num_sessions - model.num_sessions valid_frac = float(valid_num_sessions / num_sessions) - expected_frac = 0.0 if is_small else 1.0 - self._compute_expect_frac(num_sessions) - self.assertAlmostEqual(valid_frac, expected_frac, places=1, - msg="Got {} validation sessions out of {}, which is {:.3f}, and not the expected {}".format(valid_num_sessions, num_sessions, valid_frac, expected_frac)) + expected_frac = ( + 0.0 if is_small else 1.0 - self._compute_expect_frac(num_sessions) + ) + self.assertAlmostEqual( + valid_frac, + expected_frac, + places=1, + msg="Got {} validation sessions out of {}, which is {:.3f}, and not the expected {}".format( + valid_num_sessions, num_sessions, valid_frac, expected_frac + ), + ) def test_random_split_by_session(self): num_sessions = tc.activity_classifier.util._MIN_NUM_SESSIONS_FOR_SPLIT - _load_data(self, num_examples=10000, max_num_sessions=num_sessions, - randomize_num_sessions=False, enforce_all_sessions=True) + _load_data( + self, + num_examples=10000, + max_num_sessions=num_sessions, + randomize_num_sessions=False, + enforce_all_sessions=True, + ) - train, valid = tc.activity_classifier.util.random_split_by_session(self.data, self.session_id, self.fraction, self.seed) + train, valid = tc.activity_classifier.util.random_split_by_session( + self.data, self.session_id, self.fraction, self.seed + ) train_num_sessions = len(train[self.session_id].unique()) train_frac = float(train_num_sessions / num_sessions) expected_frac = self.fraction - self.assertAlmostEqual(train_frac, expected_frac, places=1, - msg= "Got {} train sessions out of {}, which is {:.3f}, and not the expected {}".format( - train_num_sessions, num_sessions, train_frac, expected_frac)) + self.assertAlmostEqual( + train_frac, + expected_frac, + places=1, + msg="Got {} train sessions out of {}, which is {:.3f}, and not the expected {}".format( + train_num_sessions, num_sessions, train_frac, expected_frac + ), + ) valid_num_sessions = len(valid[self.session_id].unique()) valid_frac = float(valid_num_sessions / num_sessions) expected_valid_frac = 1.0 - self.fraction - self.assertAlmostEqual(valid_frac, expected_valid_frac, places=1, - msg= "Got {} train sessions out of {}, which is {:.3f}, and not the expected {}".format( - valid_num_sessions, num_sessions, valid_frac, expected_valid_frac)) + self.assertAlmostEqual( + valid_frac, + expected_valid_frac, + places=1, + msg="Got {} train sessions out of {}, which is {:.3f}, and not the expected {}".format( + valid_num_sessions, num_sessions, valid_frac, expected_valid_frac + ), + ) train_sessions_set = set(train[self.session_id].unique()) valid_sessions_set = set(valid[self.session_id].unique()) - self.assertTrue(train_sessions_set.isdisjoint(valid_sessions_set), - "After train-test split, the train and validation sets should not include the same sessions") + self.assertTrue( + train_sessions_set.isdisjoint(valid_sessions_set), + "After train-test split, the train and validation sets should not include the same sessions", + ) def test_create_auto_validation_set_small(self): min_num_session_for_split = 50 num_sessions = min_num_session_for_split // 2 - _load_data(self, max_num_sessions=num_sessions, randomize_num_sessions=False, enforce_all_sessions=True) + _load_data( + self, + max_num_sessions=num_sessions, + randomize_num_sessions=False, + enforce_all_sessions=True, + ) self._create_auto_validation_set(is_small=True) def test_create_auto_validation_set_typical(self): num_sessions = tc.activity_classifier.util._MIN_NUM_SESSIONS_FOR_SPLIT * 4 - _load_data(self, num_examples=10000, max_num_sessions=num_sessions, randomize_num_sessions=False, - enforce_all_sessions=True) + _load_data( + self, + num_examples=10000, + max_num_sessions=num_sessions, + randomize_num_sessions=False, + enforce_all_sessions=True, + ) self._create_auto_validation_set() def test_create_auto_validation_set_string_session_id(self): num_sessions = tc.activity_classifier.util._MIN_NUM_SESSIONS_FOR_SPLIT * 4 - _load_data(self, num_examples=10000, max_num_sessions=num_sessions, randomize_num_sessions=False, - enforce_all_sessions=True) + _load_data( + self, + num_examples=10000, + max_num_sessions=num_sessions, + randomize_num_sessions=False, + enforce_all_sessions=True, + ) from six.moves import xrange as _xrange + session_ids_dict = {} for i in _xrange(num_sessions): session_ids_dict[i] = uuid.uuid4().hex[:6].upper() - self.data[self.session_id] = self.data[self.session_id].apply(lambda x: session_ids_dict[x]) + self.data[self.session_id] = self.data[self.session_id].apply( + lambda x: session_ids_dict[x] + ) self._create_auto_validation_set() + class ActivityClassifierTest(unittest.TestCase): @classmethod def setUpClass(self): @@ -266,56 +380,69 @@ def setUpClass(self): _load_data(self) # Create the model - self.model = tc.activity_classifier.create(self.data, - features=self.features, - target=self.target, - session_id=self.session_id, - prediction_window=self.prediction_window, - validation_set=None) + self.model = tc.activity_classifier.create( + self.data, + features=self.features, + target=self.target, + session_id=self.session_id, + prediction_window=self.prediction_window, + validation_set=None, + ) self.def_opts = { - 'verbose': True, - 'prediction_window': 100, - 'max_iterations': 10, - 'batch_size' : 32 + "verbose": True, + "prediction_window": 100, + "max_iterations": 10, + "batch_size": 32, } # Answers self.opts = self.def_opts.copy() - self.opts['prediction_window'] = self.prediction_window + self.opts["prediction_window"] = self.prediction_window self.get_ans = { - 'features': lambda x: x == self.features, - 'training_time': lambda x: x > 0, - 'target': lambda x: x == self.target, - 'verbose': lambda x: x == True, - 'session_id': lambda x: x == self.session_id, - 'prediction_window': lambda x: x == self.prediction_window, - 'training_accuracy': lambda x: x >= 0 and x <= 1, - 'training_log_loss': lambda x: isinstance(x, Number), - 'max_iterations': lambda x: x == self.def_opts['max_iterations'], - 'num_sessions': lambda x: x == self.num_sessions, - 'num_features': lambda x: x == self.num_features, - 'num_examples': lambda x: x == self.num_examples, - 'num_classes': lambda x: x == self.num_labels, - 'batch_size' : lambda x: x == self.def_opts['batch_size'], - 'classes': lambda x: sorted(x) == sorted(self.data[self.target].unique()) + "features": lambda x: x == self.features, + "training_time": lambda x: x > 0, + "target": lambda x: x == self.target, + "verbose": lambda x: x == True, + "session_id": lambda x: x == self.session_id, + "prediction_window": lambda x: x == self.prediction_window, + "training_accuracy": lambda x: x >= 0 and x <= 1, + "training_log_loss": lambda x: isinstance(x, Number), + "max_iterations": lambda x: x == self.def_opts["max_iterations"], + "num_sessions": lambda x: x == self.num_sessions, + "num_features": lambda x: x == self.num_features, + "num_examples": lambda x: x == self.num_examples, + "num_classes": lambda x: x == self.num_labels, + "batch_size": lambda x: x == self.def_opts["batch_size"], + "classes": lambda x: sorted(x) == sorted(self.data[self.target].unique()), } self.exposed_fields_ans = list(self.get_ans.keys()) - self.fields_ans = self.exposed_fields_ans + ['training_report_by_class', - 'training_iterations', 'random_seed', - 'training_precision', 'training_confusion_matrix', - 'use_data_augmentation', 'training_f1_score', - 'training_auc', 'training_roc_curve', 'training_recall'] - - - def _calc_expected_predictions_length(self, predict_input, top_k = 1): - - input_sessions = predict_input.groupby(self.session_id, { 'session_len' : tc.aggregate.COUNT()}) + self.fields_ans = self.exposed_fields_ans + [ + "training_report_by_class", + "training_iterations", + "random_seed", + "training_precision", + "training_confusion_matrix", + "use_data_augmentation", + "training_f1_score", + "training_auc", + "training_roc_curve", + "training_recall", + ] + + def _calc_expected_predictions_length(self, predict_input, top_k=1): + + input_sessions = predict_input.groupby( + self.session_id, {"session_len": tc.aggregate.COUNT()} + ) prediction_window = self.model.prediction_window - input_sessions['num_predictions_per_session'] = input_sessions['session_len'].apply( - lambda x: math.ceil(float(x) / prediction_window) ) - total_num_of_prediction = sum(input_sessions['num_predictions_per_session']) * top_k + input_sessions["num_predictions_per_session"] = input_sessions[ + "session_len" + ].apply(lambda x: math.ceil(float(x) / prediction_window)) + total_num_of_prediction = ( + sum(input_sessions["num_predictions_per_session"]) * top_k + ) return total_num_of_prediction @@ -324,9 +451,10 @@ def test_predict(self): Check the predict() function. """ model = self.model - for output_type in ['probability_vector', 'class']: + for output_type in ["probability_vector", "class"]: preds = model.predict( - self.data, output_type=output_type, output_frequency='per_window') + self.data, output_type=output_type, output_frequency="per_window" + ) expected_len = self._calc_expected_predictions_length(self.data) self.assertEqual(len(preds), expected_len) @@ -335,34 +463,41 @@ def test_export_coreml(self): Check the export_coreml() function. """ import coremltools + # Save the model as a CoreML model file - filename = tempfile.mkstemp('ActivityClassifier.mlmodel')[1] + filename = tempfile.mkstemp("ActivityClassifier.mlmodel")[1] self.model.export_coreml(filename) # Load the model back from the CoreML model file coreml_model = coremltools.models.MLModel(filename) import platform - self.assertDictEqual({ - 'com.github.apple.turicreate.version': tc.__version__, - 'com.github.apple.os.platform': platform.platform(), - 'target': self.target, - 'type': 'activity_classifier', - 'prediction_window': str(self.prediction_window), - 'session_id': self.session_id, - 'features': ','.join(self.features), - 'max_iterations': '10', - 'version': '2', - }, dict(coreml_model.user_defined_metadata) + + self.assertDictEqual( + { + "com.github.apple.turicreate.version": tc.__version__, + "com.github.apple.os.platform": platform.platform(), + "target": self.target, + "type": "activity_classifier", + "prediction_window": str(self.prediction_window), + "session_id": self.session_id, + "features": ",".join(self.features), + "max_iterations": "10", + "version": "2", + }, + dict(coreml_model.user_defined_metadata), + ) + expected_result = "Activity classifier created by Turi Create (version %s)" % ( + tc.__version__ ) - expected_result = 'Activity classifier created by Turi Create (version %s)' \ - % (tc.__version__) self.assertEquals(expected_result, coreml_model.short_description) # Create a small dataset, and compare the models' predict() output rs = np.random.RandomState(1234) - dataset = tc.util.generate_random_sframe(column_codes='r' * 3, num_rows=10) - dataset['session_id'] = 0 - dataset[self.target] = random_labels = [rs.randint(0, self.num_labels - 1, ) for i in range(10)] + dataset = tc.util.generate_random_sframe(column_codes="r" * 3, num_rows=10) + dataset["session_id"] = 0 + dataset[self.target] = random_labels = [ + rs.randint(0, self.num_labels - 1,) for i in range(10) + ] if _mac_ver() >= (10, 13): w = self.prediction_window @@ -371,33 +506,35 @@ def test_export_coreml(self): input_features = {} for f in self.features: input_features[f] = dataset[f].to_numpy() - first_input_dict = {} + first_input_dict = {} second_input_dict = {} for key, value in input_features.items(): first_input_dict[key] = value[:w].copy() - second_input_dict[key] = value[w:2*w].copy() + second_input_dict[key] = value[w : 2 * w].copy() first_input_dict["stateIn"] = np.zeros((400)) ret0 = coreml_model.predict(first_input_dict) second_input_dict["stateIn"] = ret0["stateOut"] ret1 = coreml_model.predict(second_input_dict) - pred = self.model.predict(dataset, output_type='probability_vector') + pred = self.model.predict(dataset, output_type="probability_vector") model_time0_values = pred[0] model_time1_values = pred[w] model_predictions = np.array([model_time0_values, model_time1_values]) - coreml_time0_values = [ret0[self.target + 'Probability'][l] for l in labels] - coreml_time1_values = [ret1[self.target + 'Probability'][l] for l in labels] + coreml_time0_values = [ret0[self.target + "Probability"][l] for l in labels] + coreml_time1_values = [ret1[self.target + "Probability"][l] for l in labels] coreml_predictions = np.array([coreml_time0_values, coreml_time1_values]) - np.testing.assert_array_almost_equal(model_predictions, coreml_predictions, decimal=3) + np.testing.assert_array_almost_equal( + model_predictions, coreml_predictions, decimal=3 + ) def test_classify(self): """ Check the classify() function. """ model = self.model - preds = model.classify(self.data, output_frequency='per_window') + preds = model.classify(self.data, output_frequency="per_window") expected_len = self._calc_expected_predictions_length(self.data) self.assertEqual(len(preds), expected_len) @@ -407,21 +544,24 @@ def test_classify_with_incomplete_data(self): with self.assertRaises(_ToolkitError): pred = self.model.classify(data) - def test_predict_topk(self): """ Check the predict_topk function. """ model = self.model - for output_type in ['rank', 'probability']: + for output_type in ["rank", "probability"]: preds = model.predict_topk( - self.data, output_type=output_type, output_frequency='per_window') + self.data, output_type=output_type, output_frequency="per_window" + ) expected_len = self._calc_expected_predictions_length(self.data, top_k=3) self.assertEqual(len(preds), expected_len) preds = model.predict_topk( - self.data.head(100), k=5, output_frequency='per_window') - expected_len = self._calc_expected_predictions_length(self.data.head(100), top_k=5) + self.data.head(100), k=5, output_frequency="per_window" + ) + expected_len = self._calc_expected_predictions_length( + self.data.head(100), top_k=5 + ) self.assertEqual(len(preds), expected_len) def test_predict_topk_invalid_k(self): @@ -435,7 +575,6 @@ def test_predict_topk_invalid_k(self): with self.assertRaises(TypeError): preds = model.predict_topk(self.data, k=[]) - def test_evaluate_with_incomplete_targets(self): """ Check that evaluation does not require the test data to span all labels. @@ -449,8 +588,16 @@ def test_evaluate_with_incomplete_targets(self): evaluation = self.model.evaluate(filtered_data) # Verify that all metrics were computed and included in the result. - for metric in ['accuracy', 'auc', 'precision', 'recall', 'f1_score', - 'log_loss', 'confusion_matrix', 'roc_curve']: + for metric in [ + "accuracy", + "auc", + "precision", + "recall", + "f1_score", + "log_loss", + "confusion_matrix", + "roc_curve", + ]: self.assertIn(metric, evaluation) def test__list_fields(self): @@ -469,8 +616,10 @@ def test_get(self): model = self.model for field in self.exposed_fields_ans: ans = model._get(field) - self.assertTrue(self.get_ans[field](ans), - '''Get failed in field {}. Output was {}.'''.format(field, ans)) + self.assertTrue( + self.get_ans[field](ans), + """Get failed in field {}. Output was {}.""".format(field, ans), + ) def test_summary(self): """ @@ -481,16 +630,16 @@ def test_summary(self): def test_summary_str(self): model = self.model - self.assertTrue(isinstance(model.summary('str'), str)) + self.assertTrue(isinstance(model.summary("str"), str)) def test_summary_dict(self): model = self.model - self.assertTrue(isinstance(model.summary('dict'), dict)) + self.assertTrue(isinstance(model.summary("dict"), dict)) def test_summary_invalid_input(self): model = self.model with self.assertRaises(_ToolkitError): - model.summary(model.summary('invalid')) + model.summary(model.summary("invalid")) with self.assertRaises(_ToolkitError): model.summary(model.summary(0)) @@ -498,7 +647,6 @@ def test_summary_invalid_input(self): with self.assertRaises(_ToolkitError): model.summary(model.summary({})) - def test_repr(self): """ Check the repr function. @@ -512,15 +660,19 @@ def test_save_and_load(self): """ Make sure saving and loading retains everything. """ - test_methods_list = [func for func in dir(self) if callable(getattr(self, func)) and func.startswith("test")] - test_methods_list.remove('test_save_and_load') + test_methods_list = [ + func + for func in dir(self) + if callable(getattr(self, func)) and func.startswith("test") + ] + test_methods_list.remove("test_save_and_load") with test_util.TempDirectory() as filename: self.model.save(filename) self.model = None self.model = tc.load_model(filename) - print ("Repeating all test cases after model delete and reload") + print("Repeating all test cases after model delete and reload") for test_method in test_methods_list: try: getattr(self, test_method)() @@ -528,10 +680,16 @@ def test_save_and_load(self): except unittest.SkipTest: pass except Exception as e: - self.assertTrue(False, "After model save and load, method " + test_method + - " has failed with error: " + str(e)) + self.assertTrue( + False, + "After model save and load, method " + + test_method + + " has failed with error: " + + str(e), + ) + -@unittest.skipIf(tc.util._num_available_gpus() == 0, 'Requires GPU') +@unittest.skipIf(tc.util._num_available_gpus() == 0, "Requires GPU") @pytest.mark.gpu class ActivityClassifierGPUTest(unittest.TestCase): @classmethod @@ -544,14 +702,14 @@ def test_gpu_save_load_export(self): for in_gpus in gpu_options: for out_gpus in gpu_options: tc.config.set_num_gpus(in_gpus) - model = tc.activity_classifier.create(self.data, - target=self.target, - session_id=self.session_id) + model = tc.activity_classifier.create( + self.data, target=self.target, session_id=self.session_id + ) with test_util.TempDirectory() as filename: model.save(filename) model = tc.load_model(filename) - filename = tempfile.mkstemp('ActivityClassifier.mlmodel')[1] + filename = tempfile.mkstemp("ActivityClassifier.mlmodel")[1] model.export_coreml(filename) tc.config.set_num_gpus(old_num_gpus) diff --git a/src/python/turicreate/test/test_audio_functionality.py b/src/python/turicreate/test/test_audio_functionality.py index e29309ac7f..c2605630cf 100644 --- a/src/python/turicreate/test/test_audio_functionality.py +++ b/src/python/turicreate/test/test_audio_functionality.py @@ -30,13 +30,12 @@ class ReadAudioTest(unittest.TestCase): - @classmethod def setUpClass(self): random = np.random.RandomState(1234) - self.noise1 = random.normal(loc=500, scale=100, size=16000).astype('int16') + self.noise1 = random.normal(loc=500, scale=100, size=16000).astype("int16") self.sample_rate1 = 16000 - self.noise2 = random.normal(loc=500, scale=100, size=48000).astype('int16') + self.noise2 = random.normal(loc=500, scale=100, size=48000).astype("int16") self.sample_rate2 = 48000 def test_simple_case(self, random_order=False): @@ -49,9 +48,9 @@ def test_simple_case(self, random_order=False): def test_recursive_dir(self): with TempDirectory() as temp_dir: - file1 = temp_dir + '/1.wav' - mkdir(temp_dir + '/foo') - file2 = temp_dir + '/foo/2.wav' + file1 = temp_dir + "/1.wav" + mkdir(temp_dir + "/foo") + file2 = temp_dir + "/foo/2.wav" wavfile.write(file1, self.sample_rate1, self.noise1) wavfile.write(file2, self.sample_rate2, self.noise2) @@ -66,13 +65,13 @@ def test_no_path(self): sf = tc.load_audio(temp_dir, with_path=False) self.assertEqual(len(sf), 2) - self.assertEqual(sorted(sf.column_names()), ['audio']) + self.assertEqual(sorted(sf.column_names()), ["audio"]) def test_ignore_failure(self): with TempDirectory() as temp_dir: file1, file2 = self._write_audio_files_in_dir(temp_dir) - with open(temp_dir + '/junk.wav', 'w') as f: - f.write('junk, junk, junk. Not audio data!') + with open(temp_dir + "/junk.wav", "w") as f: + f.write("junk, junk, junk. Not audio data!") with self.assertRaises(ToolkitError): tc.load_audio(temp_dir, ignore_failure=False) @@ -90,30 +89,30 @@ def test_single_file(self): sf = tc.load_audio(file1) self.assertEqual(len(sf), 1) - self.assertEqual(sorted(sf.column_names()), ['audio', 'path']) + self.assertEqual(sorted(sf.column_names()), ["audio", "path"]) # Check the audio file - audio1 = sf.filter_by([file1], 'path')['audio'][0] - self.assertEqual(audio1['sample_rate'], self.sample_rate1) - self.assertTrue(all(audio1['data'] == self.noise1)) + audio1 = sf.filter_by([file1], "path")["audio"][0] + self.assertEqual(audio1["sample_rate"], self.sample_rate1) + self.assertTrue(all(audio1["data"] == self.noise1)) def _assert_audio_sframe_correct(self, sf, file1, file2): self.assertEqual(len(sf), 2) - self.assertEqual(sorted(sf.column_names()), ['audio', 'path']) + self.assertEqual(sorted(sf.column_names()), ["audio", "path"]) # Check the first audio file - audio1 = sf.filter_by([file1], 'path')['audio'][0] - self.assertEqual(audio1['sample_rate'], self.sample_rate1) - self.assertTrue(all(audio1['data'] == self.noise1)) + audio1 = sf.filter_by([file1], "path")["audio"][0] + self.assertEqual(audio1["sample_rate"], self.sample_rate1) + self.assertTrue(all(audio1["data"] == self.noise1)) # Check the second audio file - audio2 = sf.filter_by([file2], 'path')['audio'][0] - self.assertEqual(audio2['sample_rate'], self.sample_rate2) - self.assertTrue(all(audio2['data'] == self.noise2)) + audio2 = sf.filter_by([file2], "path")["audio"][0] + self.assertEqual(audio2["sample_rate"], self.sample_rate2) + self.assertTrue(all(audio2["data"] == self.noise2)) def _write_audio_files_in_dir(self, dir_path): - file1 = dir_path + '/1.wav' - file2 = dir_path + '/2.wav' + file1 = dir_path + "/1.wav" + file2 = dir_path + "/2.wav" wavfile.write(file1, self.sample_rate1, self.noise1) wavfile.write(file2, self.sample_rate2, self.noise2) return file1, file2 @@ -126,25 +125,38 @@ def generate_white_noise(length, sample_rate): loc = random.randint(300, 600) scale = random.randint(80, 130) size = int(length * sample_rate) - data = random.normal(loc=loc, scale=scale, size=size).astype('int16') - return {'sample_rate': sample_rate, 'data': data} + data = random.normal(loc=loc, scale=scale, size=size).astype("int16") + return {"sample_rate": sample_rate, "data": data} def generate_sine_wave(length, sample_rate): data = [] volume = random.randint(500, 1500) freq = random.randint(300, 800) for x in range(int(length * sample_rate)): - data.append(volume * math.sin(2 * math.pi * freq * (x / float(sample_rate)))) - return {'sample_rate': sample_rate, 'data': np.asarray(data, dtype='int16')} - - white_noise = [generate_white_noise(3, 16000), generate_white_noise(5.1, 48000), - generate_white_noise(1, 16500)] - - sine_waves = [generate_sine_wave(3, 16000), generate_sine_wave(5.1, 48000), - generate_sine_wave(1, 12000)] - - data = tc.SFrame({'audio': white_noise + sine_waves, - 'labels': ['white noise'] * len(white_noise) + ['sine wave'] * len(sine_waves)}) + data.append( + volume * math.sin(2 * math.pi * freq * (x / float(sample_rate))) + ) + return {"sample_rate": sample_rate, "data": np.asarray(data, dtype="int16")} + + white_noise = [ + generate_white_noise(3, 16000), + generate_white_noise(5.1, 48000), + generate_white_noise(1, 16500), + ] + + sine_waves = [ + generate_sine_wave(3, 16000), + generate_sine_wave(5.1, 48000), + generate_sine_wave(1, 12000), + ] + + data = tc.SFrame( + { + "audio": white_noise + sine_waves, + "labels": ["white noise"] * len(white_noise) + + ["sine wave"] * len(sine_waves), + } + ) return data @@ -152,57 +164,78 @@ def generate_sine_wave(length, sample_rate): class ClassifierTestTwoClassesStringLabels(unittest.TestCase): - @classmethod def setUpClass(self): self.data = copy(binary_test_data) self.is_binary_classification = True - self.model = tc.sound_classifier.create(self.data, 'labels', feature='audio', max_iterations=100) + self.model = tc.sound_classifier.create( + self.data, "labels", feature="audio", max_iterations=100 + ) def test_create_invalid_max_iterations(self): with self.assertRaises(ToolkitError): - model = tc.sound_classifier.create(self.data, 'labels', feature='audio', max_iterations=0) + model = tc.sound_classifier.create( + self.data, "labels", feature="audio", max_iterations=0 + ) with self.assertRaises(TypeError): - model = tc.sound_classifier.create(self.data, 'labels', feature='audio', max_iterations='1') + model = tc.sound_classifier.create( + self.data, "labels", feature="audio", max_iterations="1" + ) def test_create_with_invalid_custom_layers(self): with self.assertRaises(ToolkitError): - model = tc.sound_classifier.create(self.data, 'labels', feature='audio', custom_layer_sizes=[]) + model = tc.sound_classifier.create( + self.data, "labels", feature="audio", custom_layer_sizes=[] + ) with self.assertRaises(ToolkitError): - model = tc.sound_classifier.create(self.data, 'labels', feature='audio', custom_layer_sizes={}) + model = tc.sound_classifier.create( + self.data, "labels", feature="audio", custom_layer_sizes={} + ) with self.assertRaises(ToolkitError): - model = tc.sound_classifier.create(self.data, 'labels', feature='audio', custom_layer_sizes=['1']) + model = tc.sound_classifier.create( + self.data, "labels", feature="audio", custom_layer_sizes=["1"] + ) with self.assertRaises(ToolkitError): - model = tc.sound_classifier.create(self.data, 'labels', feature='audio', custom_layer_sizes=[-1]) + model = tc.sound_classifier.create( + self.data, "labels", feature="audio", custom_layer_sizes=[-1] + ) with self.assertRaises(ToolkitError): - model = tc.sound_classifier.create(self.data, 'labels', feature='audio', custom_layer_sizes=[0,0]) + model = tc.sound_classifier.create( + self.data, "labels", feature="audio", custom_layer_sizes=[0, 0] + ) def test_create_with_invalid_batch_size(self): with self.assertRaises(ValueError): - model = tc.sound_classifier.create(self.data, 'labels', feature='audio', batch_size=-1) + model = tc.sound_classifier.create( + self.data, "labels", feature="audio", batch_size=-1 + ) with self.assertRaises(TypeError): - model = tc.sound_classifier.create(self.data, 'labels', feature='audio', batch_size=[]) + model = tc.sound_classifier.create( + self.data, "labels", feature="audio", batch_size=[] + ) def test_predict(self): # default ('class') output_type - predictions = self.model.predict(self.data['audio']) + predictions = self.model.predict(self.data["audio"]) _raise_error_if_not_sarray(predictions) self.assertEqual(len(predictions), len(self.data)) - for a, b in zip(predictions, self.data['labels']): + for a, b in zip(predictions, self.data["labels"]): self.assertEqual(a, b) # 'probability' output_type if self.is_binary_classification: - predictions = self.model.predict(self.data['audio'], output_type='probability') + predictions = self.model.predict( + self.data["audio"], output_type="probability" + ) _raise_error_if_not_sarray(predictions) self.assertEqual(len(predictions), len(self.data)) - for probabilities, correct_label in zip(predictions, self.data['labels']): + for probabilities, correct_label in zip(predictions, self.data["labels"]): # correct value has highest probability? correct_index = self.model.classes.index(correct_label) self.assertEqual(np.argmax(probabilities), correct_index) @@ -211,13 +244,15 @@ def test_predict(self): else: # 'probability' output type only supported for binary classification with self.assertRaises(ToolkitError): - self.model.predict(self.data['audio'], output_type='probability') + self.model.predict(self.data["audio"], output_type="probability") # 'probability_vector' output_type - predictions = self.model.predict(self.data['audio'], output_type='probability_vector') + predictions = self.model.predict( + self.data["audio"], output_type="probability_vector" + ) _raise_error_if_not_sarray(predictions) self.assertEqual(len(predictions), len(self.data)) - for prob_vector, correct_label in zip(predictions, self.data['labels']): + for prob_vector, correct_label in zip(predictions, self.data["labels"]): # correct value has highest probability? correct_index = self.model.classes.index(correct_label) self.assertEqual(np.argmax(prob_vector), correct_index) @@ -225,18 +260,18 @@ def test_predict(self): self.assertTrue(abs(np.sum(prob_vector) - 1) < 0.00001) # predict with single (dict) example - single_prediction = self.model.predict(self.data['audio'][0]) + single_prediction = self.model.predict(self.data["audio"][0]) _raise_error_if_not_sarray(single_prediction) self.assertEqual(len(single_prediction), 1) - self.assertEqual(single_prediction[0], self.data['labels'][0]) + self.assertEqual(single_prediction[0], self.data["labels"][0]) # predict with SFrame data = self.data.copy() - del data['labels'] + del data["labels"] predictions = self.model.predict(data) _raise_error_if_not_sarray(predictions) self.assertEqual(len(predictions), len(data)) - for a, b in zip(predictions, self.data['labels']): + for a, b in zip(predictions, self.data["labels"]): self.assertEqual(a, b) def test_save_and_load(self): @@ -246,96 +281,122 @@ def test_save_and_load(self): self.assertEqual(self.model.feature, new_model.feature) - old_model_probs = self.model.predict(self.data['audio'], output_type='probability_vector') - new_model_probs = new_model.predict(self.data['audio'], output_type='probability_vector') + old_model_probs = self.model.predict( + self.data["audio"], output_type="probability_vector" + ) + new_model_probs = new_model.predict( + self.data["audio"], output_type="probability_vector" + ) for a, b in zip(old_model_probs, new_model_probs): np.testing.assert_array_almost_equal(a, b, decimal=6) - @unittest.skipIf(_mac_ver() < (10,14), 'Custom models only supported on macOS 10.14+') + @unittest.skipIf( + _mac_ver() < (10, 14), "Custom models only supported on macOS 10.14+" + ) def test_export_coreml_with_prediction(self): import resampy with TempDirectory() as temp_dir: - file_name = temp_dir + '/model.mlmodel' + file_name = temp_dir + "/model.mlmodel" self.model.export_coreml(file_name) core_ml_model = coremltools.models.MLModel(file_name) # Check predictions - for cur_audio in self.data['audio']: - resampled_data = resampy.resample(cur_audio['data'], cur_audio['sample_rate'], 16000) + for cur_audio in self.data["audio"]: + resampled_data = resampy.resample( + cur_audio["data"], cur_audio["sample_rate"], 16000 + ) first_audio_frame = resampled_data[:15600] - tc_x = {'data': first_audio_frame, 'sample_rate': 16000} - tc_prob_vector = self.model.predict(tc_x, output_type='probability_vector')[0] + tc_x = {"data": first_audio_frame, "sample_rate": 16000} + tc_prob_vector = self.model.predict(tc_x, output_type="probability_vector")[ + 0 + ] - coreml_x = np.float32(first_audio_frame / 32768.0) # Convert to [-1.0, +1.0] - coreml_y = core_ml_model.predict({'audio': coreml_x}) + coreml_x = np.float32( + first_audio_frame / 32768.0 + ) # Convert to [-1.0, +1.0] + coreml_y = core_ml_model.predict({"audio": coreml_x}) - core_ml_prob_output_name = self.model.target + 'Probability' + core_ml_prob_output_name = self.model.target + "Probability" for i, cur_class in enumerate(self.model.classes): - self.assertAlmostEquals(tc_prob_vector[i], - coreml_y[core_ml_prob_output_name][cur_class], - delta=0.001) + self.assertAlmostEquals( + tc_prob_vector[i], + coreml_y[core_ml_prob_output_name][cur_class], + delta=0.001, + ) # Check metadata metadata = core_ml_model.get_spec().description.metadata - self.assertTrue('sampleRate' in metadata.userDefined) - self.assertEqual(metadata.userDefined['sampleRate'], '16000') + self.assertTrue("sampleRate" in metadata.userDefined) + self.assertEqual(metadata.userDefined["sampleRate"], "16000") def test_export_core_ml_no_prediction(self): import platform + with TempDirectory() as temp_dir: - file_name = temp_dir + '/model.mlmodel' + file_name = temp_dir + "/model.mlmodel" self.model.export_coreml(file_name) core_ml_model = coremltools.models.MLModel(file_name) # Check metadata metadata = core_ml_model.get_spec().description.metadata - self.assertTrue('sampleRate' in metadata.userDefined) - self.assertEqual(metadata.userDefined['sampleRate'], '16000') - self.assertDictEqual({ - 'com.github.apple.turicreate.version': tc.__version__, - 'com.github.apple.os.platform': platform.platform(), - 'type': 'SoundClassifier', - 'coremltoolsVersion': coremltools.__version__, - 'sampleRate': '16000', - 'version': '1' - }, dict(core_ml_model.user_defined_metadata) + self.assertTrue("sampleRate" in metadata.userDefined) + self.assertEqual(metadata.userDefined["sampleRate"], "16000") + self.assertDictEqual( + { + "com.github.apple.turicreate.version": tc.__version__, + "com.github.apple.os.platform": platform.platform(), + "type": "SoundClassifier", + "coremltoolsVersion": coremltools.__version__, + "sampleRate": "16000", + "version": "1", + }, + dict(core_ml_model.user_defined_metadata), + ) + expected_result = "Sound classifier created by Turi Create (version %s)" % ( + tc.__version__ ) - expected_result = 'Sound classifier created by Turi Create (version %s)' % ( - tc.__version__) self.assertEquals(expected_result, core_ml_model.short_description) def test_evaluate(self): evaluation = self.model.evaluate(self.data) # Verify that all metrics are included in the result. - for metric in ['accuracy', 'auc', 'precision', 'recall', 'f1_score', - 'log_loss', 'confusion_matrix', 'roc_curve']: + for metric in [ + "accuracy", + "auc", + "precision", + "recall", + "f1_score", + "log_loss", + "confusion_matrix", + "roc_curve", + ]: self.assertIn(metric, evaluation) def test_classify(self): classification = self.model.classify(self.data) - for a, b in zip(classification['class'], self.data['labels']): + for a, b in zip(classification["class"], self.data["labels"]): self.assertEqual(a, b) - for p in classification['probability']: + for p in classification["probability"]: if self.is_binary_classification: - self.assertTrue(p > .5) + self.assertTrue(p > 0.5) else: - self.assertTrue(p > .33) + self.assertTrue(p > 0.33) def test_predict_topk(self): topk_predictions = self.model.predict_topk(self.data, k=2) self.assertEqual(len(topk_predictions), len(self.data) * 2) self.assertEqual(3, len(topk_predictions.column_names())) - for column in ['id', 'class', 'probability']: + for column in ["id", "class", "probability"]: self.assertIn(column, topk_predictions.column_names()) - topk_predictions = self.model.predict_topk(self.data, k=1, output_type='rank') + topk_predictions = self.model.predict_topk(self.data, k=1, output_type="rank") self.assertEqual(len(topk_predictions), len(self.data) * 1) self.assertEqual(3, len(topk_predictions.column_names())) - for column in ['id', 'class', 'rank']: + for column in ["id", "class", "rank"]: self.assertIn(column, topk_predictions.column_names()) - unique_ranks = topk_predictions['rank'].unique() + unique_ranks = topk_predictions["rank"].unique() self.assertTrue(len(unique_ranks) == 1) self.assertTrue(unique_ranks[0] == 0) @@ -361,16 +422,16 @@ def test_summary(self): def test_summary_str(self): model = self.model - self.assertTrue(isinstance(model.summary('str'), str)) + self.assertTrue(isinstance(model.summary("str"), str)) def test_summary_dict(self): model = self.model - self.assertTrue(isinstance(model.summary('dict'), dict)) + self.assertTrue(isinstance(model.summary("dict"), dict)) def test_summary_invalid_input(self): model = self.model with self.assertRaises(ToolkitError): - model.summary(model.summary('invalid')) + model.summary(model.summary("invalid")) with self.assertRaises(ToolkitError): model.summary(model.summary(0)) @@ -383,21 +444,31 @@ class ClassifierTestTwoClassesIntLabels(ClassifierTestTwoClassesStringLabels): @classmethod def setUpClass(self): self.data = copy(binary_test_data) - self.data['labels'] = self.data['labels'].apply(lambda x: 0 if x == 'white noise' else 1) + self.data["labels"] = self.data["labels"].apply( + lambda x: 0 if x == "white noise" else 1 + ) self.is_binary_classification = True layer_sizes = [100] - self.model = tc.sound_classifier.create(self.data, 'labels', feature='audio', - custom_layer_sizes = layer_sizes, - validation_set=None) - assert(self.model.custom_layer_sizes == layer_sizes) + self.model = tc.sound_classifier.create( + self.data, + "labels", + feature="audio", + custom_layer_sizes=layer_sizes, + validation_set=None, + ) + assert self.model.custom_layer_sizes == layer_sizes # Remove the following two tests after #2949 is fixed! - @pytest.mark.xfail(reason="Non-deterministic test failure tracked in https://github.com/apple/turicreate/issues/2949") + @pytest.mark.xfail( + reason="Non-deterministic test failure tracked in https://github.com/apple/turicreate/issues/2949" + ) def test_classify(self): pass - @pytest.mark.xfail(reason="Non-deterministic test failure tracked in https://github.com/apple/turicreate/issues/2949") + @pytest.mark.xfail( + reason="Non-deterministic test failure tracked in https://github.com/apple/turicreate/issues/2949" + ) def test_predict(self): pass @@ -406,21 +477,33 @@ class ClassifierTestThreeClassesStringLabels(ClassifierTestTwoClassesStringLabel @classmethod def setUpClass(self): def generate_constant_noise(length, sample_rate): - data = np.ones((int(length * sample_rate))).astype('int16') - return {'sample_rate': sample_rate, 'data': data} - - constant_noise = [generate_constant_noise(2.5, 17000), generate_constant_noise(5, 17000), - generate_constant_noise(1, 17000)] - constant_noise = tc.SFrame({'audio': constant_noise, - 'labels': ['constant noise'] * len(constant_noise)}) + data = np.ones((int(length * sample_rate))).astype("int16") + return {"sample_rate": sample_rate, "data": data} + + constant_noise = [ + generate_constant_noise(2.5, 17000), + generate_constant_noise(5, 17000), + generate_constant_noise(1, 17000), + ] + constant_noise = tc.SFrame( + { + "audio": constant_noise, + "labels": ["constant noise"] * len(constant_noise), + } + ) self.data = copy(binary_test_data).append(constant_noise) self.is_binary_classification = False layer_sizes = [75, 100, 20] - self.model = tc.sound_classifier.create(self.data, 'labels', feature='audio', - custom_layer_sizes = layer_sizes, - validation_set=self.data, max_iterations=100) - assert(self.model.custom_layer_sizes == layer_sizes) + self.model = tc.sound_classifier.create( + self.data, + "labels", + feature="audio", + custom_layer_sizes=layer_sizes, + validation_set=self.data, + max_iterations=100, + ) + assert self.model.custom_layer_sizes == layer_sizes def test_validation_set(self): self.assertTrue(self.model.validation_accuracy is not None) @@ -433,19 +516,22 @@ def setUpClass(self): # Add a half second clip short_clip = binary_test_data[0] - half_second_length = int(short_clip['audio']['sample_rate'] / 2.) - short_clip['audio']['data'] = short_clip['audio']['data'][:half_second_length] - short_clip = tc.SFrame({'audio': [short_clip['audio']], 'labels': [short_clip['labels']]}) + half_second_length = int(short_clip["audio"]["sample_rate"] / 2.0) + short_clip["audio"]["data"] = short_clip["audio"]["data"][:half_second_length] + short_clip = tc.SFrame( + {"audio": [short_clip["audio"]], "labels": [short_clip["labels"]]} + ) self.data = self.data.append(short_clip) def test_get_deep_features(self): - deep_features = tc.sound_classifier.get_deep_features(self.data['audio']) + deep_features = tc.sound_classifier.get_deep_features(self.data["audio"]) self.assertEqual(len(deep_features), len(self.data)) self.assertEqual(deep_features[-1], []) def test_model(self): - model = tc.sound_classifier.create(self.data, 'labels', feature='audio', - validation_set=self.data) + model = tc.sound_classifier.create( + self.data, "labels", feature="audio", validation_set=self.data + ) # A prediction for a clip which is too short should be None predictions = model.predict(self.data) @@ -454,7 +540,7 @@ def test_model(self): for l in predictions[:-1]: self.assertNotEqual(l, None) - predictions = model.predict(self.data, output_type='probability_vector') + predictions = model.predict(self.data, output_type="probability_vector") self.assertEqual(predictions[-1], None) for l in predictions[:-1]: self.assertNotEqual(l, None) @@ -463,63 +549,67 @@ def test_model(self): self.assertIsNotNone(evaluate_results) classify_results = model.classify(self.data) - self.assertEqual(classify_results[-1], {'class': None, 'probability': None}) + self.assertEqual(classify_results[-1], {"class": None, "probability": None}) for i in classify_results[:-1]: - self.assertNotEqual(i['class'], None) - self.assertNotEqual(i['probability'], None) + self.assertNotEqual(i["class"], None) + self.assertNotEqual(i["probability"], None) topk_results = model.predict_topk(self.data) - self.assertEqual(topk_results[-1]['class'], None) - self.assertEqual(topk_results[-1]['probability'], None) + self.assertEqual(topk_results[-1]["class"], None) + self.assertEqual(topk_results[-1]["probability"], None) for r in topk_results[:-1]: - self.assertNotEqual(r['class'], None) - self.assertNotEqual(r['probability'], None) + self.assertNotEqual(r["class"], None) + self.assertNotEqual(r["probability"], None) -@unittest.skipIf(_mac_ver() < (10,14), 'Custom models only supported on macOS 10.14+') +@unittest.skipIf(_mac_ver() < (10, 14), "Custom models only supported on macOS 10.14+") class CoreMlCustomModelPreprocessingTest(unittest.TestCase): sample_rate = 16000 - frame_length = int(.975 * sample_rate) + frame_length = int(0.975 * sample_rate) def test_case(self): from turicreate.toolkits.sound_classifier import vggish_input model = coremltools.proto.Model_pb2.Model() - model.customModel.className = 'TCSoundClassifierPreprocessing' + model.customModel.className = "TCSoundClassifierPreprocessing" model.specificationVersion = 3 # Input - float array with shape (frame_length) x = model.description.input.add() - x.name = 'x' - x.type.multiArrayType.dataType = FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value('FLOAT32') + x.name = "x" + x.type.multiArrayType.dataType = FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value( + "FLOAT32" + ) x.type.multiArrayType.shape.append(self.frame_length) # Output - double array with shape (1, 96, 64) y = model.description.output.add() - y.name = 'y' - y.type.multiArrayType.dataType = FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value('DOUBLE') + y.name = "y" + y.type.multiArrayType.dataType = FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value( + "DOUBLE" + ) y.type.multiArrayType.shape.append(1) y.type.multiArrayType.shape.append(96) y.type.multiArrayType.shape.append(64) with TempDirectory() as temp_dir: model = coremltools.models.MLModel(model) - model_path = temp_dir + '/test.mlmodel' + model_path = temp_dir + "/test.mlmodel" model.save(model_path) model = coremltools.models.MLModel(model_path) input_data = np.arange(self.frame_length) * 0.00001 y1 = vggish_input.waveform_to_examples(input_data, self.sample_rate)[0] - y2 = model.predict({'x': np.float32(input_data)})['y'] + y2 = model.predict({"x": np.float32(input_data)})["y"] - self.assertEqual(y2.shape, (1,96,64)) + self.assertEqual(y2.shape, (1, 96, 64)) self.assertTrue(np.isclose(y1, y2, atol=1e-04).all()) class ReuseDeepFeatures(unittest.TestCase): def test_simple_case(self): data = copy(binary_test_data) - deep_features = tc.sound_classifier.get_deep_features(data['audio']) + deep_features = tc.sound_classifier.get_deep_features(data["audio"]) # Verify deep features in correct format self.assertTrue(isinstance(deep_features, tc.SArray)) @@ -531,19 +621,23 @@ def test_simple_case(self): self.assertEqual(len(deep_features[0][0]), 12288) # Test helper methods - self.assertTrue(tc.sound_classifier._is_audio_data_sarray(data['audio'])) + self.assertTrue(tc.sound_classifier._is_audio_data_sarray(data["audio"])) self.assertTrue(tc.sound_classifier._is_deep_feature_sarray(deep_features)) - original_audio_data = data['audio'] - del data['audio'] + original_audio_data = data["audio"] + del data["audio"] # Create a model using the deep features - data['features'] = deep_features - model = tc.sound_classifier.create(data, 'labels', feature='features') + data["features"] = deep_features + model = tc.sound_classifier.create(data, "labels", feature="features") # Test predict - predictions_from_audio = model.predict(original_audio_data, output_type='probability_vector') - predictions_from_deep_features = model.predict(deep_features, output_type='probability_vector') + predictions_from_audio = model.predict( + original_audio_data, output_type="probability_vector" + ) + predictions_from_deep_features = model.predict( + deep_features, output_type="probability_vector" + ) for a, b in zip(predictions_from_audio, predictions_from_deep_features): np.testing.assert_array_almost_equal(a, b, decimal=6) @@ -551,20 +645,29 @@ def test_simple_case(self): predictions_from_audio = model.classify(original_audio_data) predictions_from_deep_features = model.classify(deep_features) for a, b in zip(predictions_from_audio, predictions_from_deep_features): - self.assertEqual(a['class'], b['class']) - np.testing.assert_array_almost_equal(a['probability'], b['probability'], decimal=6) + self.assertEqual(a["class"], b["class"]) + np.testing.assert_array_almost_equal( + a["probability"], b["probability"], decimal=6 + ) # Test predict_topk predictions_from_audio = model.predict_topk(original_audio_data, k=2) predictions_from_deep_features = model.predict_topk(deep_features, k=2) for a, b in zip(predictions_from_audio, predictions_from_deep_features): - self.assertEqual(a['id'], b['id']) - self.assertEqual(a['class'], b['class']) - np.testing.assert_array_almost_equal(a['probability'], b['probability'], decimal=6) + self.assertEqual(a["id"], b["id"]) + self.assertEqual(a["class"], b["class"]) + np.testing.assert_array_almost_equal( + a["probability"], b["probability"], decimal=6 + ) # Test evaluate - predictions_from_audio = model.evaluate(tc.SFrame({'features': original_audio_data, - 'labels': data['labels']})) - predictions_from_deep_features = model.evaluate(tc.SFrame({'features': deep_features, - 'labels': data['labels']})) - self.assertEqual(predictions_from_audio['f1_score'], predictions_from_deep_features['f1_score']) + predictions_from_audio = model.evaluate( + tc.SFrame({"features": original_audio_data, "labels": data["labels"]}) + ) + predictions_from_deep_features = model.evaluate( + tc.SFrame({"features": deep_features, "labels": data["labels"]}) + ) + self.assertEqual( + predictions_from_audio["f1_score"], + predictions_from_deep_features["f1_score"], + ) diff --git a/src/python/turicreate/test/test_boosted_trees.py b/src/python/turicreate/test/test_boosted_trees.py index 8ee3f689c5..168017070e 100644 --- a/src/python/turicreate/test/test_boosted_trees.py +++ b/src/python/turicreate/test/test_boosted_trees.py @@ -22,49 +22,49 @@ import os as _os dirname = _os.path.dirname(__file__) -mushroom_dataset = _os.path.join(dirname, 'mushroom.csv') +mushroom_dataset = _os.path.join(dirname, "mushroom.csv") _DEFAULT_OPTIONS_REGRESSION = { -'step_size': 0.3, -'max_depth': 6, -'max_iterations': 10, -'min_child_weight': 0.1, -'min_loss_reduction': 0.0, -'row_subsample': 1.0, -'column_subsample': 1.0, -'random_seed': None, -'metric': 'auto', -'early_stopping_rounds': None, -'model_checkpoint_interval': 5, -'model_checkpoint_path': None, -'resume_from_checkpoint': None, + "step_size": 0.3, + "max_depth": 6, + "max_iterations": 10, + "min_child_weight": 0.1, + "min_loss_reduction": 0.0, + "row_subsample": 1.0, + "column_subsample": 1.0, + "random_seed": None, + "metric": "auto", + "early_stopping_rounds": None, + "model_checkpoint_interval": 5, + "model_checkpoint_path": None, + "resume_from_checkpoint": None, } _DEFAULT_OPTIONS_CLASSIFIER = copy.deepcopy(_DEFAULT_OPTIONS_REGRESSION) -_DEFAULT_OPTIONS_CLASSIFIER['class_weights'] = None +_DEFAULT_OPTIONS_CLASSIFIER["class_weights"] = None class BoostedTreesRegressionTest(unittest.TestCase): - @classmethod def setUpClass(self): self.data = tc.SFrame.read_csv(mushroom_dataset) - self.data['label'] = (self.data['label'] == 'p') + 20 + self.data["label"] = (self.data["label"] == "p") + 20 self.dtrain, self.dtest = self.data.random_split(0.8, seed=1) - self.param = {'max_depth': 3, - 'step_size': 1, - 'min_loss_reduction': 1, - 'max_iterations': 10, - 'min_child_weight': 1} - self.target = 'label' + self.param = { + "max_depth": 3, + "step_size": 1, + "min_loss_reduction": 1, + "max_iterations": 10, + "min_child_weight": 1, + } + self.target = "label" self.unpacked_features = self.data.column_names() self.unpacked_features.remove(self.target) self.features = self.unpacked_features[:] - self.model = tc.boosted_trees_regression.create(self.dtrain, - target=self.target, - validation_set=self.dtest, - **self.param) + self.model = tc.boosted_trees_regression.create( + self.dtrain, target=self.target, validation_set=self.dtest, **self.param + ) self.def_opts = copy.deepcopy(_DEFAULT_OPTIONS_REGRESSION) self.opts = self.def_opts.copy() @@ -73,61 +73,66 @@ def setUpClass(self): # Answers # ------------------------------------------------------------------------ self.get_ans = { - 'column_subsample': lambda x: self.opts['column_subsample'], - 'unpacked_features': lambda x: x == self.unpacked_features, - 'features': lambda x: x == self.features, - 'max_depth': lambda x: x == self.opts['max_depth'], - 'min_child_weight': lambda x: x == self.opts['min_child_weight'], - 'min_loss_reduction': lambda x: x == self.opts['min_loss_reduction'], - 'num_examples': lambda x: x == self.dtrain.num_rows(), - 'num_unpacked_features': lambda x: x == 22, - 'num_features': lambda x: x == 22, - 'max_iterations': lambda x: x == self.opts['max_iterations'], - 'num_trees': lambda x: x == self.opts['max_iterations'], - 'num_validation_examples': lambda x: x == self.dtest.num_rows(), - 'row_subsample': lambda x: x == self.opts['row_subsample'], - 'step_size': lambda x: x == self.opts['step_size'], - 'target': lambda x: x == self.target, - 'training_rmse': lambda x: x > 0, - 'training_max_error': lambda x: x > 0, - 'training_time': lambda x: x >= 0, - 'trees_json': lambda x: isinstance(x, list), - 'validation_data': lambda x: isinstance(x, tc.SFrame) and len(x) == len(self.dtest), - 'validation_rmse': lambda x: x > 0, - 'validation_max_error': lambda x: x > 0, - 'random_seed': lambda x: x is None, - 'progress': lambda x: isinstance(x, tc.SFrame) or (x is None), - 'metric': lambda x: x == 'auto', - 'early_stopping_rounds': lambda x: x is None, - 'model_checkpoint_interval': lambda x: x == 5, - 'model_checkpoint_path': lambda x: x is None, - 'resume_from_checkpoint': lambda x: x is None, - 'disable_posttrain_evaluation' : lambda x: x == False, - } + "column_subsample": lambda x: self.opts["column_subsample"], + "unpacked_features": lambda x: x == self.unpacked_features, + "features": lambda x: x == self.features, + "max_depth": lambda x: x == self.opts["max_depth"], + "min_child_weight": lambda x: x == self.opts["min_child_weight"], + "min_loss_reduction": lambda x: x == self.opts["min_loss_reduction"], + "num_examples": lambda x: x == self.dtrain.num_rows(), + "num_unpacked_features": lambda x: x == 22, + "num_features": lambda x: x == 22, + "max_iterations": lambda x: x == self.opts["max_iterations"], + "num_trees": lambda x: x == self.opts["max_iterations"], + "num_validation_examples": lambda x: x == self.dtest.num_rows(), + "row_subsample": lambda x: x == self.opts["row_subsample"], + "step_size": lambda x: x == self.opts["step_size"], + "target": lambda x: x == self.target, + "training_rmse": lambda x: x > 0, + "training_max_error": lambda x: x > 0, + "training_time": lambda x: x >= 0, + "trees_json": lambda x: isinstance(x, list), + "validation_data": lambda x: isinstance(x, tc.SFrame) + and len(x) == len(self.dtest), + "validation_rmse": lambda x: x > 0, + "validation_max_error": lambda x: x > 0, + "random_seed": lambda x: x is None, + "progress": lambda x: isinstance(x, tc.SFrame) or (x is None), + "metric": lambda x: x == "auto", + "early_stopping_rounds": lambda x: x is None, + "model_checkpoint_interval": lambda x: x == 5, + "model_checkpoint_path": lambda x: x is None, + "resume_from_checkpoint": lambda x: x is None, + "disable_posttrain_evaluation": lambda x: x == False, + } self.metrics = ["rmse", "max_error"] self.fields_ans = self.get_ans.keys() def test_create(self): - model =tc.boosted_trees_regression.create(self.dtrain, target='label', - validation_set=self.dtest, - **self.param) + model = tc.boosted_trees_regression.create( + self.dtrain, target="label", validation_set=self.dtest, **self.param + ) - rmse = model.evaluate(self.dtest, 'rmse')['rmse'] + rmse = model.evaluate(self.dtest, "rmse")["rmse"] self.assertTrue(model is not None) self.assertTrue(rmse < 0.1) dtrain = self.dtrain - dtrain['label'] = 10 - self.assertRaises(ToolkitError, - lambda:tc.boosted_trees_regression.create(self.dtrain, - target='label_wrong', **self.param)) + dtrain["label"] = 10 + self.assertRaises( + ToolkitError, + lambda: tc.boosted_trees_regression.create( + self.dtrain, target="label_wrong", **self.param + ), + ) + def test__list_fields(self): """ Check the _list_fields function. Compare with the answer. """ model = self.model - fields = model._list_fields() + fields = model._list_fields() self.assertEqual(set(fields), set(self.fields_ans)) def test_get(self): @@ -138,8 +143,10 @@ def test_get(self): model = self.model for field in self.fields_ans: ans = model._get(field) - self.assertTrue(self.get_ans[field](ans), \ - '''Get failed in field {}. Output was {}.'''.format(field, ans)) + self.assertTrue( + self.get_ans[field](ans), + """Get failed in field {}. Output was {}.""".format(field, ans), + ) def test_summary(self): """ @@ -154,14 +161,14 @@ def test_repr(self): Check the repr function. """ model = self.model - ans = str(model) + ans = str(model) self.assertTrue(type(ans) == str) def test_save_and_load(self): """ Make sure saving and loading retains things. """ - filename = 'save_file%s' % (str(uuid.uuid4())) + filename = "save_file%s" % (str(uuid.uuid4())) self.model.save(filename) self.model = tc.load_model(filename) @@ -190,10 +197,11 @@ def test_predict(self): y1 = self.model.predict(self.dtest) self.assertTrue(len(y1) == len(self.dtest)) print(self.model.evaluate(self.dtest)) - print('check the result of evaluate and print history, they should match') + print("check the result of evaluate and print history, they should match") y2 = self.model.predict( - self.dtest[[c for c in self.dtest.column_names() if c != 'label']]) + self.dtest[[c for c in self.dtest.column_names() if c != "label"]] + ) self.assertTrue(all((y1 - y2) * (y1 - y2) < 1e-10)) def test_evaluate(self): @@ -204,23 +212,25 @@ def test_evaluate(self): t = self.dtrain[self.target] p = model.predict(self.dtrain) self.sm_metrics = { - "max_error" : evaluation.max_error(t, p), - "rmse" : evaluation.rmse(t, p) + "max_error": evaluation.max_error(t, p), + "rmse": evaluation.rmse(t, p), } + def check_metric(ans, metric): self.assertTrue(ans is not None) self.assertTrue(metric in ans) - self.assertAlmostEqual(ans[metric], - self.sm_metrics[metric], - places = 4, - msg = "%s = (%s,%s)" % \ - (metric, ans[metric], self.sm_metrics[metric])) + self.assertAlmostEqual( + ans[metric], + self.sm_metrics[metric], + places=4, + msg="%s = (%s,%s)" % (metric, ans[metric], self.sm_metrics[metric]), + ) # Default ans = model.evaluate(self.dtrain) self.assertEqual(sorted(ans.keys()), sorted(self.metrics)) for m in self.metrics: - check_metric(ans, m) + check_metric(ans, m) # Individual for m in self.metrics: @@ -238,9 +248,24 @@ def test_feature_importance(self): self.assertEqual(sf.column_names(), ["name", "index", "count"]) def test_trees_json(self): - tree_0_vert_0 = eval(self.model.trees_json[0])['vertices'][0] - self.assertEquals(set(tree_0_vert_0.keys()), - set(['name','value_hexadecimal','yes_child','cover','missing_child','no_child','type','id','value','gain'])) + tree_0_vert_0 = eval(self.model.trees_json[0])["vertices"][0] + self.assertEquals( + set(tree_0_vert_0.keys()), + set( + [ + "name", + "value_hexadecimal", + "yes_child", + "cover", + "missing_child", + "no_child", + "type", + "id", + "value", + "gain", + ] + ), + ) def test_list_and_dict_type(self): rmse_threshold = 0.2 @@ -250,85 +275,98 @@ def test_list_and_dict_type(self): # make a more complicated dataset containing list and dictionary type columns complex_data = copy.copy(simple_data) - complex_data['random_list_noise'] = \ - tc.SArray([[random.gauss(0, 1) for j in range(3)] for i in range(complex_data.num_rows())]) - complex_data['random_dict_noise'] = \ - tc.SArray([{'x0': random.gauss(0, 1)} for i in range(complex_data.num_rows())]) + complex_data["random_list_noise"] = tc.SArray( + [ + [random.gauss(0, 1) for j in range(3)] + for i in range(complex_data.num_rows()) + ] + ) + complex_data["random_dict_noise"] = tc.SArray( + [{"x0": random.gauss(0, 1)} for i in range(complex_data.num_rows())] + ) complex_train, complex_test = complex_data.random_split(0.8, seed=1) - for (train, test) in [(simple_train, simple_test), (complex_train, complex_test)]: + for (train, test) in [ + (simple_train, simple_test), + (complex_train, complex_test), + ]: self._test_regression_model(train, test, rmse_threshold) - def _test_regression_model(self, train, test, rmse_threshold, target='label'): + def _test_regression_model(self, train, test, rmse_threshold, target="label"): # create - model =tc.boosted_trees_regression.create(train, target=target, - validation_set=test, - **self.param) + model = tc.boosted_trees_regression.create( + train, target=target, validation_set=test, **self.param + ) # predict pred = model.predict(test) rmse = evaluation.rmse(pred, test[target]) self.assertLess(rmse, rmse_threshold) # evaluate - rmse_eval = model.evaluate(test, metric='rmse')['rmse'] + rmse_eval = model.evaluate(test, metric="rmse")["rmse"] self.assertTrue(rmse_eval < rmse_threshold) self.assertAlmostEqual(rmse_eval, rmse, delta=1e-2) - def test_predict_new_category(self): # make new categorical feature new_test = self.dtest[:] # change 'r' cap-color into a new color 'z' - new_test['cap-color'] = new_test['cap-color'].apply(lambda x: 'z' if x == 'r' else x) + new_test["cap-color"] = new_test["cap-color"].apply( + lambda x: "z" if x == "r" else x + ) y1 = self.model.predict(new_test) new_data = self.data[:] - new_data['dict_color_feature'] = \ - new_data['cap-color'].apply(lambda x: {'cap-color': ord(x)}) + new_data["dict_color_feature"] = new_data["cap-color"].apply( + lambda x: {"cap-color": ord(x)} + ) train, test = new_data.random_split(0.8, seed=1) ## add a new key to dictionary in predict time - test['dict_color_feature'] = test['dict_color_feature'].apply( - lambda x: dict(list(x.items()) + [('cap-color2', x['cap-color']+1)])) + test["dict_color_feature"] = test["dict_color_feature"].apply( + lambda x: dict(list(x.items()) + [("cap-color2", x["cap-color"] + 1)]) + ) - model =tc.boosted_trees_regression.create(train, target='label', - **self.param) + model = tc.boosted_trees_regression.create(train, target="label", **self.param) y = self.model.predict(test) + ## --------------------------------------------------------------------------- ## ## Boosted Trees Classifier Test ## ## --------------------------------------------------------------------------- + def test_suite_boosted_trees_classifier(): """ Create a test suite for each test case in the BoostedTreesClassifierTest. """ testCases = [ - binary_classification_integer_target, - binary_classification_string_target, - binary_classification_string_target_misc_input, - multiclass_classification_integer_target, - multiclass_classification_string_target, - multiclass_classification_string_target_misc_input, - ] + binary_classification_integer_target, + binary_classification_string_target, + binary_classification_string_target_misc_input, + multiclass_classification_integer_target, + multiclass_classification_string_target, + multiclass_classification_string_target_misc_input, + ] for t in testCases: testcase_members = {} testcase_members[t.__name__] = classmethod(t) testcase_class = type( - 'BoostedTreesClassifierTest_%s' % t.__name__, + "BoostedTreesClassifierTest_%s" % t.__name__, (BoostedTreesClassifierTest,), - testcase_members + testcase_members, ) testcase_class.__test__ = True getattr(testcase_class, t.__name__)() for method in dir(testcase_class): - if method.startswith('test_'): + if method.startswith("test_"): testcase_instance = testcase_class(method) getattr(testcase_instance, method)() + def binary_classification_integer_target(cls): """ Binary classification with an integer target. @@ -336,21 +374,22 @@ def binary_classification_integer_target(cls): # Get the data from the mushroom dataset. cls.data = tc.SFrame.read_csv(mushroom_dataset) cls.dtrain, cls.dtest = cls.data.random_split(0.8, seed=1) - cls.dtrain['label'] = cls.dtrain['label'] == 'p' - cls.dtest['label'] = cls.dtest['label'] == 'p' - cls.param = {'max_depth': 3, - 'step_size': 1, - 'min_loss_reduction': 1, - 'max_iterations': 2, - 'min_child_weight': 1} - cls.target = 'label' + cls.dtrain["label"] = cls.dtrain["label"] == "p" + cls.dtest["label"] = cls.dtest["label"] == "p" + cls.param = { + "max_depth": 3, + "step_size": 1, + "min_loss_reduction": 1, + "max_iterations": 2, + "min_child_weight": 1, + } + cls.target = "label" cls.unpacked_features = cls.data.column_names() cls.unpacked_features.remove(cls.target) cls.features = cls.unpacked_features[:] - cls.model = tc.boosted_trees_classifier.create(cls.dtrain, - target=cls.target, - validation_set = cls.dtest, - **cls.param) + cls.model = tc.boosted_trees_classifier.create( + cls.dtrain, target=cls.target, validation_set=cls.dtest, **cls.param + ) cls.def_opts = copy.deepcopy(_DEFAULT_OPTIONS_CLASSIFIER) cls.opts = cls.def_opts.copy() @@ -359,85 +398,89 @@ def binary_classification_integer_target(cls): # Answers # ------------------------------------------------------------------------ - if 'classes' in cls.model._list_fields(): - num_examples_per_class = {\ - c:(cls.dtrain[cls.target] == c).sum() for c in cls.model.classes} + if "classes" in cls.model._list_fields(): + num_examples_per_class = { + c: (cls.dtrain[cls.target] == c).sum() for c in cls.model.classes + } cls.get_ans = { - 'column_subsample': lambda x: cls.opts['column_subsample'], - 'unpacked_features': lambda x: x == cls.unpacked_features, - 'features': lambda x: x == cls.features, - 'max_depth': lambda x: x == cls.opts['max_depth'], - 'min_child_weight': lambda x: x == cls.opts['min_child_weight'], - 'min_loss_reduction': lambda x: x == cls.opts['min_loss_reduction'], - 'num_examples': lambda x: x == cls.dtrain.num_rows(), - 'num_examples_per_class': lambda x: x == num_examples_per_class, - 'num_classes': lambda x: x == 2, - 'classes': lambda x: x == [0,1], - 'num_unpacked_features': lambda x: x == 22, - 'num_features': lambda x: x == 22, - 'max_iterations': lambda x: x == cls.opts['max_iterations'], - 'num_trees': lambda x: x == cls.opts['max_iterations'], - 'num_validation_examples': lambda x: x == cls.dtest.num_rows(), - 'row_subsample': lambda x: x == cls.opts['row_subsample'], - 'step_size': lambda x: x == cls.opts['step_size'], - 'target': lambda x: x == cls.target, - 'training_accuracy': lambda x: x > 0, - 'training_log_loss': lambda x: x > 0, - 'training_time': lambda x: x >= 0, - 'class_weights': lambda x: x == {0:1.0, 1:1.0}, - 'trees_json': lambda x: isinstance(x, list), - 'validation_accuracy': lambda x: x > 0, - 'validation_log_loss': lambda x: x > 0, - 'random_seed': lambda x: x is None, - 'progress': lambda x: isinstance(x, tc.SFrame) or (x is None), - 'metric': lambda x: x == 'auto', - 'early_stopping_rounds': lambda x: x is None, - 'model_checkpoint_interval': lambda x: x == 5, - 'model_checkpoint_path': lambda x: x is None, - 'resume_from_checkpoint': lambda x: x is None, - 'training_auc': lambda x: x > 0, - 'training_confusion_matrix': lambda x: len(x) > 0, - 'training_f1_score': lambda x: x > 0, - 'training_precision': lambda x: x > 0, - 'training_recall': lambda x: x > 0, - 'training_report_by_class': lambda x: len(x) > 0, - 'training_roc_curve': lambda x: len(x) > 0, - 'validation_data': lambda x: isinstance(x, tc.SFrame) and len(x) == len(cls.dtest), - 'validation_auc': lambda x: x > 0, - 'validation_confusion_matrix': lambda x: len(x) > 0, - 'validation_f1_score': lambda x: x > 0, - 'validation_precision': lambda x: x > 0, - 'validation_recall': lambda x: x > 0, - 'validation_report_by_class': lambda x: len(x) > 0, - 'validation_roc_curve': lambda x: len(x) > 0, - 'disable_posttrain_evaluation' : lambda x: x == False, - } + "column_subsample": lambda x: cls.opts["column_subsample"], + "unpacked_features": lambda x: x == cls.unpacked_features, + "features": lambda x: x == cls.features, + "max_depth": lambda x: x == cls.opts["max_depth"], + "min_child_weight": lambda x: x == cls.opts["min_child_weight"], + "min_loss_reduction": lambda x: x == cls.opts["min_loss_reduction"], + "num_examples": lambda x: x == cls.dtrain.num_rows(), + "num_examples_per_class": lambda x: x == num_examples_per_class, + "num_classes": lambda x: x == 2, + "classes": lambda x: x == [0, 1], + "num_unpacked_features": lambda x: x == 22, + "num_features": lambda x: x == 22, + "max_iterations": lambda x: x == cls.opts["max_iterations"], + "num_trees": lambda x: x == cls.opts["max_iterations"], + "num_validation_examples": lambda x: x == cls.dtest.num_rows(), + "row_subsample": lambda x: x == cls.opts["row_subsample"], + "step_size": lambda x: x == cls.opts["step_size"], + "target": lambda x: x == cls.target, + "training_accuracy": lambda x: x > 0, + "training_log_loss": lambda x: x > 0, + "training_time": lambda x: x >= 0, + "class_weights": lambda x: x == {0: 1.0, 1: 1.0}, + "trees_json": lambda x: isinstance(x, list), + "validation_accuracy": lambda x: x > 0, + "validation_log_loss": lambda x: x > 0, + "random_seed": lambda x: x is None, + "progress": lambda x: isinstance(x, tc.SFrame) or (x is None), + "metric": lambda x: x == "auto", + "early_stopping_rounds": lambda x: x is None, + "model_checkpoint_interval": lambda x: x == 5, + "model_checkpoint_path": lambda x: x is None, + "resume_from_checkpoint": lambda x: x is None, + "training_auc": lambda x: x > 0, + "training_confusion_matrix": lambda x: len(x) > 0, + "training_f1_score": lambda x: x > 0, + "training_precision": lambda x: x > 0, + "training_recall": lambda x: x > 0, + "training_report_by_class": lambda x: len(x) > 0, + "training_roc_curve": lambda x: len(x) > 0, + "validation_data": lambda x: isinstance(x, tc.SFrame) + and len(x) == len(cls.dtest), + "validation_auc": lambda x: x > 0, + "validation_confusion_matrix": lambda x: len(x) > 0, + "validation_f1_score": lambda x: x > 0, + "validation_precision": lambda x: x > 0, + "validation_recall": lambda x: x > 0, + "validation_report_by_class": lambda x: len(x) > 0, + "validation_roc_curve": lambda x: len(x) > 0, + "disable_posttrain_evaluation": lambda x: x == False, + } cls.fields_ans = cls.get_ans.keys() + def binary_classification_string_target(cls): binary_classification_integer_target(cls) cls.type = str - cls.dtrain['label'] = cls.dtrain['label'].astype(str) - cls.dtest['label'] = cls.dtest['label'].astype(str) - cls.dtrain['label'] = cls.dtrain['label'] + '-cat' - cls.dtest['label'] = cls.dtest['label'] + '-cat' - cls.model = tc.boosted_trees_classifier.create(cls.dtrain, - target=cls.target, - validation_set = cls.dtest, - **cls.param) - - num_examples_per_class = {\ - c:(cls.dtrain[cls.target] == c).sum() for c in cls.model.classes} - cls.get_ans['num_examples_per_class'] = lambda x: x == num_examples_per_class - cls.get_ans['class_weights'] = lambda x: x == {'0-cat':1.0, '1-cat':1.0} - cls.get_ans['classes'] = lambda x: x == ['0-cat', '1-cat'] + cls.dtrain["label"] = cls.dtrain["label"].astype(str) + cls.dtest["label"] = cls.dtest["label"].astype(str) + cls.dtrain["label"] = cls.dtrain["label"] + "-cat" + cls.dtest["label"] = cls.dtest["label"] + "-cat" + cls.model = tc.boosted_trees_classifier.create( + cls.dtrain, target=cls.target, validation_set=cls.dtest, **cls.param + ) + + num_examples_per_class = { + c: (cls.dtrain[cls.target] == c).sum() for c in cls.model.classes + } + cls.get_ans["num_examples_per_class"] = lambda x: x == num_examples_per_class + cls.get_ans["class_weights"] = lambda x: x == {"0-cat": 1.0, "1-cat": 1.0} + cls.get_ans["classes"] = lambda x: x == ["0-cat", "1-cat"] + def binary_classification_string_target_misc_input(cls): binary_classification_string_target(cls) # Add noise columns of categorical, - noise_X = tc.util.generate_random_sframe(cls.data.num_rows(), 'vmdA') + noise_X = tc.util.generate_random_sframe(cls.data.num_rows(), "vmdA") for c in noise_X.column_names(): cls.data[c] = noise_X[c] @@ -446,47 +489,53 @@ def binary_classification_string_target_misc_input(cls): def multiclass_classification_integer_target(cls): binary_classification_integer_target(cls) + def create_multiclass_label(row): - if row['label'] == 0: + if row["label"] == 0: return 0 - elif row['cap-surface'] == 'y': + elif row["cap-surface"] == "y": return 1 else: return 2 - cls.dtrain['label'] = cls.dtrain.apply(create_multiclass_label) - cls.dtest['label'] = cls.dtest.apply(create_multiclass_label) - cls.model = tc.boosted_trees_classifier.create(cls.dtrain, - target=cls.target, - validation_set = cls.dtest, - **cls.param) - - num_examples_per_class = {c:(cls.dtrain[cls.target] == c).sum() for c in cls.model.classes} - cls.get_ans['num_examples_per_class'] = lambda x: x == num_examples_per_class - cls.get_ans['num_classes'] = lambda x: x == 3 - cls.get_ans['num_trees'] = lambda x: x == 6 - cls.get_ans['classes'] = lambda x: set(x) == set([0,1,2]) - cls.get_ans['class_weights'] = lambda x: x == {0:1.0, 1:1.0, 2:1.0} + + cls.dtrain["label"] = cls.dtrain.apply(create_multiclass_label) + cls.dtest["label"] = cls.dtest.apply(create_multiclass_label) + cls.model = tc.boosted_trees_classifier.create( + cls.dtrain, target=cls.target, validation_set=cls.dtest, **cls.param + ) + + num_examples_per_class = { + c: (cls.dtrain[cls.target] == c).sum() for c in cls.model.classes + } + cls.get_ans["num_examples_per_class"] = lambda x: x == num_examples_per_class + cls.get_ans["num_classes"] = lambda x: x == 3 + cls.get_ans["num_trees"] = lambda x: x == 6 + cls.get_ans["classes"] = lambda x: set(x) == set([0, 1, 2]) + cls.get_ans["class_weights"] = lambda x: x == {0: 1.0, 1: 1.0, 2: 1.0} + def multiclass_classification_string_target(cls): multiclass_classification_integer_target(cls) cls.type = str - cls.dtrain['label'] = cls.dtrain['label'].astype(str) - cls.dtest['label'] = cls.dtest['label'].astype(str) - cls.model = tc.boosted_trees_classifier.create(cls.dtrain, - target=cls.target, - validation_set = cls.dtest, - **cls.param) - num_examples_per_class = {c:(cls.dtrain[cls.target] == c).sum() for c in cls.model.classes} - cls.get_ans['num_examples_per_class'] = lambda x: x == num_examples_per_class - cls.get_ans['classes'] = lambda x: set(x) == set(map(str, [0,1,2])) - cls.get_ans['class_weights'] = lambda x: x == {'0':1.0, '1':1.0, '2':1.0} + cls.dtrain["label"] = cls.dtrain["label"].astype(str) + cls.dtest["label"] = cls.dtest["label"].astype(str) + cls.model = tc.boosted_trees_classifier.create( + cls.dtrain, target=cls.target, validation_set=cls.dtest, **cls.param + ) + num_examples_per_class = { + c: (cls.dtrain[cls.target] == c).sum() for c in cls.model.classes + } + cls.get_ans["num_examples_per_class"] = lambda x: x == num_examples_per_class + cls.get_ans["classes"] = lambda x: set(x) == set(map(str, [0, 1, 2])) + cls.get_ans["class_weights"] = lambda x: x == {"0": 1.0, "1": 1.0, "2": 1.0} + def multiclass_classification_string_target_misc_input(cls): multiclass_classification_string_target(cls) # Add noise columns of categorical, - noise_X = tc.util.generate_random_sframe(cls.data.num_rows(), 'vmdA') + noise_X = tc.util.generate_random_sframe(cls.data.num_rows(), "vmdA") for c in noise_X.column_names(): cls.data[c] = noise_X[c] @@ -496,25 +545,28 @@ class BoostedTreesClassifierTest(unittest.TestCase): __test__ = False def test_create(self): - model =tc.boosted_trees_classifier.create(self.dtrain, target='label', - validation_set=self.dtest, - **self.param) + model = tc.boosted_trees_classifier.create( + self.dtrain, target="label", validation_set=self.dtest, **self.param + ) self.assertTrue(model is not None) - self.assertGreater(model.evaluate(self.dtest, 'accuracy')['accuracy'], 0.9) + self.assertGreater(model.evaluate(self.dtest, "accuracy")["accuracy"], 0.9) dtrain = self.dtrain[:] - dtrain['label'] = 10 - self.assertRaises(ToolkitError, - lambda: tc.boosted_trees_classifier.create(self.dtrain, - target='label_wrong', **self.param)) + dtrain["label"] = 10 + self.assertRaises( + ToolkitError, + lambda: tc.boosted_trees_classifier.create( + self.dtrain, target="label_wrong", **self.param + ), + ) def test__list_fields(self): """ Check the _list_fields function. Compare with the answer. """ model = self.model - fields = model._list_fields() + fields = model._list_fields() self.assertEqual(set(fields), set(self.fields_ans)) def test_get(self): @@ -528,8 +580,9 @@ def test_get(self): result = self.get_ans[field](ans) if isinstance(result, tc.SArray): result = result.all() - self.assertTrue(result, \ - '''Get failed in field {}. Output was {}.'''.format(field, ans)) + self.assertTrue( + result, """Get failed in field {}. Output was {}.""".format(field, ans) + ) def test_summary(self): """ @@ -544,14 +597,14 @@ def test_repr(self): Check the repr function. """ model = self.model - ans = str(model) + ans = str(model) self.assertTrue(type(ans) == str, "Repr failed") def test_save_and_load(self): """ Make sure saving and loading retains things. """ - filename = 'save_file%s' % (str(uuid.uuid4())) + filename = "save_file%s" % (str(uuid.uuid4())) self.model.save(filename) self.model = tc.load_model(filename) @@ -576,35 +629,35 @@ def test_save_and_load(self): except: self.assertTrue(False, "Failed during save & load diagnostics") - def test_predict_topk(self): - ks = [self.model.num_classes -1, self.model.num_classes] + ks = [self.model.num_classes - 1, self.model.num_classes] for k in ks: - y1 = self.model.predict_topk(self.dtest, k=k, output_type='rank') - self.assertEqual(y1['class'].dtype, self.type) - self.assertEqual(y1['id'].dtype, int) + y1 = self.model.predict_topk(self.dtest, k=k, output_type="rank") + self.assertEqual(y1["class"].dtype, self.type) + self.assertEqual(y1["id"].dtype, int) self.assertEqual(y1.num_rows(), self.dtest.num_rows() * k) - y2 = self.model.predict_topk(self.dtest, k=k, output_type='probability') - self.assertEqual(y2['id'].dtype, int) + y2 = self.model.predict_topk(self.dtest, k=k, output_type="probability") + self.assertEqual(y2["id"].dtype, int) self.assertEqual(y2.num_rows(), self.dtest.num_rows() * k) - y3 = self.model.predict_topk(self.dtest, k=k, output_type='margin') - self.assertEqual(y3['id'].dtype, int) + y3 = self.model.predict_topk(self.dtest, k=k, output_type="margin") + self.assertEqual(y3["id"].dtype, int) self.assertEqual(y3.num_rows(), self.dtest.num_rows() * k) - self.assertTrue(all(y3[y3['class'] == 0]['margin'] == 0.0)) + self.assertTrue(all(y3[y3["class"] == 0]["margin"] == 0.0)) test_sf = tc.SFrame() - test_sf['rank'] = y1['class'] - test_sf['prob'] = y2['class'] - test_sf['margin'] = y3['class'] - test_sf['error'] = test_sf.apply(lambda x: x['rank'] != x['prob']\ - or x['rank'] != x['margin']) + test_sf["rank"] = y1["class"] + test_sf["prob"] = y2["class"] + test_sf["margin"] = y3["class"] + test_sf["error"] = test_sf.apply( + lambda x: x["rank"] != x["prob"] or x["rank"] != x["margin"] + ) - self.assertEqual(test_sf['error'].sum(), 0) + self.assertEqual(test_sf["error"].sum(), 0) def test_predict(self): @@ -614,12 +667,12 @@ def test_predict(self): self.assertEqual(y1.dtype, self.type) # Default, output_type = class - y1 = self.model.predict(self.dtest, output_type='class') + y1 = self.model.predict(self.dtest, output_type="class") self.assertEqual(len(y1), self.dtest.num_rows()) self.assertEqual(y1.dtype, self.type) # output_type = probability vector - y1 = self.model.predict(self.dtest, output_type='probability_vector') + y1 = self.model.predict(self.dtest, output_type="probability_vector") self.assertEqual(len(y1), self.dtest.num_rows()) self.assertEqual(y1.dtype, array) self.assertTrue(all(y1.apply(lambda x: abs(sum(x) - 1.0)) < 1e-5)) @@ -627,83 +680,93 @@ def test_predict(self): k = self.model.num_classes if k == 2: class_one = sorted(self.model.classes)[1] - y_class = self.model.predict(self.dtest, 'class') == class_one + y_class = self.model.predict(self.dtest, "class") == class_one - y1 = self.model.predict(self.dtest, 'margin') + y1 = self.model.predict(self.dtest, "margin") self.assertEqual(len(y1), self.dtest.num_rows()) self.assertTrue(all(y_class == (y1 > 0.0))) - y1 = self.model.predict(self.dtest, 'probability') + y1 = self.model.predict(self.dtest, "probability") self.assertEqual(len(y1), self.dtest.num_rows()) self.assertTrue(all(y_class == (y1 > 0.5))) def test_classify(self): y1 = self.model.classify(self.dtest) self.assertEqual(len(y1), len(self.dtest)) - self.assertEqual(y1['class'].dtype, self.type) - self.assertEqual(set(y1.column_names()), set(['class', 'probability'])) + self.assertEqual(y1["class"].dtype, self.type) + self.assertEqual(set(y1.column_names()), set(["class", "probability"])) def test_evaluate(self): t = self.dtrain[self.target] c = self.model.predict(self.dtrain, "class") p = self.model.predict(self.dtrain, "probability_vector") - ans_metrics = ["accuracy", "auc", "confusion_matrix", "f1_score", - "log_loss", "precision", "recall", "roc_curve"] + ans_metrics = [ + "accuracy", + "auc", + "confusion_matrix", + "f1_score", + "log_loss", + "precision", + "recall", + "roc_curve", + ] self.sm_metrics = { - "accuracy" : evaluation.accuracy(t, c), - "auc" : evaluation.auc(t, p), - "confusion_matrix" : evaluation.confusion_matrix(t, c), - "f1_score" : evaluation.f1_score(t, c), - "log_loss" : evaluation.log_loss(t, p), - "precision" : evaluation.precision(t, c), - "recall" : evaluation.recall(t, c), - "roc_curve" : evaluation.roc_curve(t, p), - } + "accuracy": evaluation.accuracy(t, c), + "auc": evaluation.auc(t, p), + "confusion_matrix": evaluation.confusion_matrix(t, c), + "f1_score": evaluation.f1_score(t, c), + "log_loss": evaluation.log_loss(t, p), + "precision": evaluation.precision(t, c), + "recall": evaluation.recall(t, c), + "roc_curve": evaluation.roc_curve(t, p), + } model = self.model + def check_cf_matrix(ans): self.assertTrue(ans is not None) - self.assertTrue('confusion_matrix' in ans) - cf = ans['confusion_matrix']\ - .sort(['target_label', 'predicted_label']) - ans_cf = self.sm_metrics['confusion_matrix']\ - .sort(['target_label', 'predicted_label']) - self.assertEqual(list(cf['count']), list(ans_cf['count'])) + self.assertTrue("confusion_matrix" in ans) + cf = ans["confusion_matrix"].sort(["target_label", "predicted_label"]) + ans_cf = self.sm_metrics["confusion_matrix"].sort( + ["target_label", "predicted_label"] + ) + self.assertEqual(list(cf["count"]), list(ans_cf["count"])) def check_roc_curve(ans): self.assertTrue(ans is not None) - self.assertTrue('roc_curve' in ans) - roc = ans['roc_curve'] + self.assertTrue("roc_curve" in ans) + roc = ans["roc_curve"] self.assertEqual(type(roc), tc.SFrame) def check_metric(ans, metric): - if metric == 'confusion_matrix': + if metric == "confusion_matrix": check_cf_matrix(ans) - elif metric == 'roc_curve': + elif metric == "roc_curve": check_roc_curve(ans) else: self.assertTrue(ans is not None) self.assertTrue(metric in ans) - self.assertAlmostEqual(ans[metric], - self.sm_metrics[metric], - places = 4, - msg = "%s = (%s,%s)" % \ - (metric, ans[metric], self.sm_metrics[metric])) + self.assertAlmostEqual( + ans[metric], + self.sm_metrics[metric], + places=4, + msg="%s = (%s,%s)" % (metric, ans[metric], self.sm_metrics[metric]), + ) # Default ans = model.evaluate(self.dtrain) self.assertEqual(sorted(ans.keys()), sorted(ans_metrics)) for m in ans_metrics: - check_metric(ans, m) + check_metric(ans, m) # Individual for m in ans_metrics: - ans = model.evaluate(self.dtrain, metric = m) + ans = model.evaluate(self.dtrain, metric=m) check_metric(ans, m) # Test evaluate with new class test_data = self.dtrain.copy().head() - test_data[self.target] = test_data[self.target].apply(lambda x: str(x) + '-new') + test_data[self.target] = test_data[self.target].apply(lambda x: str(x) + "-new") for m in ans_metrics: ans = model.evaluate(test_data, metric=m) @@ -711,10 +774,12 @@ def test_extract_features(self): y1 = self.model.extract_features(self.dtest) self.assertTrue(len(y1) == len(self.dtest)) for feature in y1: - if (self.model.num_classes == 2): + if self.model.num_classes == 2: self.assertTrue(len(feature) == self.model.max_iterations) else: - self.assertTrue(len(feature) == self.model.max_iterations * self.model.num_classes) + self.assertTrue( + len(feature) == self.model.max_iterations * self.model.num_classes + ) def test_feature_importance(self): sf = self.model.get_feature_importance() @@ -728,44 +793,56 @@ def test_list_and_dict_type(self): # make a more complicated dataset containing list and dictionary type columns complex_data = copy.copy(simple_data) - complex_data['random_list_noise'] = \ - tc.SArray([[random.gauss(0, 1) for j in range(3)] for i in range(complex_data.num_rows())]) - complex_data['random_dict_noise'] = \ - tc.SArray([{'x0': random.gauss(0, 1)} for i in range(complex_data.num_rows())]) + complex_data["random_list_noise"] = tc.SArray( + [ + [random.gauss(0, 1) for j in range(3)] + for i in range(complex_data.num_rows()) + ] + ) + complex_data["random_dict_noise"] = tc.SArray( + [{"x0": random.gauss(0, 1)} for i in range(complex_data.num_rows())] + ) complex_train, complex_test = complex_data.random_split(0.8, seed=1) - for (train, test) in [(simple_train, simple_test), (complex_train, complex_test)]: + for (train, test) in [ + (simple_train, simple_test), + (complex_train, complex_test), + ]: self._test_classifier_model(train, test, accuracy_threshold) - def _test_classifier_model(self, train, test, accuracy_threshold, target='label'): + def _test_classifier_model(self, train, test, accuracy_threshold, target="label"): # create - model = tc.boosted_trees_classifier.create(train, target=target, - validation_set=test, - **self.param) + model = tc.boosted_trees_classifier.create( + train, target=target, validation_set=test, **self.param + ) # predict - pred = model.predict(test, output_type = 'class') - accuracy = model.evaluate(test, metric='accuracy') - self.assertGreater(accuracy['accuracy'], accuracy_threshold) - + pred = model.predict(test, output_type="class") + accuracy = model.evaluate(test, metric="accuracy") + self.assertGreater(accuracy["accuracy"], accuracy_threshold) def test_predict_new_category(self): # make new categorical feature new_test = copy.copy(self.dtest) # change 'r' cap-color into a new color 'z' - new_test['cap-color'] = new_test['cap-color'].apply(lambda x: 'z' if x == 'r' else x) + new_test["cap-color"] = new_test["cap-color"].apply( + lambda x: "z" if x == "r" else x + ) y1 = self.model.predict(new_test) new_data = copy.copy(self.data) - new_data['dict_color_feature'] = \ - new_data['cap-color'].apply(lambda x: {'cap-color': ord(x)}) + new_data["dict_color_feature"] = new_data["cap-color"].apply( + lambda x: {"cap-color": ord(x)} + ) train, test = new_data.random_split(0.8, seed=1) # add a new key to dictionary in predict time - test['dict_color_feature'] = test['dict_color_feature'].apply( - lambda x: dict(list(x.items()) + list({'cap-color2': x['cap-color']+1}.items()))) + test["dict_color_feature"] = test["dict_color_feature"].apply( + lambda x: dict( + list(x.items()) + list({"cap-color2": x["cap-color"] + 1}.items()) + ) + ) - model = tc.boosted_trees_classifier.create(train, target='label', - **self.param) + model = tc.boosted_trees_classifier.create(train, target="label", **self.param) y = self.model.predict(test) def test_metric_none(self): @@ -775,7 +852,9 @@ def test_metric_none(self): simple_data = self.data simple_train, simple_test = simple_data.random_split(0.8, seed=1) - model = tc.boosted_trees_classifier.create(simple_train, target='label', disable_posttrain_evaluation = True) + model = tc.boosted_trees_classifier.create( + simple_train, target="label", disable_posttrain_evaluation=True + ) # These fields should not be present. self.assertTrue("training_confusion_matrix" not in model._list_fields()) @@ -789,25 +868,27 @@ def test_metric_none(self): class TestStringTarget(unittest.TestCase): - def test_cat(self): import numpy as np + # Arrange np.random.seed(8) n, d = 1000, 100 sf = tc.SFrame() for i in range(d): - sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - target = np.random.randint(2, size=n) - sf['target'] = target + sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + target = np.random.randint(2, size=n) + sf["target"] = target - sf['target'] = sf['target'].astype(str) - sf['target'] = 'cat-' + sf['target'] - model = tc.boosted_trees_classifier.create(sf, 'target') + sf["target"] = sf["target"].astype(str) + sf["target"] = "cat-" + sf["target"] + model = tc.boosted_trees_classifier.create(sf, "target") # Act evaluation = model.evaluate(sf) # Assert - self.assertEqual(['cat-0', 'cat-1'], - sorted(list(evaluation['confusion_matrix']['target_label'].unique()))) + self.assertEqual( + ["cat-0", "cat-1"], + sorted(list(evaluation["confusion_matrix"]["target_label"].unique())), + ) diff --git a/src/python/turicreate/test/test_boosted_trees_checkpoint.py b/src/python/turicreate/test/test_boosted_trees_checkpoint.py index 22de3a4542..4626f7ea24 100644 --- a/src/python/turicreate/test/test_boosted_trees_checkpoint.py +++ b/src/python/turicreate/test/test_boosted_trees_checkpoint.py @@ -17,12 +17,16 @@ class BoostedTreesRegressionCheckpointTest(unittest.TestCase): @classmethod def setUpClass(cls): - sf = tc.SFrame({'cat[1]': ['1', '1', '2', '2', '2'] * 20, - 'cat[2]': ['1', '3', '3', '1', '1'] * 20, - 'target': [random.random() for i in range(100)]}) + sf = tc.SFrame( + { + "cat[1]": ["1", "1", "2", "2", "2"] * 20, + "cat[2]": ["1", "3", "3", "1", "1"] * 20, + "target": [random.random() for i in range(100)], + } + ) cls.train, cls.test = sf.random_split(0.5, seed=5) cls.model = tc.boosted_trees_regression - cls.metrics = ['rmse', 'max_error'] + cls.metrics = ["rmse", "max_error"] return cls def setUp(self): @@ -36,59 +40,97 @@ def test_default_checkpoint_interval(self): default_interval = 5 # Train 20 iterations, and checkpoint every 5 - m = self.model.create(self.train, 'target', validation_set=self.test, max_depth=2, random_seed=1, - max_iterations=max_iterations, - model_checkpoint_path=self.checkpoint_dir) + m = self.model.create( + self.train, + "target", + validation_set=self.test, + max_depth=2, + random_seed=1, + max_iterations=max_iterations, + model_checkpoint_path=self.checkpoint_dir, + ) # Resume training checkpoint from iterations 5, 10, 15, ... for i in range(default_interval, max_iterations, default_interval): - checkpoint = os.path.join(self.checkpoint_dir, 'model_checkpoint_%d' % i) - m_resume = self.model.create(self.train, 'target', validation_set=self.test, - resume_from_checkpoint=checkpoint) + checkpoint = os.path.join(self.checkpoint_dir, "model_checkpoint_%d" % i) + m_resume = self.model.create( + self.train, + "target", + validation_set=self.test, + resume_from_checkpoint=checkpoint, + ) # Check the progress is the same as the reference model for col in m.progress.column_names(): - if col != 'Elapsed Time': - self.assertListEqual(list(m.progress[col]), list(m_resume.progress[col])) + if col != "Elapsed Time": + self.assertListEqual( + list(m.progress[col]), list(m_resume.progress[col]) + ) def test_non_default_checkpoint_interval(self): max_iterations = 5 default_interval = 2 # Train 5 iterations, and checkpoint every 2 - m = self.model.create(self.train, 'target', validation_set=self.test, max_depth=2, random_seed=1, - max_iterations=max_iterations, - model_checkpoint_path=self.checkpoint_dir, - model_checkpoint_interval=default_interval) + m = self.model.create( + self.train, + "target", + validation_set=self.test, + max_depth=2, + random_seed=1, + max_iterations=max_iterations, + model_checkpoint_path=self.checkpoint_dir, + model_checkpoint_interval=default_interval, + ) # Resume training checkpoint from iterations 2, 4 for i in range(default_interval, max_iterations, default_interval): - checkpoint = os.path.join(self.checkpoint_dir, 'model_checkpoint_%d' % i) - m_resume = self.model.create(self.train, 'target', validation_set=self.test, - resume_from_checkpoint=checkpoint) + checkpoint = os.path.join(self.checkpoint_dir, "model_checkpoint_%d" % i) + m_resume = self.model.create( + self.train, + "target", + validation_set=self.test, + resume_from_checkpoint=checkpoint, + ) # Check the progress is the same as the reference model for col in m.progress.column_names(): - if col != 'Elapsed Time': - self.assertListEqual(list(m.progress[col]), list(m_resume.progress[col])) + if col != "Elapsed Time": + self.assertListEqual( + list(m.progress[col]), list(m_resume.progress[col]) + ) def test_restore_with_different_data(self): max_iterations = 20 default_interval = 5 # Train 20 iterations, and checkpoint every 5 - m = self.model.create(self.train, 'target', validation_set=self.test, max_depth=2, random_seed=1, - max_iterations=max_iterations, - model_checkpoint_path=self.checkpoint_dir) + m = self.model.create( + self.train, + "target", + validation_set=self.test, + max_depth=2, + random_seed=1, + max_iterations=max_iterations, + model_checkpoint_path=self.checkpoint_dir, + ) # Resume training checkpoint from iterations 5, 10, 15, ... using "self.test" for i in range(default_interval, max_iterations, default_interval): - checkpoint = os.path.join(self.checkpoint_dir, 'model_checkpoint_%d' % i) - m_resume = self.model.create(self.test, 'target', validation_set=self.test, - resume_from_checkpoint=checkpoint) + checkpoint = os.path.join(self.checkpoint_dir, "model_checkpoint_%d" % i) + m_resume = self.model.create( + self.test, + "target", + validation_set=self.test, + resume_from_checkpoint=checkpoint, + ) class BoostedTreesClassifierCheckpointTest(BoostedTreesRegressionCheckpointTest): @classmethod def setUpClass(cls): - sf = tc.SFrame({'cat[1]': ['1', '1', '2', '2', '2'] * 20, - 'cat[2]': ['1', '3', '3', '1', '1'] * 20, - 'target': [0, 1] * 50}) + sf = tc.SFrame( + { + "cat[1]": ["1", "1", "2", "2", "2"] * 20, + "cat[2]": ["1", "3", "3", "1", "1"] * 20, + "target": [0, 1] * 50, + } + ) cls.train, cls.test = sf.random_split(0.5, seed=5) cls.model = tc.boosted_trees_classifier return cls diff --git a/src/python/turicreate/test/test_boosted_trees_early_stop.py b/src/python/turicreate/test/test_boosted_trees_early_stop.py index 2ff750f29e..05409b8981 100644 --- a/src/python/turicreate/test/test_boosted_trees_early_stop.py +++ b/src/python/turicreate/test/test_boosted_trees_early_stop.py @@ -15,20 +15,29 @@ class BoostedTreesRegressionEarlyStopTest(unittest.TestCase): @classmethod def setUpClass(cls): - sf = tc.SFrame({'cat[1]': ['1', '1', '2', '2', '2'] * 20, - 'cat[2]': ['1', '3', '3', '1', '1'] * 20, - 'target': [random.random() for i in range(100)]}) + sf = tc.SFrame( + { + "cat[1]": ["1", "1", "2", "2", "2"] * 20, + "cat[2]": ["1", "3", "3", "1", "1"] * 20, + "target": [random.random() for i in range(100)], + } + ) cls.train, cls.test = sf.random_split(0.5, seed=5) cls.model = tc.boosted_trees_regression - cls.metrics = ['rmse', 'max_error'] + cls.metrics = ["rmse", "max_error"] return cls - def _run_test(self, train, valid, early_stopping_rounds, metric='auto'): + def _run_test(self, train, valid, early_stopping_rounds, metric="auto"): max_iterations = 50 - m = self.model.create(train, 'target', validation_set=valid, max_depth=2, - max_iterations=max_iterations, - early_stopping_rounds=early_stopping_rounds, - metric=metric) + m = self.model.create( + train, + "target", + validation_set=valid, + max_depth=2, + max_iterations=max_iterations, + early_stopping_rounds=early_stopping_rounds, + metric=metric, + ) self.assertTrue(m.num_trees < max_iterations) def test_one_round_early_stop(self): @@ -48,16 +57,22 @@ def test_no_validation_exception(self): self.assertRaises(ToolkitError, lambda: self._run_test(self.train, None, 5)) def test_no_metric_exception(self): - self.assertRaises(ToolkitError, lambda: self._run_test(self.train, self.test, 5, metric=[])) + self.assertRaises( + ToolkitError, lambda: self._run_test(self.train, self.test, 5, metric=[]) + ) class BoostedTreesClassifierEarlyStopTest(BoostedTreesRegressionEarlyStopTest): @classmethod def setUpClass(cls): - sf = tc.SFrame({'cat[1]': ['1', '1', '2', '2', '2'] * 20, - 'cat[2]': ['1', '3', '3', '1', '1'] * 20, - 'target': [0, 1] * 50}) + sf = tc.SFrame( + { + "cat[1]": ["1", "1", "2", "2", "2"] * 20, + "cat[2]": ["1", "3", "3", "1", "1"] * 20, + "target": [0, 1] * 50, + } + ) cls.train, cls.test = sf.random_split(0.5, seed=5) cls.model = tc.boosted_trees_classifier - cls.metrics = ['accuracy', 'log_loss'] + cls.metrics = ["accuracy", "log_loss"] return cls diff --git a/src/python/turicreate/test/test_classifier.py b/src/python/turicreate/test/test_classifier.py index 88ae01415e..a9f0e6e500 100644 --- a/src/python/turicreate/test/test_classifier.py +++ b/src/python/turicreate/test/test_classifier.py @@ -16,7 +16,7 @@ class ClassifierCreateTest(unittest.TestCase): Unit test class for testing a classifier model. """ - def _test_create(self, n, d, validation_set = 'auto'): + def _test_create(self, n, d, validation_set="auto"): """ Creation test helper function. """ @@ -29,20 +29,22 @@ def _test_create(self, n, d, validation_set = 'auto'): sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) target = np.random.randn(n) - sf['target'] = target - sf['target'] = sf['target'] > 0 - model = tc.classifier.create(sf, 'target', features=None, - validation_set = validation_set) - self.assertTrue(model is not None, 'Model is None.') + sf["target"] = target + sf["target"] = sf["target"] > 0 + model = tc.classifier.create( + sf, "target", features=None, validation_set=validation_set + ) + self.assertTrue(model is not None, "Model is None.") features = sf.column_names() - features.remove('target') - model = tc.classifier.create(sf, 'target', features = features, - validation_set = validation_set) - self.assertTrue(model is not None, 'Model is None.') - self.assertTrue(isinstance(model, - tc.toolkits._supervised_learning.SupervisedLearningModel)) - + features.remove("target") + model = tc.classifier.create( + sf, "target", features=features, validation_set=validation_set + ) + self.assertTrue(model is not None, "Model is None.") + self.assertTrue( + isinstance(model, tc.toolkits._supervised_learning.SupervisedLearningModel) + ) def test_multi_class_create(self): d = 10 @@ -51,16 +53,16 @@ def test_multi_class_create(self): sf = tc.SFrame() for i in range(d): sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - sf['target'] = [1,2,3,4]* 25 - model = tc.classifier.create(sf,'target') - self.assertTrue(isinstance(model, - tc.toolkits._supervised_learning.SupervisedLearningModel)) - + sf["target"] = [1, 2, 3, 4] * 25 + model = tc.classifier.create(sf, "target") + self.assertTrue( + isinstance(model, tc.toolkits._supervised_learning.SupervisedLearningModel) + ) def test_create(self): self._test_create(99, 10) self._test_create(100, 100) self._test_create(20000, 10) - self._test_create(99, 10, validation_set = None) - self._test_create(100, 100, validation_set = None) - self._test_create(20000, 10, validation_set = None) + self._test_create(99, 10, validation_set=None) + self._test_create(100, 100, validation_set=None) + self._test_create(20000, 10, validation_set=None) diff --git a/src/python/turicreate/test/test_cloudpickle.py b/src/python/turicreate/test/test_cloudpickle.py index cfe0c56b4f..f274e8ec05 100644 --- a/src/python/turicreate/test/test_cloudpickle.py +++ b/src/python/turicreate/test/test_cloudpickle.py @@ -14,7 +14,6 @@ class CloudPickleTest(unittest.TestCase): - def test_pickle_unity_object_exception(self): sa = tc.SArray() sf = tc.SFrame() @@ -27,12 +26,13 @@ def test_pickle_unity_object_exception(self): def test_memoize_subclass(self): class A(object): def __init__(self): - self.name = 'A' + self.name = "A" class B(A): def __init__(self): super(B, self).__init__() - self.name2 = 'B' + self.name2 = "B" + b = B() self.assertEqual(b.name, "A") self.assertEqual(b.name2, "B") diff --git a/src/python/turicreate/test/test_coreml_export.py b/src/python/turicreate/test/test_coreml_export.py index b831883146..4a3195919f 100644 --- a/src/python/turicreate/test/test_coreml_export.py +++ b/src/python/turicreate/test/test_coreml_export.py @@ -27,35 +27,47 @@ import pytest dirname = os.path.dirname(__file__) -mushroom_dataset = os.path.join(dirname, 'mushroom.csv') - -class CoreMLExportTest(unittest.TestCase): +mushroom_dataset = os.path.join(dirname, "mushroom.csv") +class CoreMLExportTest(unittest.TestCase): def generate_data(self, testtype, n, code_string): # numeric; integer, string categorical, list categorical, dictionary, # array, nd array (1 dim), nd array (4 dim). if testtype == "regression": - sf = tc.util.generate_random_regression_sframe(n, code_string, random_seed = 1) - test_sf = tc.util.generate_random_regression_sframe(n, code_string, random_seed = 2) + sf = tc.util.generate_random_regression_sframe( + n, code_string, random_seed=1 + ) + test_sf = tc.util.generate_random_regression_sframe( + n, code_string, random_seed=2 + ) elif testtype == "classification": - sf = tc.util.generate_random_classification_sframe(n, code_string, 2, random_seed = 1) - test_sf = tc.util.generate_random_classification_sframe(n, code_string, 2, random_seed = 2) + sf = tc.util.generate_random_classification_sframe( + n, code_string, 2, random_seed=1 + ) + test_sf = tc.util.generate_random_classification_sframe( + n, code_string, 2, random_seed=2 + ) elif testtype == "multiclass": - sf = tc.util.generate_random_classification_sframe(n, code_string, 10, random_seed = 1) - test_sf = tc.util.generate_random_classification_sframe(n, code_string, 10, random_seed = 2) + sf = tc.util.generate_random_classification_sframe( + n, code_string, 10, random_seed=1 + ) + test_sf = tc.util.generate_random_classification_sframe( + n, code_string, 10, random_seed=2 + ) else: assert False return sf, test_sf - - def _test_coreml_export(self, model, test_sf, is_regression, has_probability = None, predict_topk = None): + def _test_coreml_export( + self, model, test_sf, is_regression, has_probability=None, predict_topk=None + ): if has_probability is None: has_probability = not is_regression @@ -64,14 +76,16 @@ def _test_coreml_export(self, model, test_sf, is_regression, has_probability = N predict_topk = not is_regression # Act & Assert - with tempfile.NamedTemporaryFile(mode='w', suffix = '.mlmodel') as mlmodel_file: + with tempfile.NamedTemporaryFile(mode="w", suffix=".mlmodel") as mlmodel_file: mlmodel_filename = mlmodel_file.name model.export_coreml(mlmodel_filename) coreml_model = coremltools.models.MLModel(mlmodel_filename) - self.assertDictEqual({ - 'com.github.apple.turicreate.version': tc.__version__, - 'com.github.apple.os.platform': platform.platform(), - }, dict(coreml_model.user_defined_metadata) + self.assertDictEqual( + { + "com.github.apple.turicreate.version": tc.__version__, + "com.github.apple.os.platform": platform.platform(), + }, + dict(coreml_model.user_defined_metadata), ) if _mac_ver() < (10, 13): @@ -82,6 +96,7 @@ def array_to_numpy(row): import array import numpy import copy + row = copy.copy(row) for r in row: if type(row[r]) == array.array: @@ -97,15 +112,21 @@ def array_to_numpy(row): if not has_probability: self.assertEqual(coreml_prediction["target"], tc_prediction) else: - self.assertAlmostEqual(coreml_prediction["target"], tc_prediction, delta = 1e-5) + self.assertAlmostEqual( + coreml_prediction["target"], tc_prediction, delta=1e-5 + ) # If applicable, compare probabilistic output if has_probability and not is_regression: coreml_ret = coreml_prediction["targetProbability"] _, values_tuple = zip(*sorted(coreml_ret.items())) coreml_probs = np.array(values_tuple) - tc_probs = np.array(model.predict(row, output_type='probability_vector')[0]) - np.testing.assert_array_almost_equal(coreml_probs, tc_probs, decimal=5) + tc_probs = np.array( + model.predict(row, output_type="probability_vector")[0] + ) + np.testing.assert_array_almost_equal( + coreml_probs, tc_probs, decimal=5 + ) ############################################################# # Regression @@ -113,133 +134,146 @@ def array_to_numpy(row): def test_linear_regression(self): if _mac_ver() < (10, 14): pytest.xfail("See https://github.com/apple/turicreate/issues/1332") - for code_string in ["b"*40, "nnnn", "v", "d", "A", "bnsCvAd"]: + for code_string in ["b" * 40, "nnnn", "v", "d", "A", "bnsCvAd"]: train, test = self.generate_data("regression", 100, code_string) - model = tc.linear_regression.create(train, "target", validation_set = None) + model = tc.linear_regression.create(train, "target", validation_set=None) model.evaluate(test) # Previous regression -- this caused errors. self._test_coreml_export(model, test, True) def test_decision_tree_regression_simple(self): for code_string in ["nnnn", "v"]: train, test = self.generate_data("regression", 100, code_string) - model = tc.decision_tree_regression.create(train, "target", validation_set = None) + model = tc.decision_tree_regression.create( + train, "target", validation_set=None + ) model.evaluate(test) # Previous regression -- this caused errors. self._test_coreml_export(model, test, True) - @pytest.mark.xfail() def test_decision_tree_regression_advanced(self): - for code_string in ["b"*40, "sss", "d", "v", "Ad", "bnsCvAd"]: + for code_string in ["b" * 40, "sss", "d", "v", "Ad", "bnsCvAd"]: train, test = self.generate_data("regression", 100, code_string) - model = tc.decision_tree_regression.create(train, "target", validation_set = None) + model = tc.decision_tree_regression.create( + train, "target", validation_set=None + ) model.evaluate(test) # Previous regression -- this caused errors. self._test_coreml_export(model, test, True) - def test_boosted_trees_regression_simple(self): for code_string in ["nnnn", "v"]: train, test = self.generate_data("regression", 100, code_string) - model = tc.boosted_trees_regression.create(train, "target", validation_set = None) + model = tc.boosted_trees_regression.create( + train, "target", validation_set=None + ) model.evaluate(test) # Previous regression -- this caused errors. self._test_coreml_export(model, test, True) - @pytest.mark.xfail() def test_boosted_trees_regression_advanced(self): - for code_string in ["b"*40, "sss", "d", "v", "Ad", "bnsCvAd"]: + for code_string in ["b" * 40, "sss", "d", "v", "Ad", "bnsCvAd"]: train, test = self.generate_data("regression", 100, code_string) - model = tc.boosted_trees_regression.create(train, "target", validation_set = None) + model = tc.boosted_trees_regression.create( + train, "target", validation_set=None + ) model.evaluate(test) # Previous regression -- this caused errors. self._test_coreml_export(model, test, True) - @pytest.mark.xfail() def test_random_forest_regression_simple(self): for code_string in ["nnnn", "bns", "sss"]: train, test = self.generate_data("regression", 100, code_string) - model = tc.random_forest_regression.create(train, "target", validation_set = None) + model = tc.random_forest_regression.create( + train, "target", validation_set=None + ) model.evaluate(test) # Previous regression -- this caused errors. self._test_coreml_export(model, test, True) - @pytest.mark.xfail() def test_random_forest_regression_advanced(self): - for code_string in ["b"*40, "d", "v", "Ad", "bnsCvAd"]: + for code_string in ["b" * 40, "d", "v", "Ad", "bnsCvAd"]: train, test = self.generate_data("regression", 100, code_string) - model = tc.random_forest_regression.create(train, "target", validation_set = None) + model = tc.random_forest_regression.create( + train, "target", validation_set=None + ) model.evaluate(test) # Previous regression -- this caused errors. self._test_coreml_export(model, test, True) - ############################################################# # Classification def test_logistic_classifier(self): if _mac_ver() < (10, 14): pytest.xfail("See https://github.com/apple/turicreate/issues/1332") - for code_string in ["b"*40, "nnnn", "v", "d", "A", "bnsCvAd"]: + for code_string in ["b" * 40, "nnnn", "v", "d", "A", "bnsCvAd"]: train, test = self.generate_data("classification", 100, code_string) - model = tc.logistic_classifier.create(train, "target", validation_set = None) + model = tc.logistic_classifier.create(train, "target", validation_set=None) model.evaluate(test) # Previous regression -- this caused errors. self._test_coreml_export(model, test, False) def test_svm_classifier(self): if _mac_ver() < (10, 14): pytest.xfail("See https://github.com/apple/turicreate/issues/1332") - for code_string in ["b"*40, "nnnn", "v", "d", "A", "bnsCvAd"]: + for code_string in ["b" * 40, "nnnn", "v", "d", "A", "bnsCvAd"]: train, test = self.generate_data("classification", 100, code_string) - model = tc.svm_classifier.create(train, "target", validation_set = None) + model = tc.svm_classifier.create(train, "target", validation_set=None) model.evaluate(test) # Previous regression -- this caused errors. - self._test_coreml_export(model, test, False, has_probability = False) + self._test_coreml_export(model, test, False, has_probability=False) def test_decision_tree_classifier_simple(self): for code_string in ["nn"]: train, test = self.generate_data("classification", 100, code_string) - model = tc.decision_tree_classifier.create(train, "target", validation_set = None, max_depth=3) + model = tc.decision_tree_classifier.create( + train, "target", validation_set=None, max_depth=3 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - @pytest.mark.xfail() def test_decision_tree_classifier_advanced(self): - for code_string in ["b"*40, "nnnn", "sss", "d", "v", "Ad", "bnsCvAd"]: + for code_string in ["b" * 40, "nnnn", "sss", "d", "v", "Ad", "bnsCvAd"]: train, test = self.generate_data("classification", 100, code_string) - model = tc.decision_tree_classifier.create(train, "target", validation_set = None, max_depth=3) + model = tc.decision_tree_classifier.create( + train, "target", validation_set=None, max_depth=3 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - def test_boosted_trees_classifier_simple(self): for code_string in ["nn"]: train, test = self.generate_data("classification", 100, code_string) - model = tc.boosted_trees_classifier.create(train, "target", validation_set = None, max_depth=3, max_iterations = 5) + model = tc.boosted_trees_classifier.create( + train, "target", validation_set=None, max_depth=3, max_iterations=5 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - @pytest.mark.xfail() def test_boosted_trees_classifier_advanced(self): - for code_string in ["b"*40, "nnnn", "sss", "d", "v", "Ad", "bnsCvAd"]: + for code_string in ["b" * 40, "nnnn", "sss", "d", "v", "Ad", "bnsCvAd"]: train, test = self.generate_data("classification", 100, code_string) - model = tc.boosted_trees_classifier.create(train, "target", validation_set = None, max_depth=3, max_iterations = 5) + model = tc.boosted_trees_classifier.create( + train, "target", validation_set=None, max_depth=3, max_iterations=5 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - @pytest.mark.xfail() def test_random_forest_classifier_simple(self): for code_string in ["nnnn", "bns", "sss"]: train, test = self.generate_data("classification", 100, code_string) - model = tc.random_forest_classifier.create(train, "target", validation_set = None, max_depth=3, max_iterations = 5) + model = tc.random_forest_classifier.create( + train, "target", validation_set=None, max_depth=3, max_iterations=5 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - @pytest.mark.xfail() def test_random_forest_classifier_advanced(self): - for code_string in ["b"*40, "d", "v", "Ad", "bnsCvAd"]: + for code_string in ["b" * 40, "d", "v", "Ad", "bnsCvAd"]: train, test = self.generate_data("classification", 100, code_string) - model = tc.random_forest_classifier.create(train, "target", validation_set = None, max_depth=3, max_iterations = 5) + model = tc.random_forest_classifier.create( + train, "target", validation_set=None, max_depth=3, max_iterations=5 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) @@ -249,135 +283,148 @@ def test_random_forest_classifier_advanced(self): def test_logistic_multiclass(self): if _mac_ver() < (10, 14): pytest.xfail("See https://github.com/apple/turicreate/issues/1332") - for code_string in ["b"*40, "nnnn", "v", "d", "A", "bnsCvAd"]: + for code_string in ["b" * 40, "nnnn", "v", "d", "A", "bnsCvAd"]: train, test = self.generate_data("multiclass", 100, code_string) - model = tc.logistic_classifier.create(train, "target", validation_set = None, max_iterations = 5) + model = tc.logistic_classifier.create( + train, "target", validation_set=None, max_iterations=5 + ) model.evaluate(test) # Previous regression -- this caused errors. self._test_coreml_export(model, test, False) def test_decision_tree_multiclass_simple(self): for code_string in ["nn"]: train, test = self.generate_data("multiclass", 100, code_string) - model = tc.decision_tree_classifier.create(train, "target", validation_set = None, max_depth=3) + model = tc.decision_tree_classifier.create( + train, "target", validation_set=None, max_depth=3 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - @pytest.mark.xfail() def test_decision_tree_multiclass_advanced(self): - for code_string in ["b"*40, "nnnn", "sss", "d", "v", "Ad", "bnsCvAd"]: + for code_string in ["b" * 40, "nnnn", "sss", "d", "v", "Ad", "bnsCvAd"]: train, test = self.generate_data("multiclass", 100, code_string) - model = tc.decision_tree_classifier.create(train, "target", validation_set = None,max_depth=3) + model = tc.decision_tree_classifier.create( + train, "target", validation_set=None, max_depth=3 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - def test_boosted_trees_multiclass_simple(self): for code_string in ["nn"]: train, test = self.generate_data("multiclass", 100, code_string) - model = tc.boosted_trees_classifier.create(train, "target", validation_set = None,max_depth=3, max_iterations = 5) + model = tc.boosted_trees_classifier.create( + train, "target", validation_set=None, max_depth=3, max_iterations=5 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - @pytest.mark.xfail() def test_boosted_trees_multiclass_advanced(self): - for code_string in ["b"*40, "nnnn", "sss", "d", "v", "Ad", "bnsCvAd"]: + for code_string in ["b" * 40, "nnnn", "sss", "d", "v", "Ad", "bnsCvAd"]: train, test = self.generate_data("multiclass", 100, code_string) - model = tc.boosted_trees_classifier.create(train, "target", validation_set = None, max_depth=3, max_iterations = 5) + model = tc.boosted_trees_classifier.create( + train, "target", validation_set=None, max_depth=3, max_iterations=5 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - @pytest.mark.xfail() def test_random_forest_multiclass_simple(self): for code_string in ["nnnn", "bns", "sss"]: train, test = self.generate_data("multiclass", 100, code_string) - model = tc.random_forest_classifier.create(train, "target", validation_set = None,max_depth=3, max_iterations = 5) + model = tc.random_forest_classifier.create( + train, "target", validation_set=None, max_depth=3, max_iterations=5 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - @pytest.mark.xfail() def test_random_forest_multiclass_advanced(self): - for code_string in ["b"*40, "d", "v", "Ad", "bnsCvAd"]: + for code_string in ["b" * 40, "d", "v", "Ad", "bnsCvAd"]: train, test = self.generate_data("multiclass", 100, code_string) - model = tc.random_forest_classifier.create(train, "target", validation_set = None, max_depth=3, max_iterations = 5) + model = tc.random_forest_classifier.create( + train, "target", validation_set=None, max_depth=3, max_iterations=5 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - ############################################################# # Muliclass with few examples; this gaurantees that some # classes in the test set won't overlap. - - def test_logistic_multiclass_tiny(self): if _mac_ver() < (10, 14): pytest.xfail("See https://github.com/apple/turicreate/issues/1332") - for code_string in ["b"*40, "nnnn", "v", "d", "A", "bnsCvAd"]: + for code_string in ["b" * 40, "nnnn", "v", "d", "A", "bnsCvAd"]: train, test = self.generate_data("multiclass", 8, code_string) - model = tc.logistic_classifier.create(train, "target", validation_set = None) + model = tc.logistic_classifier.create(train, "target", validation_set=None) model.evaluate(test) # Previous regression -- this caused errors. self._test_coreml_export(model, test, False) def test_decision_tree_multiclass_simple_tiny(self): for code_string in ["nn"]: train, test = self.generate_data("multiclass", 8, code_string) - model = tc.decision_tree_classifier.create(train, "target", validation_set = None, max_depth=3) + model = tc.decision_tree_classifier.create( + train, "target", validation_set=None, max_depth=3 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - @pytest.mark.xfail() def test_decision_tree_multiclass_advanced_tiny(self): - for code_string in ["b"*40, "nnnn", "sss", "d", "v", "Ad", "bnsCvAd"]: + for code_string in ["b" * 40, "nnnn", "sss", "d", "v", "Ad", "bnsCvAd"]: train, test = self.generate_data("multiclass", 8, code_string) - model = tc.decision_tree_classifier.create(train, "target", validation_set = None,max_depth=3) + model = tc.decision_tree_classifier.create( + train, "target", validation_set=None, max_depth=3 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - def test_boosted_trees_multiclass_simple_tiny(self): for code_string in ["nn"]: train, test = self.generate_data("multiclass", 8, code_string) - model = tc.boosted_trees_classifier.create(train, "target", validation_set = None, max_depth=3, max_iterations = 5) + model = tc.boosted_trees_classifier.create( + train, "target", validation_set=None, max_depth=3, max_iterations=5 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - @pytest.mark.xfail() def test_boosted_trees_multiclass_advanced_tiny(self): - for code_string in ["b"*40, "nnnn", "sss", "d", "v", "Ad", "bnsCvAd"]: + for code_string in ["b" * 40, "nnnn", "sss", "d", "v", "Ad", "bnsCvAd"]: train, test = self.generate_data("multiclass", 8, code_string) - model = tc.boosted_trees_classifier.create(train, "target", validation_set = None, max_depth=3, max_iterations = 5) + model = tc.boosted_trees_classifier.create( + train, "target", validation_set=None, max_depth=3, max_iterations=5 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - def test_random_forest_multiclass_simple_tiny(self): for code_string in ["nnnn", "bns", "sss"]: train, test = self.generate_data("multiclass", 8, code_string) - model = tc.random_forest_classifier.create(train, "target", validation_set = None, max_depth=3, max_iterations = 5) + model = tc.random_forest_classifier.create( + train, "target", validation_set=None, max_depth=3, max_iterations=5 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) - @pytest.mark.xfail() def test_random_forest_multiclass_advanced_tiny(self): - for code_string in ["b"*40, "d", "v", "Ad", "bnsCvAd"]: + for code_string in ["b" * 40, "d", "v", "Ad", "bnsCvAd"]: train, test = self.generate_data("multiclass", 8, code_string) - model = tc.random_forest_classifier.create(train, "target", validation_set = None, max_depth=3, max_iterations = 5) + model = tc.random_forest_classifier.create( + train, "target", validation_set=None, max_depth=3, max_iterations=5 + ) model.evaluate(test) # Previous classifier -- this caused errors. self._test_coreml_export(model, test, False) def test_tree_export_issue_1831(self): SEED = 42 data = tc.SFrame.read_csv(mushroom_dataset) - data['target'] = data['label'] - train_data, test_data = data.random_split(0.8,seed=SEED) - model = tc.boosted_trees_classifier.create(train_data, target='target', - max_iterations=2, - max_depth = 3) + data["target"] = data["label"] + train_data, test_data = data.random_split(0.8, seed=SEED) + model = tc.boosted_trees_classifier.create( + train_data, target="target", max_iterations=2, max_depth=3 + ) self._test_coreml_export(model, test_data, False) diff --git a/src/python/turicreate/test/test_dataframe.py b/src/python/turicreate/test/test_dataframe.py index 6e47e6607d..fe0cee25ea 100644 --- a/src/python/turicreate/test/test_dataframe.py +++ b/src/python/turicreate/test/test_dataframe.py @@ -13,31 +13,34 @@ from pandas.util.testing import assert_frame_equal from sys import version_info + class DataFrameTest(unittest.TestCase): def test_empty(self): expected = pandas.DataFrame() assert_frame_equal(SFrame(expected).to_dataframe(), expected) - expected['int'] = [] - expected['float'] = [] - expected['str'] = [] + expected["int"] = [] + expected["float"] = [] + expected["str"] = [] assert_frame_equal(SFrame(expected).to_dataframe(), expected) def test_simple_dataframe(self): expected = pandas.DataFrame() - expected['int'] = [i for i in range(10)] - expected['float'] = [float(i) for i in range(10)] - expected['str'] = [str(i) for i in range(10)] + expected["int"] = [i for i in range(10)] + expected["float"] = [float(i) for i in range(10)] + expected["str"] = [str(i) for i in range(10)] if version_info.major == 2: - expected['unicode'] = [unicode(i) for i in range(10)] - expected['array'] = [array.array('d', [i]) for i in range(10)] - expected['ls'] = [[str(i)] for i in range(10)] + expected["unicode"] = [unicode(i) for i in range(10)] + expected["array"] = [array.array("d", [i]) for i in range(10)] + expected["ls"] = [[str(i)] for i in range(10)] assert_frame_equal(SFrame(expected).to_dataframe(), expected) def test_sparse_dataframe(self): expected = pandas.DataFrame() - expected['sparse_int'] = [i if i % 2 == 0 else None for i in range(10)] - expected['sparse_float'] = [float(i) if i % 2 == 1 else None for i in range(10)] - expected['sparse_str'] = [str(i) if i % 3 == 0 else None for i in range(10)] - expected['sparse_array'] = [array.array('d', [i]) if i % 5 == 0 else None for i in range(10)] - expected['sparse_list'] = [[str(i)] if i % 7 == 0 else None for i in range(10)] + expected["sparse_int"] = [i if i % 2 == 0 else None for i in range(10)] + expected["sparse_float"] = [float(i) if i % 2 == 1 else None for i in range(10)] + expected["sparse_str"] = [str(i) if i % 3 == 0 else None for i in range(10)] + expected["sparse_array"] = [ + array.array("d", [i]) if i % 5 == 0 else None for i in range(10) + ] + expected["sparse_list"] = [[str(i)] if i % 7 == 0 else None for i in range(10)] assert_frame_equal(SFrame(expected).to_dataframe(), expected) diff --git a/src/python/turicreate/test/test_dbscan.py b/src/python/turicreate/test/test_dbscan.py index da374363ce..f24d024570 100644 --- a/src/python/turicreate/test/test_dbscan.py +++ b/src/python/turicreate/test/test_dbscan.py @@ -16,11 +16,13 @@ from .test_knn_classifier import make_classifier_data import sys + if sys.version_info.major == 3: unittest.TestCase.assertItemsEqual = unittest.TestCase.assertCountEqual import os as _os + class CreateTest(unittest.TestCase): """ Test the create method for DBSCAN clustering. @@ -30,45 +32,53 @@ class CreateTest(unittest.TestCase): def setUpClass(self): ## Data generated by np.random.seed(31); tc.SFrame(np.random.rand(30, 2)) - self.sf = tc.SFrame({'X1': [ - [0.286053821661, 0.958105566519], - [0.770312932219, 0.986870003092], - [0.208165461905, 0.136917048844], - [0.90837380229, 0.0686385179771], - [0.0753327223397, 0.543534689487], - [0.0893997165181, 0.382393267526], - [0.668560439681, 0.429169022562], - [0.0439563074864, 0.194285988749], - [0.446659483973, 0.062573278102], - [0.297567282015, 0.943630899918], - [0.282811075761, 0.267693546553], - [0.407219004134, 0.825990402953], - [0.506700663192, 0.269475381046], - [0.340188287419, 0.97447185149], - [0.18430457912, 0.242272172626], - [0.6904593137, 0.383935276414], - [0.461442452896, 0.675224987045], - [0.0857306038525, 0.234016647286], - [0.522458878224, 0.0691166755345], - [0.0902366982884, 0.0839678579833], - [0.3228005527, 0.910903399861], - [0.831990012991, 0.75008026969], - [0.469253814747, 0.867324370425], - [0.279287904686, 0.0816360972888], - [0.14921147693, 0.494767544759], - [0.303711931037, 0.301766873086], - [0.007386922447, 0.579463366777], - [0.718318063984, 0.407263481941], - [0.162964200289, 0.210306678644], - [0.760123026079, 0.357788149323]]}) + self.sf = tc.SFrame( + { + "X1": [ + [0.286053821661, 0.958105566519], + [0.770312932219, 0.986870003092], + [0.208165461905, 0.136917048844], + [0.90837380229, 0.0686385179771], + [0.0753327223397, 0.543534689487], + [0.0893997165181, 0.382393267526], + [0.668560439681, 0.429169022562], + [0.0439563074864, 0.194285988749], + [0.446659483973, 0.062573278102], + [0.297567282015, 0.943630899918], + [0.282811075761, 0.267693546553], + [0.407219004134, 0.825990402953], + [0.506700663192, 0.269475381046], + [0.340188287419, 0.97447185149], + [0.18430457912, 0.242272172626], + [0.6904593137, 0.383935276414], + [0.461442452896, 0.675224987045], + [0.0857306038525, 0.234016647286], + [0.522458878224, 0.0691166755345], + [0.0902366982884, 0.0839678579833], + [0.3228005527, 0.910903399861], + [0.831990012991, 0.75008026969], + [0.469253814747, 0.867324370425], + [0.279287904686, 0.0816360972888], + [0.14921147693, 0.494767544759], + [0.303711931037, 0.301766873086], + [0.007386922447, 0.579463366777], + [0.718318063984, 0.407263481941], + [0.162964200289, 0.210306678644], + [0.760123026079, 0.357788149323], + ] + } + ) self.min_core_neighbors = 3 self.radius = 0.3 - self.distance = [[['X1'], "euclidean", 1]] - self.model = tc.dbscan.create(self.sf, distance=self.distance, - radius=self.radius, - min_core_neighbors=self.min_core_neighbors, - verbose=False) + self.distance = [[["X1"], "euclidean", 1]] + self.model = tc.dbscan.create( + self.sf, + distance=self.distance, + radius=self.radius, + min_core_neighbors=self.min_core_neighbors, + verbose=False, + ) def test_input_mutations(self): """ @@ -79,10 +89,13 @@ def test_input_mutations(self): local_radius = copy.deepcopy(self.radius) local_min_core_neighbors = copy.deepcopy(self.min_core_neighbors) - local_model = tc.dbscan.create(self.sf, distance=self.distance, - radius=self.radius, - min_core_neighbors=self.min_core_neighbors, - verbose=False) + local_model = tc.dbscan.create( + self.sf, + distance=self.distance, + radius=self.radius, + min_core_neighbors=self.min_core_neighbors, + verbose=False, + ) assert_sframe_equal(self.sf, local_sf) self.assertEqual(self.distance, local_dist) @@ -96,32 +109,45 @@ def test_bogus_inputs(self): ## Empty data with self.assertRaises(ToolkitError): - tc.dbscan.create(dataset=tc.SFrame(), radius=1., - min_core_neighbors=5, verbose=False) + tc.dbscan.create( + dataset=tc.SFrame(), radius=1.0, min_core_neighbors=5, verbose=False + ) ## Non-SFrame data with self.assertRaises(ToolkitError): - tc.dbscan.create(dataset=self.sf.to_dataframe(), radius=1., - min_core_neighbors=5, verbose=False) + tc.dbscan.create( + dataset=self.sf.to_dataframe(), + radius=1.0, + min_core_neighbors=5, + verbose=False, + ) ## Neighborhood parameters - for val in [-1, 'fossa', [1., 2., 3.]]: + for val in [-1, "fossa", [1.0, 2.0, 3.0]]: with self.assertRaises(ValueError): - tc.dbscan.create(self.sf, distance='euclidean', radius=val, - min_core_neighbors=self.min_core_neighbors, - verbose=False) + tc.dbscan.create( + self.sf, + distance="euclidean", + radius=val, + min_core_neighbors=self.min_core_neighbors, + verbose=False, + ) with self.assertRaises(ValueError): - tc.dbscan.create(self.sf, distance='euclidean', - radius=self.radius, min_core_neighbors=val, - verbose=False) + tc.dbscan.create( + self.sf, + distance="euclidean", + radius=self.radius, + min_core_neighbors=val, + verbose=False, + ) ## Bad distance names with self.assertRaises(TypeError): tc.dbscan.create(self.sf, distance=3) with self.assertRaises(ValueError): - tc.dbscan.create(self.sf, distance='fossa') + tc.dbscan.create(self.sf, distance="fossa") def test_create_features(self): """ @@ -132,39 +158,54 @@ def test_create_features(self): """ ## Features in list form, default argument - self.assertItemsEqual(self.model.features, ['X1']) - self.assertItemsEqual(self.model.unpacked_features, ['X1[0]', 'X1[1]']) + self.assertItemsEqual(self.model.features, ["X1"]) + self.assertItemsEqual(self.model.unpacked_features, ["X1[0]", "X1[1]"]) ## Separate features, default argument - sf = self.sf.unpack('X1') - m = tc.dbscan.create(sf, distance='euclidean', radius=self.radius, - min_core_neighbors=self.min_core_neighbors, - verbose=False) - self.assertItemsEqual(m.features, ['X1.0', 'X1.1']) + sf = self.sf.unpack("X1") + m = tc.dbscan.create( + sf, + distance="euclidean", + radius=self.radius, + min_core_neighbors=self.min_core_neighbors, + verbose=False, + ) + self.assertItemsEqual(m.features, ["X1.0", "X1.1"]) ## Separate features, specified explicitly - m = tc.dbscan.create(sf, features=['X1.0'], distance='euclidean', - radius=self.radius, - min_core_neighbors=self.min_core_neighbors, - verbose=False) - self.assertItemsEqual(m.features, ['X1.0']) + m = tc.dbscan.create( + sf, + features=["X1.0"], + distance="euclidean", + radius=self.radius, + min_core_neighbors=self.min_core_neighbors, + verbose=False, + ) + self.assertItemsEqual(m.features, ["X1.0"]) ## Features can be specified by the composite distance argument. - test_dist = [[['X1.0'], 'euclidean', 1], - [['X1.1'], 'manhattan', 1]] + test_dist = [[["X1.0"], "euclidean", 1], [["X1.1"], "manhattan", 1]] - m = tc.dbscan.create(sf, distance=test_dist, radius=self.radius, - min_core_neighbors=self.min_core_neighbors, - verbose=False) - self.assertItemsEqual(m.features, ['X1.0', 'X1.1']) + m = tc.dbscan.create( + sf, + distance=test_dist, + radius=self.radius, + min_core_neighbors=self.min_core_neighbors, + verbose=False, + ) + self.assertItemsEqual(m.features, ["X1.0", "X1.1"]) ## Features parameter should be overridden by the composite distance # argument. - m = tc.dbscan.create(sf, features=['X1.0'], distance=test_dist, - radius=self.radius, - min_core_neighbors=self.min_core_neighbors, - verbose=False) - self.assertItemsEqual(m.features, ['X1.0', 'X1.1']) + m = tc.dbscan.create( + sf, + features=["X1.0"], + distance=test_dist, + radius=self.radius, + min_core_neighbors=self.min_core_neighbors, + verbose=False, + ) + self.assertItemsEqual(m.features, ["X1.0", "X1.1"]) def test_distances(self): """ @@ -173,87 +214,131 @@ def test_distances(self): DBSCAN *should* rely entirely on the nearest neighbors toolkit for this. """ sf = make_classifier_data(n=10, d=2, seed=37) - sf.remove_column('class', inplace=True) + sf.remove_column("class", inplace=True) - numeric_features = ['int0', 'int1', 'float0', 'float1'] - array_features = ['array0'] - string_features = ['str0'] - dict_features = ['dict0'] + numeric_features = ["int0", "int1", "float0", "float1"] + array_features = ["array0"] + string_features = ["str0"] + dict_features = ["dict0"] ## Numeric standard distances should work for numeric columns - for d in ['euclidean', 'squared_euclidean', 'manhattan', 'cosine', - 'transformed_dot_product']: + for d in [ + "euclidean", + "squared_euclidean", + "manhattan", + "cosine", + "transformed_dot_product", + ]: try: - m = tc.dbscan.create(sf, features=numeric_features, distance=d, - radius=1, min_core_neighbors=3, - verbose=False) + m = tc.dbscan.create( + sf, + features=numeric_features, + distance=d, + radius=1, + min_core_neighbors=3, + verbose=False, + ) except: assert False, "Standard distance {} failed.".format(d) - ## Numeric standard distances should work for array columns - for d in ['euclidean', 'squared_euclidean', 'manhattan', 'cosine', - 'transformed_dot_product']: + for d in [ + "euclidean", + "squared_euclidean", + "manhattan", + "cosine", + "transformed_dot_product", + ]: try: - m = tc.dbscan.create(sf, features=array_features, distance=d, - radius=1, min_core_neighbors=3, - verbose=False) + m = tc.dbscan.create( + sf, + features=array_features, + distance=d, + radius=1, + min_core_neighbors=3, + verbose=False, + ) except: assert False, "Standard distance {} failed.".format(d) - ## String standard distances should work. - for d in ['levenshtein']: + for d in ["levenshtein"]: try: - m = tc.dbscan.create(sf, features=string_features, distance=d, - radius=1, min_core_neighbors=3, - verbose=False) + m = tc.dbscan.create( + sf, + features=string_features, + distance=d, + radius=1, + min_core_neighbors=3, + verbose=False, + ) except: assert False, "Standard distance {} failed.".format(d) - ## Dictionary standard distances should work. - for d in ['jaccard', 'weighted_jaccard', 'cosine', 'transformed_dot_product']: + for d in ["jaccard", "weighted_jaccard", "cosine", "transformed_dot_product"]: try: - m = tc.dbscan.create(sf, features=dict_features, distance=d, - radius=1, min_core_neighbors=3, - verbose=False) + m = tc.dbscan.create( + sf, + features=dict_features, + distance=d, + radius=1, + min_core_neighbors=3, + verbose=False, + ) except: assert False, "Standard distance {} failed.".format(d) - # Nonsensical combinations of feature types and distances should fail. with self.assertRaises(ValueError): - m = tc.dbscan.create(sf, features=numeric_features, - distance='levenshtein', radius=1, - min_core_neighbors=3, verbose=False) + m = tc.dbscan.create( + sf, + features=numeric_features, + distance="levenshtein", + radius=1, + min_core_neighbors=3, + verbose=False, + ) with self.assertRaises(ToolkitError): - m = tc.dbscan.create(sf, features=dict_features, - distance='levenshtein', radius=1, - min_core_neighbors=3, verbose=False) + m = tc.dbscan.create( + sf, + features=dict_features, + distance="levenshtein", + radius=1, + min_core_neighbors=3, + verbose=False, + ) with self.assertRaises(ToolkitError): - m = tc.dbscan.create(sf, features=string_features, - distance='euclidean', radius=1, - min_core_neighbors=3, verbose=False) - + m = tc.dbscan.create( + sf, + features=string_features, + distance="euclidean", + radius=1, + min_core_neighbors=3, + verbose=False, + ) # If no distance is specified, the automatic distance construction # should kick in and be correct. - correct_dist = [[['str0'], 'levenshtein', 1], - [['str1'], 'levenshtein', 1], - [['dict0'], 'jaccard', 1], - [['int0', 'int1', 'float0', 'float1'], 'euclidean', 1], - [['array0'], 'euclidean', 1]] - - m = tc.dbscan.create(sf, radius=1, distance=None, min_core_neighbors=3, - verbose=False) + correct_dist = [ + [["str0"], "levenshtein", 1], + [["str1"], "levenshtein", 1], + [["dict0"], "jaccard", 1], + [["int0", "int1", "float0", "float1"], "euclidean", 1], + [["array0"], "euclidean", 1], + ] + + m = tc.dbscan.create( + sf, radius=1, distance=None, min_core_neighbors=3, verbose=False + ) self.assertItemsEqual(m.distance, correct_dist) - m = tc.dbscan.create(sf, radius=1, distance='auto', min_core_neighbors=3, - verbose=False) + m = tc.dbscan.create( + sf, radius=1, distance="auto", min_core_neighbors=3, verbose=False + ) self.assertItemsEqual(m.distance, correct_dist) @@ -267,29 +352,34 @@ def setUp(self): sf = tc.SFrame(np.random.rand(30, 2)) self.min_core_neighbors = 3 self.radius = 0.3 - self.distance = [[['X1'], "euclidean", 1]] - self.model = tc.dbscan.create(sf, distance=self.distance, - radius=self.radius, - min_core_neighbors=self.min_core_neighbors, - verbose=False) + self.distance = [[["X1"], "euclidean", 1]] + self.model = tc.dbscan.create( + sf, + distance=self.distance, + radius=self.radius, + min_core_neighbors=self.min_core_neighbors, + verbose=False, + ) def test__list_fields(self): """ Check the model list fields method. """ - correct_fields = ['distance', - 'verbose', - 'min_core_neighbors', - 'num_features', - 'unpacked_features', - 'num_distance_components', - 'training_time', - 'radius', - 'num_unpacked_features', - 'num_examples', - 'cluster_id', - 'num_clusters', - 'features'] + correct_fields = [ + "distance", + "verbose", + "min_core_neighbors", + "num_features", + "unpacked_features", + "num_distance_components", + "training_time", + "radius", + "num_unpacked_features", + "num_examples", + "cluster_id", + "num_clusters", + "features", + ] self.assertItemsEqual(self.model._list_fields(), correct_fields) @@ -297,24 +387,29 @@ def test_get(self): """ Check the various 'get' methods against known answers for each field. """ - simple_fields = {'verbose': False, - 'min_core_neighbors': self.min_core_neighbors, - 'num_features': 1, - 'num_unpacked_features': 2, - 'num_distance_components': 1, - 'radius': self.radius, - 'num_examples': 30} + simple_fields = { + "verbose": False, + "min_core_neighbors": self.min_core_neighbors, + "num_features": 1, + "num_unpacked_features": 2, + "num_distance_components": 1, + "radius": self.radius, + "num_examples": 30, + } for field, ans in simple_fields.items(): self.assertEqual(self.model._get(field), ans, "{} failed".format(field)) - - _list_fields = {'distance': self.distance, - 'unpacked_features': ['X1[0]', 'X1[1]'], - 'features': ['X1']} + _list_fields = { + "distance": self.distance, + "unpacked_features": ["X1[0]", "X1[1]"], + "features": ["X1"], + } for field, ans in _list_fields.items(): - self.assertItemsEqual(self.model._get(field), ans, "{} failed".format(field)) + self.assertItemsEqual( + self.model._get(field), ans, "{} failed".format(field) + ) self.assertGreaterEqual(self.model.training_time, 0) self.assertGreaterEqual(self.model.num_clusters, 0) self.assertEqual(self.model.cluster_id.num_rows(), 30) @@ -378,32 +473,49 @@ def test_extreme_neighborhoods(self): """ ## Radius = 0 ==> all points are noise - m = tc.dbscan.create(self.sf, distance='euclidean', radius=0., - min_core_neighbors=3, verbose=False) + m = tc.dbscan.create( + self.sf, + distance="euclidean", + radius=0.0, + min_core_neighbors=3, + verbose=False, + ) self.assertEqual(m.num_clusters, 0) - self.assertEqual(sum(m.cluster_id['type'] == 'noise'), self.n) - + self.assertEqual(sum(m.cluster_id["type"] == "noise"), self.n) ## Min_neighbors > 30 ==> all points are noise - m = tc.dbscan.create(self.sf, distance='euclidean', radius=0., - min_core_neighbors=31, verbose=False) + m = tc.dbscan.create( + self.sf, + distance="euclidean", + radius=0.0, + min_core_neighbors=31, + verbose=False, + ) self.assertEqual(m.num_clusters, 0) - self.assertEqual(sum(m.cluster_id['type'] == 'noise'), self.n) - + self.assertEqual(sum(m.cluster_id["type"] == "noise"), self.n) ## Radius very large ==> all points are core points - m = tc.dbscan.create(self.sf, distance='euclidean', radius=100., - min_core_neighbors=3, verbose=False) + m = tc.dbscan.create( + self.sf, + distance="euclidean", + radius=100.0, + min_core_neighbors=3, + verbose=False, + ) self.assertEqual(m.num_clusters, 1) - self.assertEqual(sum(m.cluster_id['type'] == 'core'), self.n) - + self.assertEqual(sum(m.cluster_id["type"] == "core"), self.n) ## Min_neighbors = 0 ==> all points are core points - m = tc.dbscan.create(self.sf, distance='euclidean', radius=0.5, - min_core_neighbors=0, verbose=False) + m = tc.dbscan.create( + self.sf, + distance="euclidean", + radius=0.5, + min_core_neighbors=0, + verbose=False, + ) self.assertEqual(m.num_clusters, 1) - self.assertEqual(sum(m.cluster_id['type'] == 'core'), self.n) + self.assertEqual(sum(m.cluster_id["type"] == "core"), self.n) diff --git a/src/python/turicreate/test/test_decision_tree.py b/src/python/turicreate/test/test_decision_tree.py index 711c8bddfa..7b3af74500 100644 --- a/src/python/turicreate/test/test_decision_tree.py +++ b/src/python/turicreate/test/test_decision_tree.py @@ -21,40 +21,36 @@ import os as _os dirname = _os.path.dirname(__file__) -mushroom_dataset = _os.path.join(dirname, 'mushroom.csv') +mushroom_dataset = _os.path.join(dirname, "mushroom.csv") RMSE_CUTOFF = 15 _DEFAULT_OPTIONS_REGRESSION = { -'max_depth': 6, -'min_child_weight': 0.1, -'min_loss_reduction': 0.0, -'random_seed': None, -'metric': 'auto' + "max_depth": 6, + "min_child_weight": 0.1, + "min_loss_reduction": 0.0, + "random_seed": None, + "metric": "auto", } _DEFAULT_OPTIONS_CLASSIFIER = copy.deepcopy(_DEFAULT_OPTIONS_REGRESSION) -_DEFAULT_OPTIONS_CLASSIFIER['class_weights'] = None +_DEFAULT_OPTIONS_CLASSIFIER["class_weights"] = None class DecisionTreeRegressionTest(unittest.TestCase): - @classmethod def setUpClass(self): self.data = tc.SFrame.read_csv(mushroom_dataset) - self.data['label'] = (self.data['label'] == 'p') + 20 + self.data["label"] = (self.data["label"] == "p") + 20 self.dtrain, self.dtest = self.data.random_split(0.8, seed=1) - self.param = {'max_depth': 5, - 'min_loss_reduction': 1, - 'min_child_weight': 1} - self.target = 'label' + self.param = {"max_depth": 5, "min_loss_reduction": 1, "min_child_weight": 1} + self.target = "label" self.unpacked_features = self.data.column_names() self.unpacked_features.remove(self.target) self.features = self.unpacked_features[:] - self.model = tc.decision_tree_regression.create(self.dtrain, - target=self.target, - validation_set=self.dtest, - **self.param) + self.model = tc.decision_tree_regression.create( + self.dtrain, target=self.target, validation_set=self.dtest, **self.param + ) self.def_opts = copy.deepcopy(_DEFAULT_OPTIONS_REGRESSION) self.opts = self.def_opts.copy() @@ -63,52 +59,57 @@ def setUpClass(self): # Answers # ------------------------------------------------------------------------ self.get_ans = { - 'unpacked_features': lambda x: x == self.unpacked_features, - 'features': lambda x: x == self.features, - 'max_depth': lambda x: x == self.opts['max_depth'], - 'min_child_weight': lambda x: x == self.opts['min_child_weight'], - 'min_loss_reduction': lambda x: x == self.opts['min_loss_reduction'], - 'num_examples': lambda x: x == self.dtrain.num_rows(), - 'num_unpacked_features': lambda x: x == 22, - 'num_features': lambda x: x == 22, - 'num_validation_examples': lambda x: x == self.dtest.num_rows(), - 'target': lambda x: x == self.target, - 'num_trees': lambda x: x == 1, - 'training_rmse': lambda x: x > 0, - 'training_max_error': lambda x: x > 0, - 'training_time': lambda x: x >= 0, - 'trees_json': lambda x: isinstance(x, list), - 'validation_data': lambda x: isinstance(x, tc.SFrame) and len(x) == len(self.dtest), - 'validation_rmse': lambda x: x > 0, - 'validation_max_error': lambda x: x > 0, - 'random_seed': lambda x: x is None, - 'progress': lambda x: isinstance(x, tc.SFrame) or (x is None), - 'metric': lambda x: x == 'auto', - 'disable_posttrain_evaluation' : lambda x: x == False, - } + "unpacked_features": lambda x: x == self.unpacked_features, + "features": lambda x: x == self.features, + "max_depth": lambda x: x == self.opts["max_depth"], + "min_child_weight": lambda x: x == self.opts["min_child_weight"], + "min_loss_reduction": lambda x: x == self.opts["min_loss_reduction"], + "num_examples": lambda x: x == self.dtrain.num_rows(), + "num_unpacked_features": lambda x: x == 22, + "num_features": lambda x: x == 22, + "num_validation_examples": lambda x: x == self.dtest.num_rows(), + "target": lambda x: x == self.target, + "num_trees": lambda x: x == 1, + "training_rmse": lambda x: x > 0, + "training_max_error": lambda x: x > 0, + "training_time": lambda x: x >= 0, + "trees_json": lambda x: isinstance(x, list), + "validation_data": lambda x: isinstance(x, tc.SFrame) + and len(x) == len(self.dtest), + "validation_rmse": lambda x: x > 0, + "validation_max_error": lambda x: x > 0, + "random_seed": lambda x: x is None, + "progress": lambda x: isinstance(x, tc.SFrame) or (x is None), + "metric": lambda x: x == "auto", + "disable_posttrain_evaluation": lambda x: x == False, + } self.metrics = ["rmse", "max_error"] self.fields_ans = self.get_ans.keys() def test_create(self): - model = tc.decision_tree_regression.create(self.dtrain, target='label', - validation_set=self.dtest, - **self.param) + model = tc.decision_tree_regression.create( + self.dtrain, target="label", validation_set=self.dtest, **self.param + ) - rmse = model.evaluate(self.dtest, 'rmse')['rmse'] + rmse = model.evaluate(self.dtest, "rmse")["rmse"] self.assertTrue(model is not None) self.assertLess(rmse, RMSE_CUTOFF) dtrain = self.dtrain - dtrain['label'] = 10 - self.assertRaises(ToolkitError, - lambda:tc.decision_tree_regression.create(self.dtrain, - target='label_wrong', **self.param)) + dtrain["label"] = 10 + self.assertRaises( + ToolkitError, + lambda: tc.decision_tree_regression.create( + self.dtrain, target="label_wrong", **self.param + ), + ) + def test__list_fields(self): """ Check the _list_fields function. Compare with the answer. """ model = self.model - fields = model._list_fields() + fields = model._list_fields() self.assertEqual(set(fields), set(self.fields_ans)) def test_get(self): @@ -119,8 +120,10 @@ def test_get(self): model = self.model for field in self.fields_ans: ans = model._get(field) - self.assertTrue(self.get_ans[field](ans), \ - '''Get failed in field {}. Output was {}.'''.format(field, ans)) + self.assertTrue( + self.get_ans[field](ans), + """Get failed in field {}. Output was {}.""".format(field, ans), + ) def test_summary(self): """ @@ -135,14 +138,14 @@ def test_repr(self): Check the repr function. """ model = self.model - ans = str(model) + ans = str(model) self.assertTrue(type(ans) == str) def test_save_and_load(self): """ Make sure saving and loading retains things. """ - filename = 'save_file%s' % (str(uuid.uuid4())) + filename = "save_file%s" % (str(uuid.uuid4())) self.model.save(filename) self.model = tc.load_model(filename) @@ -167,7 +170,6 @@ def test_save_and_load(self): except: self.assertTrue(False, "Failed during save & load diagnostics") - def test_predict(self): # Make predictions from SFrame. @@ -187,27 +189,29 @@ def test_evaluate(self): t = self.dtrain[self.target] p = model.predict(self.dtrain) self.sm_metrics = { - "max_error" : tc.toolkits.evaluation.max_error(t, p), - "rmse" : tc.toolkits.evaluation.rmse(t, p) + "max_error": tc.toolkits.evaluation.max_error(t, p), + "rmse": tc.toolkits.evaluation.rmse(t, p), } + def check_metric(ans, metric): self.assertTrue(ans is not None) self.assertTrue(metric in ans) - self.assertAlmostEqual(ans[metric], - self.sm_metrics[metric], - places = 4, - msg = "%s = (%s,%s)" % \ - (metric, ans[metric], self.sm_metrics[metric])) + self.assertAlmostEqual( + ans[metric], + self.sm_metrics[metric], + places=4, + msg="%s = (%s,%s)" % (metric, ans[metric], self.sm_metrics[metric]), + ) # Default ans = model.evaluate(self.dtrain) self.assertEqual(sorted(ans.keys()), sorted(self.metrics)) for m in self.metrics: - check_metric(ans, m) + check_metric(ans, m) # Individual for m in self.metrics: - ans = model.evaluate(self.dtrain, metric = m) + ans = model.evaluate(self.dtrain, metric=m) check_metric(ans, m) def test_extract_features(self): @@ -228,20 +232,28 @@ def test_list_and_dict_type(self): # make a more complicated dataset containing list and dictionary type columns complex_data = copy.copy(simple_data) - complex_data['random_list_noise'] = \ - tc.SArray([[random.gauss(0, 1) for j in range(3)] for i in range(complex_data.num_rows())]) - complex_data['random_dict_noise'] = \ - tc.SArray([{'x0': random.gauss(0, 1)} for i in range(complex_data.num_rows())]) + complex_data["random_list_noise"] = tc.SArray( + [ + [random.gauss(0, 1) for j in range(3)] + for i in range(complex_data.num_rows()) + ] + ) + complex_data["random_dict_noise"] = tc.SArray( + [{"x0": random.gauss(0, 1)} for i in range(complex_data.num_rows())] + ) complex_train, complex_test = complex_data.random_split(0.8, seed=1) - for (train, test) in [(simple_train, simple_test), (complex_train, complex_test)]: + for (train, test) in [ + (simple_train, simple_test), + (complex_train, complex_test), + ]: self._test_regression_model(train, test, rmse_threshold) - def _test_regression_model(self, train, test, rmse_threshold, target='label'): + def _test_regression_model(self, train, test, rmse_threshold, target="label"): # create - model = tc.decision_tree_regression.create(train, target=target, - validation_set=test, - **self.param) + model = tc.decision_tree_regression.create( + train, target=target, validation_set=test, **self.param + ) # predict pred = model.predict(test) pred_lst = model.predict(list(test)) @@ -249,7 +261,7 @@ def _test_regression_model(self, train, test, rmse_threshold, target='label'): self.assertLess(rmse, rmse_threshold) # evaluate - rmse_eval = model.evaluate(test, metric='rmse')['rmse'] + rmse_eval = model.evaluate(test, metric="rmse")["rmse"] self.assertTrue(rmse_eval < rmse_threshold) self.assertAlmostEqual(rmse_eval, rmse, delta=1e-2) @@ -258,7 +270,9 @@ def test_predict_new_category(self): # Arrange new_test = copy.copy(self.dtest) # change 'r' cap-color into a new color 'z' - new_test['cap-color'] = new_test['cap-color'].apply(lambda x: 'z'if x == 'r' else x) + new_test["cap-color"] = new_test["cap-color"].apply( + lambda x: "z" if x == "r" else x + ) # Act y1 = self.model.predict(new_test) @@ -270,16 +284,19 @@ def test_predict_new_category(self): def test_predict_new_dictionary_key(self): # Arrange new_data = copy.copy(self.data) - new_data['dict_color_feature'] = \ - new_data['cap-color'].apply(lambda x: {'cap-color': ord(x)}) + new_data["dict_color_feature"] = new_data["cap-color"].apply( + lambda x: {"cap-color": ord(x)} + ) train, test = new_data.random_split(0.8, seed=1) # add a new key to dictionary in predict time - test['dict_color_feature'] = test['dict_color_feature'].apply( - lambda x: dict(list(x.items()) + list({'cap-color2': x['cap-color']+1}.items()))) + test["dict_color_feature"] = test["dict_color_feature"].apply( + lambda x: dict( + list(x.items()) + list({"cap-color2": x["cap-color"] + 1}.items()) + ) + ) - model = tc.decision_tree_regression.create(train, target='label', - **self.param) + model = tc.decision_tree_regression.create(train, target="label", **self.param) # Act. y1 = model.predict(test) y2 = model.predict(list(test)) @@ -287,38 +304,41 @@ def test_predict_new_dictionary_key(self): # Assert self.assertEqual(list(y1), list(y2)) + ## --------------------------------------------------------------------------- ## ## Decision Trees Suites ## ## --------------------------------------------------------------------------- + def test_suite_decision_tree_classifier(): """ Create a test suite for each test case in the DecisionTreeClassifierTest. """ testCases = [ - binary_classification_integer_target, - binary_classification_string_target, - multiclass_classification_integer_target, - multiclass_classification_string_target, - ] + binary_classification_integer_target, + binary_classification_string_target, + multiclass_classification_integer_target, + multiclass_classification_string_target, + ] for t in testCases: testcase_members = {} testcase_members[t.__name__] = classmethod(t) testcase_class = type( - 'DecisionTreeClassifierTest_%s' % t.__name__, + "DecisionTreeClassifierTest_%s" % t.__name__, (DecisionTreeClassifierTest,), - testcase_members + testcase_members, ) testcase_class.__test__ = True getattr(testcase_class, t.__name__)() for method in dir(testcase_class): - if method.startswith('test_'): + if method.startswith("test_"): testcase_instance = testcase_class(method) getattr(testcase_instance, method)() + def binary_classification_integer_target(cls): """ Binary classification with an integer target. @@ -326,19 +346,16 @@ def binary_classification_integer_target(cls): # Get the data from the mushroom dataset. cls.data = tc.SFrame.read_csv(mushroom_dataset) cls.dtrain, cls.dtest = cls.data.random_split(0.8, seed=1) - cls.dtrain['label'] = cls.dtrain['label'] == 'p' - cls.dtest['label'] = cls.dtest['label'] == 'p' - cls.param = {'max_depth': 3, - 'min_loss_reduction': 1, - 'min_child_weight': 1} - cls.target = 'label' + cls.dtrain["label"] = cls.dtrain["label"] == "p" + cls.dtest["label"] = cls.dtest["label"] == "p" + cls.param = {"max_depth": 3, "min_loss_reduction": 1, "min_child_weight": 1} + cls.target = "label" cls.unpacked_features = cls.data.column_names() cls.unpacked_features.remove(cls.target) cls.features = cls.unpacked_features[:] - cls.model = tc.decision_tree_classifier.create(cls.dtrain, - target=cls.target, - validation_set = cls.dtest, - **cls.param) + cls.model = tc.decision_tree_classifier.create( + cls.dtrain, target=cls.target, validation_set=cls.dtest, **cls.param + ) cls.def_opts = copy.deepcopy(_DEFAULT_OPTIONS_CLASSIFIER) cls.opts = cls.def_opts.copy() @@ -347,138 +364,149 @@ def binary_classification_integer_target(cls): # Answers # ------------------------------------------------------------------------ - if 'classes' in cls.model._list_fields(): - num_examples_per_class = { - c:(cls.dtrain[cls.target] == c).sum() for c in cls.model.classes} + if "classes" in cls.model._list_fields(): + num_examples_per_class = { + c: (cls.dtrain[cls.target] == c).sum() for c in cls.model.classes + } cls.get_ans = { - 'unpacked_features': lambda x: x == cls.unpacked_features, - 'features': lambda x: x == cls.features, - 'max_depth': lambda x: x == cls.opts['max_depth'], - 'min_child_weight': lambda x: x == cls.opts['min_child_weight'], - 'min_loss_reduction': lambda x: x == cls.opts['min_loss_reduction'], - 'num_examples': lambda x: x == cls.dtrain.num_rows(), - 'num_examples_per_class': lambda x: x == num_examples_per_class, - 'num_classes': lambda x: x == 2, - 'classes': lambda x: x == [0,1], - 'num_unpacked_features': lambda x: x == 22, - 'num_features': lambda x: x == 22, - 'num_trees': lambda x: x == 1, - 'num_validation_examples': lambda x: x == cls.dtest.num_rows(), - 'target': lambda x: x == cls.target, - 'training_accuracy': lambda x: x > 0, - 'training_log_loss': lambda x: x > 0, - 'training_time': lambda x: x >= 0, - 'class_weights': lambda x: x == {0:1.0, 1:1.0}, - 'trees_json': lambda x: isinstance(x, list), - 'validation_accuracy': lambda x: x > 0, - 'validation_log_loss': lambda x: x > 0, - 'random_seed': lambda x: x is None, - 'progress': lambda x: isinstance(x, tc.SFrame) or (x is None), - 'metric': lambda x: x == 'auto', - 'training_auc': lambda x: x > 0, - 'training_confusion_matrix': lambda x: len(x) > 0, - 'training_f1_score': lambda x: x > 0, - 'training_precision': lambda x: x > 0, - 'training_recall': lambda x: x > 0, - 'training_report_by_class': lambda x: len(x) > 0, - 'training_roc_curve': lambda x: len(x) > 0, - 'validation_data': lambda x: isinstance(x, tc.SFrame) and len(x) == len(cls.dtest), - 'validation_auc': lambda x: x > 0, - 'validation_confusion_matrix': lambda x: len(x) > 0, - 'validation_f1_score': lambda x: x > 0, - 'validation_precision': lambda x: x > 0, - 'validation_recall': lambda x: x > 0, - 'validation_report_by_class': lambda x: len(x) > 0, - 'validation_roc_curve': lambda x: len(x) > 0, - 'disable_posttrain_evaluation' : lambda x: x == False, - } + "unpacked_features": lambda x: x == cls.unpacked_features, + "features": lambda x: x == cls.features, + "max_depth": lambda x: x == cls.opts["max_depth"], + "min_child_weight": lambda x: x == cls.opts["min_child_weight"], + "min_loss_reduction": lambda x: x == cls.opts["min_loss_reduction"], + "num_examples": lambda x: x == cls.dtrain.num_rows(), + "num_examples_per_class": lambda x: x == num_examples_per_class, + "num_classes": lambda x: x == 2, + "classes": lambda x: x == [0, 1], + "num_unpacked_features": lambda x: x == 22, + "num_features": lambda x: x == 22, + "num_trees": lambda x: x == 1, + "num_validation_examples": lambda x: x == cls.dtest.num_rows(), + "target": lambda x: x == cls.target, + "training_accuracy": lambda x: x > 0, + "training_log_loss": lambda x: x > 0, + "training_time": lambda x: x >= 0, + "class_weights": lambda x: x == {0: 1.0, 1: 1.0}, + "trees_json": lambda x: isinstance(x, list), + "validation_accuracy": lambda x: x > 0, + "validation_log_loss": lambda x: x > 0, + "random_seed": lambda x: x is None, + "progress": lambda x: isinstance(x, tc.SFrame) or (x is None), + "metric": lambda x: x == "auto", + "training_auc": lambda x: x > 0, + "training_confusion_matrix": lambda x: len(x) > 0, + "training_f1_score": lambda x: x > 0, + "training_precision": lambda x: x > 0, + "training_recall": lambda x: x > 0, + "training_report_by_class": lambda x: len(x) > 0, + "training_roc_curve": lambda x: len(x) > 0, + "validation_data": lambda x: isinstance(x, tc.SFrame) + and len(x) == len(cls.dtest), + "validation_auc": lambda x: x > 0, + "validation_confusion_matrix": lambda x: len(x) > 0, + "validation_f1_score": lambda x: x > 0, + "validation_precision": lambda x: x > 0, + "validation_recall": lambda x: x > 0, + "validation_report_by_class": lambda x: len(x) > 0, + "validation_roc_curve": lambda x: len(x) > 0, + "disable_posttrain_evaluation": lambda x: x == False, + } cls.fields_ans = cls.get_ans.keys() + def binary_classification_string_target(cls): binary_classification_integer_target(cls) cls.type = str - cls.dtrain['label'] = cls.dtrain['label'].astype(str) - cls.dtest['label'] = cls.dtest['label'].astype(str) - cls.dtrain['label'] = cls.dtrain['label'] + '-cat' - cls.dtest['label'] = cls.dtest['label'] + '-cat' - cls.model = tc.decision_tree_classifier.create(cls.dtrain, - target=cls.target, - validation_set = cls.dtest, - **cls.param) - - num_examples_per_class = {c:(cls.dtrain[cls.target] == c).sum() for c in cls.model.classes} - cls.get_ans['num_examples_per_class'] = lambda x: x == num_examples_per_class - cls.get_ans['class_weights'] = lambda x: x == {'0-cat':1.0, '1-cat':1.0} - cls.get_ans['classes'] = lambda x: x == ['0-cat', '1-cat'] + cls.dtrain["label"] = cls.dtrain["label"].astype(str) + cls.dtest["label"] = cls.dtest["label"].astype(str) + cls.dtrain["label"] = cls.dtrain["label"] + "-cat" + cls.dtest["label"] = cls.dtest["label"] + "-cat" + cls.model = tc.decision_tree_classifier.create( + cls.dtrain, target=cls.target, validation_set=cls.dtest, **cls.param + ) + + num_examples_per_class = { + c: (cls.dtrain[cls.target] == c).sum() for c in cls.model.classes + } + cls.get_ans["num_examples_per_class"] = lambda x: x == num_examples_per_class + cls.get_ans["class_weights"] = lambda x: x == {"0-cat": 1.0, "1-cat": 1.0} + cls.get_ans["classes"] = lambda x: x == ["0-cat", "1-cat"] def multiclass_classification_integer_target(cls): binary_classification_integer_target(cls) + def create_multiclass_label(row): - if row['label'] == 0: + if row["label"] == 0: return 0 - elif row['cap-surface'] == 'y': + elif row["cap-surface"] == "y": return 1 else: return 2 - cls.dtrain['label'] = cls.dtrain.apply(create_multiclass_label) - cls.dtest['label'] = cls.dtest.apply(create_multiclass_label) - cls.model = tc.decision_tree_classifier.create(cls.dtrain, - target=cls.target, - validation_set = cls.dtest, - **cls.param) - - num_examples_per_class = {c:(cls.dtrain[cls.target] == c).sum() for c in - cls.model.classes} - cls.get_ans['num_examples_per_class'] = lambda x: x == num_examples_per_class - cls.get_ans['num_trees'] = lambda x: x == 3 - cls.get_ans['num_classes'] = lambda x: x == 3 - cls.get_ans['classes'] = lambda x: set(x) == set([0,1,2]) - cls.get_ans['class_weights'] = lambda x: x == {0:1.0, 1:1.0, 2:1.0} + + cls.dtrain["label"] = cls.dtrain.apply(create_multiclass_label) + cls.dtest["label"] = cls.dtest.apply(create_multiclass_label) + cls.model = tc.decision_tree_classifier.create( + cls.dtrain, target=cls.target, validation_set=cls.dtest, **cls.param + ) + + num_examples_per_class = { + c: (cls.dtrain[cls.target] == c).sum() for c in cls.model.classes + } + cls.get_ans["num_examples_per_class"] = lambda x: x == num_examples_per_class + cls.get_ans["num_trees"] = lambda x: x == 3 + cls.get_ans["num_classes"] = lambda x: x == 3 + cls.get_ans["classes"] = lambda x: set(x) == set([0, 1, 2]) + cls.get_ans["class_weights"] = lambda x: x == {0: 1.0, 1: 1.0, 2: 1.0} + def multiclass_classification_string_target(cls): multiclass_classification_integer_target(cls) cls.type = str - cls.dtrain['label'] = cls.dtrain['label'].astype(str) - cls.dtest['label'] = cls.dtest['label'].astype(str) - cls.model = tc.decision_tree_classifier.create(cls.dtrain, - target=cls.target, - validation_set = cls.dtest, - **cls.param) - - num_examples_per_class = {c:(cls.dtrain[cls.target] == c).sum() for c in - cls.model.classes} - cls.get_ans['num_examples_per_class'] = lambda x: x == num_examples_per_class - cls.get_ans['classes'] = lambda x: set(x) == set(map(str, [0,1,2])) - cls.get_ans['num_trees'] = lambda x: x == 3 - cls.get_ans['class_weights'] = lambda x: x == {'0':1.0, '1':1.0, '2':1.0} + cls.dtrain["label"] = cls.dtrain["label"].astype(str) + cls.dtest["label"] = cls.dtest["label"].astype(str) + cls.model = tc.decision_tree_classifier.create( + cls.dtrain, target=cls.target, validation_set=cls.dtest, **cls.param + ) + + num_examples_per_class = { + c: (cls.dtrain[cls.target] == c).sum() for c in cls.model.classes + } + cls.get_ans["num_examples_per_class"] = lambda x: x == num_examples_per_class + cls.get_ans["classes"] = lambda x: set(x) == set(map(str, [0, 1, 2])) + cls.get_ans["num_trees"] = lambda x: x == 3 + cls.get_ans["class_weights"] = lambda x: x == {"0": 1.0, "1": 1.0, "2": 1.0} + class DecisionTreeClassifierTest(unittest.TestCase): __test__ = False def test_create(self): - model = tc.decision_tree_classifier.create(self.dtrain, target='label', - validation_set=self.dtest, - **self.param) + model = tc.decision_tree_classifier.create( + self.dtrain, target="label", validation_set=self.dtest, **self.param + ) self.assertTrue(model is not None) - self.assertGreater(model.evaluate(self.dtest, 'accuracy')['accuracy'], 0.9) + self.assertGreater(model.evaluate(self.dtest, "accuracy")["accuracy"], 0.9) dtrain = self.dtrain[:] - dtrain['label'] = 10 - self.assertRaises(ToolkitError, - lambda: tc.decision_tree_classifier.create(self.dtrain, - target='label_wrong', **self.param)) + dtrain["label"] = 10 + self.assertRaises( + ToolkitError, + lambda: tc.decision_tree_classifier.create( + self.dtrain, target="label_wrong", **self.param + ), + ) def test__list_fields(self): """ Check the _list_fields function. Compare with the answer. """ model = self.model - fields = model._list_fields() + fields = model._list_fields() self.assertEqual(set(fields), set(self.fields_ans)) def test_get(self): @@ -489,8 +517,10 @@ def test_get(self): model = self.model for field in self.fields_ans: ans = model._get(field) - self.assertTrue(self.get_ans[field](ans), \ - '''Get failed in field {}. Output was {}.'''.format(field, ans)) + self.assertTrue( + self.get_ans[field](ans), + """Get failed in field {}. Output was {}.""".format(field, ans), + ) def test_summary(self): """ @@ -505,14 +535,14 @@ def test_repr(self): Check the repr function. """ model = self.model - ans = str(model) + ans = str(model) self.assertTrue(type(ans) == str, "Repr failed") def test_save_and_load(self): """ Make sure saving and loading retains things. """ - filename = 'save_file%s' % (str(uuid.uuid4())) + filename = "save_file%s" % (str(uuid.uuid4())) self.model.save(filename) self.model = tc.load_model(filename) @@ -537,21 +567,20 @@ def test_save_and_load(self): except: self.assertTrue(False, "Failed during save & load diagnostics") - def test_predict_topk(self): k = self.model.num_classes - y1 = self.model.predict_topk(self.dtest, k = k, output_type = 'rank') - self.assertEqual(y1['class'].dtype, self.type) - self.assertEqual(y1['id'].dtype, int) + y1 = self.model.predict_topk(self.dtest, k=k, output_type="rank") + self.assertEqual(y1["class"].dtype, self.type) + self.assertEqual(y1["id"].dtype, int) self.assertEqual(y1.num_rows(), self.dtest.num_rows() * k) - y1 = self.model.predict_topk(self.dtest, k = k, output_type = 'margin') - self.assertEqual(y1['id'].dtype, int) + y1 = self.model.predict_topk(self.dtest, k=k, output_type="margin") + self.assertEqual(y1["id"].dtype, int) self.assertEqual(y1.num_rows(), self.dtest.num_rows() * k) - y1 = self.model.predict_topk(self.dtest, k = k, output_type = 'probability') - self.assertEqual(y1['id'].dtype, int) + y1 = self.model.predict_topk(self.dtest, k=k, output_type="probability") + self.assertEqual(y1["id"].dtype, int) self.assertEqual(y1.num_rows(), self.dtest.num_rows() * k) def test_predict(self): @@ -559,20 +588,20 @@ def test_predict(self): self.assertEqual(len(y1), self.dtest.num_rows()) self.assertEqual(y1.dtype, self.type) - y1 = self.model.predict(self.dtest, output_type = 'probability_vector') + y1 = self.model.predict(self.dtest, output_type="probability_vector") self.assertEqual(len(y1), self.dtest.num_rows()) self.assertEqual(y1.dtype, array) k = self.model.num_classes if k == 2: - y1 = self.model.predict(self.dtest, 'margin') - y2 = self.model.predict(list(self.dtest), 'margin') + y1 = self.model.predict(self.dtest, "margin") + y2 = self.model.predict(list(self.dtest), "margin") self.assertEqual(len(y1), self.dtest.num_rows()) self.assertEqual(len(y2), self.dtest.num_rows()) self.assertEqual(list(y1), list(y2)) - y1 = self.model.predict(self.dtest, 'probability') - y2 = self.model.predict(list(self.dtest), 'probability') + y1 = self.model.predict(self.dtest, "probability") + y2 = self.model.predict(list(self.dtest), "probability") self.assertEqual(len(y1), self.dtest.num_rows()) self.assertEqual(len(y2), self.dtest.num_rows()) self.assertEqual(list(y1), list(y2)) @@ -580,68 +609,76 @@ def test_predict(self): def test_classify(self): y1 = self.model.classify(self.dtest) self.assertEqual(len(y1), len(self.dtest)) - self.assertEqual(y1['class'].dtype, self.type) - self.assertEqual(set(y1.column_names()), set(['class', 'probability'])) + self.assertEqual(y1["class"].dtype, self.type) + self.assertEqual(set(y1.column_names()), set(["class", "probability"])) def test_evaluate(self): t = self.dtrain[self.target] c = self.model.predict(self.dtrain, "class") p = self.model.predict(self.dtrain, "probability_vector") - ans_metrics = ["accuracy", "auc", "confusion_matrix", "f1_score", - "log_loss", "precision", "recall", "roc_curve"] + ans_metrics = [ + "accuracy", + "auc", + "confusion_matrix", + "f1_score", + "log_loss", + "precision", + "recall", + "roc_curve", + ] self.sm_metrics = { - "accuracy" : tc.toolkits.evaluation.accuracy(t, c), - "auc" : tc.toolkits.evaluation.auc(t, p), - "confusion_matrix" : tc.toolkits.evaluation.confusion_matrix(t, c), - "f1_score" : tc.toolkits.evaluation.f1_score(t, c), - "log_loss" : tc.toolkits.evaluation.log_loss(t, p), - "precision" : tc.toolkits.evaluation.precision(t, c), - "recall" : tc.toolkits.evaluation.recall(t, c), - "roc_curve" : tc.toolkits.evaluation.roc_curve(t, p), - } + "accuracy": tc.toolkits.evaluation.accuracy(t, c), + "auc": tc.toolkits.evaluation.auc(t, p), + "confusion_matrix": tc.toolkits.evaluation.confusion_matrix(t, c), + "f1_score": tc.toolkits.evaluation.f1_score(t, c), + "log_loss": tc.toolkits.evaluation.log_loss(t, p), + "precision": tc.toolkits.evaluation.precision(t, c), + "recall": tc.toolkits.evaluation.recall(t, c), + "roc_curve": tc.toolkits.evaluation.roc_curve(t, p), + } model = self.model + def check_cf_matrix(ans): self.assertTrue(ans is not None) - self.assertTrue('confusion_matrix' in ans) - cf = ans['confusion_matrix']\ - .sort(['target_label', 'predicted_label']) - ans_cf = self.sm_metrics['confusion_matrix']\ - .sort(['target_label', 'predicted_label']) - self.assertEqual(list(cf['count']), list(ans_cf['count'])) + self.assertTrue("confusion_matrix" in ans) + cf = ans["confusion_matrix"].sort(["target_label", "predicted_label"]) + ans_cf = self.sm_metrics["confusion_matrix"].sort( + ["target_label", "predicted_label"] + ) + self.assertEqual(list(cf["count"]), list(ans_cf["count"])) def check_roc_curve(ans): self.assertTrue(ans is not None) - self.assertTrue('roc_curve' in ans) - roc = ans['roc_curve'] + self.assertTrue("roc_curve" in ans) + roc = ans["roc_curve"] self.assertEqual(type(roc), tc.SFrame) def check_metric(ans, metric): - if metric == 'confusion_matrix': + if metric == "confusion_matrix": check_cf_matrix(ans) - elif metric == 'roc_curve': + elif metric == "roc_curve": check_roc_curve(ans) else: self.assertTrue(ans is not None) self.assertTrue(metric in ans) - self.assertAlmostEqual(ans[metric], - self.sm_metrics[metric], - places = 4, - msg = "%s = (%s,%s)" % \ - (metric, ans[metric], self.sm_metrics[metric])) + self.assertAlmostEqual( + ans[metric], + self.sm_metrics[metric], + places=4, + msg="%s = (%s,%s)" % (metric, ans[metric], self.sm_metrics[metric]), + ) # Default ans = model.evaluate(self.dtrain) self.assertEqual(sorted(ans.keys()), sorted(ans_metrics)) for m in ans_metrics: - check_metric(ans, m) + check_metric(ans, m) # Individual for m in ans_metrics: - ans = model.evaluate(self.dtrain, metric = m) + ans = model.evaluate(self.dtrain, metric=m) check_metric(ans, m) - - def test_extract_features(self): y1 = self.model.extract_features(self.dtest) self.assertTrue(len(y1) == len(self.dtest)) @@ -660,34 +697,43 @@ def test_list_and_dict_type(self): # make a more complicated dataset containing list and dictionary type columns complex_data = copy.copy(simple_data) - complex_data['random_list_noise'] = \ - tc.SArray([[random.gauss(0, 1) for j in range(3)] for i in range(complex_data.num_rows())]) - complex_data['random_dict_noise'] = \ - tc.SArray([{'x0': random.gauss(0, 1)} for i in range(complex_data.num_rows())]) + complex_data["random_list_noise"] = tc.SArray( + [ + [random.gauss(0, 1) for j in range(3)] + for i in range(complex_data.num_rows()) + ] + ) + complex_data["random_dict_noise"] = tc.SArray( + [{"x0": random.gauss(0, 1)} for i in range(complex_data.num_rows())] + ) complex_train, complex_test = complex_data.random_split(0.8, seed=1) - for (train, test) in [(simple_train, simple_test), (complex_train, complex_test)]: + for (train, test) in [ + (simple_train, simple_test), + (complex_train, complex_test), + ]: self._test_classifier_model(train, test, accuracy_threshold) - def _test_classifier_model(self, train, test, accuracy_threshold, target='label'): + def _test_classifier_model(self, train, test, accuracy_threshold, target="label"): # create - model = tc.decision_tree_classifier.create(train, target=target, - validation_set=test, - **self.param) + model = tc.decision_tree_classifier.create( + train, target=target, validation_set=test, **self.param + ) # predict - pred = model.predict(test, output_type = 'class') - pred_lst = model.predict(list(test), output_type = 'class') + pred = model.predict(test, output_type="class") + pred_lst = model.predict(list(test), output_type="class") self.assertEqual(list(pred), list(pred_lst)) - accuracy = model.evaluate(test, metric='accuracy') - self.assertGreater(accuracy['accuracy'], accuracy_threshold) - + accuracy = model.evaluate(test, metric="accuracy") + self.assertGreater(accuracy["accuracy"], accuracy_threshold) def test_predict_new_category(self): # Arrange new_test = copy.copy(self.dtest) # change 'r' cap-color into a new color 'z' - new_test['cap-color'] = new_test['cap-color'].apply(lambda x: 'z'if x == 'r' else x) + new_test["cap-color"] = new_test["cap-color"].apply( + lambda x: "z" if x == "r" else x + ) # Act y1 = self.model.predict(new_test) @@ -699,16 +745,19 @@ def test_predict_new_category(self): def test_predict_new_dictionary_key(self): # Arrange new_data = copy.copy(self.data) - new_data['dict_color_feature'] = \ - new_data['cap-color'].apply(lambda x: {'cap-color': ord(x)}) + new_data["dict_color_feature"] = new_data["cap-color"].apply( + lambda x: {"cap-color": ord(x)} + ) train, test = new_data.random_split(0.8, seed=1) # add a new key to dictionary in predict time - test['dict_color_feature'] = test['dict_color_feature'].apply( - lambda x: dict(list(x.items()) + list({'cap-color2': x['cap-color']+1}.items()))) + test["dict_color_feature"] = test["dict_color_feature"].apply( + lambda x: dict( + list(x.items()) + list({"cap-color2": x["cap-color"] + 1}.items()) + ) + ) - model = tc.decision_tree_classifier.create(train, target='label', - **self.param) + model = tc.decision_tree_classifier.create(train, target="label", **self.param) # Act. y1 = model.predict(test) y2 = model.predict(list(test)) @@ -716,26 +765,29 @@ def test_predict_new_dictionary_key(self): # Assert self.assertEqual(list(y1), list(y2)) -class TestStringTarget(unittest.TestCase): +class TestStringTarget(unittest.TestCase): def test_cat(self): import numpy as np + # Arrange np.random.seed(8) n, d = 1000, 100 sf = tc.SFrame() for i in range(d): - sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - target = np.random.randint(2, size=n) - sf['target'] = target + sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + target = np.random.randint(2, size=n) + sf["target"] = target - sf['target'] = sf['target'].astype(str) - sf['target'] = 'cat-' + sf['target'] - model = tc.decision_tree_classifier.create(sf, 'target') + sf["target"] = sf["target"].astype(str) + sf["target"] = "cat-" + sf["target"] + model = tc.decision_tree_classifier.create(sf, "target") # Act evaluation = model.evaluate(sf) # Assert - self.assertEqual(['cat-0', 'cat-1'], - sorted(list(evaluation['confusion_matrix']['target_label'].unique()))) + self.assertEqual( + ["cat-0", "cat-1"], + sorted(list(evaluation["confusion_matrix"]["target_label"].unique())), + ) diff --git a/src/python/turicreate/test/test_deps.py b/src/python/turicreate/test/test_deps.py index c1c2b012e4..06372d6044 100644 --- a/src/python/turicreate/test/test_deps.py +++ b/src/python/turicreate/test/test_deps.py @@ -12,25 +12,24 @@ class VersionTest(unittest.TestCase): - def test_min_version(self): - MIN_VERSION = StrictVersion('1.8.1') - self.assertEqual(get_version('1.8.1'), MIN_VERSION) - self.assertEqual(get_version('1.8.1-dev'), MIN_VERSION) - self.assertEqual(get_version('1.8.1rc'), MIN_VERSION) + MIN_VERSION = StrictVersion("1.8.1") + self.assertEqual(get_version("1.8.1"), MIN_VERSION) + self.assertEqual(get_version("1.8.1-dev"), MIN_VERSION) + self.assertEqual(get_version("1.8.1rc"), MIN_VERSION) - self.assertLess(get_version('1.8.0'), MIN_VERSION) - self.assertLess(get_version('1.8.0-dev'), MIN_VERSION) - self.assertLess(get_version('1.8.0rc'), MIN_VERSION) + self.assertLess(get_version("1.8.0"), MIN_VERSION) + self.assertLess(get_version("1.8.0-dev"), MIN_VERSION) + self.assertLess(get_version("1.8.0rc"), MIN_VERSION) - self.assertLess(get_version('1.6.2'), MIN_VERSION) - self.assertLess(get_version('1.6.2-dev'), MIN_VERSION) - self.assertLess(get_version('1.6.2rc'), MIN_VERSION) + self.assertLess(get_version("1.6.2"), MIN_VERSION) + self.assertLess(get_version("1.6.2-dev"), MIN_VERSION) + self.assertLess(get_version("1.6.2rc"), MIN_VERSION) - self.assertGreater(get_version('1.9.0'), MIN_VERSION) - self.assertGreater(get_version('1.9.0-dev'), MIN_VERSION) - self.assertGreater(get_version('1.9.0rc'), MIN_VERSION) + self.assertGreater(get_version("1.9.0"), MIN_VERSION) + self.assertGreater(get_version("1.9.0-dev"), MIN_VERSION) + self.assertGreater(get_version("1.9.0rc"), MIN_VERSION) - self.assertGreater(get_version('1.8.2'), MIN_VERSION) - self.assertGreater(get_version('1.8.2-dev'), MIN_VERSION) - self.assertGreater(get_version('1.8.2rc'), MIN_VERSION) + self.assertGreater(get_version("1.8.2"), MIN_VERSION) + self.assertGreater(get_version("1.8.2-dev"), MIN_VERSION) + self.assertGreater(get_version("1.8.2rc"), MIN_VERSION) diff --git a/src/python/turicreate/test/test_distances.py b/src/python/turicreate/test/test_distances.py index ce9edd7abc..e7b2652741 100644 --- a/src/python/turicreate/test/test_distances.py +++ b/src/python/turicreate/test/test_distances.py @@ -14,6 +14,7 @@ from collections import Counter import sys + if sys.version_info.major > 2: unittest.TestCase.assertItemsEqual = unittest.TestCase.assertCountEqual @@ -21,83 +22,113 @@ class StandardDistancesTest(unittest.TestCase): @classmethod def setUpClass(self): - self.a = {"a":.5, "b":.7} - self.b = {"b":1.0, "c":.1, "d":.5} + self.a = {"a": 0.5, "b": 0.7} + self.b = {"b": 1.0, "c": 0.1, "d": 0.5} self.av = [3, 4, 1] self.bv = [1, 2, 3] - self.al = ['a', 'b', 'b', 'c'] - self.bl = ['a', 'b'] + self.al = ["a", "b", "b", "c"] + self.bl = ["a", "b"] def test_euclidean(self): - self.assertAlmostEqual(euclidean(self.a, self.b), - tc.distances.euclidean(self.a, self.b)) - self.assertAlmostEqual((2*2 + 2*2 + 2*2)**.5, - tc.distances.euclidean(self.av, self.bv)) + self.assertAlmostEqual( + euclidean(self.a, self.b), tc.distances.euclidean(self.a, self.b) + ) + self.assertAlmostEqual( + (2 * 2 + 2 * 2 + 2 * 2) ** 0.5, tc.distances.euclidean(self.av, self.bv) + ) def test_squared_euclidean(self): - self.assertAlmostEqual(euclidean(self.a, self.b) ** 2, - tc.distances.squared_euclidean(self.a, self.b)) + self.assertAlmostEqual( + euclidean(self.a, self.b) ** 2, + tc.distances.squared_euclidean(self.a, self.b), + ) def test_manhattan(self): - self.assertAlmostEqual(manhattan(self.a, self.b), - tc.distances.manhattan(self.a, self.b)) + self.assertAlmostEqual( + manhattan(self.a, self.b), tc.distances.manhattan(self.a, self.b) + ) def test_cosine(self): - self.assertAlmostEqual(cosine(self.a, self.b), - tc.distances.cosine(self.a, self.b)) + self.assertAlmostEqual( + cosine(self.a, self.b), tc.distances.cosine(self.a, self.b) + ) def test_transformed_dot_product(self): - self.assertAlmostEqual(transformed_dot_product(self.a, self.b), - tc.distances.transformed_dot_product(self.a, self.b)) + self.assertAlmostEqual( + transformed_dot_product(self.a, self.b), + tc.distances.transformed_dot_product(self.a, self.b), + ) def test_jaccard(self): - self.assertAlmostEqual(jaccard(self.a, self.b), - tc.distances.jaccard(self.a, self.b)) - self.assertAlmostEqual(jaccard(self.al, self.bl), - tc.distances.jaccard(self.al, self.bl)) + self.assertAlmostEqual( + jaccard(self.a, self.b), tc.distances.jaccard(self.a, self.b) + ) + self.assertAlmostEqual( + jaccard(self.al, self.bl), tc.distances.jaccard(self.al, self.bl) + ) + def test_weighted_jaccard(self): - self.assertAlmostEqual(weighted_jaccard(self.a, self.b), - tc.distances.weighted_jaccard(self.a, self.b)) - self.assertAlmostEqual(weighted_jaccard(self.al, self.bl), - tc.distances.weighted_jaccard(self.al, self.bl)) + self.assertAlmostEqual( + weighted_jaccard(self.a, self.b), + tc.distances.weighted_jaccard(self.a, self.b), + ) + self.assertAlmostEqual( + weighted_jaccard(self.al, self.bl), + tc.distances.weighted_jaccard(self.al, self.bl), + ) def test_edge_cases(self): self.assertAlmostEqual(tc.distances.euclidean({}, {}), 0.0) - self.assertAlmostEqual(tc.distances.euclidean({}, {'a': 1.0}), 1.0) + self.assertAlmostEqual(tc.distances.euclidean({}, {"a": 1.0}), 1.0) self.assertAlmostEqual(tc.distances.jaccard({}, {}), 0.0) - dists = ['euclidean', 'squared_euclidean', 'manhattan', - 'cosine', 'jaccard', 'weighted_jaccard', - 'levenshtein'] + dists = [ + "euclidean", + "squared_euclidean", + "manhattan", + "cosine", + "jaccard", + "weighted_jaccard", + "levenshtein", + ] for d in dists: dist_fn = tc.distances.__dict__[d] with self.assertRaises(ToolkitError): - dist_fn([1.0], {'a': 1.0}) + dist_fn([1.0], {"a": 1.0}) with self.assertRaises(ToolkitError): dist_fn(5.0, 7.0) class DistanceUtilsTest(unittest.TestCase): - @classmethod def setUpClass(self): - self.x = {'a': 1., 'b': 1., 'c': 1, - 'd': [1. ,2., 3.], - 'e':{'cat': 10, 'dog': 11, 'fossa': 12}, - 'f': 'what on earth is a fossa?'} - - self.y = {'a': 2., 'b': 3., 'c': 4., - 'd': [4., 5., 6.], - 'e': {'eel': 5, 'dog': 12, 'fossa': 10}, - 'f': 'a fossa is the best animal on earth'} - - self.dist = [[('a', 'b', 'c'), 'euclidean', 1], - [('d',), 'manhattan', 2], - [('e',), 'jaccard', 1.5], - [('f',), 'levenshtein', 0.3]] + self.x = { + "a": 1.0, + "b": 1.0, + "c": 1, + "d": [1.0, 2.0, 3.0], + "e": {"cat": 10, "dog": 11, "fossa": 12}, + "f": "what on earth is a fossa?", + } + + self.y = { + "a": 2.0, + "b": 3.0, + "c": 4.0, + "d": [4.0, 5.0, 6.0], + "e": {"eel": 5, "dog": 12, "fossa": 10}, + "f": "a fossa is the best animal on earth", + } + + self.dist = [ + [("a", "b", "c"), "euclidean", 1], + [("d",), "manhattan", 2], + [("e",), "jaccard", 1.5], + [("f",), "levenshtein", 0.3], + ] def test_composite_dist_validation(self): """ @@ -121,35 +152,35 @@ def test_composite_dist_validation(self): ## Empty feature list dist = copy.deepcopy(self.dist) - dist.append([[], 'euclidean', 13]) + dist.append([[], "euclidean", 13]) with self.assertRaises(ValueError): tc.distances._util._validate_composite_distance(dist) ## Feature list with non-strings dist = copy.deepcopy(self.dist) - dist.append([['test', 17], 'manhattan', 13]) + dist.append([["test", 17], "manhattan", 13]) with self.assertRaises(TypeError): tc.distances._util._validate_composite_distance(dist) ## Distance function in the wrong form dist = copy.deepcopy(self.dist) - dist.append([['d'], 17, 13]) + dist.append([["d"], 17, 13]) with self.assertRaises(ValueError): tc.distances._util._validate_composite_distance(dist) ## Non-existent distance function dist = copy.deepcopy(self.dist) - dist.append([['d'], 'haversine', 13]) + dist.append([["d"], "haversine", 13]) with self.assertRaises(ValueError): tc.distances._util._validate_composite_distance(dist) ## Weight of the wrong type dist = copy.deepcopy(self.dist) - dist.append([['d'], 'euclidean', 'a lot']) + dist.append([["d"], "euclidean", "a lot"]) with self.assertRaises(ValueError): tc.distances._util._validate_composite_distance(dist) @@ -159,38 +190,48 @@ def test_composite_feature_scrub(self): Make sure excluded features are properly removed from a composite distance specification. """ - dist = [[('a', 'b', 'c', 'goat'), 'euclidean', 1], - [('d', 'horse', 'goat'), 'manhattan', 2], - [('e', 'ibex', 'ibex'), 'jaccard', 1.5], - [('f',), 'levenshtein', 0.3]] + dist = [ + [("a", "b", "c", "goat"), "euclidean", 1], + [("d", "horse", "goat"), "manhattan", 2], + [("e", "ibex", "ibex"), "jaccard", 1.5], + [("f",), "levenshtein", 0.3], + ] ## Test basic functionality - feature_blacklist = ['goat', 'horse', 'ibex'] - ans = tc.distances._util._scrub_composite_distance_features(dist, - feature_blacklist) + feature_blacklist = ["goat", "horse", "ibex"] + ans = tc.distances._util._scrub_composite_distance_features( + dist, feature_blacklist + ) for d, d_ans in zip(self.dist, ans): self.assertSequenceEqual(d[0], d_ans[0]) ## Test removal of an entire distance component - feature_blacklist.append('f') # should remove the entire last component - ans = tc.distances._util._scrub_composite_distance_features(dist, - feature_blacklist) + feature_blacklist.append("f") # should remove the entire last component + ans = tc.distances._util._scrub_composite_distance_features( + dist, feature_blacklist + ) self.assertEqual(len(ans), 3) - self.assertItemsEqual(tc.distances._util._get_composite_distance_features(ans), - ['a', 'b', 'c', 'd', 'e']) + self.assertItemsEqual( + tc.distances._util._get_composite_distance_features(ans), + ["a", "b", "c", "d", "e"], + ) def test_composite_dist_type_convert(self): """ Make sure the utility to convert distance names to function handles works properly. """ - converted_dist = tc.distances._util._convert_distance_names_to_functions(self.dist) + converted_dist = tc.distances._util._convert_distance_names_to_functions( + self.dist + ) - ans = [tc.distances.euclidean, - tc.distances.manhattan, - tc.distances.jaccard, - tc.distances.levenshtein] + ans = [ + tc.distances.euclidean, + tc.distances.manhattan, + tc.distances.jaccard, + tc.distances.levenshtein, + ] self.assertSequenceEqual(ans, [x[1] for x in converted_dist]) @@ -201,10 +242,10 @@ def test_composite_dist_compute(self): ## Check that d(x, x) = 0 d = tc.distances.compute_composite_distance(self.dist, self.x, self.x) - self.assertAlmostEqual(d, 0.) + self.assertAlmostEqual(d, 0.0) d = tc.distances.compute_composite_distance(self.dist, self.y, self.y) - self.assertAlmostEqual(d, 0.) + self.assertAlmostEqual(d, 0.0) ## Check the distance between two data points against the hard-coded # answer. @@ -212,10 +253,10 @@ def test_composite_dist_compute(self): self.assertAlmostEqual(d, 30.29165739, places=5) ## Check the distance against the nearest neighbors toolkit - sf = tc.SFrame([self.x, self.y]).unpack('X1', column_name_prefix='') + sf = tc.SFrame([self.x, self.y]).unpack("X1", column_name_prefix="") m = tc.nearest_neighbors.create(sf, distance=self.dist, verbose=False) knn = m.query(sf[:1], k=2, verbose=False) - self.assertAlmostEqual(d, knn['distance'][1], places=5) + self.assertAlmostEqual(d, knn["distance"][1], places=5) def test_composite_features_extract(self): """ @@ -223,11 +264,12 @@ def test_composite_features_extract(self): distance. """ dist = copy.deepcopy(self.dist) - dist.append([['a', 'b', 'a'], 'cosine', 13]) - ans = ['a', 'b', 'c', 'd', 'e', 'f'] + dist.append([["a", "b", "a"], "cosine", 13]) + ans = ["a", "b", "c", "d", "e", "f"] - self.assertItemsEqual(ans, - tc.distances._util._get_composite_distance_features(dist)) + self.assertItemsEqual( + ans, tc.distances._util._get_composite_distance_features(dist) + ) class LocalDistancesTest(unittest.TestCase): @@ -237,63 +279,64 @@ class LocalDistancesTest(unittest.TestCase): @classmethod def setUpClass(self): - self.a = {"a":.5, "b":.7} - self.b = {"b":1.0, "c":.1, "d":.5} + self.a = {"a": 0.5, "b": 0.7} + self.b = {"b": 1.0, "c": 0.1, "d": 0.5} self.S = "fossa" self.T = "fossil" def test_local_jaccard(self): - self.assertAlmostEqual(jaccard(self.a, self.b), 1 - 1.0/4) + self.assertAlmostEqual(jaccard(self.a, self.b), 1 - 1.0 / 4) self.assertAlmostEqual(jaccard(self.a, {}), 1) self.assertAlmostEqual(jaccard(self.a, self.a), 0) def test_local_weighted_jaccard(self): - ans = 1 - (0. + 0.7 + 0. + 0.) / (0.5 + 1.0 + 0.1 + 0.5) + ans = 1 - (0.0 + 0.7 + 0.0 + 0.0) / (0.5 + 1.0 + 0.1 + 0.5) self.assertAlmostEqual(weighted_jaccard(self.a, self.b), ans) self.assertAlmostEqual(weighted_jaccard(self.a, {}), 1) self.assertAlmostEqual(weighted_jaccard(self.a, self.a), 0) def test_local_cosine(self): - ans = 1 - (.7 / ((.5**2 + .7**2) ** .5 * - (1**2 + .1**2 + .5**2) ** .5)) + ans = 1 - ( + 0.7 / ((0.5 ** 2 + 0.7 ** 2) ** 0.5 * (1 ** 2 + 0.1 ** 2 + 0.5 ** 2) ** 0.5) + ) self.assertAlmostEqual(cosine(self.a, self.b), ans) self.assertAlmostEqual(cosine(self.a, {}), 1) self.assertAlmostEqual(cosine(self.a, self.a), 0) def test_local_transformed_dot_product(self): - ans = np.log(1. + np.exp(-0.7)) + ans = np.log(1.0 + np.exp(-0.7)) self.assertAlmostEqual(transformed_dot_product(self.a, self.b), ans) - ans = np.log(1 + np.exp(-1 * (0.5**2 + 0.7**2))) + ans = np.log(1 + np.exp(-1 * (0.5 ** 2 + 0.7 ** 2))) self.assertAlmostEqual(transformed_dot_product(self.a, self.a), ans) def test_local_euclidean(self): self.assertAlmostEqual(euclidean(self.a, self.a), 0) - ans = ((0.5)**2 + (1.0 - 0.7)**2 + (0.1)**2 + (0.5)**2)**0.5 + ans = ((0.5) ** 2 + (1.0 - 0.7) ** 2 + (0.1) ** 2 + (0.5) ** 2) ** 0.5 self.assertAlmostEqual(euclidean(self.a, self.b), ans) - ans = ((0.5)**2 + (0.7)**2)**0.5 + ans = ((0.5) ** 2 + (0.7) ** 2) ** 0.5 self.assertAlmostEqual(euclidean(self.a, {}), ans) def test_local_squared_euclidean(self): self.assertAlmostEqual(squared_euclidean(self.a, self.a), 0) - ans = ((0.5)**2 + (1.0 - 0.7)**2 + (0.1)**2 + (0.5)**2) + ans = (0.5) ** 2 + (1.0 - 0.7) ** 2 + (0.1) ** 2 + (0.5) ** 2 self.assertAlmostEqual(squared_euclidean(self.a, self.b), ans) - ans = ((0.5)**2 + (0.7)**2) + ans = (0.5) ** 2 + (0.7) ** 2 self.assertAlmostEqual(squared_euclidean(self.a, {}), ans) def test_local_manhattan(self): self.assertAlmostEqual(manhattan(self.a, self.a), 0) - ans = (0.5 + (1.0 - 0.7) + (0.1) + (0.5)) + ans = 0.5 + (1.0 - 0.7) + (0.1) + (0.5) self.assertAlmostEqual(manhattan(self.a, self.b), ans) - ans = ((0.5) + (0.7)) + ans = (0.5) + (0.7) self.assertAlmostEqual(manhattan(self.a, {}), ans) def test_local_levenshtein(self): @@ -316,6 +359,7 @@ def jaccard(a, b): ans = 1.0 - float(len(a.intersection(b))) / len(a.union(b)) return ans + def weighted_jaccard(a, b): if isinstance(a, list) and isinstance(b, list): a = dict(Counter(a)) @@ -332,25 +376,28 @@ def weighted_jaccard(a, b): b2.setdefault(k, 0) numer += min(a2[k], b2[k]) denom += max(a2[k], b2[k]) - return 1. - float(numer) / denom + return 1.0 - float(numer) / denom + def cosine(a, b): ks = set(a.keys()).intersection(set(b.keys())) num = sum([a[k] * b[k] for k in ks]) - den = sum([v ** 2 for k, v in a.items()]) * \ - sum([v ** 2 for k, v in b.items()]) - den = den ** .5 + den = sum([v ** 2 for k, v in a.items()]) * sum([v ** 2 for k, v in b.items()]) + den = den ** 0.5 if den == 0: den = 0.0001 - return 1 - num/den + return 1 - num / den + def transformed_dot_product(a, b): ks = set(a.keys()).intersection(set(b.keys())) dotprod = sum([a[k] * b[k] for k in ks]) return np.log(1 + np.exp(-1 * dotprod)) + def euclidean(a, b): - return squared_euclidean(a, b)**0.5 + return squared_euclidean(a, b) ** 0.5 + def squared_euclidean(a, b): a2 = a.copy() @@ -361,9 +408,10 @@ def squared_euclidean(a, b): for k in keys: a2.setdefault(k, 0) b2.setdefault(k, 0) - ans += (a2[k] - b2[k])**2 + ans += (a2[k] - b2[k]) ** 2 return ans + def manhattan(a, b): a2 = a.copy() b2 = b.copy() @@ -376,6 +424,7 @@ def manhattan(a, b): ans += abs(a2[k] - b2[k]) return ans + def levenshtein(a, b): m = len(a) n = len(b) @@ -389,8 +438,6 @@ def levenshtein(a, b): if a[i - 1] == b[j - 1]: D[i, j] = D[i - 1, j - 1] else: - D[i, j] = min(D[i-1, j] + 1, - D[i, j-1] + 1, - D[i-1, j-1] + 1) + D[i, j] = min(D[i - 1, j] + 1, D[i, j - 1] + 1, D[i - 1, j - 1] + 1) return D[m, n] diff --git a/src/python/turicreate/test/test_drawing_classifier.py b/src/python/turicreate/test/test_drawing_classifier.py index 10189a1470..2093fe496c 100644 --- a/src/python/turicreate/test/test_drawing_classifier.py +++ b/src/python/turicreate/test/test_drawing_classifier.py @@ -21,13 +21,15 @@ import unittest import pytest + def _build_bitmap_data(): - ''' + """ Build an SFrame from 10 saved drawings. - ''' + """ from os.path import join as _join, realpath as _realpath from os.path import splitext as _splitext, basename as _basename from os.path import dirname as _dirname + drawings_dir = _join(_dirname(_realpath(__file__)), "drawings") sf = _tc.image_analysis.load_images(drawings_dir, with_path=True) sf = sf.rename({"image": "drawing", "path": "label"}) @@ -35,18 +37,20 @@ def _build_bitmap_data(): lambda filepath: _splitext(_basename(filepath))[0][:-1] # Extract the class name from the filename, "check1.png" -> "check" # [:-1] is to get "check" out of "check1" - ) + ) return sf + def _build_stroke_data(): - ''' + """ Build an SFrame by generating 10 random stroke-based drawings. Each stroke is generated by doing a random walk on a canvas. - ''' + """ num_rows_in_sframe = 10 drawings, labels = [], [] random = _np.random.RandomState(100) - def _generate_random_point(point = None): + + def _generate_random_point(point=None): if point is not None: dx = random.choice([-1, 0, 1]) dy = random.choice([-1, 0, 1]) @@ -85,18 +89,27 @@ def setUpClass(self, warm_start=None): self.target, feature=self.feature, max_iterations=self.max_iterations, - warm_start=warm_start) + warm_start=warm_start, + ) self.stroke_model = _tc.drawing_classifier.create( self.stroke_sf, self.target, feature=self.feature, max_iterations=1, - warm_start=warm_start) + warm_start=warm_start, + ) self.trains = [self.check_cross_sf, self.stroke_sf] self.models = [self.check_cross_model, self.stroke_model] def test_create_with_missing_value_bitmap(self): - sf = self.check_cross_sf.append(_tc.SFrame({self.feature: _tc.SArray([None], dtype=_tc.Image), self.target: ["check"]})) + sf = self.check_cross_sf.append( + _tc.SFrame( + { + self.feature: _tc.SArray([None], dtype=_tc.Image), + self.target: ["check"], + } + ) + ) try: _tc.drawing_classifier.create(sf, self.target) except _ToolkitError as e: @@ -105,8 +118,7 @@ def test_create_with_missing_value_bitmap(self): def test_create_with_missing_value_in_label(self): sf = self.check_cross_sf sf = sf.remove_column(self.target) - sf = sf.add_column(_tc.SArray( - [None] * len(sf), dtype=str), self.target) + sf = sf.add_column(_tc.SArray([None] * len(sf), dtype=str), self.target) try: _tc.drawing_classifier.create(sf, self.target) except _ToolkitError as e: @@ -115,8 +127,7 @@ def test_create_with_missing_value_in_label(self): def test_create_with_missing_feature(self): for sf in self.trains: with self.assertRaises(_ToolkitError): - _tc.drawing_classifier.create(sf, self.target, - feature="wrong_feature") + _tc.drawing_classifier.create(sf, self.target, feature="wrong_feature") def test_create_with_missing_target(self): for sf in self.trains: @@ -126,82 +137,87 @@ def test_create_with_missing_target(self): def test_create_with_empty_dataset(self): for sf in self.trains: with self.assertRaises(_ToolkitError): - _tc.drawing_classifier.create(sf[:0], self.target, - feature=self.feature) + _tc.drawing_classifier.create(sf[:0], self.target, feature=self.feature) def test_create_with_missing_coordinates_in_stroke_input(self): drawing = [[{"x": 1.0, "y": 1.0}], [{"x": 0.0}, {"y": 0.0}]] - sf = _tc.SFrame({ - self.feature: [drawing], - self.target: ["missing_coordinates"] - }) + sf = _tc.SFrame({self.feature: [drawing], self.target: ["missing_coordinates"]}) with self.assertRaises(_ToolkitError): _tc.drawing_classifier.create(sf, self.target) def test_create_with_wrongly_typed_coordinates_in_stroke_input(self): drawing = [[{"x": 1.0, "y": 0}], [{"x": "string_x?!", "y": 0.1}]] - sf = _tc.SFrame({ - self.feature: [drawing], - self.target: ["string_x_coordinate"] - }) + sf = _tc.SFrame({self.feature: [drawing], self.target: ["string_x_coordinate"]}) with self.assertRaises(_ToolkitError): _tc.drawing_classifier.create(sf, self.target) def test_create_with_None_coordinates_in_stroke_input(self): drawing = [[{"x": 1.0, "y": None}], [{"x": 1.1, "y": 0.1}]] - sf = _tc.SFrame({ - self.feature: [drawing], - self.target: ["none_y_coordinate"] - }) + sf = _tc.SFrame({self.feature: [drawing], self.target: ["none_y_coordinate"]}) with self.assertRaises(_ToolkitError): _tc.drawing_classifier.create(sf, self.target, feature=self.feature) def test_create_with_validation_set_None(self): for data in self.trains: _tc.drawing_classifier.create( - data, self.target, feature=self.feature, validation_set=None, max_iterations=1) + data, + self.target, + feature=self.feature, + validation_set=None, + max_iterations=1, + ) def test_create_with_verbose_False(self): for data in self.trains: args = [data, self.target] kwargs = { - 'feature': self.feature, - 'max_iterations': 1, + "feature": self.feature, + "max_iterations": 1, } test_util.assert_longer_verbose_logs( - _tc.drawing_classifier.create, args, kwargs) + _tc.drawing_classifier.create, args, kwargs + ) def test_create_with_no_validation_set(self): for data in self.trains: _tc.drawing_classifier.create( - data, self.target, feature=self.feature, max_iterations=1) + data, self.target, feature=self.feature, max_iterations=1 + ) def test_create_with_empty_drawing_in_stroke_input(self): drawing = [] - sf = _tc.SFrame({ - self.feature: [drawing], - self.target: ["empty_drawing"] - }) + sf = _tc.SFrame({self.feature: [drawing], self.target: ["empty_drawing"]}) # Should not error out, it should silently ignore the empty drawing - _tc.drawing_classifier.create(sf, self.target, feature=self.feature, - max_iterations=1) + _tc.drawing_classifier.create( + sf, self.target, feature=self.feature, max_iterations=1 + ) def test_create_with_empty_stroke_in_stroke_input(self): drawing = [[{"x": 1.0, "y": 0.0}], [], [{"x": 1.1, "y": 0.1}]] - sf = _tc.SFrame({ - self.feature: [drawing], - self.target: ["empty_drawing"] - }) + sf = _tc.SFrame({self.feature: [drawing], self.target: ["empty_drawing"]}) # Should not error out, it should silently ignore the empty stroke - _tc.drawing_classifier.create(sf, self.target, feature=self.feature, - max_iterations=1) + _tc.drawing_classifier.create( + sf, self.target, feature=self.feature, max_iterations=1 + ) def test_create_with_fixed_random_seed(self): for data in self.trains: model_1 = _tc.drawing_classifier.create( - data, self.target, feature=self.feature, validation_set=None, max_iterations=3, random_seed=86) + data, + self.target, + feature=self.feature, + validation_set=None, + max_iterations=3, + random_seed=86, + ) model_2 = _tc.drawing_classifier.create( - data, self.target, feature=self.feature, validation_set=None, max_iterations=3, random_seed=86) + data, + self.target, + feature=self.feature, + validation_set=None, + max_iterations=3, + random_seed=86, + ) pred_1 = model_1.predict(data) pred_2 = model_2.predict(data) for i in range(len(pred_1)): @@ -214,10 +230,10 @@ def test_predict_with_sframe(self): for output_type in ["class", "probability_vector"]: preds = model.predict(sf, output_type=output_type) if output_type == "class": - assert(preds.dtype == sf[self.target].dtype) + assert preds.dtype == sf[self.target].dtype else: - assert(preds.dtype == _array) - assert(len(preds) == len(sf)) + assert preds.dtype == _array + assert len(preds) == len(sf) def test_predict_with_sarray(self): for index in range(len(self.models)): @@ -226,10 +242,10 @@ def test_predict_with_sarray(self): for output_type in ["class", "probability_vector"]: preds = model.predict(sf[self.feature], output_type=output_type) if output_type == "class": - assert(preds.dtype == sf[self.target].dtype) + assert preds.dtype == sf[self.target].dtype else: - assert(preds.dtype == _array) - assert(len(preds) == len(sf)) + assert preds.dtype == _array + assert len(preds) == len(sf) def test_predict_topk(self): k = 2 @@ -238,15 +254,15 @@ def test_predict_topk(self): sf = self.trains[index] for output_type in ["rank", "probability"]: preds = model.predict_topk(sf, k=k, output_type=output_type) - assert("id" in preds.column_names()) - assert("class" in preds.column_names()) + assert "id" in preds.column_names() + assert "class" in preds.column_names() if output_type == "rank": - assert(preds["rank"].dtype == int) - assert(sorted(preds["rank"].unique()) == [0, 1]) + assert preds["rank"].dtype == int + assert sorted(preds["rank"].unique()) == [0, 1] else: - assert(output_type == "probability") - assert(preds["probability"].dtype == float) - assert(len(preds) == k*len(sf)) + assert output_type == "probability" + assert preds["probability"].dtype == float + assert len(preds) == k * len(sf) def test_predict_output_type_probability_with_sframe(self): for index in range(len(self.models)): @@ -257,7 +273,7 @@ def test_predict_output_type_probability_with_sframe(self): model.predict(sf, output_type="probability") else: preds = model.predict(sf, output_type="probability") - assert(preds.dtype == float) + assert preds.dtype == float def test_predict_output_type_probability_with_sarray(self): for index in range(len(self.models)): @@ -267,9 +283,8 @@ def test_predict_output_type_probability_with_sarray(self): with self.assertRaises(_ToolkitError): model.predict(sf[self.feature], output_type="probability") else: - preds = model.predict( - sf[self.feature], output_type="probability") - assert(preds.dtype == float) + preds = model.predict(sf[self.feature], output_type="probability") + assert preds.dtype == float def test_evaluate_without_ground_truth(self): for index in range(len(self.trains)): @@ -280,26 +295,33 @@ def test_evaluate_without_ground_truth(self): model.evaluate(sf_without_ground_truth) def test_evaluate_with_ground_truth(self): - all_metrics = ["accuracy", "auc", "precision", "recall", - "f1_score", "log_loss", "confusion_matrix", "roc_curve"] + all_metrics = [ + "accuracy", + "auc", + "precision", + "recall", + "f1_score", + "log_loss", + "confusion_matrix", + "roc_curve", + ] for index in range(len(self.models)): model = self.models[index] sf = self.trains[index] individual_run_results = dict() for metric in all_metrics: evaluation = model.evaluate(sf, metric=metric) - assert(metric in evaluation) + assert metric in evaluation individual_run_results[metric] = evaluation[metric] evaluation = model.evaluate(sf, metric="auto") for metric in all_metrics: if metric in ["confusion_matrix", "roc_curve"]: test_util.SFrameComparer()._assert_sframe_equal( - individual_run_results[metric], - evaluation[metric]) + individual_run_results[metric], evaluation[metric] + ) else: - assert(metric in evaluation) - assert( - individual_run_results[metric] == evaluation[metric]) + assert metric in evaluation + assert individual_run_results[metric] == evaluation[metric] def test_evaluate_with_unsupported_metric(self): for index in range(len(self.trains)): @@ -316,33 +338,40 @@ def test_save_and_load(self): new_model = _tc.load_model(filename) old_preds = old_model.predict(data) new_preds = new_model.predict(data) - assert(new_preds.dtype == old_preds.dtype - and (new_preds == old_preds).all()) + assert ( + new_preds.dtype == old_preds.dtype + and (new_preds == old_preds).all() + ) def test_export_coreml(self): import coremltools import platform - max_iters_ans = [str(self.max_iterations), '1'] - warm_start_ans = '' if self.warm_start is None else self.warm_start + + max_iters_ans = [str(self.max_iterations), "1"] + warm_start_ans = "" if self.warm_start is None else self.warm_start for i, model in enumerate(self.models): filename = _mkstemp("bingo.mlmodel")[1] model.export_coreml(filename) # Load the model back from the CoreML model file coreml_model = coremltools.models.MLModel(filename) - self.assertDictEqual({ - 'com.github.apple.turicreate.version': _tc.__version__, - 'com.github.apple.os.platform': platform.platform(), - 'target': self.target, - 'feature': self.feature, - 'type': 'drawing_classifier', - 'warm_start': warm_start_ans, - 'max_iterations': max_iters_ans[i], - 'version': '2', - }, dict(coreml_model.user_defined_metadata) + self.assertDictEqual( + { + "com.github.apple.turicreate.version": _tc.__version__, + "com.github.apple.os.platform": platform.platform(), + "target": self.target, + "feature": self.feature, + "type": "drawing_classifier", + "warm_start": warm_start_ans, + "max_iterations": max_iters_ans[i], + "version": "2", + }, + dict(coreml_model.user_defined_metadata), ) - expected_result = 'Drawing classifier created by Turi Create (version %s)' \ + expected_result = ( + "Drawing classifier created by Turi Create (version %s)" % (_tc.__version__) + ) self.assertEquals(expected_result, coreml_model.short_description) @unittest.skipIf(_sys.platform != "darwin", "Core ML only supported on Mac") @@ -362,37 +391,38 @@ def test_export_coreml_with_predict(self): if test_number == 1: # stroke input - sf[feature] = _tc.drawing_classifier.util.draw_strokes( - sf[self.feature]) + sf[feature] = _tc.drawing_classifier.util.draw_strokes(sf[self.feature]) for row_number in range(len(sf)): - core_ml_preds = mlmodel.predict({ - "drawing": sf[feature][row_number]._to_pil_image() - }) - assert(core_ml_preds[self.target] == tc_preds[row_number]) + core_ml_preds = mlmodel.predict( + {"drawing": sf[feature][row_number]._to_pil_image()} + ) + assert core_ml_preds[self.target] == tc_preds[row_number] if test_number == 1: sf = sf.remove_column(feature) def test_draw_strokes_sframe(self): sf = self.stroke_sf - sf["rendered"] = _tc.drawing_classifier.util.draw_strokes( - sf[self.feature]) + sf["rendered"] = _tc.drawing_classifier.util.draw_strokes(sf[self.feature]) for index in range(len(sf["rendered"])): rendered = sf["rendered"][index] - assert(type(rendered) == _tc.Image - and rendered.channels == 1 - and rendered.width == 28 - and rendered.height == 28) + assert ( + type(rendered) == _tc.Image + and rendered.channels == 1 + and rendered.width == 28 + and rendered.height == 28 + ) def test_draw_strokes_single_input(self): sf = self.stroke_sf - single_bitmap = _tc.drawing_classifier.util.draw_strokes( - sf[self.feature][0]) - assert(type(single_bitmap) == _tc.Image - and single_bitmap.channels == 1 - and single_bitmap.width == 28 - and single_bitmap.height == 28) + single_bitmap = _tc.drawing_classifier.util.draw_strokes(sf[self.feature][0]) + assert ( + type(single_bitmap) == _tc.Image + and single_bitmap.channels == 1 + and single_bitmap.width == 28 + and single_bitmap.height == 28 + ) def test_repr(self): for model in self.models: @@ -405,17 +435,17 @@ def test_summary(self): def test_summary_str(self): for model in self.models: - self.assertTrue(isinstance(model.summary('str'), str)) + self.assertTrue(isinstance(model.summary("str"), str)) def test_summary_dict(self): for model in self.models: - self.assertTrue(isinstance(model.summary('dict'), dict)) + self.assertTrue(isinstance(model.summary("dict"), dict)) def test_summary_invalid_input(self): for model in self.models: with self.assertRaises(_ToolkitError): - model.summary(model.summary('invalid')) + model.summary(model.summary("invalid")) with self.assertRaises(_ToolkitError): model.summary(model.summary(0)) @@ -427,11 +457,12 @@ def test_summary_invalid_input(self): class DrawingClassifierFromScratchTest(DrawingClassifierTest): @classmethod def setUpClass(self): - super(DrawingClassifierFromScratchTest, self).setUpClass( - warm_start=None) + super(DrawingClassifierFromScratchTest, self).setUpClass(warm_start=None) + class DrawingClassifierUsingQuickdraw245(DrawingClassifierTest): @classmethod def setUpClass(self): super(DrawingClassifierUsingQuickdraw245, self).setUpClass( - warm_start="quickdraw_245_v0") + warm_start="quickdraw_245_v0" + ) diff --git a/src/python/turicreate/test/test_environment_config.py b/src/python/turicreate/test/test_environment_config.py index ef26107c58..3e2fe0f091 100644 --- a/src/python/turicreate/test/test_environment_config.py +++ b/src/python/turicreate/test/test_environment_config.py @@ -20,9 +20,8 @@ import os import shutil -class EnvironmentConfigTester(unittest.TestCase): - +class EnvironmentConfigTester(unittest.TestCase): def test_config_basic_write(self): test_dir = tempfile.mkdtemp() diff --git a/src/python/turicreate/test/test_evaluation.py b/src/python/turicreate/test/test_evaluation.py index 133eef8a2e..5b4fa1f9a5 100644 --- a/src/python/turicreate/test/test_evaluation.py +++ b/src/python/turicreate/test/test_evaluation.py @@ -8,8 +8,14 @@ from __future__ import absolute_import as _ import unittest import turicreate -from sklearn.metrics import (fbeta_score, recall_score, precision_score, - accuracy_score, f1_score, roc_auc_score) +from sklearn.metrics import ( + fbeta_score, + recall_score, + precision_score, + accuracy_score, + f1_score, + roc_auc_score, +) from turicreate.toolkits._main import ToolkitError import math from numpy import inf @@ -26,11 +32,10 @@ def _round_scores(p): not happen during our unit tests, we snap numbers to the centers between thresholds. """ - return (np.round(p, decimals=5) + 0.5/100001).clip(max=1) + return (np.round(p, decimals=5) + 0.5 / 100001).clip(max=1) -def _generate_classes_and_scores(num_classes, n, seed=42, - hard_predictions=False): +def _generate_classes_and_scores(num_classes, n, seed=42, hard_predictions=False): rs = np.random.RandomState(seed) t = rs.randint(num_classes, size=n) if hard_predictions: @@ -43,140 +48,140 @@ def _generate_classes_and_scores(num_classes, n, seed=42, class MetricsTest(unittest.TestCase): - def test_rmse(self): - y = turicreate.SArray([1,2,1,2]) - yhat = turicreate.SArray([3,-1,1,0]) + y = turicreate.SArray([1, 2, 1, 2]) + yhat = turicreate.SArray([3, -1, 1, 0]) rmse = turicreate.toolkits.evaluation.rmse(y, yhat) - true_rmse = (float(2*2 + 3*3 + 0 + 2*2)/4)**.5 + true_rmse = (float(2 * 2 + 3 * 3 + 0 + 2 * 2) / 4) ** 0.5 self.assertAlmostEqual(rmse, true_rmse) def test_log_loss(self): # Binary classification - y = turicreate.SArray([ 1, 1, 0, 1, 1]) - yhat = turicreate.SArray([.5, .2, .8, .3, .9]) + y = turicreate.SArray([1, 1, 0, 1, 1]) + yhat = turicreate.SArray([0.5, 0.2, 0.8, 0.3, 0.9]) logp = yhat.apply(lambda x: math.log(x)) log1mp = yhat.apply(lambda x: math.log(1 - x)) - expected = - (y * logp + (1-y) * log1mp).mean() + expected = -(y * logp + (1 - y) * log1mp).mean() observed = turicreate.toolkits.evaluation.log_loss(y, yhat) self.assertAlmostEqual(expected, observed) # Binary classification, string - y = turicreate.SArray([ 1, 1, 0, 1, 1]) - yhat = turicreate.SArray([.5, .2, .8, .3, .9]) + y = turicreate.SArray([1, 1, 0, 1, 1]) + yhat = turicreate.SArray([0.5, 0.2, 0.8, 0.3, 0.9]) logp = yhat.apply(lambda x: math.log(x)) log1mp = yhat.apply(lambda x: math.log(1 - x)) - expected = - (y * logp + (1-y) * log1mp).mean() + expected = -(y * logp + (1 - y) * log1mp).mean() observed = turicreate.toolkits.evaluation.log_loss(y.astype(str), yhat) self.assertAlmostEqual(expected, observed) - # Binary classification vs sklearn example - y = turicreate.SArray([ 1, 0, 0, 1]) - yhat = turicreate.SArray([[.1, .9], - [.9, .1], - [.8, .2], - [.35, .65]]) + y = turicreate.SArray([1, 0, 0, 1]) + yhat = turicreate.SArray([[0.1, 0.9], [0.9, 0.1], [0.8, 0.2], [0.35, 0.65]]) - expected = - (math.log(.9) + math.log(.9) + math.log(.8) + math.log(.65)) / 4.0 + expected = ( + -(math.log(0.9) + math.log(0.9) + math.log(0.8) + math.log(0.65)) / 4.0 + ) observed = turicreate.toolkits.evaluation.log_loss(y, yhat) self.assertAlmostEqual(expected, observed) # Binary classification with missing data - y = turicreate.SArray([ 1, None, None, 1]) - yhat = turicreate.SArray([[.1, .9], - [.9, .1], - [.8, .2], - [.35, .65]]) + y = turicreate.SArray([1, None, None, 1]) + yhat = turicreate.SArray([[0.1, 0.9], [0.9, 0.1], [0.8, 0.2], [0.35, 0.65]]) - expected = - (math.log(.9) + math.log(.9) + math.log(.8) + math.log(.65)) / 4.0 + expected = ( + -(math.log(0.9) + math.log(0.9) + math.log(0.8) + math.log(0.65)) / 4.0 + ) observed = turicreate.toolkits.evaluation.log_loss(y, yhat) self.assertAlmostEqual(expected, observed) # Multiclass - y = turicreate.SArray([ 0, 1, 2, 0]) - yhat = turicreate.SArray([[.5, .1, .4], - [.5, .1, .4], - [.1, .5, .4], - [.2, .3, .5]]) + y = turicreate.SArray([0, 1, 2, 0]) + yhat = turicreate.SArray( + [[0.5, 0.1, 0.4], [0.5, 0.1, 0.4], [0.1, 0.5, 0.4], [0.2, 0.3, 0.5]] + ) true_probs = [yhat[0][0], yhat[1][1], yhat[2][2], yhat[3][0]] - expected = - sum([math.log(x) for x in true_probs]) / 4.0 + expected = -sum([math.log(x) for x in true_probs]) / 4.0 observed = turicreate.toolkits.evaluation.log_loss(y, yhat) self.assertAlmostEqual(expected, observed) # Multiclass with strings - y = turicreate.SArray([ "a", "b", "c", "a"]) - yhat = turicreate.SArray([[.5, .1, .4], - [.5, .1, .4], - [.1, .5, .4], - [.2, .3, .5]]) + y = turicreate.SArray(["a", "b", "c", "a"]) + yhat = turicreate.SArray( + [[0.5, 0.1, 0.4], [0.5, 0.1, 0.4], [0.1, 0.5, 0.4], [0.2, 0.3, 0.5]] + ) true_probs = [yhat[0][0], yhat[1][1], yhat[2][2], yhat[3][0]] - expected = - sum([math.log(x) for x in true_probs])/4.0 + expected = -sum([math.log(x) for x in true_probs]) / 4.0 observed = turicreate.toolkits.evaluation.log_loss(y, yhat) self.assertAlmostEqual(expected, observed) # Multiclass with strings and explicit index map - y = turicreate.SArray([ "a", "c", "d", "a"]) - yhat = turicreate.SArray([[.5, .0, .1, .4], - [.5, .0, .1, .4], - [.1, .0, .5, .4], - [.2, .0, .3, .5]]) - index_map = {"a" : 0, "b" : 1, "c" : 2, "d" : 3} + y = turicreate.SArray(["a", "c", "d", "a"]) + yhat = turicreate.SArray( + [ + [0.5, 0.0, 0.1, 0.4], + [0.5, 0.0, 0.1, 0.4], + [0.1, 0.0, 0.5, 0.4], + [0.2, 0.0, 0.3, 0.5], + ] + ) + index_map = {"a": 0, "b": 1, "c": 2, "d": 3} true_probs = [yhat[0][0], yhat[1][2], yhat[2][3], yhat[3][0]] - expected = - sum([math.log(x) for x in true_probs])/4.0 + expected = -sum([math.log(x) for x in true_probs]) / 4.0 observed = turicreate.toolkits.evaluation.log_loss(y, yhat, index_map=index_map) self.assertAlmostEqual(expected, observed) def test_logloss_clipping(self): - y = turicreate.SArray([0, 1, 2, 0]) - yhat = turicreate.SArray([[0.9, 0.0, 0.1], - [0.8, 0.1, 0.1], - [0.1, 0.1, 0.8], - [0.1, 0.1, 0.8]]) + y = turicreate.SArray([0, 1, 2, 0]) + yhat = turicreate.SArray( + [[0.9, 0.0, 0.1], [0.8, 0.1, 0.1], [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]] + ) log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat) self.assertTrue(log_loss != inf) - y = turicreate.SArray([0, 1, 2, 0]) - yhat = turicreate.SArray([[1.0, 0.0, 0.0], - [0.8, 0.1, 0.1], - [0.1, 0.1, 0.8], - [0.1, 0.1, 0.8]]) + y = turicreate.SArray([0, 1, 2, 0]) + yhat = turicreate.SArray( + [[1.0, 0.0, 0.0], [0.8, 0.1, 0.1], [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]] + ) log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat) self.assertTrue(log_loss != inf) - y = turicreate.SArray([0, 1, 0, 0]) + y = turicreate.SArray([0, 1, 0, 0]) yhat = turicreate.SArray([0.0, 0.9, 0.1, 0.1]) log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat) self.assertTrue(log_loss != inf) - y = turicreate.SArray([0, 1, 0, 0]) + y = turicreate.SArray([0, 1, 0, 0]) yhat = turicreate.SArray([0.1, 1.0, 0.1, 0.1]) log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat) self.assertTrue(log_loss != inf) def test_probabilities_with_index_map(self): - y = turicreate.SArray([0, 2, 3, 0]) - yhat = turicreate.SArray([[0.9, 0.0, 0.0, 0.1], - [0.8, 0.0, 0.1, 0.1], - [0.1, 0.0, 0.1, 0.8], - [0.1, 0.0, 0.1, 0.8]]) + y = turicreate.SArray([0, 2, 3, 0]) + yhat = turicreate.SArray( + [ + [0.9, 0.0, 0.0, 0.1], + [0.8, 0.0, 0.1, 0.1], + [0.1, 0.0, 0.1, 0.8], + [0.1, 0.0, 0.1, 0.8], + ] + ) # The evaluation toolkit must know that 1 is a possible class label, and # corresponds to index 1 in each probability vector. @@ -184,22 +189,21 @@ def test_probabilities_with_index_map(self): log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat, index_map=index_map) auc = turicreate.toolkits.evaluation.auc(y, yhat, index_map=index_map) - roc_curve = turicreate.toolkits.evaluation.roc_curve(y, yhat, index_map=index_map) + roc_curve = turicreate.toolkits.evaluation.roc_curve( + y, yhat, index_map=index_map + ) def test_integer_probabilities(self): - y = turicreate.SArray([ 0, 1, 2, 0]) - yhat = turicreate.SArray([[1, 0, 1], - [1, 0, 1], - [0, 1, 1], - [0, 0, 1]]) + y = turicreate.SArray([0, 1, 2, 0]) + yhat = turicreate.SArray([[1, 0, 1], [1, 0, 1], [0, 1, 1], [0, 0, 1]]) log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat) auc = turicreate.toolkits.evaluation.auc(y, yhat) roc_curve = turicreate.toolkits.evaluation.roc_curve(y, yhat) - y = turicreate.SArray([ 0, 1, 0, 0]) - yhat = turicreate.SArray([ 0, 1, 0, 0]) + y = turicreate.SArray([0, 1, 0, 0]) + yhat = turicreate.SArray([0, 1, 0, 0]) turicreate.toolkits.evaluation.log_loss(y, yhat) turicreate.toolkits.evaluation.auc(y, yhat) @@ -208,12 +212,10 @@ def test_integer_probabilities(self): def test_none_probabilities(self): # Test the case when probabilities are integer (sigh!) - y = turicreate.SArray([ 0, 1, 2, 0]) - yhat = turicreate.SArray([[1, 0, None], - [1, 0, None], - [0, 1, None], - [0, 0, None]]) - + y = turicreate.SArray([0, 1, 2, 0]) + yhat = turicreate.SArray( + [[1, 0, None], [1, 0, None], [0, 1, None], [0, 0, None]] + ) with self.assertRaises(TypeError): log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat) @@ -223,12 +225,10 @@ def test_none_probabilities(self): roc_curve = turicreate.toolkits.evaluation.roc_curve(y, yhat) # Test the case when probabilities are integer (sigh!) - y = turicreate.SArray([ 0, 1, 2, 0]) - yhat = turicreate.SArray([[0.9, 0.0, 0.0], - [0.9, 0.1, 0.0], - None, - [0.0, 0.1, 0.9]]) - + y = turicreate.SArray([0, 1, 2, 0]) + yhat = turicreate.SArray( + [[0.9, 0.0, 0.0], [0.9, 0.1, 0.0], None, [0.0, 0.1, 0.9]] + ) with self.assertRaises(ToolkitError): turicreate.toolkits.evaluation.log_loss(y, yhat) @@ -238,8 +238,8 @@ def test_none_probabilities(self): turicreate.toolkits.evaluation.roc_curve(y, yhat) # Test the case when probabilities are integer (sigh!) - y = turicreate.SArray([ 0, 1, 0, 0]) - yhat = turicreate.SArray([ 0, 1, 0, None]) + y = turicreate.SArray([0, 1, 0, 0]) + yhat = turicreate.SArray([0, 1, 0, None]) with self.assertRaises(ToolkitError): turicreate.toolkits.evaluation.log_loss(y, yhat) @@ -249,8 +249,8 @@ def test_none_probabilities(self): turicreate.toolkits.evaluation.roc_curve(y, yhat) # Test the case when probabilities are integer (sigh!) - y = turicreate.SArray([ 0, 1, 0, 0]) - yhat = turicreate.SArray([ 0.1, 0.1, 0.9, None]) + y = turicreate.SArray([0, 1, 0, 0]) + yhat = turicreate.SArray([0.1, 0.1, 0.9, None]) with self.assertRaises(ToolkitError): turicreate.toolkits.evaluation.log_loss(y, yhat) @@ -259,17 +259,16 @@ def test_none_probabilities(self): with self.assertRaises(ToolkitError): turicreate.toolkits.evaluation.roc_curve(y, yhat) - def test_none_prec_recall_scores_binary(self): # Arrange - y = turicreate.SArray([0, 1]) + y = turicreate.SArray([0, 1]) yhat = turicreate.SArray([0, 0]) # Act - pr = turicreate.toolkits.evaluation.precision(y, yhat) - rec = turicreate.toolkits.evaluation.recall(y, yhat) - f1 = turicreate.toolkits.evaluation.f1_score(y, yhat) - fbeta = turicreate.toolkits.evaluation.fbeta_score(y, yhat, beta = 2.0) + pr = turicreate.toolkits.evaluation.precision(y, yhat) + rec = turicreate.toolkits.evaluation.recall(y, yhat) + f1 = turicreate.toolkits.evaluation.f1_score(y, yhat) + fbeta = turicreate.toolkits.evaluation.fbeta_score(y, yhat, beta=2.0) # Assert self.assertEqual(pr, None) @@ -278,14 +277,14 @@ def test_none_prec_recall_scores_binary(self): self.assertEqual(fbeta, 0.0) # Arrange - y = turicreate.SArray([0, 0]) + y = turicreate.SArray([0, 0]) yhat = turicreate.SArray([0, 1]) # Act - pr = turicreate.toolkits.evaluation.precision(y, yhat) - rec = turicreate.toolkits.evaluation.recall(y, yhat) - f1 = turicreate.toolkits.evaluation.f1_score(y, yhat) - fbeta = turicreate.toolkits.evaluation.fbeta_score(y, yhat, beta = 2.0) + pr = turicreate.toolkits.evaluation.precision(y, yhat) + rec = turicreate.toolkits.evaluation.recall(y, yhat) + f1 = turicreate.toolkits.evaluation.f1_score(y, yhat) + fbeta = turicreate.toolkits.evaluation.fbeta_score(y, yhat, beta=2.0) # Assert self.assertEqual(pr, 0.0) @@ -294,14 +293,14 @@ def test_none_prec_recall_scores_binary(self): self.assertEqual(fbeta, 0.0) # Arrange - y = turicreate.SArray(["0", "1"]) + y = turicreate.SArray(["0", "1"]) yhat = turicreate.SArray(["0", "0"]) # Act - pr = turicreate.toolkits.evaluation.precision(y, yhat) - rec = turicreate.toolkits.evaluation.recall(y, yhat) - f1 = turicreate.toolkits.evaluation.f1_score(y, yhat) - fbeta = turicreate.toolkits.evaluation.fbeta_score(y, yhat, beta = 2.0) + pr = turicreate.toolkits.evaluation.precision(y, yhat) + rec = turicreate.toolkits.evaluation.recall(y, yhat) + f1 = turicreate.toolkits.evaluation.f1_score(y, yhat) + fbeta = turicreate.toolkits.evaluation.fbeta_score(y, yhat, beta=2.0) # Assert self.assertEqual(pr, None) @@ -310,14 +309,14 @@ def test_none_prec_recall_scores_binary(self): self.assertEqual(fbeta, 0.0) # Arrange - y = turicreate.SArray(["0", "0"]) + y = turicreate.SArray(["0", "0"]) yhat = turicreate.SArray(["0", "1"]) # Act - pr = turicreate.toolkits.evaluation.precision(y, yhat) - rec = turicreate.toolkits.evaluation.recall(y, yhat) - f1 = turicreate.toolkits.evaluation.f1_score(y, yhat) - fbeta = turicreate.toolkits.evaluation.fbeta_score(y, yhat, beta = 2.0) + pr = turicreate.toolkits.evaluation.precision(y, yhat) + rec = turicreate.toolkits.evaluation.recall(y, yhat) + f1 = turicreate.toolkits.evaluation.f1_score(y, yhat) + fbeta = turicreate.toolkits.evaluation.fbeta_score(y, yhat, beta=2.0) # Assert self.assertEqual(pr, 0.0) @@ -327,26 +326,29 @@ def test_none_prec_recall_scores_binary(self): def test_none_prec_recall_scores(self): # Arrange - y = turicreate.SArray([0, 1, 2]) + y = turicreate.SArray([0, 1, 2]) yhat = turicreate.SArray([0, 0, 2]) # Act avg_cases = ["micro", "macro", None] - pr = {}; rec = {}; f1 = {}; fbeta = {} + pr = {} + rec = {} + f1 = {} + fbeta = {} for avg in avg_cases: - pr[avg] = turicreate.toolkits.evaluation.precision(y, yhat, avg) - rec[avg] = turicreate.toolkits.evaluation.recall(y, yhat, avg) - f1[avg] = turicreate.toolkits.evaluation.f1_score(y, yhat, avg) + pr[avg] = turicreate.toolkits.evaluation.precision(y, yhat, avg) + rec[avg] = turicreate.toolkits.evaluation.recall(y, yhat, avg) + f1[avg] = turicreate.toolkits.evaluation.f1_score(y, yhat, avg) fbeta[avg] = turicreate.toolkits.evaluation.fbeta_score(y, yhat, 2.0, avg) # Assert - self.assertAlmostEqual(pr["micro"], 2.0/3) - self.assertAlmostEqual(rec["micro"], 2.0/3) - self.assertAlmostEqual(f1["micro"], 2.0/3) - self.assertAlmostEqual(fbeta["micro"], 2.0/3) + self.assertAlmostEqual(pr["micro"], 2.0 / 3) + self.assertAlmostEqual(rec["micro"], 2.0 / 3) + self.assertAlmostEqual(f1["micro"], 2.0 / 3) + self.assertAlmostEqual(fbeta["micro"], 2.0 / 3) self.assertAlmostEqual(pr["macro"], 0.75) - self.assertAlmostEqual(rec["macro"], 2.0/3) + self.assertAlmostEqual(rec["macro"], 2.0 / 3) self.assertAlmostEqual(f1["macro"], 0.5555555555555555) self.assertAlmostEqual(fbeta["macro"], 0.6111111111111112) @@ -356,26 +358,29 @@ def test_none_prec_recall_scores(self): self.assertEqual(fbeta[None][1], 0.0) # Arrange - y = turicreate.SArray(["0", "1", "2"]) + y = turicreate.SArray(["0", "1", "2"]) yhat = turicreate.SArray(["0", "0", "2"]) # Act avg_cases = ["micro", "macro", None] - pr = {}; rec = {}; f1 = {}; fbeta = {} + pr = {} + rec = {} + f1 = {} + fbeta = {} for avg in avg_cases: - pr[avg] = turicreate.toolkits.evaluation.precision(y, yhat, avg) - rec[avg] = turicreate.toolkits.evaluation.recall(y, yhat, avg) - f1[avg] = turicreate.toolkits.evaluation.f1_score(y, yhat, avg) + pr[avg] = turicreate.toolkits.evaluation.precision(y, yhat, avg) + rec[avg] = turicreate.toolkits.evaluation.recall(y, yhat, avg) + f1[avg] = turicreate.toolkits.evaluation.f1_score(y, yhat, avg) fbeta[avg] = turicreate.toolkits.evaluation.fbeta_score(y, yhat, 2.0, avg) # Assert - self.assertAlmostEqual(pr["micro"], 2.0/3) - self.assertAlmostEqual(rec["micro"], 2.0/3) - self.assertAlmostEqual(f1["micro"], 2.0/3) - self.assertAlmostEqual(fbeta["micro"], 2.0/3) + self.assertAlmostEqual(pr["micro"], 2.0 / 3) + self.assertAlmostEqual(rec["micro"], 2.0 / 3) + self.assertAlmostEqual(f1["micro"], 2.0 / 3) + self.assertAlmostEqual(fbeta["micro"], 2.0 / 3) self.assertAlmostEqual(pr["macro"], 0.75) - self.assertAlmostEqual(rec["macro"], 2.0/3) + self.assertAlmostEqual(rec["macro"], 2.0 / 3) self.assertAlmostEqual(f1["macro"], 0.5555555555555555) self.assertAlmostEqual(fbeta["macro"], 0.6111111111111112) @@ -385,25 +390,28 @@ def test_none_prec_recall_scores(self): self.assertEqual(fbeta[None]["1"], 0.0) # Arrange - y = turicreate.SArray([0, 0, 2]) + y = turicreate.SArray([0, 0, 2]) yhat = turicreate.SArray([0, 1, 2]) # Act avg_cases = ["micro", "macro", None] - pr = {}; rec = {}; f1 = {}; fbeta = {} + pr = {} + rec = {} + f1 = {} + fbeta = {} for avg in avg_cases: - pr[avg] = turicreate.toolkits.evaluation.precision(y, yhat, avg) - rec[avg] = turicreate.toolkits.evaluation.recall(y, yhat, avg) - f1[avg] = turicreate.toolkits.evaluation.f1_score(y, yhat, avg) + pr[avg] = turicreate.toolkits.evaluation.precision(y, yhat, avg) + rec[avg] = turicreate.toolkits.evaluation.recall(y, yhat, avg) + f1[avg] = turicreate.toolkits.evaluation.f1_score(y, yhat, avg) fbeta[avg] = turicreate.toolkits.evaluation.fbeta_score(y, yhat, 2.0, avg) # Assert - self.assertAlmostEqual(pr["micro"], 2.0/3) - self.assertAlmostEqual(rec["micro"], 2.0/3) - self.assertAlmostEqual(f1["micro"], 2.0/3) - self.assertAlmostEqual(fbeta["micro"], 2.0/3) + self.assertAlmostEqual(pr["micro"], 2.0 / 3) + self.assertAlmostEqual(rec["micro"], 2.0 / 3) + self.assertAlmostEqual(f1["micro"], 2.0 / 3) + self.assertAlmostEqual(fbeta["micro"], 2.0 / 3) - self.assertAlmostEqual(pr["macro"], 2.0/3) + self.assertAlmostEqual(pr["macro"], 2.0 / 3) self.assertAlmostEqual(rec["macro"], 0.75) self.assertAlmostEqual(f1["macro"], 0.5555555555555555) self.assertAlmostEqual(fbeta["macro"], 0.5185185185185185) @@ -414,25 +422,28 @@ def test_none_prec_recall_scores(self): self.assertEqual(fbeta[None][1], 0.0) # Arrange - y = turicreate.SArray(["0", "0", "2"]) + y = turicreate.SArray(["0", "0", "2"]) yhat = turicreate.SArray(["0", "1", "2"]) # Act avg_cases = ["micro", "macro", None] - pr = {}; rec = {}; f1 = {}; fbeta = {} + pr = {} + rec = {} + f1 = {} + fbeta = {} for avg in avg_cases: - pr[avg] = turicreate.toolkits.evaluation.precision(y, yhat, avg) - rec[avg] = turicreate.toolkits.evaluation.recall(y, yhat, avg) - f1[avg] = turicreate.toolkits.evaluation.f1_score(y, yhat, avg) + pr[avg] = turicreate.toolkits.evaluation.precision(y, yhat, avg) + rec[avg] = turicreate.toolkits.evaluation.recall(y, yhat, avg) + f1[avg] = turicreate.toolkits.evaluation.f1_score(y, yhat, avg) fbeta[avg] = turicreate.toolkits.evaluation.fbeta_score(y, yhat, 2.0, avg) # Assert - self.assertAlmostEqual(pr["micro"], 2.0/3) - self.assertAlmostEqual(rec["micro"], 2.0/3) - self.assertAlmostEqual(f1["micro"], 2.0/3) - self.assertAlmostEqual(fbeta["micro"], 2.0/3) + self.assertAlmostEqual(pr["micro"], 2.0 / 3) + self.assertAlmostEqual(rec["micro"], 2.0 / 3) + self.assertAlmostEqual(f1["micro"], 2.0 / 3) + self.assertAlmostEqual(fbeta["micro"], 2.0 / 3) - self.assertAlmostEqual(pr["macro"], 2.0/3) + self.assertAlmostEqual(pr["macro"], 2.0 / 3) self.assertAlmostEqual(rec["macro"], 0.75) self.assertAlmostEqual(f1["macro"], 0.5555555555555555) self.assertAlmostEqual(fbeta["macro"], 0.5185185185185185) @@ -443,66 +454,134 @@ def test_none_prec_recall_scores(self): self.assertEqual(fbeta[None]["1"], 0.0) def test_confusion_matrix(self): - y = turicreate.SArray([ 1, 1, 0, 1, 1, 0, 1]) - yhat = turicreate.SArray([0, 1, 0, 0, 1, 1, 0]) + y = turicreate.SArray([1, 1, 0, 1, 1, 0, 1]) + yhat = turicreate.SArray([0, 1, 0, 0, 1, 1, 0]) res = turicreate.toolkits.evaluation.confusion_matrix(y, yhat) - res = res.sort(["predicted_label", "target_label"])['count'] - self.assertTrue((res == turicreate.SArray([1,3,1,2])).all()) + res = res.sort(["predicted_label", "target_label"])["count"] + self.assertTrue((res == turicreate.SArray([1, 3, 1, 2])).all()) def test_roc_curve(self): # Example from p.864 # https://ccrma.stanford.edu/workshops/mir2009/references/ROCintro.pdf - y = turicreate.SArray( - [1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0]) + y = turicreate.SArray( + [1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0] + ) yhat = turicreate.SArray( - [.9, .8, .7, .6, .55, .54, .53, .52, .51, .505, .4, .39, .38, - .37, .36, .35, .34, .33, .30, .1]) - - true_fpr = turicreate.SArray([0, 0, 0, .1, .1, .1, .1, .2, .3, .3, .4, - .4, .5, .5, .6, .7, .8, .8, .9, .9, 1.0]) - true_tpr = turicreate.SArray([0, .1, .2, .2, .3, .4, .5, .5, .5, .6, .6, - .7, .7, .8, .8, .8, .8, .9, .9, 1.0, 1.0]) + [ + 0.9, + 0.8, + 0.7, + 0.6, + 0.55, + 0.54, + 0.53, + 0.52, + 0.51, + 0.505, + 0.4, + 0.39, + 0.38, + 0.37, + 0.36, + 0.35, + 0.34, + 0.33, + 0.30, + 0.1, + ] + ) + + true_fpr = turicreate.SArray( + [ + 0, + 0, + 0, + 0.1, + 0.1, + 0.1, + 0.1, + 0.2, + 0.3, + 0.3, + 0.4, + 0.4, + 0.5, + 0.5, + 0.6, + 0.7, + 0.8, + 0.8, + 0.9, + 0.9, + 1.0, + ] + ) + true_tpr = turicreate.SArray( + [ + 0, + 0.1, + 0.2, + 0.2, + 0.3, + 0.4, + 0.5, + 0.5, + 0.5, + 0.6, + 0.6, + 0.7, + 0.7, + 0.8, + 0.8, + 0.8, + 0.8, + 0.9, + 0.9, + 1.0, + 1.0, + ] + ) res = turicreate.toolkits.evaluation.roc_curve(y, yhat) - points = res[['fpr', 'tpr']].unique().sort(['fpr', 'tpr']) - self.assertTrue(all(points['fpr'] == true_fpr)) - self.assertTrue(all(points['tpr'] == true_tpr)) + points = res[["fpr", "tpr"]].unique().sort(["fpr", "tpr"]) + self.assertTrue(all(points["fpr"] == true_fpr)) + self.assertTrue(all(points["tpr"] == true_tpr)) def test_roc_curve_str(self): - y = turicreate.SArray(['a', 'b', 'a', 'b']) - yhat = turicreate.SArray([.1, .2, .3, .4]) + y = turicreate.SArray(["a", "b", "a", "b"]) + yhat = turicreate.SArray([0.1, 0.2, 0.3, 0.4]) res = turicreate.toolkits.evaluation.roc_curve(y, yhat) - points = res[['fpr', 'tpr']].unique().sort(['fpr', 'tpr']) - self.assertTrue(all(res['tpr'] >= 0) and all(res['tpr'] <= 1)) - self.assertTrue(all(res['fpr'] >= 0) and all(res['fpr'] <= 1)) + points = res[["fpr", "tpr"]].unique().sort(["fpr", "tpr"]) + self.assertTrue(all(res["tpr"] >= 0) and all(res["tpr"] <= 1)) + self.assertTrue(all(res["fpr"] >= 0) and all(res["fpr"] <= 1)) def test_grouped_precision_recall(self): data = turicreate.SFrame() - data['user_id'] = ["a", "b", "b", "c", "c", "c"] - data['item_id'] = ['x', 'x', 'y', 'v', 'w', 'z'] - data['rating'] = [0, 1, 2, 3, 4, 5] + data["user_id"] = ["a", "b", "b", "c", "c", "c"] + data["item_id"] = ["x", "x", "y", "v", "w", "z"] + data["rating"] = [0, 1, 2, 3, 4, 5] m = turicreate.recommender.item_similarity_recommender.create(data) recs = m.recommend() test_data = turicreate.SFrame() - test_data['user_id'] = ['a', 'b'] - test_data['item_id'] = ['v', 'z'] - test_data['rating'] = [7, 8] + test_data["user_id"] = ["a", "b"] + test_data["item_id"] = ["v", "z"] + test_data["rating"] = [7, 8] - pr = turicreate.recommender.util.precision_recall_by_user(test_data, - recs, cutoffs=[3]) + pr = turicreate.recommender.util.precision_recall_by_user( + test_data, recs, cutoffs=[3] + ) self.assertEqual(type(pr), turicreate.SFrame) - self.assertEqual(pr.column_names(), ['user_id', - 'cutoff', - 'precision', - 'recall', - 'count']) - self.assertEqual(list(pr['user_id']), list(turicreate.SArray(['a', 'b', 'c']))) - pr = turicreate.recommender.util.precision_recall_by_user(test_data, - recs, cutoffs=[5, 10, 15]) + self.assertEqual( + pr.column_names(), ["user_id", "cutoff", "precision", "recall", "count"] + ) + self.assertEqual(list(pr["user_id"]), list(turicreate.SArray(["a", "b", "c"]))) + pr = turicreate.recommender.util.precision_recall_by_user( + test_data, recs, cutoffs=[5, 10, 15] + ) self.assertEqual(pr.num_rows(), 9) def test_fbeta_binary_score(self): @@ -514,13 +593,15 @@ def test_fbeta_binary_score(self): str_predictions = predictions.astype(str) # Act - skl_beta = fbeta_score(list(targets), list(predictions), beta=2.0, - average='binary') - beta = turicreate.toolkits.evaluation.fbeta_score(targets, predictions, - beta=2.0) - str_beta = turicreate.toolkits.evaluation.fbeta_score(str_targets, - str_predictions, - beta=2.0) + skl_beta = fbeta_score( + list(targets), list(predictions), beta=2.0, average="binary" + ) + beta = turicreate.toolkits.evaluation.fbeta_score( + targets, predictions, beta=2.0 + ) + str_beta = turicreate.toolkits.evaluation.fbeta_score( + str_targets, str_predictions, beta=2.0 + ) # Assert self.assertAlmostEqual(skl_beta, beta) @@ -528,11 +609,12 @@ def test_fbeta_binary_score(self): # Act skl_beta = fbeta_score(list(targets), list(predictions), beta=0.5) - beta = turicreate.toolkits.evaluation.fbeta_score(targets, predictions, - beta=0.5) - str_beta = turicreate.toolkits.evaluation.fbeta_score(str_targets, - str_predictions, - beta=0.5) + beta = turicreate.toolkits.evaluation.fbeta_score( + targets, predictions, beta=0.5 + ) + str_beta = turicreate.toolkits.evaluation.fbeta_score( + str_targets, str_predictions, beta=0.5 + ) # Assert self.assertAlmostEqual(skl_beta, beta) @@ -547,77 +629,73 @@ def test_fbeta_multi_class_score(self): str_predictions = predictions.astype(str) # Act [ beta = 2] - skl_beta = fbeta_score(list(targets), list(predictions), - beta=2.0, average = 'macro') - beta = turicreate.toolkits.evaluation.fbeta_score(targets, predictions, - beta=2.0) - str_beta = turicreate.toolkits.evaluation.fbeta_score(str_targets, - str_predictions, - beta=2.0) + skl_beta = fbeta_score( + list(targets), list(predictions), beta=2.0, average="macro" + ) + beta = turicreate.toolkits.evaluation.fbeta_score( + targets, predictions, beta=2.0 + ) + str_beta = turicreate.toolkits.evaluation.fbeta_score( + str_targets, str_predictions, beta=2.0 + ) # Assert self.assertAlmostEqual(skl_beta, beta) self.assertAlmostEqual(skl_beta, str_beta) # Act [ beta = 0.5] - skl_beta = fbeta_score(list(targets), list(predictions), - beta=0.5, average = 'micro') - beta = turicreate.toolkits.evaluation.fbeta_score(targets, - predictions, - beta=0.5, - average = 'micro') - str_beta = turicreate.toolkits.evaluation.fbeta_score(str_targets, - str_predictions, - beta=0.5, - average = 'micro') + skl_beta = fbeta_score( + list(targets), list(predictions), beta=0.5, average="micro" + ) + beta = turicreate.toolkits.evaluation.fbeta_score( + targets, predictions, beta=0.5, average="micro" + ) + str_beta = turicreate.toolkits.evaluation.fbeta_score( + str_targets, str_predictions, beta=0.5, average="micro" + ) # Assert self.assertAlmostEqual(skl_beta, beta) self.assertAlmostEqual(skl_beta, str_beta) # Act [Average = 'macro'] - skl_beta = fbeta_score(list(targets), list(predictions), - beta=2.0, average = 'macro') - beta= turicreate.toolkits.evaluation.fbeta_score(targets, - predictions, - beta=2.0, - average = 'macro') - str_beta= turicreate.toolkits.evaluation.fbeta_score(str_targets, - str_predictions, - beta=2.0, - average = 'macro') + skl_beta = fbeta_score( + list(targets), list(predictions), beta=2.0, average="macro" + ) + beta = turicreate.toolkits.evaluation.fbeta_score( + targets, predictions, beta=2.0, average="macro" + ) + str_beta = turicreate.toolkits.evaluation.fbeta_score( + str_targets, str_predictions, beta=2.0, average="macro" + ) ## Assert self.assertAlmostEqual(skl_beta, beta) self.assertAlmostEqual(skl_beta, str_beta) ## Act [Average = 'micro'] - skl_beta = fbeta_score(list(targets), list(predictions), - beta=2.0, average = 'micro') - beta= turicreate.toolkits.evaluation.fbeta_score(targets, - predictions, - beta=2.0, - average = 'micro') - str_beta= turicreate.toolkits.evaluation.fbeta_score(str_targets, - str_predictions, - beta=2.0, - average = 'micro') + skl_beta = fbeta_score( + list(targets), list(predictions), beta=2.0, average="micro" + ) + beta = turicreate.toolkits.evaluation.fbeta_score( + targets, predictions, beta=2.0, average="micro" + ) + str_beta = turicreate.toolkits.evaluation.fbeta_score( + str_targets, str_predictions, beta=2.0, average="micro" + ) # Assert self.assertAlmostEqual(skl_beta, beta) self.assertAlmostEqual(skl_beta, str_beta) # Act [Average = None] - skl_beta = fbeta_score(list(targets), list(predictions), - beta=2.0, average = None) - beta = turicreate.toolkits.evaluation.fbeta_score(targets, - predictions, - beta=2.0, - average = None) - str_beta= turicreate.toolkits.evaluation.fbeta_score(str_targets, - str_predictions, - beta=2.0, - average = None) + skl_beta = fbeta_score(list(targets), list(predictions), beta=2.0, average=None) + beta = turicreate.toolkits.evaluation.fbeta_score( + targets, predictions, beta=2.0, average=None + ) + str_beta = turicreate.toolkits.evaluation.fbeta_score( + str_targets, str_predictions, beta=2.0, average=None + ) # Assert self.assertEqual(type(beta), dict) self.assertEqual(set(beta.keys()), set([0, 1, 2])) @@ -626,11 +704,11 @@ def test_fbeta_multi_class_score(self): # Note: Explicitly not putting it into a for loop for ease of # debugging when the tests fail. self.assertAlmostEqual(skl_beta[0], beta[0]) - self.assertAlmostEqual(skl_beta[0], str_beta['0']) + self.assertAlmostEqual(skl_beta[0], str_beta["0"]) self.assertAlmostEqual(skl_beta[1], beta[1]) - self.assertAlmostEqual(skl_beta[1], str_beta['1']) + self.assertAlmostEqual(skl_beta[1], str_beta["1"]) self.assertAlmostEqual(skl_beta[2], beta[2]) - self.assertAlmostEqual(skl_beta[2], str_beta['2']) + self.assertAlmostEqual(skl_beta[2], str_beta["2"]) def test_f1_binary_score(self): @@ -644,8 +722,9 @@ def test_f1_binary_score(self): # Act skl_score = f1_score(list(targets), list(predictions)) score = turicreate.toolkits.evaluation.f1_score(targets, predictions) - str_score = turicreate.toolkits.evaluation.f1_score(str_targets, - str_predictions) + str_score = turicreate.toolkits.evaluation.f1_score( + str_targets, str_predictions + ) # Assert self.assertAlmostEqual(skl_score, score) @@ -660,53 +739,50 @@ def test_f1_multi_class_score(self): str_predictions = predictions.astype(str) # Act - skl_score = f1_score(list(targets), list(predictions), - average = 'macro') + skl_score = f1_score(list(targets), list(predictions), average="macro") score = turicreate.toolkits.evaluation.f1_score(targets, predictions) - str_score = turicreate.toolkits.evaluation.f1_score(str_targets, - str_predictions) + str_score = turicreate.toolkits.evaluation.f1_score( + str_targets, str_predictions + ) # Assert self.assertAlmostEqual(skl_score, score) self.assertAlmostEqual(skl_score, str_score) # Act [Average = 'macro'] - skl_score = f1_score(list(targets), list(predictions), - average = 'macro') - score = turicreate.toolkits.evaluation.f1_score(targets, - predictions, - average = 'macro') - str_score= turicreate.toolkits.evaluation.f1_score(str_targets, - str_predictions, - average = 'macro') + skl_score = f1_score(list(targets), list(predictions), average="macro") + score = turicreate.toolkits.evaluation.f1_score( + targets, predictions, average="macro" + ) + str_score = turicreate.toolkits.evaluation.f1_score( + str_targets, str_predictions, average="macro" + ) # Assert self.assertAlmostEqual(skl_score, score) self.assertAlmostEqual(skl_score, str_score) # Act [Average = 'micro'] - skl_score = f1_score(list(targets), list(predictions), - average = 'micro') - score= turicreate.toolkits.evaluation.f1_score(targets, - predictions, - average = 'micro') - str_score= turicreate.toolkits.evaluation.f1_score(str_targets, - str_predictions, - average = 'micro') + skl_score = f1_score(list(targets), list(predictions), average="micro") + score = turicreate.toolkits.evaluation.f1_score( + targets, predictions, average="micro" + ) + str_score = turicreate.toolkits.evaluation.f1_score( + str_targets, str_predictions, average="micro" + ) # Assert self.assertAlmostEqual(skl_score, score) self.assertAlmostEqual(skl_score, str_score) # Act [Average = None] - skl_score = f1_score(list(targets), list(predictions), - average = None) - score = turicreate.toolkits.evaluation.f1_score(targets, - predictions, - average = None) - str_score= turicreate.toolkits.evaluation.f1_score(str_targets, - str_predictions, - average = None) + skl_score = f1_score(list(targets), list(predictions), average=None) + score = turicreate.toolkits.evaluation.f1_score( + targets, predictions, average=None + ) + str_score = turicreate.toolkits.evaluation.f1_score( + str_targets, str_predictions, average=None + ) # Assert self.assertEqual(type(score), dict) self.assertEqual(set(score.keys()), set([0, 1, 2])) @@ -715,11 +791,11 @@ def test_f1_multi_class_score(self): # Note: Explicitly not putting it into a for loop for ease of # debugging when the tests fail. self.assertAlmostEqual(skl_score[0], score[0]) - self.assertAlmostEqual(skl_score[0], str_score['0']) + self.assertAlmostEqual(skl_score[0], str_score["0"]) self.assertAlmostEqual(skl_score[1], score[1]) - self.assertAlmostEqual(skl_score[1], str_score['1']) + self.assertAlmostEqual(skl_score[1], str_score["1"]) self.assertAlmostEqual(skl_score[2], score[2]) - self.assertAlmostEqual(skl_score[2], str_score['2']) + self.assertAlmostEqual(skl_score[2], str_score["2"]) def test_precision_binary_score(self): # Arrange @@ -732,8 +808,9 @@ def test_precision_binary_score(self): # Act skl_score = precision_score(list(targets), list(predictions)) score = turicreate.toolkits.evaluation.precision(targets, predictions) - str_score = turicreate.toolkits.evaluation.precision(str_targets, - str_predictions) + str_score = turicreate.toolkits.evaluation.precision( + str_targets, str_predictions + ) # Assert self.assertAlmostEqual(skl_score, score) @@ -748,53 +825,50 @@ def test_precision_multi_class_score(self): str_predictions = predictions.astype(str) # Act - skl_score = precision_score(list(targets), list(predictions), - average = 'macro') + skl_score = precision_score(list(targets), list(predictions), average="macro") score = turicreate.toolkits.evaluation.precision(targets, predictions) - str_score = turicreate.toolkits.evaluation.precision(str_targets, - str_predictions) + str_score = turicreate.toolkits.evaluation.precision( + str_targets, str_predictions + ) # Assert self.assertAlmostEqual(skl_score, score) self.assertAlmostEqual(skl_score, str_score) # Act [Average = 'macro'] - skl_score = precision_score(list(targets), list(predictions), - average = 'macro') - score = turicreate.toolkits.evaluation.precision(targets, - predictions, - average = 'macro') - str_score= turicreate.toolkits.evaluation.precision(str_targets, - str_predictions, - average = 'macro') + skl_score = precision_score(list(targets), list(predictions), average="macro") + score = turicreate.toolkits.evaluation.precision( + targets, predictions, average="macro" + ) + str_score = turicreate.toolkits.evaluation.precision( + str_targets, str_predictions, average="macro" + ) ## Assert self.assertAlmostEqual(skl_score, score) self.assertAlmostEqual(skl_score, str_score) ## Act [Average = 'micro'] - skl_score = precision_score(list(targets), list(predictions), - average = 'micro') - score= turicreate.toolkits.evaluation.precision(targets, - predictions, - average = 'micro') - str_score= turicreate.toolkits.evaluation.precision(str_targets, - str_predictions, - average = 'micro') + skl_score = precision_score(list(targets), list(predictions), average="micro") + score = turicreate.toolkits.evaluation.precision( + targets, predictions, average="micro" + ) + str_score = turicreate.toolkits.evaluation.precision( + str_targets, str_predictions, average="micro" + ) # Assert self.assertAlmostEqual(skl_score, score) self.assertAlmostEqual(skl_score, str_score) # Act [Average = None] - skl_score = precision_score(list(targets), list(predictions), - average = None) - score = turicreate.toolkits.evaluation.precision(targets, - predictions, - average = None) - str_score= turicreate.toolkits.evaluation.precision(str_targets, - str_predictions, - average = None) + skl_score = precision_score(list(targets), list(predictions), average=None) + score = turicreate.toolkits.evaluation.precision( + targets, predictions, average=None + ) + str_score = turicreate.toolkits.evaluation.precision( + str_targets, str_predictions, average=None + ) # Assert self.assertEqual(type(score), dict) self.assertEqual(set(score.keys()), set([0, 1, 2])) @@ -803,11 +877,11 @@ def test_precision_multi_class_score(self): # Note: Explicitly not putting it into a for loop for ease of # debugging when the tests fail. self.assertAlmostEqual(skl_score[0], score[0]) - self.assertAlmostEqual(skl_score[0], str_score['0']) + self.assertAlmostEqual(skl_score[0], str_score["0"]) self.assertAlmostEqual(skl_score[1], score[1]) - self.assertAlmostEqual(skl_score[1], str_score['1']) + self.assertAlmostEqual(skl_score[1], str_score["1"]) self.assertAlmostEqual(skl_score[2], score[2]) - self.assertAlmostEqual(skl_score[2], str_score['2']) + self.assertAlmostEqual(skl_score[2], str_score["2"]) def test_recall_binary_score(self): # Arrange @@ -820,8 +894,7 @@ def test_recall_binary_score(self): # Act skl_score = recall_score(list(targets), list(predictions)) score = turicreate.toolkits.evaluation.recall(targets, predictions) - str_score = turicreate.toolkits.evaluation.recall(str_targets, - str_predictions) + str_score = turicreate.toolkits.evaluation.recall(str_targets, str_predictions) # Assert self.assertAlmostEqual(skl_score, score) @@ -836,53 +909,48 @@ def test_recall_multi_class_score(self): str_predictions = predictions.astype(str) # Act - skl_score = recall_score(list(targets), list(predictions), - average = 'macro') + skl_score = recall_score(list(targets), list(predictions), average="macro") score = turicreate.toolkits.evaluation.recall(targets, predictions) - str_score = turicreate.toolkits.evaluation.recall(str_targets, - str_predictions) + str_score = turicreate.toolkits.evaluation.recall(str_targets, str_predictions) # Assert self.assertAlmostEqual(skl_score, score) self.assertAlmostEqual(skl_score, str_score) # Act [Average = 'macro'] - skl_score = recall_score(list(targets), list(predictions), - average = 'macro') - score = turicreate.toolkits.evaluation.recall(targets, - predictions, - average = 'macro') - str_score= turicreate.toolkits.evaluation.recall(str_targets, - str_predictions, - average = 'macro') + skl_score = recall_score(list(targets), list(predictions), average="macro") + score = turicreate.toolkits.evaluation.recall( + targets, predictions, average="macro" + ) + str_score = turicreate.toolkits.evaluation.recall( + str_targets, str_predictions, average="macro" + ) # Assert self.assertAlmostEqual(skl_score, score) self.assertAlmostEqual(skl_score, str_score) # Act [Average = 'micro'] - skl_score = recall_score(list(targets), list(predictions), - average = 'micro') - score= turicreate.toolkits.evaluation.recall(targets, - predictions, - average = 'micro') - str_score= turicreate.toolkits.evaluation.recall(str_targets, - str_predictions, - average = 'micro') + skl_score = recall_score(list(targets), list(predictions), average="micro") + score = turicreate.toolkits.evaluation.recall( + targets, predictions, average="micro" + ) + str_score = turicreate.toolkits.evaluation.recall( + str_targets, str_predictions, average="micro" + ) # Assert self.assertAlmostEqual(skl_score, score) self.assertAlmostEqual(skl_score, str_score) # Act [Average = None] - skl_score = recall_score(list(targets), list(predictions), - average = None) - score = turicreate.toolkits.evaluation.recall(targets, - predictions, - average = None) - str_score= turicreate.toolkits.evaluation.recall(str_targets, - str_predictions, - average = None) + skl_score = recall_score(list(targets), list(predictions), average=None) + score = turicreate.toolkits.evaluation.recall( + targets, predictions, average=None + ) + str_score = turicreate.toolkits.evaluation.recall( + str_targets, str_predictions, average=None + ) # Assert self.assertEqual(type(score), dict) self.assertEqual(set(score.keys()), set([0, 1, 2])) @@ -891,11 +959,11 @@ def test_recall_multi_class_score(self): # Note: Explicitly not putting it into a for loop for ease of # debugging when the tests fail. self.assertAlmostEqual(skl_score[0], score[0]) - self.assertAlmostEqual(skl_score[0], str_score['0']) + self.assertAlmostEqual(skl_score[0], str_score["0"]) self.assertAlmostEqual(skl_score[1], score[1]) - self.assertAlmostEqual(skl_score[1], str_score['1']) + self.assertAlmostEqual(skl_score[1], str_score["1"]) self.assertAlmostEqual(skl_score[2], score[2]) - self.assertAlmostEqual(skl_score[2], str_score['2']) + self.assertAlmostEqual(skl_score[2], str_score["2"]) def test_accuracy_binary_score(self): # Arrange @@ -908,8 +976,9 @@ def test_accuracy_binary_score(self): # Act skl_score = accuracy_score(list(targets), list(predictions)) score = turicreate.toolkits.evaluation.accuracy(targets, predictions) - str_score = turicreate.toolkits.evaluation.accuracy(str_targets, - str_predictions) + str_score = turicreate.toolkits.evaluation.accuracy( + str_targets, str_predictions + ) # Assert self.assertAlmostEqual(skl_score, score) @@ -926,8 +995,9 @@ def test_accuracy_multi_class_score(self): # Act skl_score = accuracy_score(list(targets), list(predictions)) score = turicreate.toolkits.evaluation.accuracy(targets, predictions) - str_score = turicreate.toolkits.evaluation.accuracy(str_targets, - str_predictions) + str_score = turicreate.toolkits.evaluation.accuracy( + str_targets, str_predictions + ) # Assert self.assertAlmostEqual(skl_score, score) @@ -941,12 +1011,12 @@ def test_accuracy_multi_class_score(self): cls_skl_score[i] = accuracy_score(list(t), list(p)) macro_avg = sum(cls_skl_score.values()) * 1.0 / 3 - score = turicreate.toolkits.evaluation.accuracy(targets, - predictions, - average = 'macro') - str_score= turicreate.toolkits.evaluation.accuracy(str_targets, - str_predictions, - average = 'macro') + score = turicreate.toolkits.evaluation.accuracy( + targets, predictions, average="macro" + ) + str_score = turicreate.toolkits.evaluation.accuracy( + str_targets, str_predictions, average="macro" + ) ## Assert self.assertAlmostEqual(macro_avg, score) @@ -954,26 +1024,25 @@ def test_accuracy_multi_class_score(self): ## Act [Average = 'micro'] skl_score = accuracy_score(list(targets), list(predictions)) - score= turicreate.toolkits.evaluation.accuracy(targets, - predictions, - average = 'micro') - str_score= turicreate.toolkits.evaluation.accuracy(str_targets, - str_predictions, - average = 'micro') + score = turicreate.toolkits.evaluation.accuracy( + targets, predictions, average="micro" + ) + str_score = turicreate.toolkits.evaluation.accuracy( + str_targets, str_predictions, average="micro" + ) # Assert self.assertAlmostEqual(skl_score, score) self.assertAlmostEqual(skl_score, str_score) # Act [Average = None] - prec_score = precision_score(list(targets), list(predictions), - average = None) - score = turicreate.toolkits.evaluation.accuracy(targets, - predictions, - average = None) - str_score= turicreate.toolkits.evaluation.accuracy(str_targets, - str_predictions, - average = None) + prec_score = precision_score(list(targets), list(predictions), average=None) + score = turicreate.toolkits.evaluation.accuracy( + targets, predictions, average=None + ) + str_score = turicreate.toolkits.evaluation.accuracy( + str_targets, str_predictions, average=None + ) # Assert self.assertEqual(type(score), dict) self.assertEqual(set(score.keys()), set([0, 1, 2])) @@ -982,11 +1051,11 @@ def test_accuracy_multi_class_score(self): # Note: Explicitly not putting it into a for loop for ease of # debugging when the tests fail. self.assertAlmostEqual(prec_score[0], score[0]) - self.assertAlmostEqual(prec_score[0], str_score['0']) + self.assertAlmostEqual(prec_score[0], str_score["0"]) self.assertAlmostEqual(prec_score[1], score[1]) - self.assertAlmostEqual(prec_score[1], str_score['1']) + self.assertAlmostEqual(prec_score[1], str_score["1"]) self.assertAlmostEqual(prec_score[2], score[2]) - self.assertAlmostEqual(prec_score[2], str_score['2']) + self.assertAlmostEqual(prec_score[2], str_score["2"]) def test_missing_values(self): # Arrange @@ -1003,34 +1072,58 @@ def test_missing_values(self): self.assertAlmostEqual(skl_score, score) # Act & Assert [precision] - skl_score = precision_score(t, p, average = 'macro') + skl_score = precision_score(t, p, average="macro") score = turicreate.toolkits.evaluation.precision(targets, predictions) self.assertAlmostEqual(skl_score, score) # Act & Assert [recall] - skl_score = recall_score(t, p, average = 'macro') + skl_score = recall_score(t, p, average="macro") score = turicreate.toolkits.evaluation.recall(targets, predictions) self.assertAlmostEqual(skl_score, score) # Act & Assert [f1_score] - skl_score = f1_score(t, p, average = 'macro') + skl_score = f1_score(t, p, average="macro") score = turicreate.toolkits.evaluation.f1_score(targets, predictions) self.assertAlmostEqual(skl_score, score) # Act & Assert [fbeta_score] - skl_score = fbeta_score(t, p, beta = 2.0, average = 'macro') - score = turicreate.toolkits.evaluation.fbeta_score(targets, predictions, beta = 2.0) + skl_score = fbeta_score(t, p, beta=2.0, average="macro") + score = turicreate.toolkits.evaluation.fbeta_score( + targets, predictions, beta=2.0 + ) self.assertAlmostEqual(skl_score, score) def test_auc_basic(self): # Arrange # Example from p.864 # https://ccrma.stanford.edu/workshops/mir2009/references/ROCintro.pdf - y = turicreate.SArray( - [1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0]) + y = turicreate.SArray( + [1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0] + ) yhat = turicreate.SArray( - [.9, .8, .7, .6, .55, .54, .53, .52, .51, .505, - .4, .39, .38, .37, .36, .35, .34, .33, .30, .1]) + [ + 0.9, + 0.8, + 0.7, + 0.6, + 0.55, + 0.54, + 0.53, + 0.52, + 0.51, + 0.505, + 0.4, + 0.39, + 0.38, + 0.37, + 0.36, + 0.35, + 0.34, + 0.33, + 0.30, + 0.1, + ] + ) scikit_auc = roc_auc_score(list(y), list(yhat)) @@ -1050,16 +1143,18 @@ def test_auc_binary(self): yhat = turicreate.SArray(rs.uniform(size=n)) # Act & Assert - scikit_auc = roc_auc_score(list(y), list(yhat), average = "macro") + scikit_auc = roc_auc_score(list(y), list(yhat), average="macro") glc_int_auc = turicreate.toolkits.evaluation.auc(y, yhat, "macro") glc_str_auc = turicreate.toolkits.evaluation.auc(y.astype(str), yhat, "macro") self.assertAlmostEqual(glc_int_auc, scikit_auc) self.assertAlmostEqual(glc_str_auc, scikit_auc) # Act & Assert - scikit_auc = roc_auc_score(list(y), list(yhat), average = None) - glc_int_auc = turicreate.toolkits.evaluation.auc(y, yhat, average = None) - glc_str_auc = turicreate.toolkits.evaluation.auc(y.astype(str), yhat, average = None) + scikit_auc = roc_auc_score(list(y), list(yhat), average=None) + glc_int_auc = turicreate.toolkits.evaluation.auc(y, yhat, average=None) + glc_str_auc = turicreate.toolkits.evaluation.auc( + y.astype(str), yhat, average=None + ) self.assertAlmostEqual(glc_int_auc, scikit_auc) self.assertAlmostEqual(glc_str_auc, scikit_auc) @@ -1081,12 +1176,10 @@ def test_auc_multi_class_score(self): sk_score[i] = roc_auc_score(sk_t[i], sk_p[i]) # Act [Average = None] - score = turicreate.toolkits.evaluation.auc(targets, - predictions, - average = None) - str_score= turicreate.toolkits.evaluation.auc(str_targets, - predictions, - average = None) + score = turicreate.toolkits.evaluation.auc(targets, predictions, average=None) + str_score = turicreate.toolkits.evaluation.auc( + str_targets, predictions, average=None + ) # Assert self.assertEqual(type(score), dict) self.assertEqual(set(score.keys()), set([0, 1, 2])) @@ -1095,19 +1188,19 @@ def test_auc_multi_class_score(self): # Note: Explicitly not putting it into a for loop for ease of # debugging when the tests fail. self.assertAlmostEqual(sk_score[0], score[0]) - self.assertAlmostEqual(sk_score[0], str_score['0']) + self.assertAlmostEqual(sk_score[0], str_score["0"]) self.assertAlmostEqual(sk_score[1], score[1]) - self.assertAlmostEqual(sk_score[1], str_score['1']) + self.assertAlmostEqual(sk_score[1], str_score["1"]) self.assertAlmostEqual(sk_score[2], score[2]) - self.assertAlmostEqual(sk_score[2], str_score['2']) + self.assertAlmostEqual(sk_score[2], str_score["2"]) # Act [Average = 'macro'] - score = turicreate.toolkits.evaluation.auc(targets, - predictions, - average = 'macro') - str_score = turicreate.toolkits.evaluation.auc(str_targets, - predictions, - average = 'macro') + score = turicreate.toolkits.evaluation.auc( + targets, predictions, average="macro" + ) + str_score = turicreate.toolkits.evaluation.auc( + str_targets, predictions, average="macro" + ) avg_score = 0.0 for i in range(3): avg_score += sk_score[i] @@ -1124,61 +1217,64 @@ def test_bogus_input_prob_evaluators(self): float_predictions = predictions.apply(lambda x: x[0]) with self.assertRaises(ToolkitError): - score = turicreate.toolkits.evaluation.auc(targets, - predictions) + score = turicreate.toolkits.evaluation.auc(targets, predictions) with self.assertRaises(ToolkitError): - score = turicreate.toolkits.evaluation.roc_curve(targets, - predictions) + score = turicreate.toolkits.evaluation.roc_curve(targets, predictions) with self.assertRaises(ToolkitError): - score = turicreate.toolkits.evaluation.log_loss(targets, - predictions) + score = turicreate.toolkits.evaluation.log_loss(targets, predictions) with self.assertRaises(ToolkitError): - score = turicreate.toolkits.evaluation.auc(targets, - float_predictions) + score = turicreate.toolkits.evaluation.auc(targets, float_predictions) with self.assertRaises(ToolkitError): - score = turicreate.toolkits.evaluation.roc_curve(targets, - float_predictions) + score = turicreate.toolkits.evaluation.roc_curve(targets, float_predictions) with self.assertRaises(ToolkitError): - score = turicreate.toolkits.evaluation.log_loss(targets, - float_predictions) + score = turicreate.toolkits.evaluation.log_loss(targets, float_predictions) bad_range_targets = turicreate.SArray([0, 1, 0, 1]) bad_range_predictions = turicreate.SArray([1, 2, 3, 4]) with self.assertRaises(ToolkitError): - score = turicreate.toolkits.evaluation.log_loss(bad_range_targets, - bad_range_predictions) + score = turicreate.toolkits.evaluation.log_loss( + bad_range_targets, bad_range_predictions + ) with self.assertRaises(ToolkitError): - score = turicreate.toolkits.evaluation.roc_curve(bad_range_targets, - bad_range_predictions) + score = turicreate.toolkits.evaluation.roc_curve( + bad_range_targets, bad_range_predictions + ) with self.assertRaises(ToolkitError): - score = turicreate.toolkits.evaluation.auc(bad_range_targets, - bad_range_predictions) + score = turicreate.toolkits.evaluation.auc( + bad_range_targets, bad_range_predictions + ) bad_range_targets = turicreate.SArray([0, 1, 0, 1]) - bad_range_predictions = turicreate.SArray([[1.0, 2.0], - [2.0, 3.0], - [3.0, 4.0], - [4.0, 5.0]]) + bad_range_predictions = turicreate.SArray( + [[1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0]] + ) with self.assertRaises(ToolkitError): - score = turicreate.toolkits.evaluation.log_loss(bad_range_targets, - bad_range_predictions) + score = turicreate.toolkits.evaluation.log_loss( + bad_range_targets, bad_range_predictions + ) with self.assertRaises(ToolkitError): - score = turicreate.toolkits.evaluation.roc_curve(bad_range_targets, - bad_range_predictions) + score = turicreate.toolkits.evaluation.roc_curve( + bad_range_targets, bad_range_predictions + ) with self.assertRaises(ToolkitError): - score = turicreate.toolkits.evaluation.auc(bad_range_targets, - bad_range_predictions) + score = turicreate.toolkits.evaluation.auc( + bad_range_targets, bad_range_predictions + ) targets = turicreate.SArray([0, 1, 2, 0]) - predictions = turicreate.SArray([[0.9, 0.0, 0.1, 0.0], - [0.8, 0.1, 0.1, 0.0], - [0.1, 0.1, 0.8, 0.0], - [0.1, 0.1, 0.8, 0.0]]) + predictions = turicreate.SArray( + [ + [0.9, 0.0, 0.1, 0.0], + [0.8, 0.1, 0.1, 0.0], + [0.1, 0.1, 0.8, 0.0], + [0.1, 0.1, 0.8, 0.0], + ] + ) good_index_map = {i: i for i in range(4)} incomplete_index_map = {i: i for i in range(3)} invalid_range_index_map = {0: 1, 1: 2, 2: 3, 3: 4} @@ -1186,41 +1282,53 @@ def test_bogus_input_prob_evaluators(self): # No exception with a correct index map score = turicreate.toolkits.evaluation.auc( - targets, predictions, index_map=good_index_map) + targets, predictions, index_map=good_index_map + ) score = turicreate.toolkits.evaluation.roc_curve( - targets, predictions, index_map=good_index_map) + targets, predictions, index_map=good_index_map + ) score = turicreate.toolkits.evaluation.log_loss( - targets, predictions, index_map=good_index_map) + targets, predictions, index_map=good_index_map + ) # Exception if index_map size does not match prediction vector size with self.assertRaises(ToolkitError): score = turicreate.toolkits.evaluation.auc( - targets, predictions, index_map=incomplete_index_map) + targets, predictions, index_map=incomplete_index_map + ) with self.assertRaises(ToolkitError): score = turicreate.toolkits.evaluation.roc_curve( - targets, predictions, index_map=incomplete_index_map) + targets, predictions, index_map=incomplete_index_map + ) with self.assertRaises(ToolkitError): score = turicreate.toolkits.evaluation.log_loss( - targets, predictions, index_map=incomplete_index_map) + targets, predictions, index_map=incomplete_index_map + ) # Exception if index_map values do not span prediction vector indices with self.assertRaises(ToolkitError): score = turicreate.toolkits.evaluation.auc( - targets, predictions, index_map=invalid_range_index_map) + targets, predictions, index_map=invalid_range_index_map + ) with self.assertRaises(ToolkitError): score = turicreate.toolkits.evaluation.roc_curve( - targets, predictions, index_map=invalid_range_index_map) + targets, predictions, index_map=invalid_range_index_map + ) with self.assertRaises(ToolkitError): score = turicreate.toolkits.evaluation.log_loss( - targets, predictions, index_map=invalid_range_index_map) + targets, predictions, index_map=invalid_range_index_map + ) # Exception if index_map uses the same index for two labels with self.assertRaises(ToolkitError): score = turicreate.toolkits.evaluation.auc( - targets, predictions, index_map=non_injective_index_map) + targets, predictions, index_map=non_injective_index_map + ) with self.assertRaises(ToolkitError): score = turicreate.toolkits.evaluation.roc_curve( - targets, predictions, index_map=non_injective_index_map) + targets, predictions, index_map=non_injective_index_map + ) with self.assertRaises(ToolkitError): score = turicreate.toolkits.evaluation.log_loss( - targets, predictions, index_map=non_injective_index_map) + targets, predictions, index_map=non_injective_index_map + ) diff --git a/src/python/turicreate/test/test_explore.py b/src/python/turicreate/test/test_explore.py index 180d4fdd55..b40989634f 100644 --- a/src/python/turicreate/test/test_explore.py +++ b/src/python/turicreate/test/test_explore.py @@ -23,11 +23,16 @@ import turicreate as tc from turicreate.toolkits._internal_utils import _mac_ver -class ExploreTest(unittest.TestCase): - @unittest.skipIf(_mac_ver() < (10, 12), "macOS-only test; UISoup doesn't work on Linux") - @unittest.skipIf(_mac_ver() > (10, 13), "macOS 10.14 appears to have broken the UX flow to prompt for accessibility access") - @unittest.skipIf(not(six.PY2), "Python 2.7-only test; UISoup doesn't work on 3.x") +class ExploreTest(unittest.TestCase): + @unittest.skipIf( + _mac_ver() < (10, 12), "macOS-only test; UISoup doesn't work on Linux" + ) + @unittest.skipIf( + _mac_ver() > (10, 13), + "macOS 10.14 appears to have broken the UX flow to prompt for accessibility access", + ) + @unittest.skipIf(not (six.PY2), "Python 2.7-only test; UISoup doesn't work on 3.x") def test_sanity_on_macOS(self): """ Create a simple SFrame, containing a very unique string. @@ -40,7 +45,7 @@ def test_sanity_on_macOS(self): # Generate some test data unique_str = repr(uuid.uuid4()) - sf = tc.SFrame({'a': [1,2,3], 'b': ['hello', 'world', unique_str]}) + sf = tc.SFrame({"a": [1, 2, 3], "b": ["hello", "world", unique_str]}) # Run the explore view and make sure we can see our unique string sf.explore() @@ -48,7 +53,7 @@ def test_sanity_on_macOS(self): window = None try: - window = uisoup.get_window('Turi*Create*Visualization') + window = uisoup.get_window("Turi*Create*Visualization") result = window.findall(value=unique_str) self.assertEqual( len(result), @@ -56,7 +61,8 @@ def test_sanity_on_macOS(self): ( "Expected to find exactly one element containing the unique" "string %s." - ) % unique_str + ) + % unique_str, ) first = result[0] self.assertEqual( @@ -65,10 +71,11 @@ def test_sanity_on_macOS(self): ( "Expected to find the unique string %s as the name of the found" "element. Instead, got %s." - ) % (unique_str, first.acc_name) + ) + % (unique_str, first.acc_name), ) finally: if window is not None: # Kill the explore process - os.kill(window.proc_id, signal.SIGTERM) \ No newline at end of file + os.kill(window.proc_id, signal.SIGTERM) diff --git a/src/python/turicreate/test/test_extensions.py b/src/python/turicreate/test/test_extensions.py index 6f3c831d2b..499d3b355f 100644 --- a/src/python/turicreate/test/test_extensions.py +++ b/src/python/turicreate/test/test_extensions.py @@ -13,6 +13,7 @@ import turicreate as tc import sys + if sys.version_info.major > 2: long = int import random @@ -21,7 +22,6 @@ class VariantCheckTest(unittest.TestCase): - def identical(self, reference, b): if type(reference) in [int, long]: self.assertIn(type(b), [int, long]) @@ -44,45 +44,51 @@ def variant_turnaround(self, reference, expected_result=None): if expected_result is None: expected_result = reference from ..extensions import _demo_identity + self.identical(expected_result, _demo_identity(reference)) def test_variant_check(self): - sa = SArray([1,2,3,4,5]) - sf = SFrame({'a':sa}) + sa = SArray([1, 2, 3, 4, 5]) + sf = SFrame({"a": sa}) import array + self.variant_turnaround(1) self.variant_turnaround(1.0) - self.variant_turnaround(array.array('d', [1.0, 2.0, 3.0])) - # numeric lists currently converts to array - self.variant_turnaround([1, 2, 3], array.array('d',[1,.0,2.0,3.0])) + self.variant_turnaround(array.array("d", [1.0, 2.0, 3.0])) + # numeric lists currently converts to array + self.variant_turnaround([1, 2, 3], array.array("d", [1, 0.0, 2.0, 3.0])) self.variant_turnaround("abc") self.variant_turnaround(["abc", "def"]) - self.variant_turnaround({'a':1,'b':'c'}) - self.variant_turnaround({'a':[1,2,'d'],'b':['a','b','c']}) - # numeric lists currently converts to array - self.variant_turnaround({'a':[1,2,3],'b':['a','b','c']}, - {'a':array.array('d',[1,2,3]),'b':['a','b','c']}) + self.variant_turnaround({"a": 1, "b": "c"}) + self.variant_turnaround({"a": [1, 2, "d"], "b": ["a", "b", "c"]}) + # numeric lists currently converts to array + self.variant_turnaround( + {"a": [1, 2, 3], "b": ["a", "b", "c"]}, + {"a": array.array("d", [1, 2, 3]), "b": ["a", "b", "c"]}, + ) self.variant_turnaround(sa) self.variant_turnaround(sf) - self.variant_turnaround([sa,sf]) - self.variant_turnaround([sa,sa]) - self.variant_turnaround([sf,sf]) - self.variant_turnaround({'a':sa, 'b':sf, 'c':['a','b','c','d']}) - self.variant_turnaround({'a':[{'a':1, 'b':2}], 'b':[{'a':3}]}) - self.variant_turnaround({'a':[{'a':sa, 'b': sa}], 'b':[{'a':sa}]}) - self.variant_turnaround({'a': [sa, {'c':sa, 'd': sa}], 'e':[{'f':sa}]}) - self.variant_turnaround({'a':[sa,{'a':sa}]}) - self.variant_turnaround({'a':[{'a':sa,'b':'c'}]}) - self.variant_turnaround({'a':[sa,{'a':sa,'b':'c'}]}) - self.variant_turnaround({'a':[sa,sf,{'a':sa,'b':'c'}], - 'b':sf, 'c':['a','b','c','d']}) + self.variant_turnaround([sa, sf]) + self.variant_turnaround([sa, sa]) + self.variant_turnaround([sf, sf]) + self.variant_turnaround({"a": sa, "b": sf, "c": ["a", "b", "c", "d"]}) + self.variant_turnaround({"a": [{"a": 1, "b": 2}], "b": [{"a": 3}]}) + self.variant_turnaround({"a": [{"a": sa, "b": sa}], "b": [{"a": sa}]}) + self.variant_turnaround({"a": [sa, {"c": sa, "d": sa}], "e": [{"f": sa}]}) + self.variant_turnaround({"a": [sa, {"a": sa}]}) + self.variant_turnaround({"a": [{"a": sa, "b": "c"}]}) + self.variant_turnaround({"a": [sa, {"a": sa, "b": "c"}]}) + self.variant_turnaround( + {"a": [sa, sf, {"a": sa, "b": "c"}], "b": sf, "c": ["a", "b", "c", "d"]} + ) def test_stress(self): random.seed(0) + class A: + pass - class A: pass A.flextype_encodable = True def _make(depth): @@ -95,25 +101,30 @@ def _make(depth): if s == 0: return str(random.randint(0, 100)) elif s == 1: - return random.randint(0,100000) + return random.randint(0, 100000) elif s == 2: A.flextype_encodable = False - return SArray([random.randint(0,100000) for i in range(2)]) + return SArray([random.randint(0, 100000) for i in range(2)]) elif s == 3: A.flextype_encodable = False - return SFrame({'a' : [random.randint(0,100000) for i in range(2)], - 'b' : [str(random.randint(0,100000)) for i in range(2)]}) + return SFrame( + { + "a": [random.randint(0, 100000) for i in range(2)], + "b": [str(random.randint(0, 100000)) for i in range(2)], + } + ) elif s == 4: length = random.randint(3, 8) # The ['a'] needed so it doesn't get translated to a string. - return ['a'] + [_make(depth - 1) for i in range(length)] + return ["a"] + [_make(depth - 1) for i in range(length)] elif s == 5: length = random.randint(3, 8) - return {str(random.randint(0, 100)) : _make(depth - 1) - for i in range(length)} + return { + str(random.randint(0, 100)): _make(depth - 1) for i in range(length) + } - for depth in [2,3,4,5,10]: + for depth in [2, 3, 4, 5, 10]: for i in range(10): A.flextype_encodable = True @@ -129,7 +140,6 @@ def _make(depth): else: self.assertFalse(A.flextype_encodable) - def test_futures_1(self): future = tc.extensions._demo_addone.run_background(1) @@ -138,22 +148,20 @@ def test_futures_1(self): self.assertEqual(result, 2) - def test_futures_stress_1(self): n = 50 X = tc.util.generate_random_sframe(n, "CS") rows = list(X) - futures = [None]*n + futures = [None] * n for i in range(n): - futures[i] = tc.extensions._demo_extract_row.run_background(X, i) + futures[i] = tc.extensions._demo_extract_row.run_background(X, i) for i in range(n): self.assertEqual(futures[i].result(), rows[i]) - def test_futures_stress_2(self): X = tc.util.generate_random_sframe(1000, "CS") @@ -174,12 +182,10 @@ def test_futures_stress_2(self): test_indices = list(range(n)) random.shuffle(test_indices) - futures = [None]*n + futures = [None] * n for i in start_indices: - futures[i] = tc.extensions._demo_extract_row.run_background(X, indices[i]) + futures[i] = tc.extensions._demo_extract_row.run_background(X, indices[i]) for i in test_indices: self.assertEqual(futures[i].result(), rows[indices[i]]) - - diff --git a/src/python/turicreate/test/test_external_memory_tree.py b/src/python/turicreate/test/test_external_memory_tree.py index 4121835223..9c739d53e4 100644 --- a/src/python/turicreate/test/test_external_memory_tree.py +++ b/src/python/turicreate/test/test_external_memory_tree.py @@ -10,30 +10,33 @@ import turicreate as tc from array import array + def _get_data(n): t = [1] * (n - n // 2) + [0] * (n // 2) - sf = tc.SFrame({'target': t}) + sf = tc.SFrame({"target": t}) sf = sf.add_row_number() - sf['id'] = sf['id'].apply(lambda x: {x:1} if x != 0 else {i:1 for i in range(n)}) + sf["id"] = sf["id"].apply(lambda x: {x: 1} if x != 0 else {i: 1 for i in range(n)}) return sf -class TreeExtractFeaturesTest(unittest.TestCase): - def _run_test(self, train_sf, test_sf, target = 'target'): +class TreeExtractFeaturesTest(unittest.TestCase): + def _run_test(self, train_sf, test_sf, target="target"): - for model in [tc.classifier.decision_tree_classifier, - tc.classifier.random_forest_classifier, - tc.classifier.boosted_trees_classifier]: - m = model.create(train_sf, target = target, validation_set = None) + for model in [ + tc.classifier.decision_tree_classifier, + tc.classifier.random_forest_classifier, + tc.classifier.boosted_trees_classifier, + ]: + m = model.create(train_sf, target=target, validation_set=None) for leaf in m.extract_features(test_sf)[-1]: - self.assertTrue(leaf > 1e-5) # leaf_id should not be zero. + self.assertTrue(leaf > 1e-5) # leaf_id should not be zero. def test_multiple_cache_files_in_memory(self): N = 10000 sf = _get_data(N) - self._run_test(sf, sf, 'target') + self._run_test(sf, sf, "target") def test_multiple_cache_files_external_memory(self): N = 20000 sf = _get_data(N) - self._run_test(sf, sf, 'target') + self._run_test(sf, sf, "target") diff --git a/src/python/turicreate/test/test_fast_path_prediction.py b/src/python/turicreate/test/test_fast_path_prediction.py index eec83a71a9..7bed63eaef 100644 --- a/src/python/turicreate/test/test_fast_path_prediction.py +++ b/src/python/turicreate/test/test_fast_path_prediction.py @@ -14,8 +14,8 @@ import shutil import numpy as np -class FastPathPredictionTest(unittest.TestCase): +class FastPathPredictionTest(unittest.TestCase): @classmethod def setUpClass(self): """ @@ -33,8 +33,8 @@ def setUpClass(self): target[1] = 1 ## Create the model - self.sf['target'] = target - self.target = 'target' + self.sf["target"] = target + self.target = "target" self.model = None self.regression = False @@ -49,7 +49,7 @@ def test_save_and_load(self): if self.model is None: return - filename = 'save_file%s' % (str(uuid.uuid4())) + filename = "save_file%s" % (str(uuid.uuid4())) old_model = self.model self.model.save(filename) @@ -81,7 +81,7 @@ def test_classify(self): lp = model.classify(list(sf)) dp = model.classify(sf[0]) sf_new = sf[0].copy() - sf_new['new_column'] = 1 + sf_new["new_column"] = 1 dp_new = model.classify(sf_new) self.assertEqual(len(dp), 1) @@ -102,12 +102,12 @@ def test_predict_topk(self): # Act & Assert if self.has_predict_topk: # Act & Assert - output_type = 'rank' + output_type = "rank" bp = model.predict_topk(sf, output_type, k) lp = model.predict_topk(list(sf), output_type, k) dp = model.predict_topk(sf[0], output_type, k) sf_new = sf[0].copy() - sf_new['new_column'] = 1 + sf_new["new_column"] = 1 dp_new = model.predict_topk(sf_new, output_type, k) self.assertEqual(len(dp), 2) @@ -119,7 +119,7 @@ def test_predict_topk(self): self.assertEqual(lp[1], dp[1]) # Act & Assert - output_type = 'probability' + output_type = "probability" bp = model.predict_topk(sf, output_type, k) lp = model.predict_topk(list(sf), output_type, k) dp = model.predict_topk(sf[0], output_type, k) @@ -134,7 +134,7 @@ def test_predict_topk(self): self.assertEqual(lp[1], dp[1]) # Act & Assert - output_type = 'margin' + output_type = "margin" bp = model.predict_topk(sf, output_type, k) lp = model.predict_topk(list(sf), output_type, k) dp = model.predict_topk(sf[0], output_type, k) @@ -160,12 +160,12 @@ def test_predict(self): if not self.regression: # Act & Assert - output_type = 'class' + output_type = "class" bp = model.predict(sf, output_type) lp = model.predict(list(sf), output_type) dp = model.predict(sf[0], output_type) sf_new = sf[0].copy() - sf_new['new_column'] = 1 + sf_new["new_column"] = 1 dp_new = model.predict(sf_new, output_type) self.assertEqual(len(dp), 1) @@ -175,7 +175,7 @@ def test_predict(self): self.assertEqual(lp[0], dp[0]) # act & assert - output_type = 'margin' + output_type = "margin" bp = model.predict(sf, output_type) lp = model.predict(list(sf), output_type) dp = model.predict(sf[0], output_type) @@ -189,7 +189,7 @@ def test_predict(self): # act & assert if self.has_probability_vector: - output_type = 'probability_vector' + output_type = "probability_vector" bp = model.predict(sf, output_type) lp = model.predict(list(sf), output_type) dp = model.predict(sf[0], output_type) @@ -203,7 +203,7 @@ def test_predict(self): # act & assert if self.has_probability: - output_type = 'probability' + output_type = "probability" bp = model.predict(sf, output_type) lp = model.predict(list(sf), output_type) dp = model.predict(sf[0], output_type) @@ -222,7 +222,7 @@ def test_predict(self): lp = model.predict(list(sf)) dp = model.predict(sf[0]) sf_new = sf[0].copy() - sf_new['new_column'] = 1 + sf_new["new_column"] = 1 dp_new = model.predict(sf_new) self.assertEqual(len(dp), 1) @@ -253,98 +253,119 @@ class LinearRegressionTest(FastPathPredictionTest): @classmethod def setUpClass(self): super(LinearRegressionTest, self).setUpClass() - self.model = tc.linear_regression.create(self.sf, - self.target, validation_set=None) + self.model = tc.linear_regression.create( + self.sf, self.target, validation_set=None + ) self.regression = True + class RandomForestRegressionTest(FastPathPredictionTest): @classmethod def setUpClass(self): super(RandomForestRegressionTest, self).setUpClass() - self.model = tc.random_forest_regression.create(self.sf, - self.target, validation_set=None) + self.model = tc.random_forest_regression.create( + self.sf, self.target, validation_set=None + ) self.regression = True + class DecisionTreeRegressionTest(FastPathPredictionTest): @classmethod def setUpClass(self): super(DecisionTreeRegressionTest, self).setUpClass() - self.model = tc.decision_tree_regression.create(self.sf, - self.target, validation_set=None) + self.model = tc.decision_tree_regression.create( + self.sf, self.target, validation_set=None + ) self.regression = True + class BoostedTreesRegressionTest(FastPathPredictionTest): @classmethod def setUpClass(self): super(BoostedTreesRegressionTest, self).setUpClass() - self.model = tc.boosted_trees_regression.create(self.sf, - self.target, validation_set=None) + self.model = tc.boosted_trees_regression.create( + self.sf, self.target, validation_set=None + ) self.regression = True + class LogisticRegressionTest(FastPathPredictionTest): @classmethod def setUpClass(self): super(LogisticRegressionTest, self).setUpClass() - self.model = tc.logistic_classifier.create(self.sf, - self.target, validation_set=None) + self.model = tc.logistic_classifier.create( + self.sf, self.target, validation_set=None + ) self.has_predict_topk = True + class SVMClassifierTest(FastPathPredictionTest): @classmethod def setUpClass(self): super(SVMClassifierTest, self).setUpClass() - self.model = tc.svm_classifier.create(self.sf, self.target, - validation_set=None) + self.model = tc.svm_classifier.create(self.sf, self.target, validation_set=None) self.has_probability = False self.has_probability_vector = False + class RandomForestClassifierTest(FastPathPredictionTest): @classmethod def setUpClass(self): super(RandomForestClassifierTest, self).setUpClass() - self.model = tc.random_forest_classifier.create(self.sf, self.target, - validation_set=None) + self.model = tc.random_forest_classifier.create( + self.sf, self.target, validation_set=None + ) self.has_predict_topk = True + class DecisionTreeClassifierTest(FastPathPredictionTest): @classmethod def setUpClass(self): super(DecisionTreeClassifierTest, self).setUpClass() - self.model = tc.decision_tree_classifier.create(self.sf, self.target, - validation_set=None) + self.model = tc.decision_tree_classifier.create( + self.sf, self.target, validation_set=None + ) self.has_predict_topk = True + class BoostedTreesClassifierTest(FastPathPredictionTest): @classmethod def setUpClass(self): super(BoostedTreesClassifierTest, self).setUpClass() - self.model = tc.boosted_trees_classifier.create(self.sf, - self.target, validation_set=None) + self.model = tc.boosted_trees_classifier.create( + self.sf, self.target, validation_set=None + ) self.has_predict_topk = True + class RandomForestClassifierStringClassTest(FastPathPredictionTest): @classmethod def setUpClass(self): super(RandomForestClassifierStringClassTest, self).setUpClass() self.sf[self.target] = self.sf[self.target].astype(str) - self.model = tc.random_forest_classifier.create(self.sf, self.target, - validation_set=None) + self.model = tc.random_forest_classifier.create( + self.sf, self.target, validation_set=None + ) self.has_predict_topk = True + class DecisionTreeClassifierStringClassTest(FastPathPredictionTest): @classmethod def setUpClass(self): super(DecisionTreeClassifierStringClassTest, self).setUpClass() self.sf[self.target] = self.sf[self.target].astype(str) - self.model = tc.decision_tree_classifier.create(self.sf, self.target, - validation_set=None) + self.model = tc.decision_tree_classifier.create( + self.sf, self.target, validation_set=None + ) self.has_predict_topk = True + class BoostedTreesClassifierStringClassTest(FastPathPredictionTest): @classmethod def setUpClass(self): super(BoostedTreesClassifierStringClassTest, self).setUpClass() self.sf[self.target] = self.sf[self.target].astype(str) - self.model = tc.boosted_trees_classifier.create(self.sf, - self.target, validation_set=None) + self.model = tc.boosted_trees_classifier.create( + self.sf, self.target, validation_set=None + ) self.has_predict_topk = True diff --git a/src/python/turicreate/test/test_file_util.py b/src/python/turicreate/test/test_file_util.py index c52ed8a671..d4e6f1faa4 100644 --- a/src/python/turicreate/test/test_file_util.py +++ b/src/python/turicreate/test/test_file_util.py @@ -11,27 +11,30 @@ import tempfile from ..util import _file_util as fu + class FileUtilTests(unittest.TestCase): def setUp(self): - self.local_path = 'tmp/a/b/c' - self.s3_path = 's3://a/b/c' - self.http_path = 'http://a.b.c/d' + self.local_path = "tmp/a/b/c" + self.s3_path = "s3://a/b/c" + self.http_path = "http://a.b.c/d" self._get_env() def _get_env(self): - self.run_s3_test = ('FILE_UTIL_TEST_S3_BUCKET' in os.environ) and \ - 'AWS_ACCESS_KEY_ID' in os.environ and \ - 'AWS_SECRET_ACCESS_KEY' in os.environ + self.run_s3_test = ( + ("FILE_UTIL_TEST_S3_BUCKET" in os.environ) + and "AWS_ACCESS_KEY_ID" in os.environ + and "AWS_SECRET_ACCESS_KEY" in os.environ + ) if self.run_s3_test: - self.s3_test_path = os.environ['FILE_UTIL_TEST_S3_BUCKET'] + self.s3_test_path = os.environ["FILE_UTIL_TEST_S3_BUCKET"] else: self.s3_test_path = None def test_get_protocol(self): - self.assertEqual(fu.get_protocol(self.local_path), '') - self.assertEqual(fu.get_protocol(self.s3_path), 's3') - self.assertEqual(fu.get_protocol(self.http_path), 'http') + self.assertEqual(fu.get_protocol(self.local_path), "") + self.assertEqual(fu.get_protocol(self.s3_path), "s3") + self.assertEqual(fu.get_protocol(self.http_path), "http") def test_is_local_path(self): self.assertTrue(fu.is_local_path(self.local_path)) @@ -39,9 +42,11 @@ def test_is_local_path(self): self.assertFalse(fu.is_local_path(self.http_path)) def test_expand_full_path(self): - if not 'HOME' in os.environ: - raise RuntimeError('warning: cannot find $HOME key in environment') + if not "HOME" in os.environ: + raise RuntimeError("warning: cannot find $HOME key in environment") else: - home = os.environ['HOME'] - self.assertTrue(fu.expand_full_path('~/tmp'), os.path.join(home, 'tmp')) - self.assertTrue(fu.expand_full_path('tmp'), os.path.join(os.getcwd(), 'tmp')) + home = os.environ["HOME"] + self.assertTrue(fu.expand_full_path("~/tmp"), os.path.join(home, "tmp")) + self.assertTrue( + fu.expand_full_path("tmp"), os.path.join(os.getcwd(), "tmp") + ) diff --git a/src/python/turicreate/test/test_flexible_type.py b/src/python/turicreate/test/test_flexible_type.py index 7544ef3a5e..ecbef0ca56 100644 --- a/src/python/turicreate/test/test_flexible_type.py +++ b/src/python/turicreate/test/test_flexible_type.py @@ -14,7 +14,9 @@ from ..data_structures import image from .. import SArray import os -from .._cython.cy_flexible_type import _translate_through_flexible_type as _flexible_type +from .._cython.cy_flexible_type import ( + _translate_through_flexible_type as _flexible_type, +) from .._cython.cy_flexible_type import _translate_through_flex_list as _tr_flex_list from .._cython.cy_flexible_type import infer_type_of_list from .._cython.cy_flexible_type import _get_inferred_column_type, _all_convertable @@ -26,6 +28,7 @@ from copy import copy import sys + if sys.version_info.major > 2: long = int unicode = str @@ -34,108 +37,121 @@ current_file_dir = os.path.dirname(os.path.realpath(__file__)) + def from_lambda(v): from .._connect import main as glconnect + u = glconnect.get_unity() return u.eval_lambda(lambda x: x, v) + special_types = set() -IntegerValue = ( - [int(0), long(1)] - + [_dt(0) for _dt in (np.sctypes['int'] + np.sctypes['uint'] - + [np.bool, bool, np.bool_])]) +IntegerValue = [int(0), long(1)] + [ + _dt(0) + for _dt in (np.sctypes["int"] + np.sctypes["uint"] + [np.bool, bool, np.bool_]) +] special_types.add(id(IntegerValue)) # 2**63 and -2**63-1 are not representable by a C int64_t, so it's # treated as a float. -FloatValue = [float(0)] + [_dt(0) for _dt in np.sctypes['float']] + [2**63, -2**63 - 1] +FloatValue = ( + [float(0)] + [_dt(0) for _dt in np.sctypes["float"]] + [2 ** 63, -(2 ** 63) - 1] +) special_types.add(id(FloatValue)) -StringValue = ([str('bork'), unicode('bork'), b'bork', b''] - + [_dt('bork') for _dt in - [np.unicode, np.unicode_, str, unicode, np.str, - np.str_, np.string_]] - + [str(''), unicode('')] - + [_dt('') for _dt in - [np.unicode, np.unicode_, str, unicode, np.str, - np.str_, np.string_]]) +StringValue = ( + [str("bork"), unicode("bork"), b"bork", b""] + + [ + _dt("bork") + for _dt in [np.unicode, np.unicode_, str, unicode, np.str, np.str_, np.string_] + ] + + [str(""), unicode("")] + + [ + _dt("") + for _dt in [np.unicode, np.unicode_, str, unicode, np.str, np.str_, np.string_] + ] +) special_types.add(id(StringValue)) -DictValue = [{'a' : 12}, dict()] +DictValue = [{"a": 12}, dict()] special_types.add(id(DictValue)) -DatetimeValue = [datetime.date(2000, 6, 12), - datetime.date(1100, 1, 1), - datetime.datetime(2000, 6, 12)] +DatetimeValue = [ + datetime.date(2000, 6, 12), + datetime.date(1100, 1, 1), + datetime.datetime(2000, 6, 12), +] special_types.add(id(DatetimeValue)) AnyValue = IntegerValue + FloatValue + StringValue + DatetimeValue + DictValue special_types.add(id(AnyValue)) # All the different types of float sequences we support -FloatSequence = ( - [[0.5, 1.5, 2.5], (0.5, 1.5, 2.5), - {0.5, 1.5, 2.5}, frozenset([0.5, 1.5, 2.5])] - + [array.array(c, [0.5, 1.5, 2.5]) for c in 'fd']) +FloatSequence = [ + [0.5, 1.5, 2.5], + (0.5, 1.5, 2.5), + {0.5, 1.5, 2.5}, + frozenset([0.5, 1.5, 2.5]), +] + [array.array(c, [0.5, 1.5, 2.5]) for c in "fd"] special_types.add(id(FloatSequence)) # All the different types of float sequences we support -FloatSequenceWithNAN = ( - [[0.5, 1.5, 2.5, nan], (0.5, 1.5, 2.5, nan), - {0.5, 1.5, 2.5, nan}, frozenset([0.5, 1.5, 2.5, nan])] - + [array.array(c, [0.5, 1.5, 2.5, nan]) for c in 'fd']) +FloatSequenceWithNAN = [ + [0.5, 1.5, 2.5, nan], + (0.5, 1.5, 2.5, nan), + {0.5, 1.5, 2.5, nan}, + frozenset([0.5, 1.5, 2.5, nan]), +] + [array.array(c, [0.5, 1.5, 2.5, nan]) for c in "fd"] special_types.add(id(FloatSequenceWithNAN)) # All the different types of float sequences we support -FloatSequenceWithNone = ( - [[0.5, 1.5, 2.5, None], (0.5, 1.5, 2.5, None)]) +FloatSequenceWithNone = [[0.5, 1.5, 2.5, None], (0.5, 1.5, 2.5, None)] special_types.add(id(FloatSequenceWithNone)) # All the different integer sequences we support -IntegerSequence = ( - [[int(i) for i in range(3)] - , [long(i) for i in range(3)] - , tuple(range(3)) - , tuple(long(i) for i in range(3)) - , set(range(3)) - , frozenset(range(3)) - ] - + [array.array(c, range(3)) for c in 'bBhHiIlL']) +IntegerSequence = [ + [int(i) for i in range(3)], + [long(i) for i in range(3)], + tuple(range(3)), + tuple(long(i) for i in range(3)), + set(range(3)), + frozenset(range(3)), +] + [array.array(c, range(3)) for c in "bBhHiIlL"] special_types.add(id(IntegerSequence)) # All the different integer sequences we support, with a Nan -IntegerSequenceWithNAN = ( - [[int(i) for i in range(3)] + [nan] - , [long(i) for i in range(3)] + [nan] - , tuple(range(3)) + (nan,) - , tuple(long(i) for i in range(3)) + (nan,) - , set([long(i) for i in range(3)] + [nan]) - , frozenset([long(i) for i in range(3)] + [nan])]) +IntegerSequenceWithNAN = [ + [int(i) for i in range(3)] + [nan], + [long(i) for i in range(3)] + [nan], + tuple(range(3)) + (nan,), + tuple(long(i) for i in range(3)) + (nan,), + set([long(i) for i in range(3)] + [nan]), + frozenset([long(i) for i in range(3)] + [nan]), +] special_types.add(id(IntegerSequenceWithNAN)) # All the different types of string -IntegerSequenceWithNone = ( - [[int(i) for i in range(3)] + [None] - , [long(i) for i in range(3)] + [None] - , tuple(range(3)) + (None,) - , tuple(long(i) for i in range(3)) + (None,) - , set([long(i) for i in range(3)] + [None]) - , frozenset([long(i) for i in range(3)] + [None])]) +IntegerSequenceWithNone = [ + [int(i) for i in range(3)] + [None], + [long(i) for i in range(3)] + [None], + tuple(range(3)) + (None,), + tuple(long(i) for i in range(3)) + (None,), + set([long(i) for i in range(3)] + [None]), + frozenset([long(i) for i in range(3)] + [None]), +] special_types.add(id(IntegerSequenceWithNone)) # Empty but typed float arrays -EmptyFloatArray = ( - [array.array(c, []) for c in 'fd']) +EmptyFloatArray = [array.array(c, []) for c in "fd"] special_types.add(id(EmptyFloatArray)) # Empty but typed integer arrays -type_codes = 'bBhHiIlL' +type_codes = "bBhHiIlL" if sys.version_info.major == 2: - type_codes += 'c' -EmptyIntegerArray = ( - [array.array(c, []) for c in type_codes]) + type_codes += "c" +EmptyIntegerArray = [array.array(c, []) for c in type_codes] special_types.add(id(EmptyIntegerArray)) # All empty arrays @@ -146,26 +162,39 @@ def from_lambda(v): special_types.add(id(EmptySequence)) # Boolean Sequences -BooleanSequence = ( - [ list( (i%2 == 0) for i in range(3)) - , tuple( (i%2 == 0) for i in range(3)) - , set([True]), set([False]), set([True, False])]) +BooleanSequence = [ + list((i % 2 == 0) for i in range(3)), + tuple((i % 2 == 0) for i in range(3)), + set([True]), + set([False]), + set([True, False]), +] special_types.add(id(BooleanSequence)) # String sequences -StringSequence = ( - [ list( str(i) for i in range(3)) - , tuple( str(i) for i in range(3)) - , set( str(i) for i in range(3)) - , frozenset( str(i) for i in range(3))]) +StringSequence = [ + list(str(i) for i in range(3)), + tuple(str(i) for i in range(3)), + set(str(i) for i in range(3)), + frozenset(str(i) for i in range(3)), +] special_types.add(id(StringSequence)) -AnySequence = (EmptySequence + BooleanSequence + StringSequence - + IntegerSequence + IntegerSequenceWithNone + IntegerSequenceWithNAN - + FloatSequence + FloatSequenceWithNone + FloatSequenceWithNAN - + EmptyArray) +AnySequence = ( + EmptySequence + + BooleanSequence + + StringSequence + + IntegerSequence + + IntegerSequenceWithNone + + IntegerSequenceWithNAN + + FloatSequence + + FloatSequenceWithNone + + FloatSequenceWithNAN + + EmptyArray +) special_types.add(id(AnySequence)) + def verify_inference(values, expected_type): # Go through and build a list of all the possible value enumerations that need to be tested. @@ -198,23 +227,23 @@ def get_value(values, idx_set): inferred_type, result = _get_inferred_column_type(v_list) if inferred_type != expected_type: - assert False, ("Expected type %s, got type %s; input value = %s." - % (str(expected_type), str(inferred_type), str(v_list))) + assert False, "Expected type %s, got type %s; input value = %s." % ( + str(expected_type), + str(inferred_type), + str(v_list), + ) if inferred_type != NoneType: reconverted_result = _tr_flex_list(result, inferred_type) - assert str(result) == str(reconverted_result), \ - (("Values in type translated inconsistently: " + assert str(result) == str(reconverted_result), ( + "Values in type translated inconsistently: " "\nInput value = %s" "\nOutput value = %s" - "\nReconverted = %s") - % (str(v_list), str(result), reconverted_result)) - - + "\nReconverted = %s" + ) % (str(v_list), str(result), reconverted_result) class FlexibleTypeInference(unittest.TestCase): - def test_int_float(self): verify_inference([IntegerValue], int) verify_inference([IntegerValue, IntegerValue], int) @@ -248,66 +277,64 @@ def test_mixed_types(self): def test_array_list(self): tests = [ - - # Individual types - ([EmptySequence], list), - ([IntegerSequence], array.array), - ([IntegerSequenceWithNone], list), - ([IntegerSequenceWithNAN], array.array), - ([FloatSequence], array.array), - ([FloatSequenceWithNAN], array.array), - ([FloatSequenceWithNone], list), - ([EmptyIntegerArray], array.array), - ([EmptyFloatArray], array.array), - ([BooleanSequence], array.array), - ([StringSequence], list), - - # Multiple types - ([IntegerSequence, FloatSequence], array.array), - ([IntegerSequence, FloatSequence], array.array), - - # Multiple types - ([EmptySequence, EmptyFloatArray], array.array), - ([EmptySequence, EmptyIntegerArray], array.array), - ([EmptySequence, IntegerSequence], array.array), - ([EmptySequence, FloatSequence], array.array), - - # Multiple types - ([EmptySequence, EmptyFloatArray], array.array), - ([EmptySequence, EmptyIntegerArray], array.array), - ([EmptySequence, IntegerSequence], array.array), - ([EmptySequence, FloatSequence], array.array), - - # Arrays and lists - ([StringSequence, EmptyFloatArray], list), - ([StringSequence, EmptyIntegerArray], list), - ([StringSequence, IntegerSequence], list), - ([StringSequence, FloatSequence], list)] + # Individual types + ([EmptySequence], list), + ([IntegerSequence], array.array), + ([IntegerSequenceWithNone], list), + ([IntegerSequenceWithNAN], array.array), + ([FloatSequence], array.array), + ([FloatSequenceWithNAN], array.array), + ([FloatSequenceWithNone], list), + ([EmptyIntegerArray], array.array), + ([EmptyFloatArray], array.array), + ([BooleanSequence], array.array), + ([StringSequence], list), + # Multiple types + ([IntegerSequence, FloatSequence], array.array), + ([IntegerSequence, FloatSequence], array.array), + # Multiple types + ([EmptySequence, EmptyFloatArray], array.array), + ([EmptySequence, EmptyIntegerArray], array.array), + ([EmptySequence, IntegerSequence], array.array), + ([EmptySequence, FloatSequence], array.array), + # Multiple types + ([EmptySequence, EmptyFloatArray], array.array), + ([EmptySequence, EmptyIntegerArray], array.array), + ([EmptySequence, IntegerSequence], array.array), + ([EmptySequence, FloatSequence], array.array), + # Arrays and lists + ([StringSequence, EmptyFloatArray], list), + ([StringSequence, EmptyIntegerArray], list), + ([StringSequence, IntegerSequence], list), + ([StringSequence, FloatSequence], list), + ] # Add in additional rules for testing for tv, res in copy(tests): - tests.append( (tv + [EmptySequence], res) ) + tests.append((tv + [EmptySequence], res)) for tv, res in copy(tests): - tests.append( (tv + [[None]], list) ) + tests.append((tv + [[None]], list)) for tv, res in copy(tests): - tests.append( (tv + [StringSequence], list) ) + tests.append((tv + [StringSequence], list)) # Run the tests for tv, res in tests: verify_inference(tv, res) def test_nparray(self): - NPSequence = ([np.array(range(3),'d'), None], - [np.array(range(3),'i'), None], - [np.array(range(3),'f'), None], - [np.array(range(3),'d'), array.array('d',[1,2,3])], - [np.array(range(3),'i'), array.array('d',[1,2,3])], - [np.array(range(3),'f'), array.array('d',[1,2,3])], - [np.array(range(3),'d'), array.array('d',[1,2,3]), None], - [np.array(range(3),'i'), array.array('d',[1,2,3]), None], - [np.array(range(3),'f'), array.array('d',[1,2,3]), None]) + NPSequence = ( + [np.array(range(3), "d"), None], + [np.array(range(3), "i"), None], + [np.array(range(3), "f"), None], + [np.array(range(3), "d"), array.array("d", [1, 2, 3])], + [np.array(range(3), "i"), array.array("d", [1, 2, 3])], + [np.array(range(3), "f"), array.array("d", [1, 2, 3])], + [np.array(range(3), "d"), array.array("d", [1, 2, 3]), None], + [np.array(range(3), "i"), array.array("d", [1, 2, 3]), None], + [np.array(range(3), "f"), array.array("d", [1, 2, 3]), None], + ) # Run the tests for seq in NPSequence: @@ -321,9 +348,13 @@ class FlexibleTypeTest(unittest.TestCase): # On lambda return, if the return value is a non-empty of list of # all numerical values, we try hard to use array.array def numeric_list_to_array(self, v): - if (type(v) is list) and (len(v) > 0) and all((type(x) is int) or (type(x) is float) for x in v): - return array.array('d', v) - elif (type(v) is list): + if ( + (type(v) is list) + and (len(v) > 0) + and all((type(x) is int) or (type(x) is float) for x in v) + ): + return array.array("d", v) + elif type(v) is list: return [self.numeric_list_to_array(x) for x in v] else: return v @@ -334,9 +365,11 @@ def assert_equal_with_lambda_check(self, translated, correct): def test_none(self): self.assert_equal_with_lambda_check(_flexible_type(None), None) + def test_date_time(self): d = datetime.datetime(2010, 10, 10, 10, 10, 10) - self.assert_equal_with_lambda_check(_flexible_type(d),d) + self.assert_equal_with_lambda_check(_flexible_type(d), d) + def test_int(self): self.assert_equal_with_lambda_check(_flexible_type(1), 1) self.assert_equal_with_lambda_check(_flexible_type(long(1)), 1) @@ -376,11 +409,11 @@ def test_string(self): def test_array(self): # float array - expected = array.array('d', [.1, .2, .3]) + expected = array.array("d", [0.1, 0.2, 0.3]) self.assert_equal_with_lambda_check(_flexible_type(expected), expected) # int array - expected = array.array('d', [1, 2, 3]) + expected = array.array("d", [1, 2, 3]) self.assert_equal_with_lambda_check(_flexible_type([1, 2, 3]), expected) self.assert_equal_with_lambda_check(_flexible_type([1.0, 2.0, 3.0]), expected) self.assert_equal_with_lambda_check(_flexible_type([1, 2, 3.0]), expected) @@ -388,52 +421,91 @@ def test_array(self): # numpy ndarray expected = np.asarray([1, 2, 3]) self.assertSequenceEqual(list(_flexible_type(expected)), list(expected)) - self.assertSequenceEqual(list(from_lambda(expected)), array.array('d', expected)) + self.assertSequenceEqual( + list(from_lambda(expected)), array.array("d", expected) + ) - expected = np.asarray([.1, .2, .3]) + expected = np.asarray([0.1, 0.2, 0.3]) self.assertSequenceEqual(list(_flexible_type(expected)), list(expected)) - self.assertSequenceEqual(list(from_lambda(expected)), array.array('d', expected)) + self.assertSequenceEqual( + list(from_lambda(expected)), array.array("d", expected) + ) def test_dict(self): d = dt.datetime(2010, 10, 10, 10, 10, 10) - img = image.Image(current_file_dir + "/images/nested/sample_grey.jpg","JPG") - expected = {'int': 0, 'float': 0.1, 'str': 'str', - 'list': ['a', 'b', 'c'], 'array': array.array('d', [1, 2, 3]),'datetime':[d], - 'image': img ,'none': None} + img = image.Image(current_file_dir + "/images/nested/sample_grey.jpg", "JPG") + expected = { + "int": 0, + "float": 0.1, + "str": "str", + "list": ["a", "b", "c"], + "array": array.array("d", [1, 2, 3]), + "datetime": [d], + "image": img, + "none": None, + } self.assert_equal_with_lambda_check(_flexible_type(expected), expected) self.assert_equal_with_lambda_check(_flexible_type({}), {}) - expected = [{'a': 1, 'b': 20, 'c': None}, {"b": 4, None: 5}, None, {'a': 0}] + expected = [{"a": 1, "b": 20, "c": None}, {"b": 4, None: 5}, None, {"a": 0}] self.assert_equal_with_lambda_check(_flexible_type(expected), expected) def test_list(self): d = dt.datetime(2010, 10, 10, 10, 10, 10) - img = image.Image(current_file_dir + "/images/nested/sample_grey.jpg","JPG") - expected = [None, img, 1, 0.1, '1',d,array.array('d', [1, 2, 3]), {'foo': array.array('d', [1, 2,3])}] + img = image.Image(current_file_dir + "/images/nested/sample_grey.jpg", "JPG") + expected = [ + None, + img, + 1, + 0.1, + "1", + d, + array.array("d", [1, 2, 3]), + {"foo": array.array("d", [1, 2, 3])}, + ] self.assert_equal_with_lambda_check(_flexible_type(expected), expected) self.assert_equal_with_lambda_check(_flexible_type([]), []) self.assert_equal_with_lambda_check(_flexible_type([[], []]), [[], []]) def test_image(self): - img_gray_jpg = image.Image(current_file_dir + "/images/nested/sample_grey.jpg","JPG") - img_gray_png = image.Image(current_file_dir + "/images/nested/sample_grey.png","PNG") - img_gray_auto_jpg = image.Image(current_file_dir + "/images/nested/sample_grey.jpg") - img_gray_auto_png = image.Image(current_file_dir + "/images/nested/sample_grey.png") - img_color_jpg = image.Image(current_file_dir + "/images/sample.jpg","JPG") - img_color_png = image.Image(current_file_dir + "/images/sample.png","PNG") + img_gray_jpg = image.Image( + current_file_dir + "/images/nested/sample_grey.jpg", "JPG" + ) + img_gray_png = image.Image( + current_file_dir + "/images/nested/sample_grey.png", "PNG" + ) + img_gray_auto_jpg = image.Image( + current_file_dir + "/images/nested/sample_grey.jpg" + ) + img_gray_auto_png = image.Image( + current_file_dir + "/images/nested/sample_grey.png" + ) + img_color_jpg = image.Image(current_file_dir + "/images/sample.jpg", "JPG") + img_color_png = image.Image(current_file_dir + "/images/sample.png", "PNG") img_color_auto_jpg = image.Image(current_file_dir + "/images/sample.jpg") img_color_auto_png = image.Image(current_file_dir + "/images/sample.png") - - self.assert_equal_with_lambda_check(_flexible_type(img_gray_jpg),img_gray_jpg) - self.assert_equal_with_lambda_check(_flexible_type(img_gray_png),img_gray_png) - self.assert_equal_with_lambda_check(_flexible_type(img_gray_auto_jpg),img_gray_auto_jpg) - self.assert_equal_with_lambda_check(_flexible_type(img_gray_auto_png),img_gray_png) - self.assert_equal_with_lambda_check(_flexible_type(img_color_jpg),img_color_jpg) - self.assert_equal_with_lambda_check(_flexible_type(img_color_png),img_color_png) - self.assert_equal_with_lambda_check(_flexible_type(img_color_auto_jpg),img_color_auto_jpg) - self.assert_equal_with_lambda_check(_flexible_type(img_color_auto_png),img_color_auto_png) + self.assert_equal_with_lambda_check(_flexible_type(img_gray_jpg), img_gray_jpg) + self.assert_equal_with_lambda_check(_flexible_type(img_gray_png), img_gray_png) + self.assert_equal_with_lambda_check( + _flexible_type(img_gray_auto_jpg), img_gray_auto_jpg + ) + self.assert_equal_with_lambda_check( + _flexible_type(img_gray_auto_png), img_gray_png + ) + self.assert_equal_with_lambda_check( + _flexible_type(img_color_jpg), img_color_jpg + ) + self.assert_equal_with_lambda_check( + _flexible_type(img_color_png), img_color_png + ) + self.assert_equal_with_lambda_check( + _flexible_type(img_color_auto_jpg), img_color_auto_jpg + ) + self.assert_equal_with_lambda_check( + _flexible_type(img_color_auto_png), img_color_auto_png + ) def test_tr_flex_list(self): expected = [] @@ -443,63 +515,118 @@ def test_tr_flex_list(self): expected = [1, 2, 3, 4, 5, None] self.assert_equal_with_lambda_check(_tr_flex_list(expected), expected) self.assert_equal_with_lambda_check(_tr_flex_list(expected, int), expected) - self.assert_equal_with_lambda_check(_tr_flex_list(expected, int, ignore_cast_failure=True), expected) + self.assert_equal_with_lambda_check( + _tr_flex_list(expected, int, ignore_cast_failure=True), expected + ) # test datetime list from_zone = GMT(0) to_zone = GMT(4.5) d1 = dt.datetime(2010, 10, 10, 10, 10, 10).replace(tzinfo=from_zone) d2 = d1.astimezone(to_zone) - expected = [d1,d2, None] + expected = [d1, d2, None] self.assert_equal_with_lambda_check(_tr_flex_list(expected), expected) - self.assert_equal_with_lambda_check(_tr_flex_list(expected, dt.datetime), expected) - self.assert_equal_with_lambda_check(_tr_flex_list(expected, dt.datetime, ignore_cast_failure=True), expected) + self.assert_equal_with_lambda_check( + _tr_flex_list(expected, dt.datetime), expected + ) + self.assert_equal_with_lambda_check( + _tr_flex_list(expected, dt.datetime, ignore_cast_failure=True), expected + ) # test image list - img_gray_auto_png = image.Image(current_file_dir + "/images/nested/sample_grey.png") - img_color_jpg = image.Image(current_file_dir + "/images/sample.jpg","JPG") + img_gray_auto_png = image.Image( + current_file_dir + "/images/nested/sample_grey.png" + ) + img_color_jpg = image.Image(current_file_dir + "/images/sample.jpg", "JPG") expected = [img_gray_auto_png, img_color_jpg, None] self.assert_equal_with_lambda_check(_tr_flex_list(expected), expected) - self.assert_equal_with_lambda_check(_tr_flex_list(expected, image.Image), expected) - self.assert_equal_with_lambda_check(_tr_flex_list(expected, image.Image, ignore_cast_failure=True), expected) + self.assert_equal_with_lambda_check( + _tr_flex_list(expected, image.Image), expected + ) + self.assert_equal_with_lambda_check( + _tr_flex_list(expected, image.Image, ignore_cast_failure=True), expected + ) # test str list - expected = ['a', 'b', 'c', None] + expected = ["a", "b", "c", None] self.assert_equal_with_lambda_check(_tr_flex_list(expected), expected) self.assert_equal_with_lambda_check(_tr_flex_list(expected, str), expected) # test array list - expected = [array.array('d', range(5)), array.array('d', range(5)), None] + expected = [array.array("d", range(5)), array.array("d", range(5)), None] self.assert_equal_with_lambda_check(_tr_flex_list(expected), expected) - self.assert_equal_with_lambda_check(_tr_flex_list(expected, array.array), expected) + self.assert_equal_with_lambda_check( + _tr_flex_list(expected, array.array), expected + ) expected = [[float(i) for i in range(5)], range(5), None] - self.assert_equal_with_lambda_check(_tr_flex_list(expected), [array.array('d', range(5)), - array.array('d', range(5)), None]) + self.assert_equal_with_lambda_check( + _tr_flex_list(expected), + [array.array("d", range(5)), array.array("d", range(5)), None], + ) # test int array - expected = array.array('d', range(5)) + expected = array.array("d", range(5)) self.assert_equal_with_lambda_check(_tr_flex_list(expected), list(range(5))) - expected = [1, 1.0, '1', [1., 1., 1.], ['a', 'b', 'c'], {}, {'a': 1}, None] - self.assert_equal_with_lambda_check(_tr_flex_list(expected, int, ignore_cast_failure=True), [1, 1, None]) - self.assert_equal_with_lambda_check(_tr_flex_list(expected, float, ignore_cast_failure=True), [1.0, 1.0, None]) + expected = [1, 1.0, "1", [1.0, 1.0, 1.0], ["a", "b", "c"], {}, {"a": 1}, None] + self.assert_equal_with_lambda_check( + _tr_flex_list(expected, int, ignore_cast_failure=True), [1, 1, None] + ) + self.assert_equal_with_lambda_check( + _tr_flex_list(expected, float, ignore_cast_failure=True), [1.0, 1.0, None] + ) # Anything can be cast to a string # self.assert_equal_with_lambda_check(_tr_flex_list(expected, str, ignore_cast_failure=True), ['1', '1', None]) - self.assert_equal_with_lambda_check(_tr_flex_list(expected, array.array, ignore_cast_failure=True), [array.array('d', [1., 1., 1.]), None]) - self.assert_equal_with_lambda_check(_tr_flex_list(expected, list, ignore_cast_failure=True), - [[1., 1., 1.], ['a', 'b', 'c'], None]) - self.assert_equal_with_lambda_check(_tr_flex_list(expected, dict, ignore_cast_failure=True), - [{}, {'a': 1}, None]) + self.assert_equal_with_lambda_check( + _tr_flex_list(expected, array.array, ignore_cast_failure=True), + [array.array("d", [1.0, 1.0, 1.0]), None], + ) + self.assert_equal_with_lambda_check( + _tr_flex_list(expected, list, ignore_cast_failure=True), + [[1.0, 1.0, 1.0], ["a", "b", "c"], None], + ) + self.assert_equal_with_lambda_check( + _tr_flex_list(expected, dict, ignore_cast_failure=True), + [{}, {"a": 1}, None], + ) def test_infer_list_type(self): - self.assertEqual(infer_type_of_list([image.Image(current_file_dir + "/images/nested/sample_grey.png"), image.Image(current_file_dir + "/images/sample.jpg","JPG"), image.Image(current_file_dir + "/images/sample.png") -]), image.Image) - self.assertEqual(infer_type_of_list([dt.datetime(2010, 10, 10, 10, 10, 10), dt.datetime(2000, 5, 7, 10, 4, 10),dt.datetime(1845, 5, 7, 4, 4, 10)]), dt.datetime) + self.assertEqual( + infer_type_of_list( + [ + image.Image(current_file_dir + "/images/nested/sample_grey.png"), + image.Image(current_file_dir + "/images/sample.jpg", "JPG"), + image.Image(current_file_dir + "/images/sample.png"), + ] + ), + image.Image, + ) + self.assertEqual( + infer_type_of_list( + [ + dt.datetime(2010, 10, 10, 10, 10, 10), + dt.datetime(2000, 5, 7, 10, 4, 10), + dt.datetime(1845, 5, 7, 4, 4, 10), + ] + ), + dt.datetime, + ) self.assertEqual(infer_type_of_list([0, 1, 2]), int) self.assertEqual(infer_type_of_list([0, 1, 2.0]), float) - self.assertEqual(infer_type_of_list(['foo', u'bar']), str) - self.assertEqual(infer_type_of_list([array.array('d', [1, 2, 3]), array.array('d', [1, 2, 3])]), array.array) - self.assertEqual(infer_type_of_list([[], [1.0, 2.0, 3.0], array.array('d', [1, 2, 3])]), array.array) - self.assertEqual(infer_type_of_list([[], [1, 2, 3], array.array('d', [1, 2, 3])]), array.array) - self.assertEqual(infer_type_of_list([{'a': 1}, {'b': 2}]), dict) + self.assertEqual(infer_type_of_list(["foo", u"bar"]), str) + self.assertEqual( + infer_type_of_list( + [array.array("d", [1, 2, 3]), array.array("d", [1, 2, 3])] + ), + array.array, + ) + self.assertEqual( + infer_type_of_list([[], [1.0, 2.0, 3.0], array.array("d", [1, 2, 3])]), + array.array, + ) + self.assertEqual( + infer_type_of_list([[], [1, 2, 3], array.array("d", [1, 2, 3])]), + array.array, + ) + self.assertEqual(infer_type_of_list([{"a": 1}, {"b": 2}]), dict) def test_datetime_lambda(self): d = dt.datetime.now() @@ -528,11 +655,11 @@ def test_flexible_type_hint(self): _check_ft_pyobject_hint_path([], list) _check_ft_pyobject_hint_path([1], list) - _check_ft_pyobject_hint_path((1,2), list) + _check_ft_pyobject_hint_path((1, 2), list) - _check_ft_pyobject_hint_path({1:1}, dict) - _check_ft_pyobject_hint_path(array.array('i', [1,2]), array.array) - _check_ft_pyobject_hint_path(array.array('d', [1,2]), array.array) + _check_ft_pyobject_hint_path({1: 1}, dict) + _check_ft_pyobject_hint_path(array.array("i", [1, 2]), array.array) + _check_ft_pyobject_hint_path(array.array("d", [1, 2]), array.array) def test_pytype_from_type_name(self): @@ -549,7 +676,7 @@ def test_pytype_from_type_name(self): def test_type_conversions(self): # testing valid sarray of inf's (inf is a float) sa_all_inf = SArray(["inf", "Inf", "iNf", "inF", "INF"]) - sa_all_inf.astype(float) # should not raise error so we good + sa_all_inf.astype(float) # should not raise error so we good # testing invalid sarray of float words sa_float_words = SArray(["inf", "infiltrate", "nanana", "2.0version"]) with self.assertRaises(RuntimeError): @@ -561,7 +688,7 @@ def test_type_conversions(self): def test_hashable_dict_keys(self): # Make sure that the keys of a dictionary are actually expressable as keys. - sa_dictionary = SArray([{(1,2) : 3}]) + sa_dictionary = SArray([{(1, 2): 3}]) out = list(sa_dictionary) - self.assertEqual(out[0][(1,2)], 3) + self.assertEqual(out[0][(1, 2)], 3) diff --git a/src/python/turicreate/test/test_gl_pickler.py b/src/python/turicreate/test/test_gl_pickler.py index 4a89bc96e3..f618027fb4 100644 --- a/src/python/turicreate/test/test_gl_pickler.py +++ b/src/python/turicreate/test/test_gl_pickler.py @@ -23,7 +23,6 @@ class GLPicklingTest(unittest.TestCase): - def setUp(self): self.filename = str(uuid.uuid4()) self.dir_mode = False @@ -37,10 +36,16 @@ def tearDown(self): def test_pickling_simple_types(self): obj_list = [ - 1, "hello", 5.0, - (1, 2), ("i", "love", "cricket"), - [1, 2, "hello"], [1.3, (1,2), "foo"], ["bar", {"foo": "bar"}], - {"cricket": "best-sport", "test": [1,2,3]}, {"foo": 1.3}, + 1, + "hello", + 5.0, + (1, 2), + ("i", "love", "cricket"), + [1, 2, "hello"], + [1.3, (1, 2), "foo"], + ["bar", {"foo": "bar"}], + {"cricket": "best-sport", "test": [1, 2, 3]}, + {"foo": 1.3}, ] for obj in obj_list: pickler = gl_pickle.GLPickler(self.filename) @@ -49,13 +54,16 @@ def test_pickling_simple_types(self): unpickler = gl_pickle.GLUnpickler(self.filename) obj_ret = unpickler.load() unpickler.close() - assert obj == obj_ret, "Failed pickling in %s (Got back %s)" % (obj, obj_ret) + assert obj == obj_ret, "Failed pickling in %s (Got back %s)" % ( + obj, + obj_ret, + ) def test_pickling_sarray_types(self): sarray_list = [ - tc.SArray([1,2,3]), - tc.SArray([1.0,2.0,3.5]), + tc.SArray([1, 2, 3]), + tc.SArray([1.0, 2.0, 3.5]), tc.SArray(["foo", "bar"]), ] for obj in sarray_list: @@ -65,14 +73,16 @@ def test_pickling_sarray_types(self): unpickler = gl_pickle.GLUnpickler(self.filename) obj_ret = unpickler.load() unpickler.close() - assert list(obj) == list(obj_ret), \ - "Failed pickling in %s (Got back %s)" % (obj, obj_ret) + assert list(obj) == list(obj_ret), "Failed pickling in %s (Got back %s)" % ( + obj, + obj_ret, + ) def test_pickling_sframe_types(self): sarray_list = [ - tc.SFrame([1,2,3]), - tc.SFrame([1.0,2.0,3.5]), + tc.SFrame([1, 2, 3]), + tc.SFrame([1.0, 2.0, 3.5]), tc.SFrame(["foo", "bar"]), ] for obj in sarray_list: @@ -86,23 +96,25 @@ def test_pickling_sframe_types(self): def test_pickling_sgraph_types(self): - sg_test_1 = tc.SGraph().add_vertices([ - tc.Vertex(0, {'fluffy': 1}), - tc.Vertex(1, {'fluffy': 1, 'woof': 1}), - tc.Vertex(2, {})]) + sg_test_1 = tc.SGraph().add_vertices( + [ + tc.Vertex(0, {"fluffy": 1}), + tc.Vertex(1, {"fluffy": 1, "woof": 1}), + tc.Vertex(2, {}), + ] + ) sg_test_2 = tc.SGraph() - sg_test_2 = sg_test_2.add_vertices([ - tc.Vertex(x) for x in [0, 1, 2]]) - sg_test_2 = sg_test_2.add_edges([ - tc.Edge(0, 1, attr={'relationship': 'dislikes'}), - tc.Edge(1, 2, attr={'relationship': 'likes'}), - tc.Edge(1, 0, attr={'relationship': 'likes'})]) - - sarray_list = [ - sg_test_1, - sg_test_2 - ] + sg_test_2 = sg_test_2.add_vertices([tc.Vertex(x) for x in [0, 1, 2]]) + sg_test_2 = sg_test_2.add_edges( + [ + tc.Edge(0, 1, attr={"relationship": "dislikes"}), + tc.Edge(1, 2, attr={"relationship": "likes"}), + tc.Edge(1, 0, attr={"relationship": "likes"}), + ] + ) + + sarray_list = [sg_test_1, sg_test_2] for obj in sarray_list: pickler = gl_pickle.GLPickler(self.filename) pickler.dump(obj) @@ -113,19 +125,21 @@ def test_pickling_sgraph_types(self): assert_sframe_equal(obj.get_vertices(), obj_ret.get_vertices()) assert_sframe_equal(obj.get_edges(), obj_ret.get_edges()) - def test_combination_gl_python_types(self): - sg_test_1 = tc.SGraph().add_vertices([ - tc.Vertex(1, {'fluffy': 1}), - tc.Vertex(2, {'fluffy': 1, 'woof': 1}), - tc.Vertex(3, {})]) - sarray_test_1 = tc.SArray([1,2,3]) - sframe_test_1 = tc.SFrame([1,2,3]) + sg_test_1 = tc.SGraph().add_vertices( + [ + tc.Vertex(1, {"fluffy": 1}), + tc.Vertex(2, {"fluffy": 1, "woof": 1}), + tc.Vertex(3, {}), + ] + ) + sarray_test_1 = tc.SArray([1, 2, 3]) + sframe_test_1 = tc.SFrame([1, 2, 3]) obj_list = [ [sg_test_1, sframe_test_1, sarray_test_1], - {0:sg_test_1, 1:sframe_test_1, 2:sarray_test_1} + {0: sg_test_1, 1: sframe_test_1, 2: sarray_test_1}, ] for obj in obj_list: @@ -138,48 +152,64 @@ def test_combination_gl_python_types(self): assert_sframe_equal(obj[0].get_vertices(), obj_ret[0].get_vertices()) assert_sframe_equal(obj[0].get_edges(), obj_ret[0].get_edges()) assert_sframe_equal(obj[1], obj_ret[1]) - assert list(obj[2]) == list(obj_ret[2]) + assert list(obj[2]) == list(obj_ret[2]) def test_pickle_compatibility(self): obj_list = [ - 1, "hello", 5.0, - (1, 2), ("i", "love", "cricket"), - [1, 2, "hello"], [1.3, (1,2), "foo"], ["bar", {"foo": "bar"}], - {"cricket": "best-sport", "test": [1,2,3]}, {"foo": 1.3}, + 1, + "hello", + 5.0, + (1, 2), + ("i", "love", "cricket"), + [1, 2, "hello"], + [1.3, (1, 2), "foo"], + ["bar", {"foo": "bar"}], + {"cricket": "best-sport", "test": [1, 2, 3]}, + {"foo": 1.3}, ] for obj in obj_list: - file = open(self.filename, 'wb') + file = open(self.filename, "wb") pickler = pickle.Pickler(file) pickler.dump(obj) file.close() unpickler = gl_pickle.GLUnpickler(self.filename) obj_ret = unpickler.load() unpickler.close() - assert obj == obj_ret, \ - "Failed pickling in %s (Got back %s)" % (obj, obj_ret) + assert obj == obj_ret, "Failed pickling in %s (Got back %s)" % ( + obj, + obj_ret, + ) def test_cloud_pickle_compatibility(self): obj_list = [ - 1, "hello", 5.0, - (1, 2), ("i", "love", "cricket"), - [1, 2, "hello"], [1.3, (1,2), "foo"], ["bar", {"foo": "bar"}], - {"cricket": "best-sport", "test": [1,2,3]}, {"foo": 1.3}, + 1, + "hello", + 5.0, + (1, 2), + ("i", "love", "cricket"), + [1, 2, "hello"], + [1.3, (1, 2), "foo"], + ["bar", {"foo": "bar"}], + {"cricket": "best-sport", "test": [1, 2, 3]}, + {"foo": 1.3}, ] for obj in obj_list: - file = open(self.filename, 'wb') + file = open(self.filename, "wb") pickler = _cloudpickle.CloudPickler(file) pickler.dump(obj) file.close() unpickler = gl_pickle.GLUnpickler(self.filename) obj_ret = unpickler.load() unpickler.close() - assert obj == obj_ret, \ - "Failed pickling in %s (Got back %s)" % (obj, obj_ret) + assert obj == obj_ret, "Failed pickling in %s (Got back %s)" % ( + obj, + obj_ret, + ) def test_relative_path(self): # Arrange sf1 = tc.SFrame(range(10)) - relative_path = 'tmp/%s' % self.filename + relative_path = "tmp/%s" % self.filename # Act pickler = gl_pickle.GLPickler(relative_path) @@ -198,8 +228,8 @@ def test_relative_path(self): def test_save_over_previous(self): sarray_list = [ - tc.SFrame([1,2,3]), - tc.SFrame([1.0,2.0,3.5]), + tc.SFrame([1, 2, 3]), + tc.SFrame([1.0, 2.0, 3.5]), tc.SFrame(["foo", "bar"]), ] for obj in sarray_list: diff --git a/src/python/turicreate/test/test_graph.py b/src/python/turicreate/test/test_graph.py index d10982bcd2..5c1d3bd780 100644 --- a/src/python/turicreate/test/test_graph.py +++ b/src/python/turicreate/test/test_graph.py @@ -19,72 +19,139 @@ import os import sys + if sys.version_info.major > 2: unittest.TestCase.assertItemsEqual = unittest.TestCase.assertCountEqual + class GraphTests(unittest.TestCase): def setUp(self): - self.vertices = pd.DataFrame({ - 'vid': ['1', '2', '3'], - 'color': ['g', None, 'b'], - 'vec': [[.1, .1, .1], [.1, .1, .1], [.1, .1, .1]]}) - self.edges = pd.DataFrame({ - 'src_id': ['1', '2', '3'], - 'dst_id': ['2', '3', '4'], - 'weight': [0., None, 1.]}) + self.vertices = pd.DataFrame( + { + "vid": ["1", "2", "3"], + "color": ["g", None, "b"], + "vec": [[0.1, 0.1, 0.1], [0.1, 0.1, 0.1], [0.1, 0.1, 0.1]], + } + ) + self.edges = pd.DataFrame( + { + "src_id": ["1", "2", "3"], + "dst_id": ["2", "3", "4"], + "weight": [0.0, None, 1.0], + } + ) def test_empty_graph(self): g = SGraph() - self.assertEqual(g.summary(), {'num_vertices': 0, 'num_edges': 0}) + self.assertEqual(g.summary(), {"num_vertices": 0, "num_edges": 0}) self.assertEqual(len(g.get_fields()), 3) - self.assertTrue(g.get_vertices(format='sframe').shape, (0, 1)) - self.assertTrue(g.get_edges(format='sframe').shape, (0, 2)) + self.assertTrue(g.get_vertices(format="sframe").shape, (0, 1)) + self.assertTrue(g.get_edges(format="sframe").shape, (0, 2)) self.assertTrue(g.vertices.shape, (0, 1)) self.assertTrue(g.edges.shape, (0, 2)) - self.assertTrue(len(g.get_vertices(format='list')) == 0) - self.assertTrue(len(g.get_edges(format='list')) == 0) + self.assertTrue(len(g.get_vertices(format="list")) == 0) + self.assertTrue(len(g.get_edges(format="list")) == 0) def test_graph_constructor(self): - g = SGraph().add_vertices(self.vertices, 'vid').add_edges(self.edges, 'src_id', 'dst_id') + g = ( + SGraph() + .add_vertices(self.vertices, "vid") + .add_edges(self.edges, "src_id", "dst_id") + ) g2 = SGraph(g.vertices, g.edges) - g3 = SGraph(g.vertices, g.edges, src_field="__dst_id", dst_field="__src_id") #flip around src and dst - assert_frame_equal(g.vertices.to_dataframe().sort_values('__id').reset_index(drop=True), - g2.vertices.to_dataframe().sort_values('__id').reset_index(drop=True)) - assert_frame_equal(g.edges.to_dataframe().sort_values(['__src_id', '__dst_id']).reset_index(drop=True), - g2.edges.to_dataframe().sort_values(['__src_id', '__dst_id']).reset_index(drop=True)) - self.assertRaises(ValueError, lambda: SGraph(SFrame(self.vertices), SFrame(self.edges))) - self.assertRaises(ValueError, lambda: SGraph(SFrame(self.vertices), SFrame(self.edges), 'vid', '__src_id', '__dst_id')) - self.assertRaises(ValueError, lambda: SGraph(SFrame(self.vertices), SFrame(self.edges), vid_field=None, src_field='src_id', dst_field='dst_id')) + g3 = SGraph( + g.vertices, g.edges, src_field="__dst_id", dst_field="__src_id" + ) # flip around src and dst + assert_frame_equal( + g.vertices.to_dataframe().sort_values("__id").reset_index(drop=True), + g2.vertices.to_dataframe().sort_values("__id").reset_index(drop=True), + ) + assert_frame_equal( + g.edges.to_dataframe() + .sort_values(["__src_id", "__dst_id"]) + .reset_index(drop=True), + g2.edges.to_dataframe() + .sort_values(["__src_id", "__dst_id"]) + .reset_index(drop=True), + ) + self.assertRaises( + ValueError, lambda: SGraph(SFrame(self.vertices), SFrame(self.edges)) + ) + self.assertRaises( + ValueError, + lambda: SGraph( + SFrame(self.vertices), SFrame(self.edges), "vid", "__src_id", "__dst_id" + ), + ) + self.assertRaises( + ValueError, + lambda: SGraph( + SFrame(self.vertices), + SFrame(self.edges), + vid_field=None, + src_field="src_id", + dst_field="dst_id", + ), + ) def test_simple_graph(self): for input_type in [pd.DataFrame, SFrame, list]: g = SGraph() if input_type is list: - vertices = [Vertex(x[1]['vid'], {'color': x[1]['color'], 'vec': x[1]['vec']}) for x in self.vertices.iterrows()] - edges = [Edge(x[1]['src_id'], x[1]['dst_id'], {'weight': x[1]['weight']}) for x in self.edges.iterrows()] + vertices = [ + Vertex(x[1]["vid"], {"color": x[1]["color"], "vec": x[1]["vec"]}) + for x in self.vertices.iterrows() + ] + edges = [ + Edge(x[1]["src_id"], x[1]["dst_id"], {"weight": x[1]["weight"]}) + for x in self.edges.iterrows() + ] g = g.add_vertices(vertices) g = g.add_edges(edges) else: - g = g.add_vertices(input_type(self.vertices), vid_field='vid') - g = g.add_edges(input_type(self.edges), src_field='src_id', dst_field='dst_id') - self.assertEqual(g.summary(), {'num_vertices': 4, 'num_edges': 3}) - self.assertItemsEqual(g.get_fields(), ['__id', '__src_id', '__dst_id', 'color', 'vec', 'weight']) - self.assertItemsEqual(g.get_vertices(format='dataframe').columns.values, ['color', 'vec']) - self.assertItemsEqual(g.get_edges(format='dataframe').columns.values, ['__src_id', '__dst_id', 'weight']) - self.assertTrue(g.get_edges(format='dataframe').shape, (3, 3)) - self.assertTrue(g.get_vertices(format='dataframe').shape, (4, 3)) - self.assertTrue(g.get_vertices(format='dataframe', fields={'color': 'g'}).shape, (1, 2)) - self.assertTrue(g.get_edges(format='dataframe', fields={'weight': 0.}).shape, (1, 3)) - - self.assertItemsEqual(g.get_vertices(format='sframe').column_names(), ['__id', 'color', 'vec']) - self.assertItemsEqual(g.get_edges(format='sframe').column_names(), ['__src_id', '__dst_id', 'weight']) - self.assertTrue(g.get_edges(format='sframe').shape, (3, 3)) - self.assertTrue(g.get_vertices(format='sframe').shape, (4, 3)) - self.assertTrue(g.get_vertices(format='sframe', fields={'color': 'g'}).shape, (1, 2)) - self.assertTrue(g.get_edges(format='sframe', fields={'weight': 0.}).shape, (1, 3)) - - vertices = g.get_vertices(format='list') - edges = g.get_edges(format='list') + g = g.add_vertices(input_type(self.vertices), vid_field="vid") + g = g.add_edges( + input_type(self.edges), src_field="src_id", dst_field="dst_id" + ) + self.assertEqual(g.summary(), {"num_vertices": 4, "num_edges": 3}) + self.assertItemsEqual( + g.get_fields(), + ["__id", "__src_id", "__dst_id", "color", "vec", "weight"], + ) + self.assertItemsEqual( + g.get_vertices(format="dataframe").columns.values, ["color", "vec"] + ) + self.assertItemsEqual( + g.get_edges(format="dataframe").columns.values, + ["__src_id", "__dst_id", "weight"], + ) + self.assertTrue(g.get_edges(format="dataframe").shape, (3, 3)) + self.assertTrue(g.get_vertices(format="dataframe").shape, (4, 3)) + self.assertTrue( + g.get_vertices(format="dataframe", fields={"color": "g"}).shape, (1, 2) + ) + self.assertTrue( + g.get_edges(format="dataframe", fields={"weight": 0.0}).shape, (1, 3) + ) + + self.assertItemsEqual( + g.get_vertices(format="sframe").column_names(), ["__id", "color", "vec"] + ) + self.assertItemsEqual( + g.get_edges(format="sframe").column_names(), + ["__src_id", "__dst_id", "weight"], + ) + self.assertTrue(g.get_edges(format="sframe").shape, (3, 3)) + self.assertTrue(g.get_vertices(format="sframe").shape, (4, 3)) + self.assertTrue( + g.get_vertices(format="sframe", fields={"color": "g"}).shape, (1, 2) + ) + self.assertTrue( + g.get_edges(format="sframe", fields={"weight": 0.0}).shape, (1, 3) + ) + + vertices = g.get_vertices(format="list") + edges = g.get_edges(format="list") self.assertEqual(len(vertices), 4) self.assertEqual(len(edges), 3) @@ -93,91 +160,125 @@ def test_simple_graph(self): self.assertFalse(edges.__is_materialized__()) def test_vertex_query(self): - df = pd.DataFrame({'src': ['a', 'c', 'b', 'd', 'c', 'e', 'g', 'f'], - 'dst': ['b', 'b', 'd', 'c', 'e', 'g', 'f', 'e']}) - g = SGraph().add_edges(df, src_field='src', dst_field='dst') + df = pd.DataFrame( + { + "src": ["a", "c", "b", "d", "c", "e", "g", "f"], + "dst": ["b", "b", "d", "c", "e", "g", "f", "e"], + } + ) + g = SGraph().add_edges(df, src_field="src", dst_field="dst") # basic check - g2 = g.get_neighborhood(ids=['b'], radius=1, full_subgraph=False) - out = g2.get_edges(format='dataframe') - out.sort_values(by=['__src_id', '__dst_id'], axis=0, inplace=True) + g2 = g.get_neighborhood(ids=["b"], radius=1, full_subgraph=False) + out = g2.get_edges(format="dataframe") + out.sort_values(by=["__src_id", "__dst_id"], axis=0, inplace=True) out.index = range(len(out)) - correct = pd.DataFrame.from_records([('b', 'd'), - ('a', 'b'), - ('c', 'b')], - columns=['__src_id', '__dst_id']) - correct.sort_values(by=['__src_id', '__dst_id'], axis=0, inplace=True) + correct = pd.DataFrame.from_records( + [("b", "d"), ("a", "b"), ("c", "b")], columns=["__src_id", "__dst_id"] + ) + correct.sort_values(by=["__src_id", "__dst_id"], axis=0, inplace=True) correct.index = range(len(correct)) assert_frame_equal(out, correct, check_dtype=False) # check larger radius, full subgraph, and multiple vertices - g2 = g.get_neighborhood(ids=['a', 'g'], radius=2, full_subgraph=True) - out = g2.get_edges(format='dataframe') - out.sort_values(by=['__src_id', '__dst_id'], axis=0, inplace=True) + g2 = g.get_neighborhood(ids=["a", "g"], radius=2, full_subgraph=True) + out = g2.get_edges(format="dataframe") + out.sort_values(by=["__src_id", "__dst_id"], axis=0, inplace=True) out.index = range(len(out)) - correct = pd.DataFrame.from_records([('a', 'b'), - ('b', 'd'), - ('c', 'b'), - ('c', 'e'), - ('d', 'c'), - ('e', 'g'), - ('f', 'e'), - ('g', 'f')], - columns=['__src_id', '__dst_id']) - correct.sort_values(by=['__src_id', '__dst_id'], axis=0, inplace=True) + correct = pd.DataFrame.from_records( + [ + ("a", "b"), + ("b", "d"), + ("c", "b"), + ("c", "e"), + ("d", "c"), + ("e", "g"), + ("f", "e"), + ("g", "f"), + ], + columns=["__src_id", "__dst_id"], + ) + correct.sort_values(by=["__src_id", "__dst_id"], axis=0, inplace=True) correct.index = range(len(correct)) assert_frame_equal(out, correct, check_dtype=False) def test_select_query(self): g = SGraph() - g = g.add_vertices(self.vertices, 'vid').add_edges(self.edges, 'src_id', 'dst_id') + g = g.add_vertices(self.vertices, "vid").add_edges( + self.edges, "src_id", "dst_id" + ) g2 = g.select_fields(["color", "weight"]) - self.assertSequenceEqual((g2.get_fields()), ['__id', 'color', '__src_id', '__dst_id', 'weight']) + self.assertSequenceEqual( + (g2.get_fields()), ["__id", "color", "__src_id", "__dst_id", "weight"] + ) g2 = g.select_fields(["color"]) - self.assertSequenceEqual((g2.get_fields()), ['__id', 'color', '__src_id', '__dst_id']) - del g.edges['weight'] - del g.vertices['vec'] - g.vertices['color2'] = g.vertices['color'] - self.assertSequenceEqual((g.get_fields()), ['__id', 'color', 'color2', '__src_id', '__dst_id']) + self.assertSequenceEqual( + (g2.get_fields()), ["__id", "color", "__src_id", "__dst_id"] + ) + del g.edges["weight"] + del g.vertices["vec"] + g.vertices["color2"] = g.vertices["color"] + self.assertSequenceEqual( + (g.get_fields()), ["__id", "color", "color2", "__src_id", "__dst_id"] + ) g2 = g.select_fields([]) - self.assertSequenceEqual((g2.get_fields()), ['__id', '__src_id', '__dst_id']) + self.assertSequenceEqual((g2.get_fields()), ["__id", "__src_id", "__dst_id"]) def test_select_query_with_same_vertex_edge_field(self): - vertices = SFrame({'__id': range(10)}) - edges = SFrame({'__src_id': range(10), '__dst_id': range(1, 11)}) + vertices = SFrame({"__id": range(10)}) + edges = SFrame({"__src_id": range(10), "__dst_id": range(1, 11)}) g = SGraph(vertices, edges) - g.vertices['weight'] = 0 - g.vertices['v'] = 0 - g.edges['weight'] = 0 - g.edges['e'] = 0 - self.assertItemsEqual(g.get_fields(), ['v', 'e', 'weight', 'weight', '__id', '__src_id', '__dst_id']) - g2 = g.select_fields('weight') - self.assertItemsEqual(g2.get_fields(), ['weight', 'weight', '__id', '__src_id', '__dst_id']) + g.vertices["weight"] = 0 + g.vertices["v"] = 0 + g.edges["weight"] = 0 + g.edges["e"] = 0 + self.assertItemsEqual( + g.get_fields(), + ["v", "e", "weight", "weight", "__id", "__src_id", "__dst_id"], + ) + g2 = g.select_fields("weight") + self.assertItemsEqual( + g2.get_fields(), ["weight", "weight", "__id", "__src_id", "__dst_id"] + ) def test_save_load(self): - g = SGraph().add_vertices(self.vertices, 'vid').add_edges(self.edges, 'src_id', 'dst_id') + g = ( + SGraph() + .add_vertices(self.vertices, "vid") + .add_edges(self.edges, "src_id", "dst_id") + ) with util.TempDirectory() as f: g.save(f) - g2 = load_sgraph(f, 'binary') - self.assertEqual(g2.summary(), {'num_vertices': 4, 'num_edges': 3}) - self.assertItemsEqual(g2.get_fields(), {'__id', '__src_id', '__dst_id', 'color', 'vec', 'weight'}) + g2 = load_sgraph(f, "binary") + self.assertEqual(g2.summary(), {"num_vertices": 4, "num_edges": 3}) + self.assertItemsEqual( + g2.get_fields(), + {"__id", "__src_id", "__dst_id", "color", "vec", "weight"}, + ) with util.TempDirectory() as f: - g.save(f, format='csv') + g.save(f, format="csv") vertices = SFrame.read_csv(f + "/vertices.csv") edges = SFrame.read_csv(f + "/edges.csv") - g2 = SGraph().add_edges(edges, '__src_id', '__dst_id').add_vertices(vertices, '__id') - self.assertEqual(g2.summary(), {'num_vertices': 4, 'num_edges': 3}) - self.assertItemsEqual(g2.get_fields(), {'__id', '__src_id', '__dst_id', 'color', 'vec', 'weight'}) + g2 = ( + SGraph() + .add_edges(edges, "__src_id", "__dst_id") + .add_vertices(vertices, "__id") + ) + self.assertEqual(g2.summary(), {"num_vertices": 4, "num_edges": 3}) + self.assertItemsEqual( + g2.get_fields(), + {"__id", "__src_id", "__dst_id", "color", "vec", "weight"}, + ) temp_fn = None # The delete=False is for Windows sake - with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f: + with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f: temp_fn = f.name g.save(f.name) - with open(f.name, 'r') as f2: + with open(f.name, "r") as f2: data = f2.read() g2 = json.loads(data) self.assertTrue("vertices" in g2) @@ -209,16 +310,24 @@ def test_load_graph_from_text(self): 3,2""" temp_fnames = [] - with tempfile.NamedTemporaryFile(mode="w", delete=False) as fsnap, tempfile.NamedTemporaryFile(mode="w", delete=False) as ftsv, tempfile.NamedTemporaryFile(mode="w", delete=False) as fcsv: + with tempfile.NamedTemporaryFile( + mode="w", delete=False + ) as fsnap, tempfile.NamedTemporaryFile( + mode="w", delete=False + ) as ftsv, tempfile.NamedTemporaryFile( + mode="w", delete=False + ) as fcsv: fsnap.write(toy_graph_snap) fsnap.file.flush() ftsv.write(toy_graph_tsv) ftsv.file.flush() fcsv.write(toy_graph_csv) fcsv.file.flush() - for (fname, fmt) in zip([fsnap.name, ftsv.name, fcsv.name], ['snap', 'tsv', 'csv']): + for (fname, fmt) in zip( + [fsnap.name, ftsv.name, fcsv.name], ["snap", "tsv", "csv"] + ): g = load_sgraph(fname, fmt) - self.assertEqual(g.summary(), {'num_vertices': 3, 'num_edges': 6}) + self.assertEqual(g.summary(), {"num_vertices": 3, "num_edges": 6}) temp_fnames.append(fname) for name in temp_fnames: @@ -226,41 +335,56 @@ def test_load_graph_from_text(self): os.remove(name) def test_robust_parse(self): - df = pd.DataFrame({'int': [1, 2, 3], - 'float': [1., 2., 3.], - 'str': ['one', 'two', 'three'], - 'nan': [np.nan, np.nan, np.nan], - 'sparse_int': [1, 2, np.nan], - 'sparse_float': [np.nan, 2., 3.], - 'sparse_str': [None, 'two', None] - }) + df = pd.DataFrame( + { + "int": [1, 2, 3], + "float": [1.0, 2.0, 3.0], + "str": ["one", "two", "three"], + "nan": [np.nan, np.nan, np.nan], + "sparse_int": [1, 2, np.nan], + "sparse_float": [np.nan, 2.0, 3.0], + "sparse_str": [None, "two", None], + } + ) g = SGraph().add_vertices(df) - self.assertItemsEqual(g.get_fields(), df.columns.tolist() + ['__id', '__src_id', '__dst_id']) + self.assertItemsEqual( + g.get_fields(), df.columns.tolist() + ["__id", "__src_id", "__dst_id"] + ) - df2 = g.get_vertices(format='dataframe') - sf = g.get_vertices(format='sframe') + df2 = g.get_vertices(format="dataframe") + sf = g.get_vertices(format="sframe") for col in df.columns: # potential bug: df2 is missing the 'nan' column. - if (col != 'nan'): - self.assertItemsEqual(sorted(list(df2[col].dropna())), sorted(list(df[col].dropna()))) - self.assertItemsEqual(sorted(list(sf[col].dropna())), sorted(list(df[col].dropna()))) + if col != "nan": + self.assertItemsEqual( + sorted(list(df2[col].dropna())), sorted(list(df[col].dropna())) + ) + self.assertItemsEqual( + sorted(list(sf[col].dropna())), sorted(list(df[col].dropna())) + ) def test_missing_value_vids(self): vertices = SFrame() - vertices['vid'] = [1, 2, 3, None] + vertices["vid"] = [1, 2, 3, None] edges = SFrame() - edges['src'] = [1, 2, 3, None] - edges['dst'] = [4, 4, 4, 4] - self.assertRaises(RuntimeError, lambda : SGraph().add_vertices(vertices, 'vid').summary()) - self.assertRaises(RuntimeError, lambda : SGraph().add_edges(edges, 'src', 'dst').summary()) - self.assertRaises(RuntimeError, lambda : SGraph().add_edges(edges, 'dst', 'src').summary()) + edges["src"] = [1, 2, 3, None] + edges["dst"] = [4, 4, 4, 4] + self.assertRaises( + RuntimeError, lambda: SGraph().add_vertices(vertices, "vid").summary() + ) + self.assertRaises( + RuntimeError, lambda: SGraph().add_edges(edges, "src", "dst").summary() + ) + self.assertRaises( + RuntimeError, lambda: SGraph().add_edges(edges, "dst", "src").summary() + ) def test_gframe(self): g = SGraph() v = g.vertices - self.assertSequenceEqual(v.column_names(), ['__id']) + self.assertSequenceEqual(v.column_names(), ["__id"]) e = g.edges - self.assertSequenceEqual(e.column_names(), ['__src_id', '__dst_id']) + self.assertSequenceEqual(e.column_names(), ["__src_id", "__dst_id"]) # Test vertices and edge attributes cannot be modified def set_vertices_empty(g): @@ -284,37 +408,61 @@ def remove_edge_column(gf, name): self.assertRaises(AttributeError, lambda: set_edges_empty(g)) # Test gframe operations has the same effect as its sframe+graph equivalent - g = SGraph().add_vertices(self.vertices, 'vid').add_edges(self.edges, 'src_id', 'dst_id') + g = ( + SGraph() + .add_vertices(self.vertices, "vid") + .add_edges(self.edges, "src_id", "dst_id") + ) v = g.vertices - v['id_col'] = v['__id'] + v["id_col"] = v["__id"] e = g.edges - e['src_id_col'] = e['__src_id'] - e['dst_id_col'] = e['__dst_id'] - g2 = SGraph().add_vertices(self.vertices, 'vid').add_edges(self.edges, 'src_id', 'dst_id') + e["src_id_col"] = e["__src_id"] + e["dst_id_col"] = e["__dst_id"] + g2 = ( + SGraph() + .add_vertices(self.vertices, "vid") + .add_edges(self.edges, "src_id", "dst_id") + ) new_vdata = g2.get_vertices() - new_vdata['id_col'] = new_vdata['__id'] + new_vdata["id_col"] = new_vdata["__id"] new_edata = g2.get_edges() - new_edata['src_id_col'] = new_edata['__src_id'] - new_edata['dst_id_col'] = new_edata['__dst_id'] - g2 = SGraph().add_vertices(new_vdata, '__id').add_edges(new_edata, '__src_id', '__dst_id') - assert_frame_equal(g.get_vertices().to_dataframe().sort_values('__id').reset_index(drop=True), - g2.get_vertices().to_dataframe().sort_values('__id').reset_index(drop=True)) - assert_frame_equal(g.get_edges().to_dataframe().sort_values(['__src_id', '__dst_id']).reset_index(drop=True), - g2.get_edges().to_dataframe().sort_values(['__src_id', '__dst_id']).reset_index(drop=True)) + new_edata["src_id_col"] = new_edata["__src_id"] + new_edata["dst_id_col"] = new_edata["__dst_id"] + g2 = ( + SGraph() + .add_vertices(new_vdata, "__id") + .add_edges(new_edata, "__src_id", "__dst_id") + ) + assert_frame_equal( + g.get_vertices().to_dataframe().sort_values("__id").reset_index(drop=True), + g2.get_vertices().to_dataframe().sort_values("__id").reset_index(drop=True), + ) + assert_frame_equal( + g.get_edges() + .to_dataframe() + .sort_values(["__src_id", "__dst_id"]) + .reset_index(drop=True), + g2.get_edges() + .to_dataframe() + .sort_values(["__src_id", "__dst_id"]) + .reset_index(drop=True), + ) # check delete a column with exception, and edges is still in a valid state - self.assertRaises(KeyError, lambda: remove_edge_column(g.edges, 'badcolumn')) + self.assertRaises(KeyError, lambda: remove_edge_column(g.edges, "badcolumn")) g.edges.head() # test slicing assert_frame_equal(g.edges[:3].to_dataframe(), g.get_edges()[:3].to_dataframe()) - assert_frame_equal(g.vertices[:3].to_dataframe(), g.get_vertices()[:3].to_dataframe()) + assert_frame_equal( + g.vertices[:3].to_dataframe(), g.get_vertices()[:3].to_dataframe() + ) # test add row number e_expected = g.get_edges().to_dataframe() v_expected = g.get_vertices().to_dataframe() - e_expected['id'] = range(len(e_expected)) - v_expected['id'] = range(len(v_expected)) + e_expected["id"] = range(len(e_expected)) + v_expected["id"] = range(len(v_expected)) def test_sframe_le_append_skip_row_bug_is_fixed(self): """ @@ -331,9 +479,14 @@ def test_sframe_le_append_skip_row_bug_is_fixed(self): n = 12 # smallest n to repro the le_append bug # A graph with edge i -> i + 1 - g = SGraph().add_edges(SFrame({'src': range(n), 'dst': range(1, n + 1)}), 'src', 'dst') + g = SGraph().add_edges( + SFrame({"src": range(n), "dst": range(1, n + 1)}), "src", "dst" + ) lazy_sf = g.get_edges() materialized_sf = g.get_edges() materialized_sf.materialize() - assert_frame_equal(lazy_sf[lazy_sf['__dst_id'] == n].to_dataframe(), materialized_sf[materialized_sf['__dst_id'] == n].to_dataframe()) + assert_frame_equal( + lazy_sf[lazy_sf["__dst_id"] == n].to_dataframe(), + materialized_sf[materialized_sf["__dst_id"] == n].to_dataframe(), + ) diff --git a/src/python/turicreate/test/test_graph_analytics.py b/src/python/turicreate/test/test_graph_analytics.py index dad3d298f5..706e13e3aa 100644 --- a/src/python/turicreate/test/test_graph_analytics.py +++ b/src/python/turicreate/test/test_graph_analytics.py @@ -19,16 +19,18 @@ from turicreate.data_structures.sframe import SFrame import sys + if sys.version_info.major == 3: unittest.TestCase.assertItemsEqual = unittest.TestCase.assertCountEqual dataset_server = "http://testdatasets.s3-website-us-west-2.amazonaws.com/" + class GraphAnalyticsTest(unittest.TestCase): @classmethod def setUpClass(cls): url = dataset_server + "p2p-Gnutella04.txt.gz" - cls.graph = tc.load_sgraph(url, format='snap') + cls.graph = tc.load_sgraph(url, format="snap") def __test_model_save_load_helper__(self, model): with util.TempDirectory() as f: @@ -37,8 +39,12 @@ def __test_model_save_load_helper__(self, model): self.assertItemsEqual(model._list_fields(), m2._list_fields()) for key in model._list_fields(): if type(model._get(key)) is SGraph: - self.assertItemsEqual(model._get(key).summary(), m2._get(key).summary()) - self.assertItemsEqual(model._get(key).get_fields(), m2._get(key).get_fields()) + self.assertItemsEqual( + model._get(key).summary(), m2._get(key).summary() + ) + self.assertItemsEqual( + model._get(key).get_fields(), m2._get(key).get_fields() + ) elif type(model._get(key)) is SFrame: sf1 = model._get(key) sf2 = m2._get(key) @@ -52,7 +58,7 @@ def __test_model_save_load_helper__(self, model): df2 = df2.set_index(df2.columns[0]) assert_frame_equal(df1, df2) else: - if (type(model._get(key)) is pd.DataFrame): + if type(model._get(key)) is pd.DataFrame: assert_frame_equal(model._get(key), m2._get(key)) else: self.assertEqual(model._get(key), m2._get(key)) @@ -64,20 +70,30 @@ def test_degree_count(self): self.__test_model_save_load_helper__(m) g = m.graph - expected_out_deg = g.edges.groupby('__src_id', {'expected': tc.aggregate.COUNT}) - expected_out_deg = expected_out_deg.join(g.vertices[['__id']], on={'__src_id': "__id"}, how="right").fillna("expected", 0) - expected_out_deg = expected_out_deg.sort("__src_id")['expected'] - expected_in_deg = g.edges.groupby('__dst_id', {'expected': tc.aggregate.COUNT}) - expected_in_deg = expected_in_deg.join(g.vertices[['__id']], on={'__dst_id': "__id"}, how="right").fillna("expected", 0) - expected_in_deg = expected_in_deg.sort("__dst_id")['expected'] - - sf = g.vertices.sort('__id') - actual_out_deg = sf['out_degree'] - actual_in_deg = sf['in_degree'] - actual_all_deg = sf['total_degree'] + expected_out_deg = g.edges.groupby( + "__src_id", {"expected": tc.aggregate.COUNT} + ) + expected_out_deg = expected_out_deg.join( + g.vertices[["__id"]], on={"__src_id": "__id"}, how="right" + ).fillna("expected", 0) + expected_out_deg = expected_out_deg.sort("__src_id")["expected"] + expected_in_deg = g.edges.groupby( + "__dst_id", {"expected": tc.aggregate.COUNT} + ) + expected_in_deg = expected_in_deg.join( + g.vertices[["__id"]], on={"__dst_id": "__id"}, how="right" + ).fillna("expected", 0) + expected_in_deg = expected_in_deg.sort("__dst_id")["expected"] + + sf = g.vertices.sort("__id") + actual_out_deg = sf["out_degree"] + actual_in_deg = sf["in_degree"] + actual_all_deg = sf["total_degree"] self.assertEqual((expected_in_deg - actual_in_deg).sum(), 0) self.assertEqual((expected_out_deg - actual_out_deg).sum(), 0) - self.assertEqual((actual_all_deg - (actual_out_deg + actual_in_deg)).sum(), 0) + self.assertEqual( + (actual_all_deg - (actual_out_deg + actual_in_deg)).sum(), 0 + ) def test_label_propagation(self): if "label_propagation" in get_unity().list_toolkit_functions(): @@ -92,64 +108,81 @@ def get_label(vid): return 1 else: return None - g.vertices['label'] = g.vertices['__id'].apply(get_label, int) - m = tc.label_propagation.create(g, label_field='label') + + g.vertices["label"] = g.vertices["__id"].apply(get_label, int) + m = tc.label_propagation.create(g, label_field="label") m.summary() self.__test_model_save_load_helper__(m) for row in m.graph.vertices: - predicted_label = row['predicted_label'] + predicted_label = row["predicted_label"] if predicted_label is None: - for k in ['P%d' % i for i in range(num_classes)]: + for k in ["P%d" % i for i in range(num_classes)]: self.assertAlmostEqual(row[k], 1.0 / num_classes) else: sum_of_prob = 0.0 - for k in ['P%d' % i for i in range(num_classes)]: + for k in ["P%d" % i for i in range(num_classes)]: sum_of_prob += row[k] - self.assertGreaterEqual(row['P%d' % predicted_label], row[k]) + self.assertGreaterEqual(row["P%d" % predicted_label], row[k]) self.assertAlmostEqual(sum_of_prob, 1.0) # Add more options: weighted edges, change self weight, and undirected edges def get_edge_weight(vid): return float(vid) * 10 / num_vertices - g.edges['weight'] = g.edges['__src_id'].apply(get_edge_weight, float) - m = tc.label_propagation.create(g, label_field='label', threshold=1e-2, - weight_field='weight', self_weight=0.5, - undirected=True) + + g.edges["weight"] = g.edges["__src_id"].apply(get_edge_weight, float) + m = tc.label_propagation.create( + g, + label_field="label", + threshold=1e-2, + weight_field="weight", + self_weight=0.5, + undirected=True, + ) # Test early termination using max_iteration max_iter = 3 - m = tc.label_propagation.create(g, label_field='label', threshold=1e-10, max_iterations=max_iter) + m = tc.label_propagation.create( + g, label_field="label", threshold=1e-10, max_iterations=max_iter + ) self.assertEqual(m.num_iterations, max_iter) # Test that the predict class should be None if all class probabilities are equal - g = g.add_vertices(tc.SFrame({'__id': [-1]})) - m = tc.label_propagation.create(g, label_field='label', threshold=1e-10, max_iterations=max_iter) + g = g.add_vertices(tc.SFrame({"__id": [-1]})) + m = tc.label_propagation.create( + g, label_field="label", threshold=1e-10, max_iterations=max_iter + ) result = m.graph.vertices - self.assertEqual(result[result['__id'] == -1]['predicted_label'][0], None) + self.assertEqual(result[result["__id"] == -1]["predicted_label"][0], None) def test_pagerank(self): if "pagerank" in get_unity().list_toolkit_functions(): m = tc.pagerank.create(self.graph) print(m) m.summary() - self.assertEqual((m.pagerank.num_rows(), m.pagerank.num_columns()), - (self.graph.summary()['num_vertices'], 3)) - self.assertEqual(int(m.pagerank['pagerank'].sum()), 2727) + self.assertEqual( + (m.pagerank.num_rows(), m.pagerank.num_columns()), + (self.graph.summary()["num_vertices"], 3), + ) + self.assertEqual(int(m.pagerank["pagerank"].sum()), 2727) self.__test_model_save_load_helper__(m) m2 = tc.pagerank.create(self.graph, reset_probability=0.5) print(m2) - self.assertEqual((m2.pagerank.num_rows(), m2.pagerank.num_columns()), - (self.graph.summary()['num_vertices'], 3)) - self.assertAlmostEqual(m2.pagerank['pagerank'].sum(), 7087.08, delta=1e-2) + self.assertEqual( + (m2.pagerank.num_rows(), m2.pagerank.num_columns()), + (self.graph.summary()["num_vertices"], 3), + ) + self.assertAlmostEqual(m2.pagerank["pagerank"].sum(), 7087.08, delta=1e-2) with self.assertRaises(Exception): - assert_frame_equal(m.pagerank.topk('pagerank'), m2.pagerank.topk('pagerank')) + assert_frame_equal( + m.pagerank.topk("pagerank"), m2.pagerank.topk("pagerank") + ) - pr_out = m2['pagerank'] + pr_out = m2["pagerank"] with self.assertRaises(Exception): - assert_frame_equal(m.pagerank.topk('pagerank'), pr_out.topk('pagerank')) + assert_frame_equal(m.pagerank.topk("pagerank"), pr_out.topk("pagerank")) self.__test_model_save_load_helper__(m2) @@ -184,9 +217,11 @@ def test_kcore(self): m = tc.kcore.create(self.graph) print(m) m.summary() - biggest_core = m.core_id.groupby('core_id', tc.aggregate.COUNT).topk('Count').head(1) - self.assertEqual(biggest_core['core_id'][0], 6) - self.assertEqual(biggest_core['Count'][0], 4492) + biggest_core = ( + m.core_id.groupby("core_id", tc.aggregate.COUNT).topk("Count").head(1) + ) + self.assertEqual(biggest_core["core_id"][0], 6) + self.assertEqual(biggest_core["Count"][0], 4492) self.__test_model_save_load_helper__(m) def test_shortest_path(self): @@ -204,7 +239,9 @@ def test_shortest_path(self): chain_graph = tc.SGraph().add_edges([tc.Edge(i, i + 1) for i in range(10)]) m3 = tc.shortest_path.create(chain_graph, source_vid=0) for i in range(10): - self.assertSequenceEqual(m3.get_path(i), [(j, float(j)) for j in range(i + 1)]) + self.assertSequenceEqual( + m3.get_path(i), [(j, float(j)) for j in range(i + 1)] + ) star_graph = tc.SGraph().add_edges([tc.Edge(0, i + 1) for i in range(10)]) m4 = tc.shortest_path.create(star_graph, source_vid=0) @@ -219,26 +256,31 @@ def test_shortest_path(self): # m4.get_path(i, show=True) # Test sssp ignoring the existing distance field - star_graph.vertices['distance'] = 0 + star_graph.vertices["distance"] = 0 m5 = tc.shortest_path.create(star_graph, source_vid=0) for i in range(1, 11): self.assertSequenceEqual(m5.get_path(i), [(0, 0.0), (i, 1.0)]) def test_compute_shortest_path(self): - edge_src_ids = ['src1', 'src2', 'a', 'b', 'c' ] - edge_dst_ids = [ 'a', 'b', 'dst', 'c', 'dst'] - edges = tc.SFrame({'__src_id': edge_src_ids, '__dst_id': edge_dst_ids}) - g=tc.SGraph().add_edges(edges) - res = tc.shortest_path._compute_shortest_path( - g, ["src1","src2"], "dst") - self.assertEqual(res, [["src1", "a", "dst"]]) - res = tc.shortest_path._compute_shortest_path(g, "src2", "dst") - self.assertEqual(res, [["src2", "b", "c", "dst"]]) - - edge_src_ids = [0,1,2,3,4] - edge_dst_ids = [2,3,5,4,5] - edge_weights = [1,0.1,1,0.1,0.1] - g=tc.SFrame({'__src_id':edge_src_ids,'__dst_id':edge_dst_ids, 'weights':edge_weights}) - g=tc.SGraph(edges=g) - t=tc.shortest_path._compute_shortest_path(g,[0,1],[5],"weights") - self.assertEqual(t, [[1,3,4,5]]) + edge_src_ids = ["src1", "src2", "a", "b", "c"] + edge_dst_ids = ["a", "b", "dst", "c", "dst"] + edges = tc.SFrame({"__src_id": edge_src_ids, "__dst_id": edge_dst_ids}) + g = tc.SGraph().add_edges(edges) + res = tc.shortest_path._compute_shortest_path(g, ["src1", "src2"], "dst") + self.assertEqual(res, [["src1", "a", "dst"]]) + res = tc.shortest_path._compute_shortest_path(g, "src2", "dst") + self.assertEqual(res, [["src2", "b", "c", "dst"]]) + + edge_src_ids = [0, 1, 2, 3, 4] + edge_dst_ids = [2, 3, 5, 4, 5] + edge_weights = [1, 0.1, 1, 0.1, 0.1] + g = tc.SFrame( + { + "__src_id": edge_src_ids, + "__dst_id": edge_dst_ids, + "weights": edge_weights, + } + ) + g = tc.SGraph(edges=g) + t = tc.shortest_path._compute_shortest_path(g, [0, 1], [5], "weights") + self.assertEqual(t, [[1, 3, 4, 5]]) diff --git a/src/python/turicreate/test/test_graph_compute.py b/src/python/turicreate/test/test_graph_compute.py index 37cbe31874..7ff2838472 100644 --- a/src/python/turicreate/test/test_graph_compute.py +++ b/src/python/turicreate/test/test_graph_compute.py @@ -11,19 +11,21 @@ import time import sys + if sys.version_info.major > 2: unittest.TestCase.assertItemsEqual = unittest.TestCase.assertCountEqual + def degree_count_fn(source, edge, target, edge_dir, field): if field is None: - target['in_degree'] += 1 - source['out_degree'] += 1 - target['all_degree'] += 1 - source['all_degree'] += 1 + target["in_degree"] += 1 + source["out_degree"] += 1 + target["all_degree"] += 1 + source["all_degree"] += 1 else: - if edge_dir is 'in' or edge_dir is 'all': + if edge_dir is "in" or edge_dir is "all": target[field] = target[field] + 1 - if edge_dir is 'out' or edge_dir is 'all': + if edge_dir is "out" or edge_dir is "all": source[field] = source[field] + 1 return (source, edge, target) @@ -41,80 +43,163 @@ def return_pair_fn(source, edge, target): class GraphTests(unittest.TestCase): - def test_simple_triple_apply(self): def identity_fun(src, edge, dst): return src, edge, dst nverts = 100 ring_graph = SGraph().add_edges([Edge(i, 0) for i in range(1, nverts)]) - ring_graph.vertices['id'] = ring_graph.vertices['__id'] - ring_graph.edges['src'] = ring_graph.edges['__src_id'] - ring_graph2 = ring_graph.triple_apply(identity_fun, ['id', 'src']) - - self.assertSequenceEqual(list(ring_graph2.vertices['id']), list(ring_graph2.vertices['__id'])) - self.assertSequenceEqual(list(ring_graph2.edges['src']), list(ring_graph2.edges['__src_id'])) - for i in ring_graph.edges['__dst_id']: + ring_graph.vertices["id"] = ring_graph.vertices["__id"] + ring_graph.edges["src"] = ring_graph.edges["__src_id"] + ring_graph2 = ring_graph.triple_apply(identity_fun, ["id", "src"]) + + self.assertSequenceEqual( + list(ring_graph2.vertices["id"]), list(ring_graph2.vertices["__id"]) + ) + self.assertSequenceEqual( + list(ring_graph2.edges["src"]), list(ring_graph2.edges["__src_id"]) + ) + for i in ring_graph.edges["__dst_id"]: self.assertEqual(i, 0) def test_triple_apply(self): nverts = 100 ring_graph = SGraph().add_edges([Edge(i, 0) for i in range(1, nverts)]) vdata = ring_graph.get_vertices() - vdata['in_degree'] = 0 - vdata['out_degree'] = 0 - vdata['all_degree'] = 0 - vdata['do_not_touch'] = 0 + vdata["in_degree"] = 0 + vdata["out_degree"] = 0 + vdata["all_degree"] = 0 + vdata["do_not_touch"] = 0 ring_graph = ring_graph.add_vertices(vdata) - ret = ring_graph.triple_apply(lambda source, edge, target: degree_count_fn(source, edge, target, 'in', 'in_degree'), mutated_fields=['in_degree'], input_fields=['in_degree']) - self.assertItemsEqual(ret.get_fields(), ['__id', '__src_id', '__dst_id', 'in_degree']) - ret = ring_graph.triple_apply(lambda source, edge, target: degree_count_fn(source, edge, target, 'out', 'out_degree'), mutated_fields=['out_degree'], input_fields=['out_degree']) - self.assertItemsEqual(ret.get_fields(), ['__id', '__src_id', '__dst_id', 'out_degree']) - ret = ring_graph.triple_apply(lambda source, edge, target: degree_count_fn(source, edge, target, 'all', 'all_degree'), mutated_fields=['all_degree'], input_fields=['all_degree']) - self.assertItemsEqual(ret.get_fields(), ['__id', '__src_id', '__dst_id', 'all_degree']) - - ring_graph = ring_graph.triple_apply(lambda source, edge, target: degree_count_fn(source, edge, target, 'all', None), ['in_degree', 'out_degree', 'all_degree']) - self.assertItemsEqual(ring_graph.get_fields(), ['__id', '__src_id', '__dst_id', 'in_degree', 'out_degree', 'all_degree', 'do_not_touch']) + ret = ring_graph.triple_apply( + lambda source, edge, target: degree_count_fn( + source, edge, target, "in", "in_degree" + ), + mutated_fields=["in_degree"], + input_fields=["in_degree"], + ) + self.assertItemsEqual( + ret.get_fields(), ["__id", "__src_id", "__dst_id", "in_degree"] + ) + ret = ring_graph.triple_apply( + lambda source, edge, target: degree_count_fn( + source, edge, target, "out", "out_degree" + ), + mutated_fields=["out_degree"], + input_fields=["out_degree"], + ) + self.assertItemsEqual( + ret.get_fields(), ["__id", "__src_id", "__dst_id", "out_degree"] + ) + ret = ring_graph.triple_apply( + lambda source, edge, target: degree_count_fn( + source, edge, target, "all", "all_degree" + ), + mutated_fields=["all_degree"], + input_fields=["all_degree"], + ) + self.assertItemsEqual( + ret.get_fields(), ["__id", "__src_id", "__dst_id", "all_degree"] + ) + + ring_graph = ring_graph.triple_apply( + lambda source, edge, target: degree_count_fn( + source, edge, target, "all", None + ), + ["in_degree", "out_degree", "all_degree"], + ) + self.assertItemsEqual( + ring_graph.get_fields(), + [ + "__id", + "__src_id", + "__dst_id", + "in_degree", + "out_degree", + "all_degree", + "do_not_touch", + ], + ) vdata = ring_graph.get_vertices() for v in vdata: - if (v['__id'] == 0): - self.assertEqual(v['in_degree'], nverts - 1) - self.assertEqual(v['out_degree'], 0) + if v["__id"] == 0: + self.assertEqual(v["in_degree"], nverts - 1) + self.assertEqual(v["out_degree"], 0) else: - self.assertEqual(v['in_degree'], 0) - self.assertEqual(v['out_degree'], 1) - self.assertEqual(v['all_degree'], (v['in_degree'] + v['out_degree'])) + self.assertEqual(v["in_degree"], 0) + self.assertEqual(v["out_degree"], 1) + self.assertEqual(v["all_degree"], (v["in_degree"] + v["out_degree"])) # test lambda that changes fields that are not in the mutate_fields - ring_graph = ring_graph.triple_apply(lambda source, edge, target: degree_count_fn(source, edge, target, 'all', 'do_not_touch'), mutated_fields=['in_degree']) + ring_graph = ring_graph.triple_apply( + lambda source, edge, target: degree_count_fn( + source, edge, target, "all", "do_not_touch" + ), + mutated_fields=["in_degree"], + ) vdata = ring_graph.get_vertices() for v in vdata: - self.assertEqual(v['do_not_touch'], 0) - self.assertEqual(v['all_degree'], (v['in_degree'] + v['out_degree'])) + self.assertEqual(v["do_not_touch"], 0) + self.assertEqual(v["all_degree"], (v["in_degree"] + v["out_degree"])) # test change edge data - ring_graph.edges['src_id'] = 0 - ring_graph.edges['dst_id'] = 0 + ring_graph.edges["src_id"] = 0 + ring_graph.edges["dst_id"] = 0 + def edge_update_fn(source, edge, target): - edge['src_id'] = source['__id'] - edge['dst_id'] = target['__id'] + edge["src_id"] = source["__id"] + edge["dst_id"] = target["__id"] return (source, edge, target) - ring_graph = ring_graph.triple_apply(edge_update_fn, mutated_fields=['src_id', 'dst_id']) + + ring_graph = ring_graph.triple_apply( + edge_update_fn, mutated_fields=["src_id", "dst_id"] + ) edata = ring_graph.get_edges() for e in edata: - self.assertEqual(e['__src_id'], e['src_id']) - self.assertEqual(e['__dst_id'], e['dst_id']) + self.assertEqual(e["__src_id"], e["src_id"]) + self.assertEqual(e["__dst_id"], e["dst_id"]) # test exception in lambda - self.assertRaises(RuntimeError, lambda: ring_graph.triple_apply(exception_fn, mutated_fields=['in_degree'])) + self.assertRaises( + RuntimeError, + lambda: ring_graph.triple_apply(exception_fn, mutated_fields=["in_degree"]), + ) # test lambda that does not return a tuple of dicts - self.assertRaises(RuntimeError, lambda: ring_graph.triple_apply(return_none_fn, mutated_fields=['in_degree'])) - self.assertRaises(RuntimeError, lambda: ring_graph.triple_apply(return_pair_fn, mutated_fields=['in_degree'])) + self.assertRaises( + RuntimeError, + lambda: ring_graph.triple_apply( + return_none_fn, mutated_fields=["in_degree"] + ), + ) + self.assertRaises( + RuntimeError, + lambda: ring_graph.triple_apply( + return_pair_fn, mutated_fields=["in_degree"] + ), + ) # test api input validation - self.assertRaises(TypeError, lambda: ring_graph.triple_apply(exception_fn, mutated_fields=None)) - self.assertRaises(TypeError, lambda: ring_graph.triple_apply(exception_fn, mutated_fields=['in_degree'], input_fields={'a': 'b'})) - self.assertRaises(ValueError, lambda: ring_graph.triple_apply(exception_fn, mutated_fields=[])) - self.assertRaises(ValueError, lambda: ring_graph.triple_apply(exception_fn, mutated_fields=['field_not_exist'])) - self.assertRaises(ValueError, lambda: ring_graph.triple_apply(exception_fn, mutated_fields=['__id'])) + self.assertRaises( + TypeError, + lambda: ring_graph.triple_apply(exception_fn, mutated_fields=None), + ) + self.assertRaises( + TypeError, + lambda: ring_graph.triple_apply( + exception_fn, mutated_fields=["in_degree"], input_fields={"a": "b"} + ), + ) + self.assertRaises( + ValueError, lambda: ring_graph.triple_apply(exception_fn, mutated_fields=[]) + ) + self.assertRaises( + ValueError, + lambda: ring_graph.triple_apply( + exception_fn, mutated_fields=["field_not_exist"] + ), + ) + self.assertRaises( + ValueError, + lambda: ring_graph.triple_apply(exception_fn, mutated_fields=["__id"]), + ) diff --git a/src/python/turicreate/test/test_image_classifier.py b/src/python/turicreate/test/test_image_classifier.py index 8f37d276c0..5d06aa0b48 100644 --- a/src/python/turicreate/test/test_image_classifier.py +++ b/src/python/turicreate/test/test_image_classifier.py @@ -14,9 +14,11 @@ import unittest from turicreate.toolkits._main import ToolkitError as _ToolkitError -from turicreate.toolkits._internal_utils import (_mac_ver, - _raise_error_if_not_sframe, - _raise_error_if_not_sarray) +from turicreate.toolkits._internal_utils import ( + _mac_ver, + _raise_error_if_not_sframe, + _raise_error_if_not_sarray, +) from . import util as test_util @@ -27,44 +29,53 @@ def get_test_data(): - ''' + """ Create 5 all white images and 5 all black images. Then add some noise to each image. - ''' + """ from PIL import Image + DIM = 224 # Five all white images data = [] for _ in range(5): - data.append( np.full((DIM, DIM, 3), 255, dtype=np.uint8) ) + data.append(np.full((DIM, DIM, 3), 255, dtype=np.uint8)) # Five all black images for _ in range(5): - data.append( np.full((DIM, DIM, 3), 0, dtype=np.uint8) ) + data.append(np.full((DIM, DIM, 3), 0, dtype=np.uint8)) # Add some random noise to each images random = np.random.RandomState(100) for cur_image in data: for _ in range(1000): x, y = random.randint(DIM), random.randint(DIM) - rand_pixel_value = (random.randint(255), random.randint(255), random.randint(255)) + rand_pixel_value = ( + random.randint(255), + random.randint(255), + random.randint(255), + ) cur_image[x][y] = rand_pixel_value # Convert to an array of tc.Images images = [] for cur_data in data: pil_image = Image.fromarray(cur_data) - image_data = bytearray([z for l in pil_image.getdata() for z in l ]) + image_data = bytearray([z for l in pil_image.getdata() for z in l]) image_data_size = len(image_data) - tc_image = tc.Image(_image_data = image_data, - _width = DIM, _height = DIM, - _channels = 3, _format_enum = 2, - _image_data_size = image_data_size) + tc_image = tc.Image( + _image_data=image_data, + _width=DIM, + _height=DIM, + _channels=3, + _format_enum=2, + _image_data_size=image_data_size, + ) images.append(tc_image) - labels = ['white'] * 5 + ['black'] * 5 - return tc.SFrame({'awesome_image': images, 'awesome_label': labels}) + labels = ["white"] * 5 + ["black"] * 5 + return tc.SFrame({"awesome_image": images, "awesome_label": labels}) data = get_test_data() @@ -72,89 +83,116 @@ def get_test_data(): class ImageClassifierTest(unittest.TestCase): @classmethod - def setUpClass(self, model = 'resnet-50', input_image_shape = (3, 224, 224), tol=0.02, - num_examples = 100, label_type = int): - self.feature = 'awesome_image' - self.target = 'awesome_label' + def setUpClass( + self, + model="resnet-50", + input_image_shape=(3, 224, 224), + tol=0.02, + num_examples=100, + label_type=int, + ): + self.feature = "awesome_image" + self.target = "awesome_label" self.input_image_shape = input_image_shape self.pre_trained_model = model self.tolerance = tol - self.model = tc.image_classifier.create(data, target=self.target, - model=self.pre_trained_model, - seed=42) + self.model = tc.image_classifier.create( + data, target=self.target, model=self.pre_trained_model, seed=42 + ) self.nn_model = self.model.feature_extractor self.lm_model = self.model.classifier self.max_iterations = 10 self.get_ans = { - 'classifier' : lambda x: type(x) == \ - tc.logistic_classifier.LogisticClassifier, - 'feature': lambda x: x == self.feature, - 'classes': lambda x: x == self.lm_model.classes, - 'training_time': lambda x: x > 0, - 'input_image_shape': lambda x: x == self.input_image_shape, - 'target': lambda x: x == self.target, - 'feature_extractor' : lambda x: callable(x.extract_features), - 'training_loss': lambda x: x > 0, - 'max_iterations': lambda x: x == self.max_iterations, - 'num_features': lambda x: x == self.lm_model.num_features, - 'num_examples': lambda x: x == self.lm_model.num_examples, - 'model': lambda x: (x == self.pre_trained_model - or (self.pre_trained_model == "VisionFeaturePrint_Screen" - and x == "VisionFeaturePrint_Scene")), - 'num_classes': lambda x: x == self.lm_model.num_classes, + "classifier": lambda x: type(x) + == tc.logistic_classifier.LogisticClassifier, + "feature": lambda x: x == self.feature, + "classes": lambda x: x == self.lm_model.classes, + "training_time": lambda x: x > 0, + "input_image_shape": lambda x: x == self.input_image_shape, + "target": lambda x: x == self.target, + "feature_extractor": lambda x: callable(x.extract_features), + "training_loss": lambda x: x > 0, + "max_iterations": lambda x: x == self.max_iterations, + "num_features": lambda x: x == self.lm_model.num_features, + "num_examples": lambda x: x == self.lm_model.num_examples, + "model": lambda x: ( + x == self.pre_trained_model + or ( + self.pre_trained_model == "VisionFeaturePrint_Screen" + and x == "VisionFeaturePrint_Scene" + ) + ), + "num_classes": lambda x: x == self.lm_model.num_classes, } self.fields_ans = self.get_ans.keys() def assertListAlmostEquals(self, list1, list2, tol): self.assertEqual(len(list1), len(list2)) for a, b in zip(list1, list2): - self.assertAlmostEqual(a, b, delta = tol) + self.assertAlmostEqual(a, b, delta=tol) def test_create_with_missing_value(self): - data_with_none = data.append(tc.SFrame({self.feature: tc.SArray([None], dtype=tc.Image), self.target: [data[self.target][0]]})) + data_with_none = data.append( + tc.SFrame( + { + self.feature: tc.SArray([None], dtype=tc.Image), + self.target: [data[self.target][0]], + } + ) + ) with self.assertRaises(_ToolkitError): - tc.image_classifier.create(data_with_none, feature=self.feature, target=self.target) + tc.image_classifier.create( + data_with_none, feature=self.feature, target=self.target + ) def test_create_with_missing_feature(self): with self.assertRaises(_ToolkitError): - tc.image_classifier.create(data, feature='wrong_feature', target=self.target) + tc.image_classifier.create( + data, feature="wrong_feature", target=self.target + ) def test_create_with_missing_label(self): with self.assertRaises(RuntimeError): - tc.image_classifier.create(data, feature=self.feature, target='wrong_annotations') + tc.image_classifier.create( + data, feature=self.feature, target="wrong_annotations" + ) def test_create_with_empty_dataset(self): with self.assertRaises(_ToolkitError): - tc.image_classifier.create(data[:0], target = self.target) + tc.image_classifier.create(data[:0], target=self.target) def test_predict(self): model = self.model - for output_type in ['class', 'probability_vector']: + for output_type in ["class", "probability_vector"]: preds = model.predict(data.head(), output_type=output_type) _raise_error_if_not_sarray(preds) self.assertEqual(len(preds), len(data.head())) - if output_type == 'class': - self.assertTrue(all(preds[:5] == 'white')) - self.assertTrue(all(preds[5:] == 'black')) + if output_type == "class": + self.assertTrue(all(preds[:5] == "white")) + self.assertTrue(all(preds[5:] == "black")) def test_single_image(self): model = self.model single_image = data[0][self.feature] prediction = model.predict(single_image) self.assertTrue(isinstance(prediction, (int, str))) - prediction = model.predict_topk(single_image, k = 2) + prediction = model.predict_topk(single_image, k=2) _raise_error_if_not_sframe(prediction) prediction = model.classify(single_image) - self.assertTrue(isinstance(prediction, dict) and 'class' in prediction and 'probability' in prediction) + self.assertTrue( + isinstance(prediction, dict) + and "class" in prediction + and "probability" in prediction + ) def test_sarray(self): model = self.model sa = data[self.feature] predictions = model.predict(sa) _raise_error_if_not_sarray(predictions) - predictions = model.predict_topk(sa, k = 2) + predictions = model.predict_topk(sa, k=2) _raise_error_if_not_sframe(predictions) predictions = model.classify(sa) _raise_error_if_not_sframe(predictions) @@ -170,44 +208,51 @@ def test_junk_input(self): def test_export_coreml(self): if self.model.model == "VisionFeaturePrint_Scene": - pytest.xfail("Expected failure until " - + "https://github.com/apple/turicreate/issues/2744 is fixed") - filename = tempfile.mkstemp('bingo.mlmodel')[1] + pytest.xfail( + "Expected failure until " + + "https://github.com/apple/turicreate/issues/2744 is fixed" + ) + filename = tempfile.mkstemp("bingo.mlmodel")[1] self.model.export_coreml(filename) coreml_model = coremltools.models.MLModel(filename) - self.assertDictEqual({ - 'com.github.apple.turicreate.version': tc.__version__, - 'com.github.apple.os.platform': platform.platform(), - 'type': 'ImageClassifier', - 'coremltoolsVersion': coremltools.__version__, - 'version': '1' - }, dict(coreml_model.user_defined_metadata) + self.assertDictEqual( + { + "com.github.apple.turicreate.version": tc.__version__, + "com.github.apple.os.platform": platform.platform(), + "type": "ImageClassifier", + "coremltoolsVersion": coremltools.__version__, + "version": "1", + }, + dict(coreml_model.user_defined_metadata), + ) + expected_result = ( + "Image classifier (%s) created by Turi Create (version %s)" + % (self.model.model, tc.__version__) ) - expected_result = 'Image classifier (%s) created by Turi Create (version %s)' % ( - self.model.model, tc.__version__) self.assertEquals(expected_result, coreml_model.short_description) - @unittest.skipIf(sys.platform != 'darwin', 'Core ML only supported on Mac') + @unittest.skipIf(sys.platform != "darwin", "Core ML only supported on Mac") def test_export_coreml_predict(self): - filename = tempfile.mkstemp('bingo.mlmodel')[1] + filename = tempfile.mkstemp("bingo.mlmodel")[1] self.model.export_coreml(filename) coreml_model = coremltools.models.MLModel(filename) img = data[0:1][self.feature][0] img_fixed = tc.image_analysis.resize(img, *reversed(self.input_image_shape)) from PIL import Image + pil_img = Image.fromarray(img_fixed.pixel_data) if _mac_ver() >= (10, 13): classes = self.model.classifier.classes ret = coreml_model.predict({self.feature: pil_img}) - coreml_values = [ret[self.target + 'Probability'][l] for l in classes] + coreml_values = [ret[self.target + "Probability"][l] for l in classes] self.assertListAlmostEquals( - coreml_values, - list(self.model.predict(img_fixed, output_type = 'probability_vector')), - self.tolerance + coreml_values, + list(self.model.predict(img_fixed, output_type="probability_vector")), + self.tolerance, ) def test_classify(self): @@ -217,8 +262,8 @@ def test_classify(self): def test_predict_topk(self): model = self.model - for output_type in ['margin', 'probability', 'rank']: - preds = model.predict_topk(data.head(), output_type = output_type, k = 2) + for output_type in ["margin", "probability", "rank"]: + preds = model.predict_topk(data.head(), output_type=output_type, k=2) self.assertEqual(len(preds), 2 * len(data.head())) def test_list_fields(self): @@ -230,8 +275,10 @@ def test_get(self): model = self.model for field in self.fields_ans: ans = model._get(field) - self.assertTrue(self.get_ans[field](ans), - '''Get failed in field {}. Output was {}.'''.format(field, ans)) + self.assertTrue( + self.get_ans[field](ans), + """Get failed in field {}. Output was {}.""".format(field, ans), + ) def test_summary(self): model = self.model @@ -239,16 +286,16 @@ def test_summary(self): def test_summary_str(self): model = self.model - self.assertTrue(isinstance(model.summary('str'), str)) + self.assertTrue(isinstance(model.summary("str"), str)) def test_summary_dict(self): model = self.model - self.assertTrue(isinstance(model.summary('dict'), dict)) + self.assertTrue(isinstance(model.summary("dict"), dict)) def test_summary_invalid_input(self): model = self.model with self.assertRaises(_ToolkitError): - model.summary(model.summary('invalid')) + model.summary(model.summary("invalid")) with self.assertRaises(_ToolkitError): model.summary(model.summary(0)) @@ -284,23 +331,33 @@ def test_save_and_load(self): def test_evaluate_explore(self): # Run the explore method and make sure we don't throw an exception. # This will test the JSON serialization logic. - tc.visualization.set_target('none') + tc.visualization.set_target("none") evaluation = self.model.evaluate(data) evaluation.explore() + class ImageClassifierSqueezeNetTest(ImageClassifierTest): @classmethod def setUpClass(self): - super(ImageClassifierSqueezeNetTest, self).setUpClass(model='squeezenet_v1.1', - input_image_shape=(3, 227, 227), - tol=0.005, num_examples = 200) + super(ImageClassifierSqueezeNetTest, self).setUpClass( + model="squeezenet_v1.1", + input_image_shape=(3, 227, 227), + tol=0.005, + num_examples=200, + ) + # TODO: if on skip OS, test negative case -@unittest.skipIf(_mac_ver() < (10,14), 'VisionFeaturePrint_Scene only supported on macOS 10.14+') +@unittest.skipIf( + _mac_ver() < (10, 14), "VisionFeaturePrint_Scene only supported on macOS 10.14+" +) class VisionFeaturePrintSceneTest(ImageClassifierTest): @classmethod def setUpClass(self): - super(VisionFeaturePrintSceneTest, self).setUpClass(model='VisionFeaturePrint_Scene', - input_image_shape=(3, 299, 299), - tol=0.005, num_examples = 100, - label_type = str) + super(VisionFeaturePrintSceneTest, self).setUpClass( + model="VisionFeaturePrint_Scene", + input_image_shape=(3, 299, 299), + tol=0.005, + num_examples=100, + label_type=str, + ) diff --git a/src/python/turicreate/test/test_image_similarity.py b/src/python/turicreate/test/test_image_similarity.py index 421f44a10e..9a77544242 100644 --- a/src/python/turicreate/test/test_image_similarity.py +++ b/src/python/turicreate/test/test_image_similarity.py @@ -18,102 +18,115 @@ def get_test_data(): - ''' + """ Create 5 all white images and 5 all black images. Then add some noise to each image. - ''' + """ from PIL import Image + DIM = 224 # Five all white images data = [] for _ in range(5): - data.append( np.full((DIM, DIM, 3), 255, dtype=np.uint8) ) + data.append(np.full((DIM, DIM, 3), 255, dtype=np.uint8)) # Five all black images for _ in range(5): - data.append( np.full((DIM, DIM, 3), 0, dtype=np.uint8) ) + data.append(np.full((DIM, DIM, 3), 0, dtype=np.uint8)) # Add some random noise to each image random = np.random.RandomState(100) for cur_image in data: for _ in range(1000): x, y = random.randint(DIM), random.randint(DIM) - rand_pixel_value = (random.randint(255), random.randint(255), random.randint(255)) + rand_pixel_value = ( + random.randint(255), + random.randint(255), + random.randint(255), + ) cur_image[x][y] = rand_pixel_value # Convert to an array of tc.Images images = [] for cur_data in data: pil_image = Image.fromarray(cur_data) - image_data = bytearray([z for l in pil_image.getdata() for z in l ]) + image_data = bytearray([z for l in pil_image.getdata() for z in l]) image_data_size = len(image_data) - tc_image = tc.Image(_image_data = image_data, - _width = DIM, _height = DIM, - _channels = 3, _format_enum = 2, - _image_data_size = image_data_size) + tc_image = tc.Image( + _image_data=image_data, + _width=DIM, + _height=DIM, + _channels=3, + _format_enum=2, + _image_data_size=image_data_size, + ) images.append(tc_image) - return tc.SFrame({'awesome_image': images}) + return tc.SFrame({"awesome_image": images}) data = get_test_data() class ImageSimilarityTest(unittest.TestCase): - @classmethod - def setUpClass(self, input_image_shape = (3,224,224), model = 'resnet-50'): + def setUpClass(self, input_image_shape=(3, 224, 224), model="resnet-50"): """ The setup class method for the basic test case with all default values. """ - self.feature = 'awesome_image' + self.feature = "awesome_image" self.label = None self.input_image_shape = input_image_shape self.pre_trained_model = model # Create the model - self.def_opts= { - 'model': 'resnet-50', - 'verbose': True, + self.def_opts = { + "model": "resnet-50", + "verbose": True, } # Model - self.model = tc.image_similarity.create(data, feature=self.feature, - label=None, model=self.pre_trained_model) + self.model = tc.image_similarity.create( + data, feature=self.feature, label=None, model=self.pre_trained_model + ) self.nn_model = self.model.feature_extractor self.lm_model = self.model.similarity_model self.opts = self.def_opts.copy() # Answers self.get_ans = { - 'similarity_model' : lambda x: type(x) == \ - tc.nearest_neighbors.NearestNeighborsModel, - 'feature': lambda x: x == self.feature, - 'training_time': lambda x: x > 0, - 'input_image_shape': lambda x: x == self.input_image_shape, - 'label': lambda x: x == self.label, - 'feature_extractor' : lambda x: callable(x.extract_features), - 'num_features': lambda x: x == self.lm_model.num_features, - 'num_examples': lambda x: x == self.lm_model.num_examples, - 'model': lambda x: (x == self.pre_trained_model - or (self.pre_trained_model == "VisionFeaturePrint_Screen" - and x == "VisionFeaturePrint_Scene")) + "similarity_model": lambda x: type(x) + == tc.nearest_neighbors.NearestNeighborsModel, + "feature": lambda x: x == self.feature, + "training_time": lambda x: x > 0, + "input_image_shape": lambda x: x == self.input_image_shape, + "label": lambda x: x == self.label, + "feature_extractor": lambda x: callable(x.extract_features), + "num_features": lambda x: x == self.lm_model.num_features, + "num_examples": lambda x: x == self.lm_model.num_examples, + "model": lambda x: ( + x == self.pre_trained_model + or ( + self.pre_trained_model == "VisionFeaturePrint_Screen" + and x == "VisionFeaturePrint_Scene" + ) + ), } self.fields_ans = self.get_ans.keys() def assertListAlmostEquals(self, list1, list2, tol): self.assertEqual(len(list1), len(list2)) for a, b in zip(list1, list2): - self.assertAlmostEqual(a, b, delta = tol) + self.assertAlmostEqual(a, b, delta=tol) def test_create_with_missing_feature(self): with self.assertRaises(_ToolkitError): - tc.image_similarity.create(data, feature='wrong_feature', label=self.label) + tc.image_similarity.create(data, feature="wrong_feature", label=self.label) def test_create_with_missing_label(self): with self.assertRaises(_ToolkitError): - tc.image_similarity.create(data, feature=self.feature, label='wrong_label') + tc.image_similarity.create(data, feature=self.feature, label="wrong_label") def test_create_with_empty_dataset(self): with self.assertRaises(_ToolkitError): @@ -125,11 +138,11 @@ def test_query(self): self.assertEqual(len(preds), len(data) * 5) # Make sure all the white images (first five images) are only similar to the other white images - white_sims = preds.filter_by([0, 1, 2, 3, 4], 'query_label')['reference_label'] + white_sims = preds.filter_by([0, 1, 2, 3, 4], "query_label")["reference_label"] self.assertEqual(sorted(white_sims.unique()), [0, 1, 2, 3, 4]) # Make sure all the black images (last five images) are only similar to the other black images - white_sims = preds.filter_by([5, 6, 7, 8, 9], 'query_label')['reference_label'] + white_sims = preds.filter_by([5, 6, 7, 8, 9], "query_label")["reference_label"] self.assertEqual(sorted(white_sims.unique()), [5, 6, 7, 8, 9]) def test_similarity_graph(self): @@ -137,7 +150,7 @@ def test_similarity_graph(self): preds = model.similarity_graph() self.assertEqual(len(preds.edges), len(data) * 5) - preds = model.similarity_graph(output_type = 'SFrame') + preds = model.similarity_graph(output_type="SFrame") self.assertEqual(len(preds), len(data) * 5) def test_list_fields(self): @@ -153,8 +166,10 @@ def test_get(self): model = self.model for field in self.fields_ans: ans = model._get(field) - self.assertTrue(self.get_ans[field](ans), - "Get failed in field {}. Output was {}.".format(field, ans)) + self.assertTrue( + self.get_ans[field](ans), + "Get failed in field {}. Output was {}.".format(field, ans), + ) def test_query_input(self): model = self.model @@ -176,16 +191,16 @@ def test_summary(self): def test_summary_str(self): model = self.model - self.assertTrue(isinstance(model.summary('str'), str)) + self.assertTrue(isinstance(model.summary("str"), str)) def test_summary_dict(self): model = self.model - self.assertTrue(isinstance(model.summary('dict'), dict)) + self.assertTrue(isinstance(model.summary("dict"), dict)) def test_summary_invalid_input(self): model = self.model with self.assertRaises(_ToolkitError): - model.summary(model.summary('invalid')) + model.summary(model.summary("invalid")) with self.assertRaises(_ToolkitError): model.summary(model.summary(0)) @@ -207,25 +222,32 @@ def test_export_coreml(self): def get_psnr(x, y): # See: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio # The higher the number the better. - return 20 * np.log10(max(x.max(), y.max())) - 10 * np.log10(np.square(x-y).mean()) + return 20 * np.log10(max(x.max(), y.max())) - 10 * np.log10( + np.square(x - y).mean() + ) # Save the model as a CoreML model file - filename = tempfile.mkstemp('ImageSimilarity.mlmodel')[1] + filename = tempfile.mkstemp("ImageSimilarity.mlmodel")[1] self.model.export_coreml(filename) # Load the model back from the CoreML model file coreml_model = coremltools.models.MLModel(filename) import platform - self.assertDictEqual({ - 'com.github.apple.turicreate.version': tc.__version__, - 'com.github.apple.os.platform': platform.platform(), - 'type': 'ImageSimilarityModel', - 'coremltoolsVersion': coremltools.__version__, - 'version': '1' - }, dict(coreml_model.user_defined_metadata) + + self.assertDictEqual( + { + "com.github.apple.turicreate.version": tc.__version__, + "com.github.apple.os.platform": platform.platform(), + "type": "ImageSimilarityModel", + "coremltoolsVersion": coremltools.__version__, + "version": "1", + }, + dict(coreml_model.user_defined_metadata), + ) + expected_result = ( + "Image similarity (%s) created by Turi Create (version %s)" + % (self.model.model, tc.__version__) ) - expected_result = 'Image similarity (%s) created by Turi Create (version %s)' % ( - self.model.model, tc.__version__) # Get model distances for comparison img = data[0:1][self.feature][0] @@ -234,12 +256,13 @@ def get_psnr(x, y): if _mac_ver() >= (10, 13): from PIL import Image as _PIL_Image + pil_img = _PIL_Image.fromarray(img_fixed.pixel_data) - coreml_ret = coreml_model.predict({'awesome_image': pil_img}) + coreml_ret = coreml_model.predict({"awesome_image": pil_img}) # Compare distances - coreml_distances = np.array(coreml_ret['distance']) - tc_distances = tc_ret.sort('reference_label')['distance'].to_numpy() + coreml_distances = np.array(coreml_ret["distance"]) + tc_distances = tc_ret.sort("reference_label")["distance"].to_numpy() psnr_value = get_psnr(coreml_distances, tc_distances) self.assertTrue(psnr_value > 50) @@ -266,21 +289,29 @@ def test_save_and_load(self): class ImageSimilaritySqueezeNetTest(ImageSimilarityTest): @classmethod def setUpClass(self): - super(ImageSimilaritySqueezeNetTest, self).setUpClass(model='squeezenet_v1.1', - input_image_shape=(3, 227, 227)) + super(ImageSimilaritySqueezeNetTest, self).setUpClass( + model="squeezenet_v1.1", input_image_shape=(3, 227, 227) + ) -@unittest.skipIf(_mac_ver() < (10,14), 'VisionFeaturePrint_Scene only supported on macOS 10.14+') +@unittest.skipIf( + _mac_ver() < (10, 14), "VisionFeaturePrint_Scene only supported on macOS 10.14+" +) class ImageSimilarityVisionFeaturePrintSceneTest(ImageSimilarityTest): @classmethod def setUpClass(self): - super(ImageSimilarityVisionFeaturePrintSceneTest, self).setUpClass(model='VisionFeaturePrint_Scene', - input_image_shape=(3, 299, 299)) + super(ImageSimilarityVisionFeaturePrintSceneTest, self).setUpClass( + model="VisionFeaturePrint_Scene", input_image_shape=(3, 299, 299) + ) + + # A test to gaurantee that old code using the incorrect name still works. -@unittest.skipIf(_mac_ver() < (10,14), 'VisionFeaturePrint_Scene only supported on macOS 10.14+') +@unittest.skipIf( + _mac_ver() < (10, 14), "VisionFeaturePrint_Scene only supported on macOS 10.14+" +) class ImageSimilarityVisionFeaturePrintSceneTest_bad_name(ImageSimilarityTest): @classmethod def setUpClass(self): - super(ImageSimilarityVisionFeaturePrintSceneTest_bad_name, self).setUpClass(model='VisionFeaturePrint_Screen', - input_image_shape=(3, 299, 299)) - + super(ImageSimilarityVisionFeaturePrintSceneTest_bad_name, self).setUpClass( + model="VisionFeaturePrint_Screen", input_image_shape=(3, 299, 299) + ) diff --git a/src/python/turicreate/test/test_image_type.py b/src/python/turicreate/test/test_image_type.py index c8bafd37a8..af7656dd7e 100644 --- a/src/python/turicreate/test/test_image_type.py +++ b/src/python/turicreate/test/test_image_type.py @@ -24,33 +24,36 @@ class image_info: def __init__(self, url): self.url = url - if 'png' in url: - self.format = 'PNG' - elif 'jpg' in url: - self.format = 'JPG' - if 'grey' in url: + if "png" in url: + self.format = "PNG" + elif "jpg" in url: + self.format = "JPG" + if "grey" in url: self.channels = 1 else: self.channels = 3 -urls = [current_file_dir + x for x in [ - '/images/nested/sample_grey.jpg', - '/images/nested/sample_grey.png', - '/images/sample.jpg', - '/images/sample.png' - ]] + +urls = [ + current_file_dir + x + for x in [ + "/images/nested/sample_grey.jpg", + "/images/nested/sample_grey.png", + "/images/sample.jpg", + "/images/sample.png", + ] +] test_image_info = [image_info(u) for u in urls] class ImageClassTest(unittest.TestCase): - def __check_raw_image_equals_pilimage(self, glimage, pilimage): # size equal self.assertEqual((glimage.width, glimage.height), pilimage.size) # decode image glimage_decoded = image_analysis._decode(glimage) - self.assertEqual(glimage_decoded._format_enum, image._format['RAW']) + self.assertEqual(glimage_decoded._format_enum, image._format["RAW"]) # Getting data if glimage.channels == 1: pil_data = bytearray([z for z in pilimage.getdata()]) @@ -60,7 +63,9 @@ def __check_raw_image_equals_pilimage(self, glimage, pilimage): # Size data equal self.assertEqual(glimage_decoded._image_data_size, len(pil_data)) self.assertEqual(len(glimage_decoded._image_data), len(pil_data)) - pixel_diff = [abs(x - y) for (x, y) in zip(glimage_decoded._image_data, pil_data)] + pixel_diff = [ + abs(x - y) for (x, y) in zip(glimage_decoded._image_data, pil_data) + ] self.assertLess(sum(pixel_diff) / float(len(pil_data)), 2) @@ -89,10 +94,14 @@ def test_construct(self): # Test construction with wrong format for meta_info in test_image_info: - if (meta_info.url == 'PNG'): - self.assertRaises(RuntimeError, lambda: image.Image(path=meta_info.url, format='JPG')) - elif (meta_info.url == 'JPG'): - self.assertRaises(RuntimeError, lambda: image.Image(path=meta_info.url, format='PNG')) + if meta_info.url == "PNG": + self.assertRaises( + RuntimeError, lambda: image.Image(path=meta_info.url, format="JPG") + ) + elif meta_info.url == "JPG": + self.assertRaises( + RuntimeError, lambda: image.Image(path=meta_info.url, format="PNG") + ) # Test empty image sarray sa = SArray([image.Image()] * 100) @@ -111,30 +120,35 @@ def test_resize(self): for new_channels in [None, 1, 3, 4]: new_width = int(scale * glimage.width) new_height = int(scale * glimage.height) - glimage_resized = image_analysis.resize(glimage, new_width, new_height, new_channels) - pilimage = PIL_Image.open(meta_info.url).resize((new_width, new_height), PIL_Image.NEAREST) - if (new_channels == 1): - pilimage = pilimage.convert('L') - elif (new_channels == 3): - pilimage = pilimage.convert('RGB') - elif (new_channels == 4): - pilimage = pilimage.convert('RGBA') + glimage_resized = image_analysis.resize( + glimage, new_width, new_height, new_channels + ) + pilimage = PIL_Image.open(meta_info.url).resize( + (new_width, new_height), PIL_Image.NEAREST + ) + if new_channels == 1: + pilimage = pilimage.convert("L") + elif new_channels == 3: + pilimage = pilimage.convert("RGB") + elif new_channels == 4: + pilimage = pilimage.convert("RGBA") self.__check_raw_image_equals_pilimage(glimage_resized, pilimage) def test_cmyk_not_supported(self): for meta_info in test_image_info: input_img = PIL_Image.open(meta_info.url) - input_img = input_img.convert('CMYK') + input_img = input_img.convert("CMYK") import tempfile + with tempfile.NamedTemporaryFile() as t: - input_img.save(t, format='jpeg') + input_img.save(t, format="jpeg") with self.assertRaises(ToolkitError): - cmyk_image = image.Image(path=t.name, format='JPG') + cmyk_image = image.Image(path=t.name, format="JPG") def test_batch_resize(self): - image_url_dir = current_file_dir + '/images' - sa = image_analysis.load_images(image_url_dir, "auto", with_path=False)['image'] + image_url_dir = current_file_dir + "/images" + sa = image_analysis.load_images(image_url_dir, "auto", with_path=False)["image"] for new_channels in [1, 3, 4]: sa_resized = image_analysis.resize(sa, 320, 280, new_channels) for i in sa_resized: @@ -143,24 +157,24 @@ def test_batch_resize(self): self.assertEqual(i.channels, new_channels) def test_load_images(self): - image_url_dir = current_file_dir + '/images' + image_url_dir = current_file_dir + "/images" # Test auto format, with path and recursive sf1 = image_analysis.load_images(image_url_dir, "auto", True, True) self.assertEqual(sf1.num_columns(), 2) self.assertEqual(sf1.num_rows(), 18) - self.assertEqual(sf1['image'].dtype, image.Image) + self.assertEqual(sf1["image"].dtype, image.Image) # Test auto format, with path and non recursive sf2 = image_analysis.load_images(image_url_dir, "auto", True, False) self.assertEqual(sf2.num_columns(), 2) self.assertEqual(sf2.num_rows(), 2) - self.assertEqual(sf2['image'].dtype, image.Image) + self.assertEqual(sf2["image"].dtype, image.Image) # Test auto format, without path and recursive sf3 = image_analysis.load_images(image_url_dir, "auto", False, True) self.assertEqual(sf3.num_columns(), 1) self.assertEqual(sf3.num_rows(), 18) - self.assertEqual(sf3['image'].dtype, image.Image) + self.assertEqual(sf3["image"].dtype, image.Image) # Test auto format, without path and non recursive sf4 = image_analysis.load_images(image_url_dir, "auto", False, False) @@ -168,7 +182,9 @@ def test_load_images(self): self.assertEqual(sf4.num_rows(), 2) # Confirm that load_images works with a single image as well - sf5 = image_analysis.load_images(image_url_dir + '/sample.jpg', "auto", False, False) + sf5 = image_analysis.load_images( + image_url_dir + "/sample.jpg", "auto", False, False + ) self.assertEqual(sf5.num_columns(), 1) self.assertEqual(sf5.num_rows(), 1) @@ -182,12 +198,13 @@ def test_load_images(self): # Setting ignore_failure to True, and we should be able to load images # to our best effort without throwing error - image_analysis.load_images(image_url_dir, 'JPG', ignore_failure=True) - image_analysis.load_images(image_url_dir, 'PNG', ignore_failure=True) + image_analysis.load_images(image_url_dir, "JPG", ignore_failure=True) + image_analysis.load_images(image_url_dir, "PNG", ignore_failure=True) def test_astype_image(self): import glob - imagelist = glob.glob(current_file_dir + '/images/*/**') + + imagelist = glob.glob(current_file_dir + "/images/*/**") imageurls = SArray(imagelist) images = imageurls.astype(image.Image) self.assertEqual(images.dtype, image.Image) @@ -204,13 +221,13 @@ def test_astype_image(self): self.assertEqual(ret[1], None) def test_casting(self): - image_url_dir = current_file_dir + '/images/nested' + image_url_dir = current_file_dir + "/images/nested" sf = image_analysis.load_images(image_url_dir, "auto", True, True) - sa = sf['image'] + sa = sf["image"] sa_vec = sa.astype(array.array) sa_img = sa_vec.pixel_array_to_image(sa[0].width, sa[0].height, sa[0].channels) sa_str = sa.astype(str) - sa_str_expected = 'Height: ' + str(sa[0].height) + ' Width: ' + str(sa[0].width) + sa_str_expected = "Height: " + str(sa[0].height) + " Width: " + str(sa[0].width) decoded_image = image_analysis._decode(sa[0]) self.assertEqual(sa_img[0].height, sa[0].height) self.assertEqual(sa_img[0].width, sa[0].width) @@ -220,9 +237,9 @@ def test_casting(self): self.assertEqual(sa_str[0], sa_str_expected) def test_lambda(self): - image_url_dir = current_file_dir + '/images' + image_url_dir = current_file_dir + "/images" sf = image_analysis.load_images(image_url_dir) - sa = sf['image'] + sa = sf["image"] # Lambda returning self sa_self = sa.apply(lambda x: x) @@ -244,9 +261,10 @@ def test_lambda(self): for i in range(len(sa_channels)): self.assertEqual(sa[i].channels, sa_channels[i]) - # Lambda returning resized self - sa_resized = sa.apply(lambda x: image_analysis.resize(x, int(x.width / 2), int(x.height / 2))) + sa_resized = sa.apply( + lambda x: image_analysis.resize(x, int(x.width / 2), int(x.height / 2)) + ) for i in range(len(sa_resized)): self.assertEqual(sa_resized[i].width, int(sa[i].width / 2)) @@ -254,9 +272,30 @@ def test_generate_mean(self): zeros = bytearray(100) fifties = bytearray([50] * 100) hundreds = bytearray([100] * 100) - img1 = image.Image(_image_data=zeros, _channels=1, _height=1, _width=100, _image_data_size=100, _format_enum=2) # format 2 is RAW - img2 = image.Image(_image_data=hundreds, _channels=1, _height=1, _width=100, _image_data_size=100, _format_enum=2) - img3 = image.Image(_image_data=fifties, _channels=1, _height=1, _width=100, _image_data_size=100, _format_enum=2) + img1 = image.Image( + _image_data=zeros, + _channels=1, + _height=1, + _width=100, + _image_data_size=100, + _format_enum=2, + ) # format 2 is RAW + img2 = image.Image( + _image_data=hundreds, + _channels=1, + _height=1, + _width=100, + _image_data_size=100, + _format_enum=2, + ) + img3 = image.Image( + _image_data=fifties, + _channels=1, + _height=1, + _width=100, + _image_data_size=100, + _format_enum=2, + ) sa = SArray([img1, img2]) average = sa.mean() @@ -268,7 +307,14 @@ def test_generate_mean(self): def test_pixel_data(self): fifties = bytearray([50] * 100) - img = image.Image(_image_data=fifties, _channels=1, _height=1, _width=100, _image_data_size=100, _format_enum=2) + img = image.Image( + _image_data=fifties, + _channels=1, + _height=1, + _width=100, + _image_data_size=100, + _format_enum=2, + ) pixel_data = img.pixel_data.flatten() self.assertEqual(pixel_data.shape, (100,)) @@ -277,35 +323,89 @@ def test_pixel_data(self): self.assertEqual(pixel_data[p], 50) # Load images and make sure shape is right - img_color = image.Image(os.path.join(current_file_dir, 'images', 'sample.png')) + img_color = image.Image(os.path.join(current_file_dir, "images", "sample.png")) self.assertEqual(img_color.pixel_data.shape, (444, 800, 3)) - img_gray = image.Image(os.path.join(current_file_dir, 'images', 'nested', 'sample_grey.png')) + img_gray = image.Image( + os.path.join(current_file_dir, "images", "nested", "sample_grey.png") + ) self.assertEqual(img_gray.pixel_data.shape, (444, 800)) def test_png_bitdepth(self): def path(name): - return os.path.join(current_file_dir, 'images', 'bitdepths', name) + return os.path.join(current_file_dir, "images", "bitdepths", name) # Test images with varying bitdepth and check correctness against 4 reference pixels images_info = [ # path, bitdepth, pixel_data[0, 0], pixel_data[0, 1], pixel_data[0, 200], pixel_data[40, 400] - (path('color_1bit.png'), [0, 0, 0], [0, 0, 0], [ 0, 255, 255], [255, 255, 0]), - (path('color_2bit.png'), [0, 0, 0], [0, 0, 0], [ 85, 255, 170], [170, 170, 85]), - (path('color_4bit.png'), [0, 0, 0], [0, 0, 0], [ 68, 221, 187], [153, 187, 102]), - (path('color_8bit.png'), [0, 0, 0], [0, 1, 2], [ 73, 219, 182], [146, 182, 109]), - (path('color_16bit.png'), [0, 0, 0], [0, 1, 2], [ 73, 219, 182], [146, 182, 109]), - - (path('gray_1bit.png'), 0, 0, 0, 255), - (path('gray_2bit.png'), 0, 0, 85, 170), - (path('gray_4bit.png'), 0, 0, 68, 153), - (path('gray_8bit.png'), 0, 0, 73, 146), - (path('gray_16bit.png'), 0, 0, 73, 146), - - (path('palette_1bit.png'), [127, 0, 255], [127, 0, 255], [127, 0, 255], [255, 0, 0]), - (path('palette_2bit.png'), [127, 0, 255], [127, 0, 255], [ 42, 220, 220], [212, 220, 127]), - (path('palette_4bit.png'), [127, 0, 255], [127, 0, 255], [ 8, 189, 232], [178, 242, 149]), - (path('palette_8bit.png'), [127, 0, 255], [127, 0, 255], [ 18, 199, 229], [164, 248, 158]), + ( + path("color_1bit.png"), + [0, 0, 0], + [0, 0, 0], + [0, 255, 255], + [255, 255, 0], + ), + ( + path("color_2bit.png"), + [0, 0, 0], + [0, 0, 0], + [85, 255, 170], + [170, 170, 85], + ), + ( + path("color_4bit.png"), + [0, 0, 0], + [0, 0, 0], + [68, 221, 187], + [153, 187, 102], + ), + ( + path("color_8bit.png"), + [0, 0, 0], + [0, 1, 2], + [73, 219, 182], + [146, 182, 109], + ), + ( + path("color_16bit.png"), + [0, 0, 0], + [0, 1, 2], + [73, 219, 182], + [146, 182, 109], + ), + (path("gray_1bit.png"), 0, 0, 0, 255), + (path("gray_2bit.png"), 0, 0, 85, 170), + (path("gray_4bit.png"), 0, 0, 68, 153), + (path("gray_8bit.png"), 0, 0, 73, 146), + (path("gray_16bit.png"), 0, 0, 73, 146), + ( + path("palette_1bit.png"), + [127, 0, 255], + [127, 0, 255], + [127, 0, 255], + [255, 0, 0], + ), + ( + path("palette_2bit.png"), + [127, 0, 255], + [127, 0, 255], + [42, 220, 220], + [212, 220, 127], + ), + ( + path("palette_4bit.png"), + [127, 0, 255], + [127, 0, 255], + [8, 189, 232], + [178, 242, 149], + ), + ( + path("palette_8bit.png"), + [127, 0, 255], + [127, 0, 255], + [18, 199, 229], + [164, 248, 158], + ), ] for path, color_0_0, color_0_1, color_0_200, color_40_400 in images_info: diff --git a/src/python/turicreate/test/test_io.py b/src/python/turicreate/test/test_io.py index c16bec9f38..c480fee95f 100644 --- a/src/python/turicreate/test/test_io.py +++ b/src/python/turicreate/test/test_io.py @@ -7,6 +7,7 @@ from __future__ import division as _ from __future__ import absolute_import as _ import sys + if sys.version_info.major >= 3: import subprocess as commands else: @@ -27,19 +28,21 @@ from turicreate.toolkits._model import Model from pandas.util.testing import assert_frame_equal -restricted_place = '/root' -if sys.platform == 'win32': - restricted_place = 'C:/Windows/System32/config/RegBack' -elif sys.platform == 'darwin': - restricted_place = '/System' +restricted_place = "/root" +if sys.platform == "win32": + restricted_place = "C:/Windows/System32/config/RegBack" +elif sys.platform == "darwin": + restricted_place = "/System" if sys.version_info.major >= 3: unichr = chr + def _test_save_load_object_helper(testcase, obj, path): """ Helper function to test save and load a server side object to a given url. """ + def cleanup(url): """ Remove the saved file from temp directory. @@ -72,28 +75,37 @@ def assert_same_elements(x, y): testcase.assertEqual(obj.shape, newobj.shape) testcase.assertEqual(obj.column_names(), newobj.column_names()) testcase.assertEqual(obj.column_types(), newobj.column_types()) - assert_frame_equal(obj.head(obj.num_rows()).to_dataframe(), - newobj.head(newobj.num_rows()).to_dataframe()) + assert_frame_equal( + obj.head(obj.num_rows()).to_dataframe(), + newobj.head(newobj.num_rows()).to_dataframe(), + ) else: raise TypeError cleanup(path) def create_test_objects(): - vertices = pandas.DataFrame({'vid': ['1', '2', '3'], - 'color': ['g', 'r', 'b'], - 'vec': [[.1, .1, .1], [.1, .1, .1], [.1, .1, .1]]}) - edges = pandas.DataFrame({'src_id': ['1', '2', '3'], - 'dst_id': ['2', '3', '4'], - 'weight': [0., 0.1, 1.]}) - - graph = SGraph().add_vertices(vertices, 'vid').add_edges(edges, 'src_id', 'dst_id') + vertices = pandas.DataFrame( + { + "vid": ["1", "2", "3"], + "color": ["g", "r", "b"], + "vec": [[0.1, 0.1, 0.1], [0.1, 0.1, 0.1], [0.1, 0.1, 0.1]], + } + ) + edges = pandas.DataFrame( + { + "src_id": ["1", "2", "3"], + "dst_id": ["2", "3", "4"], + "weight": [0.0, 0.1, 1.0], + } + ) + + graph = SGraph().add_vertices(vertices, "vid").add_edges(edges, "src_id", "dst_id") sframe = SFrame(edges) return (graph, sframe) class LocalFSConnectorTests(unittest.TestCase): - @classmethod def setUpClass(self): self.tempfile = tempfile.NamedTemporaryFile().name @@ -112,15 +124,14 @@ def test_object_save_load(self): _test_save_load_object_helper(self, self.sframe, self.tempfile) def test_basic(self): - self._test_read_write_helper(self.tempfile, 'hello world') + self._test_read_write_helper(self.tempfile, "hello world") def test_gzip(self): - self._test_read_write_helper(self.tempfile + ".gz", 'hello world') - self._test_read_write_helper(self.tempfile + ".csv.gz", 'hello world') + self._test_read_write_helper(self.tempfile + ".gz", "hello world") + self._test_read_write_helper(self.tempfile + ".csv.gz", "hello world") class HttpConnectorTests(unittest.TestCase): - @classmethod def setUpClass(self): self.url = "http://s3-us-west-2.amazonaws.com/testdatasets/a_to_z.txt.gz" @@ -131,12 +142,14 @@ def _test_read_helper(self, url, content_expected): self.assertEqual(content_read, content_expected) def test_read(self): - expected = "\n".join([str(unichr(i + ord('a'))) for i in range(26)]) + expected = "\n".join([str(unichr(i + ord("a"))) for i in range(26)]) expected = expected + "\n" self._test_read_helper(self.url, expected) def test_exception(self): - self.assertRaises(IOError, lambda: glconnect.get_unity().__write__(self.url, '.....')) + self.assertRaises( + IOError, lambda: glconnect.get_unity().__write__(self.url, ".....") + ) @unittest.skip("Disabling HDFS Connector Tests") @@ -155,20 +168,24 @@ def _test_read_write_helper(self, url, content_expected): content_read = glconnect.get_unity().__read__(url) self.assertEqual(content_read, content_expected) # clean up the file we wrote - status, output = commands.getstatusoutput('hadoop fs -test -e ' + url) + status, output = commands.getstatusoutput("hadoop fs -test -e " + url) if status is 0: - commands.getstatusoutput('hadoop fs -rm ' + url) + commands.getstatusoutput("hadoop fs -rm " + url) def test_basic(self): if self.has_hdfs: - self._test_read_write_helper("hdfs://" + self.tempfile, 'hello,world,woof') + self._test_read_write_helper("hdfs://" + self.tempfile, "hello,world,woof") else: logging.getLogger(__name__).info("No hdfs available. Test pass.") def test_gzip(self): if self.has_hdfs: - self._test_read_write_helper("hdfs://" + self.tempfile + ".gz", 'hello,world,woof') - self._test_read_write_helper("hdfs://" + self.tempfile + ".csv.gz", 'hello,world,woof') + self._test_read_write_helper( + "hdfs://" + self.tempfile + ".gz", "hello,world,woof" + ) + self._test_read_write_helper( + "hdfs://" + self.tempfile + ".csv.gz", "hello,world,woof" + ) else: logging.getLogger(__name__).info("No hdfs available. Test pass.") @@ -183,12 +200,26 @@ def test_object_save_load(self): def test_exception(self): bad_url = "hdfs:///root/" if self.has_hdfs: - self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///")) - self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///tmp")) - self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs://" + self.tempfile)) - self.assertRaises(IOError, lambda: glconnect.get_unity().__write__(bad_url + "/tmp", "somerandomcontent")) + self.assertRaises( + IOError, lambda: glconnect.get_unity().__read__("hdfs:///") + ) + self.assertRaises( + IOError, lambda: glconnect.get_unity().__read__("hdfs:///tmp") + ) + self.assertRaises( + IOError, + lambda: glconnect.get_unity().__read__("hdfs://" + self.tempfile), + ) + self.assertRaises( + IOError, + lambda: glconnect.get_unity().__write__( + bad_url + "/tmp", "somerandomcontent" + ), + ) self.assertRaises(IOError, lambda: self.graph.save(bad_url + "x.graph")) - self.assertRaises(IOError, lambda: self.sframe.save(bad_url + "x.frame_idx")) + self.assertRaises( + IOError, lambda: self.sframe.save(bad_url + "x.frame_idx") + ) self.assertRaises(IOError, lambda: load_sgraph(bad_url + "mygraph")) self.assertRaises(IOError, lambda: load_sframe(bad_url + "x.frame_idx")) self.assertRaises(IOError, lambda: load_model(bad_url + "x.model")) @@ -201,8 +232,8 @@ class S3ConnectorTests(unittest.TestCase): # This test requires aws cli to be installed. If not, the tests will be skipped. @classmethod def setUpClass(self): - status, output = commands.getstatusoutput('aws s3api list-buckets') - self.has_s3 = (status is 0) + status, output = commands.getstatusoutput("aws s3api list-buckets") + self.has_s3 = status is 0 self.standard_bucket = None self.regional_bucket = None # Use aws cli s3api to find a bucket with "gl-testdata" in the name, and use it as out test bucket. @@ -210,15 +241,17 @@ def setUpClass(self): if self.has_s3: try: json_output = json.loads(output) - bucket_list = [b['Name'] for b in json_output['Buckets']] - assert 'gl-testdata' in bucket_list - assert 'gl-testdata-oregon' in bucket_list - self.standard_bucket = 'gl-testdata' - self.regional_bucket = 'gl-testdata-oregon' + bucket_list = [b["Name"] for b in json_output["Buckets"]] + assert "gl-testdata" in bucket_list + assert "gl-testdata-oregon" in bucket_list + self.standard_bucket = "gl-testdata" + self.regional_bucket = "gl-testdata-oregon" self.tempfile = tempfile.NamedTemporaryFile().name (self.graph, self.sframe) = create_test_objects() except: - logging.getLogger(__name__).warning("Fail parsing ioutput of s3api into json. Please check your awscli version.") + logging.getLogger(__name__).warning( + "Fail parsing ioutput of s3api into json. Please check your awscli version." + ) self.has_s3 = False def _test_read_write_helper(self, url, content_expected): @@ -226,20 +259,27 @@ def _test_read_write_helper(self, url, content_expected): glconnect.get_unity().__write__(s3url, content_expected) content_read = glconnect.get_unity().__read__(s3url) self.assertEqual(content_read, content_expected) - (status, output) = commands.getstatusoutput('aws s3 rm --region us-west-2 ' + url) + (status, output) = commands.getstatusoutput( + "aws s3 rm --region us-west-2 " + url + ) if status is not 0: logging.getLogger(__name__).warning("Cannot remove file: " + url) def test_basic(self): if self.has_s3: for bucket in [self.standard_bucket, self.regional_bucket]: - self._test_read_write_helper("s3://" + bucket + self.tempfile, 'hello,world,woof') + self._test_read_write_helper( + "s3://" + bucket + self.tempfile, "hello,world,woof" + ) else: logging.getLogger(__name__).info("No s3 bucket available. Test pass.") def test_gzip(self): if self.has_s3: - self._test_read_write_helper("s3://" + self.standard_bucket + self.tempfile + ".gz", 'hello,world,woof') + self._test_read_write_helper( + "s3://" + self.standard_bucket + self.tempfile + ".gz", + "hello,world,woof", + ) else: logging.getLogger(__name__).info("No s3 bucket available. Test pass.") @@ -256,12 +296,32 @@ def test_exception(self): bad_bucket = "i_am_a_bad_bucket" prefix = "s3://" + bad_bucket self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3:///")) - self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + self.standard_bucket + "/somerandomfile")) - self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + "/somerandomfile")) - self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + "/somerandomfile", "somerandomcontent")) - self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + self.standard_bucket + "I'amABadUrl/", "somerandomcontent")) + self.assertRaises( + IOError, + lambda: glconnect.get_unity().__read__( + "s3://" + self.standard_bucket + "/somerandomfile" + ), + ) + self.assertRaises( + IOError, + lambda: glconnect.get_unity().__read__("s3://" + "/somerandomfile"), + ) + self.assertRaises( + IOError, + lambda: glconnect.get_unity().__write__( + "s3://" + "/somerandomfile", "somerandomcontent" + ), + ) + self.assertRaises( + IOError, + lambda: glconnect.get_unity().__write__( + "s3://" + self.standard_bucket + "I'amABadUrl/", "somerandomcontent" + ), + ) self.assertRaises(IOError, lambda: self.graph.save(prefix + "/x.graph")) - self.assertRaises(IOError, lambda: self.sframe.save(prefix + "/x.frame_idx")) + self.assertRaises( + IOError, lambda: self.sframe.save(prefix + "/x.frame_idx") + ) self.assertRaises(IOError, lambda: load_sgraph(prefix + "/x.graph")) self.assertRaises(IOError, lambda: load_sframe(prefix + "/x.frame_idx")) self.assertRaises(IOError, lambda: load_model(prefix + "/x.model")) diff --git a/src/python/turicreate/test/test_json.py b/src/python/turicreate/test/test_json.py index fb5bd90e08..4f965b3982 100644 --- a/src/python/turicreate/test/test_json.py +++ b/src/python/turicreate/test/test_json.py @@ -18,7 +18,7 @@ import array import datetime import hypothesis -import json # Python built-in JSON module +import json # Python built-in JSON module import math import os import pandas @@ -31,7 +31,7 @@ import tempfile from . import util -from .. import _json # turicreate._json +from .. import _json # turicreate._json from ..data_structures.sarray import SArray from ..data_structures.sframe import SFrame from ..data_structures.sgraph import SGraph, Vertex, Edge @@ -39,29 +39,35 @@ if sys.version_info.major == 3: long = int + class image_info: def __init__(self, url): self.url = url - if 'png' in url: - self.format = 'PNG' - elif 'jpg' in url: - self.format = 'JPG' - if 'grey' in url: + if "png" in url: + self.format = "PNG" + elif "jpg" in url: + self.format = "JPG" + if "grey" in url: self.channels = 1 else: self.channels = 3 + current_file_dir = os.path.dirname(os.path.realpath(__file__)) -image_urls = [current_file_dir + x for x in [ - '/images/nested/sample_grey.jpg', - '/images/nested/sample_grey.png', - '/images/sample.jpg', - '/images/sample.png' -]] +image_urls = [ + current_file_dir + x + for x in [ + "/images/nested/sample_grey.jpg", + "/images/nested/sample_grey.png", + "/images/sample.jpg", + "/images/sample.png", + ] +] image_info = [image_info(u) for u in image_urls] _SFrameComparer = util.SFrameComparer() + class JSONTest(unittest.TestCase): # Only generate lists of dicts, but allow nearly-arbitrary JSON inside those. # However, limit to length 1 to make sure the keys are the same in all rows. @@ -92,26 +98,32 @@ class JSONTest(unittest.TestCase): values=hypothesis.strategies.recursive( # Known bug #2: [{"": null}] parses as "null" in SFrame, and should be None # Once this is fixed, uncomment the line below. - #hypothesis.strategies.none() | + # hypothesis.strategies.none() | # Known bug #3: [{"": false}] parses as "false" in SFrame, and should be 0 # Once this is fixed, uncomment the line below. - #hypothesis.strategies.booleans() | - hypothesis.strategies.integers(min_value=-(2**53)+1, max_value=(2**53)-1) | - hypothesis.strategies.floats() | - hypothesis.strategies.text(string.ascii_letters + string.digits), - lambda children: hypothesis.strategies.lists(children, 1) | - hypothesis.strategies.dictionaries( - hypothesis.strategies.text(string.ascii_letters + string.digits), children, min_size=1)), + # hypothesis.strategies.booleans() | + hypothesis.strategies.integers( + min_value=-(2 ** 53) + 1, max_value=(2 ** 53) - 1 + ) + | hypothesis.strategies.floats() + | hypothesis.strategies.text(string.ascii_letters + string.digits), + lambda children: hypothesis.strategies.lists(children, 1) + | hypothesis.strategies.dictionaries( + hypothesis.strategies.text(string.ascii_letters + string.digits), + children, + min_size=1, + ), + ), min_size=1, - max_size=1 + max_size=1, ), min_size=1, - max_size=1 + max_size=1, ) def _assertEqual(self, x, y): - if type(x) in [long,int]: - self.assertTrue(type(y) in [long,int]) + if type(x) in [long, int]: + self.assertTrue(type(y) in [long, int]) elif isinstance(x, six.string_types): self.assertTrue(isinstance(y, six.string_types)) else: @@ -125,11 +137,11 @@ def _assertEqual(self, x, y): elif isinstance(x, SGraph): _SFrameComparer._assert_sgraph_equal(x, y) elif isinstance(x, dict): - for (k1,v1),(k2,v2) in zip(sorted(x.items()), sorted(y.items())): + for (k1, v1), (k2, v2) in zip(sorted(x.items()), sorted(y.items())): self._assertEqual(k1, k2) self._assertEqual(v1, v2) elif isinstance(x, list): - for v1,v2 in zip(x, y): + for v1, v2 in zip(x, y): self._assertEqual(v1, v2) else: self.assertEqual(x, y) @@ -143,59 +155,55 @@ def _run_test_case(self, value): data = json.loads(json.dumps(data, allow_nan=False)) schema = json.loads(json.dumps(schema, allow_nan=False)) - #print("----------------------------------") - #print("Value: %s" % value) - #print("Serializable Data: %s" % data) - #print("Serializable Schema: %s" % schema) + # print("----------------------------------") + # print("Value: %s" % value) + # print("Serializable Data: %s" % data) + # print("Serializable Schema: %s" % schema) result = _json.from_serializable(data, schema) - #print("Deserialized Result: %s" % result) - #print("----------------------------------") + # print("Deserialized Result: %s" % result) + # print("----------------------------------") self._assertEqual(result, value) # test that JSON serialization gives expected result serialized = _json.dumps(value) deserialized = _json.loads(serialized) self._assertEqual(deserialized, value) - @unittest.skipIf(sys.platform == 'win32', "Windows long issue") + @unittest.skipIf(sys.platform == "win32", "Windows long issue") def test_int(self): - [self._run_test_case(value) for value in [ - 0, - 1, - -2147483650, - -2147483649, # boundary of accurate representation in JS 64-bit float - 2147483648, # boundary of accurate representation in JS 64-bit float - 2147483649, - ]] + [ + self._run_test_case(value) + for value in [ + 0, + 1, + -2147483650, + -2147483649, # boundary of accurate representation in JS 64-bit float + 2147483648, # boundary of accurate representation in JS 64-bit float + 2147483649, + ] + ] def test_float(self): - [self._run_test_case(value) for value in [ - -1.1, - -1.0, - 0.0, - 1.0, - 1.1, - float('-inf'), - float('inf'), - ]] + [ + self._run_test_case(value) + for value in [-1.1, -1.0, 0.0, 1.0, 1.1, float("-inf"), float("inf"),] + ] self.assertTrue( - math.isnan( - _json.from_serializable(*_json.to_serializable(float('nan'))))) + math.isnan(_json.from_serializable(*_json.to_serializable(float("nan")))) + ) def test_string_to_json(self): - [self._run_test_case(value) for value in [ - "hello", - "a'b", - "a\"b", - "ɖɞɫɷ", - ]] + [self._run_test_case(value) for value in ["hello", "a'b", 'a"b', "ɖɞɫɷ",]] def test_vec_to_json(self): - [self._run_test_case(value) for value in [ - array.array('d'), - array.array('d', [1.5]), - array.array('d', [2.1,2.5,3.1]), - array.array('d', [float('-inf'), float('inf')]), - ]] + [ + self._run_test_case(value) + for value in [ + array.array("d"), + array.array("d", [1.5]), + array.array("d", [2.1, 2.5, 3.1]), + array.array("d", [float("-inf"), float("inf")]), + ] + ] def test_list_to_json(self): # TODO -- we can't test lists of numbers, due to @@ -203,108 +211,115 @@ def test_list_to_json(self): # if `list` of `int` goes into C++, the flexible_type representation # becomes flex_vec (vector). This is a lossy representation. # known issue, can't resolve here. - [self._run_test_case(value) for value in [ - [], - ["hello", "world"], - ["hello", 3, None], - [3.14159, None], - [{}, {'x': 1, 'y': 2}], - ["hello", float('-inf'), float('inf')], - ]] + [ + self._run_test_case(value) + for value in [ + [], + ["hello", "world"], + ["hello", 3, None], + [3.14159, None], + [{}, {"x": 1, "y": 2}], + ["hello", float("-inf"), float("inf")], + ] + ] def test_dict_to_json(self): - [self._run_test_case(value) for value in [ - {}, - { - "x": 1, - "y": 2 - }, - ]] + [self._run_test_case(value) for value in [{}, {"x": 1, "y": 2},]] def test_date_time_to_json(self): d = datetime.datetime(year=2016, month=3, day=5) - [self._run_test_case(value) for value in [ - d, - pytz.utc.localize(d), - pytz.timezone('US/Arizona').localize(d), - ]] + [ + self._run_test_case(value) + for value in [ + d, + pytz.utc.localize(d), + pytz.timezone("US/Arizona").localize(d), + ] + ] def test_image_to_json(self): from .. import Image - [self._run_test_case(value) for value in [ - Image(path=item.url, format=item.format) for item in image_info - ]] + + [ + self._run_test_case(value) + for value in [ + Image(path=item.url, format=item.format) for item in image_info + ] + ] def test_sarray_to_json(self): from .. import Image d = datetime.datetime(year=2016, month=3, day=5) - [self._run_test_case(value) for value in [ - SArray(), - SArray([1,2,3]), - SArray([1.0,2.0,3.0]), - SArray([None, 3, None]), - SArray(["hello", "world"]), - SArray(array.array('d', [2.1,2.5,3.1])), - SArray([ - ["hello", None, "world"], - ["hello", 3, None], - [3.14159, None], - ]), - SArray([ - { - "x": 1, - "y": 2 - }, { - "x": 5, - "z": 3 - }, - ]), - SArray([ - d, - pytz.utc.localize(d), - pytz.timezone('US/Arizona').localize(d), - ]), - SArray([ - Image(path=item.url, format=item.format) for item in image_info - ]), - ]] + [ + self._run_test_case(value) + for value in [ + SArray(), + SArray([1, 2, 3]), + SArray([1.0, 2.0, 3.0]), + SArray([None, 3, None]), + SArray(["hello", "world"]), + SArray(array.array("d", [2.1, 2.5, 3.1])), + SArray( + [["hello", None, "world"], ["hello", 3, None], [3.14159, None],] + ), + SArray([{"x": 1, "y": 2}, {"x": 5, "z": 3},]), + SArray( + [d, pytz.utc.localize(d), pytz.timezone("US/Arizona").localize(d),] + ), + SArray( + [Image(path=item.url, format=item.format) for item in image_info] + ), + ] + ] def test_sframe_to_json(self): - [self._run_test_case(value) for value in [ - SFrame(), - SFrame({'foo': [1,2,3,4], 'bar': [None, "Hello", None, "World"]}), - ]] + [ + self._run_test_case(value) + for value in [ + SFrame(), + SFrame({"foo": [1, 2, 3, 4], "bar": [None, "Hello", None, "World"]}), + ] + ] def test_sgraph_to_json(self): sg = SGraph() self._run_test_case(sg) - sg = sg.add_vertices([Vertex(x) for x in [1,2,3,4]]) - sg = sg.add_edges([Edge(x, x+1) for x in [1,2,3]]) + sg = sg.add_vertices([Vertex(x) for x in [1, 2, 3, 4]]) + sg = sg.add_edges([Edge(x, x + 1) for x in [1, 2, 3]]) self._run_test_case(sg) def test_nested_to_json(self): # not tested in the cases above: nested data, nested schema # (but all flexible_type compatible) - [self._run_test_case(value) for value in [ - {'foo': ['a','b','c'], 'bar': array.array('d', [0.0, float('inf'), float('-inf')])}, - [['a','b','c'], array.array('d', [0.0, float('inf'), float('-inf')])], - { - 'baz': {'foo': ['a','b','c'], 'bar': array.array('d', [0.0, float('inf'), float('-inf')])}, - 'qux': [['a','b','c'], array.array('d', [0.0, float('inf'), float('-inf')])], - } - ]] + [ + self._run_test_case(value) + for value in [ + { + "foo": ["a", "b", "c"], + "bar": array.array("d", [0.0, float("inf"), float("-inf")]), + }, + [["a", "b", "c"], array.array("d", [0.0, float("inf"), float("-inf")])], + { + "baz": { + "foo": ["a", "b", "c"], + "bar": array.array("d", [0.0, float("inf"), float("-inf")]), + }, + "qux": [ + ["a", "b", "c"], + array.array("d", [0.0, float("inf"), float("-inf")]), + ], + }, + ] + ] def test_variant_to_json(self): # not tested in the cases above: variant_type other than SFrame-like # but containing SFrame-like (so cannot be a flexible_type) - sf = SFrame({'col1': [1,2], 'col2': ['hello','world']}) - sa = SArray([5.0,6.0,7.0]) - [self._run_test_case(value) for value in [ - {'foo': sf, 'bar': sa}, - [sf, sa], - ]] + sf = SFrame({"col1": [1, 2], "col2": ["hello", "world"]}) + sa = SArray([5.0, 6.0, 7.0]) + [self._run_test_case(value) for value in [{"foo": sf, "bar": sa}, [sf, sa],]] def test_malformed_json(self): out = """ @@ -331,7 +346,7 @@ def test_malformed_json(self): } ] """ - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile("w") as f: f.write(out) f.flush() @@ -339,25 +354,29 @@ def test_malformed_json(self): self.assertRaises(RuntimeError, SFrame.read_json, f.name) def test_nonexistant_json(self): - self.assertRaises(IOError, SArray.read_json, '/nonexistant.json') - self.assertRaises(IOError, SFrame.read_json, '/nonexistant.json') + self.assertRaises(IOError, SArray.read_json, "/nonexistant.json") + self.assertRaises(IOError, SFrame.read_json, "/nonexistant.json") def test_strange_128_char_corner_case(self): json_text = """ {"foo":[{"bar":"Lorem ipsum dolor sit amet, consectetur adipiscing elit. In eget odio velit. Suspendisse potenti. Vivamus a urna feugiat nullam."}]} """ - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile("w") as f: f.write(json_text) f.flush() df = pandas.read_json(f.name, lines=True) - sf_actual = SFrame.read_json(f.name, orient='lines') + sf_actual = SFrame.read_json(f.name, orient="lines") sf_expected = SFrame(df) _SFrameComparer._assert_sframe_equal(sf_expected, sf_actual) - @pytest.mark.xfail(reason="Non-deterministic test failure tracked in https://github.com/apple/turicreate/issues/2934") + @pytest.mark.xfail( + reason="Non-deterministic test failure tracked in https://github.com/apple/turicreate/issues/2934" + ) # deterministic across runs, and may take a while - @hypothesis.settings(derandomize=True, suppress_health_check=[hypothesis.HealthCheck.too_slow]) + @hypothesis.settings( + derandomize=True, suppress_health_check=[hypothesis.HealthCheck.too_slow] + ) @hypothesis.given(hypothesis_json) def test_arbitrary_json(self, json_obj): # Known bug #1: escaped chars give different behavior in SFrame JSON parsing @@ -377,13 +396,13 @@ def test_arbitrary_json(self, json_obj): return try: - expected = SFrame(json_obj).unpack('X1', column_name_prefix='') + expected = SFrame(json_obj).unpack("X1", column_name_prefix="") except TypeError: # something like TypeError: A common type cannot be infered from types integer, string. # TC enforces all list items have the same type, which # JSON does not necessarily enforce. Let's skip those examples. return - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile("w") as f: f.write(json_text) f.flush() @@ -397,10 +416,9 @@ def test_arbitrary_json(self, json_obj): _SFrameComparer._assert_sframe_equal(expected, actual) - def test_true_false_substitutions(self): - expecteda = [["a", "b", "c"],["a", "b", "c"]] - expectedb = [["d", "false", "e", 0, "true", 1, "a"],["d", "e", "f"]] + expecteda = [["a", "b", "c"], ["a", "b", "c"]] + expectedb = [["d", "false", "e", 0, "true", 1, "a"], ["d", "e", "f"]] records_json_file = """ [{"a" : ["a", "b", "c"], @@ -413,18 +431,17 @@ def test_true_false_substitutions(self): {"a" : ["a", "b", "c"], "b" : ["d", "e", "f"]} """ - - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile("w") as f: f.write(records_json_file) f.flush() - records = SFrame.read_json(f.name, orient='records') + records = SFrame.read_json(f.name, orient="records") self.assertEqual(list(records["a"]), expecteda) self.assertEqual(list(records["b"]), expectedb) - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile("w") as f: f.write(lines_json_file) f.flush() - lines = SFrame.read_json(f.name, orient='lines') + lines = SFrame.read_json(f.name, orient="lines") self.assertEqual(list(lines["a"]), expecteda) self.assertEqual(list(lines["b"]), expectedb) diff --git a/src/python/turicreate/test/test_json_export.py b/src/python/turicreate/test/test_json_export.py index 8e10defebc..56e0c9acca 100644 --- a/src/python/turicreate/test/test_json_export.py +++ b/src/python/turicreate/test/test_json_export.py @@ -26,22 +26,23 @@ _TEST_CASE_SIZE = 1000 + class JSONExporterTest(unittest.TestCase): # tests int/float/str def test_simple_types(self): np.random.seed(42) sf = tc.SFrame() - sf['idx'] = range(_TEST_CASE_SIZE) - sf['ints'] = np.random.randint(-100000, 100000, _TEST_CASE_SIZE) - sf['strings'] = sf['ints'].astype(str) - sf['floats'] = np.random.random(_TEST_CASE_SIZE) + sf["idx"] = range(_TEST_CASE_SIZE) + sf["ints"] = np.random.randint(-100000, 100000, _TEST_CASE_SIZE) + sf["strings"] = sf["ints"].astype(str) + sf["floats"] = np.random.random(_TEST_CASE_SIZE) # TODO: nans and infs will break JSON - what should we do about this? - #sf['nans_and_infs'] = sf['idx'].apply(lambda x: float('nan') if x > 0 else float('inf')) + # sf['nans_and_infs'] = sf['idx'].apply(lambda x: float('nan') if x > 0 else float('inf')) - with tempfile.NamedTemporaryFile(mode='w', suffix = '.json') as json_file: - sf.save(json_file.name, format='json') + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as json_file: + sf.save(json_file.name, format="json") with open(json_file.name) as json_data: # will throw if JSON export doesn't work loaded = json.load(json_data) @@ -49,9 +50,9 @@ def test_simple_types(self): def test_array_dtype(self): np.random.seed(42) sf = tc.SFrame() - sf['arr'] = np.random.rand(100,3) - with tempfile.NamedTemporaryFile(mode='w', suffix = '.json') as json_file: - sf.save(json_file.name, format='json') + sf["arr"] = np.random.rand(100, 3) + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as json_file: + sf.save(json_file.name, format="json") with open(json_file.name) as json_data: # will throw if JSON export doesn't work loaded = json.load(json_data) diff --git a/src/python/turicreate/test/test_kmeans.py b/src/python/turicreate/test/test_kmeans.py index 4fd4fe37e2..322feaf6c2 100644 --- a/src/python/turicreate/test/test_kmeans.py +++ b/src/python/turicreate/test/test_kmeans.py @@ -19,6 +19,7 @@ from turicreate.toolkits._main import ToolkitError import sys + if sys.version_info.major == 3: unittest.TestCase.assertItemsEqual = unittest.TestCase.assertCountEqual @@ -37,20 +38,20 @@ def make_clustering_data(n, d, seed=None): # integer and float columns for i in range(d): - sf['int{}'.format(i)] = np.random.randint(low=-10, high=10, size=n) + sf["int{}".format(i)] = np.random.randint(low=-10, high=10, size=n) for i in range(d): v = np.random.rand(n) - sf['float{}'.format(i)] = v * 20 - 10 + sf["float{}".format(i)] = v * 20 - 10 # dict column - string_col = test_util.uniform_string_column(n, word_length=5, - alphabet_size=5, - missingness=0.) + string_col = test_util.uniform_string_column( + n, word_length=5, alphabet_size=5, missingness=0.0 + ) - sf['dict0'] = tc.text_analytics.count_ngrams(string_col, - n=3, method='character', - to_lower=False) + sf["dict0"] = tc.text_analytics.count_ngrams( + string_col, n=3, method="character", to_lower=False + ) return sf @@ -66,30 +67,34 @@ def setUp(self): self.K = 10 self.max_iter = 10 self.sf = make_clustering_data(n=self.n, d=self.dim, seed=8) - self.model = tc.kmeans.create(self.sf, num_clusters=self.K, - max_iterations=self.max_iter, - batch_size=None, verbose=False) + self.model = tc.kmeans.create( + self.sf, + num_clusters=self.K, + max_iterations=self.max_iter, + batch_size=None, + verbose=False, + ) def test__list_fields(self): """ Check the model list fields method. """ correct_fields = [ - 'batch_size', - 'row_label_name', - 'cluster_id', - 'cluster_info', - 'features', - 'max_iterations', - 'method', - 'num_clusters', - 'num_examples', - 'num_features', - 'num_unpacked_features', - 'training_iterations', - 'training_time', - 'unpacked_features', - ] + "batch_size", + "row_label_name", + "cluster_id", + "cluster_info", + "features", + "max_iterations", + "method", + "num_clusters", + "num_examples", + "num_features", + "num_unpacked_features", + "training_iterations", + "training_time", + "unpacked_features", + ] self.assertItemsEqual(self.model._list_fields(), correct_fields) @@ -98,13 +103,13 @@ def test_get(self): Check the various 'get' methods against known answers for each field. """ correct_fields = { - 'max_iterations': self.max_iter, - 'row_label_name': 'row_id', - 'num_clusters': self.K, - 'num_examples': self.n, - 'method': 'elkan', - 'batch_size': self.n, - 'num_features': 2 * self.dim + 1, + "max_iterations": self.max_iter, + "row_label_name": "row_id", + "num_clusters": self.K, + "num_examples": self.n, + "method": "elkan", + "batch_size": self.n, + "num_features": 2 * self.dim + 1, } print(self.model) @@ -112,8 +117,10 @@ def test_get(self): self.assertEqual(self.model._get(field), ans, "{} failed".format(field)) self.assertGreaterEqual(self.model.training_time, 0) self.assertGreater(self.model.num_unpacked_features, self.n) - self.assertItemsEqual(self.model.features, - ['int0', 'int1', 'int2', 'float0', 'float1', 'float2', 'dict0']) + self.assertItemsEqual( + self.model.features, + ["int0", "int1", "int2", "float0", "float1", "float2", "dict0"], + ) def test_summaries(self): """ @@ -165,25 +172,23 @@ def test_predict_params(self): ans = self.model.predict(tc.SFrame(), verbose=False) with self.assertRaises(ToolkitError): - ans = self.model.predict(self.sf['int0'], verbose=False) + ans = self.model.predict(self.sf["int0"], verbose=False) ## Check that mis-matching schemas are trapped correctly. All features # are used when the model is created. with self.assertRaises(ToolkitError): - ans = self.model.predict(self.sf[['int0']], verbose=False) + ans = self.model.predict(self.sf[["int0"]], verbose=False) ## Check that bad inputs for the output type are trapped correctly. with self.assertRaises(TypeError): ans = self.model.predict(self.sf, output_type=1, verbose=False) ## Check that output type correctly changes the type of the outputs. - ans = self.model.predict(self.sf, output_type='cluster_id', - verbose=False) + ans = self.model.predict(self.sf, output_type="cluster_id", verbose=False) self.assertIsInstance(ans, tc.SArray) self.assertTrue(ans.dtype == int) - ans = self.model.predict(self.sf, output_type='distance', - verbose=False) + ans = self.model.predict(self.sf, output_type="distance", verbose=False) self.assertIsInstance(ans, tc.SArray) self.assertTrue(ans.dtype == float) @@ -219,8 +224,13 @@ def test_input_mutations(self): features = copy.copy(self.sf.column_names()) ## Create a model with the copied objects - m = tc.kmeans.create(sf, features=features, num_clusters=K, - max_iterations=max_iter, verbose=verbose) + m = tc.kmeans.create( + sf, + features=features, + num_clusters=K, + max_iterations=max_iter, + verbose=verbose, + ) ## Check that the copies still equal the originals assert_sframe_equal(sf, self.sf) @@ -236,13 +246,21 @@ def test_bad_data(self): """ ## Empty SFrame with self.assertRaises(ValueError): - m = tc.kmeans.create(dataset=tc.SFrame(), num_clusters=self.K, - max_iterations=self.max_iter, verbose=False) + m = tc.kmeans.create( + dataset=tc.SFrame(), + num_clusters=self.K, + max_iterations=self.max_iter, + verbose=False, + ) ## Dataset as an SArray with self.assertRaises(TypeError): - m = tc.kmeans.create(dataset=self.sf['int0'], num_clusters=self.K, - max_iterations=self.max_iter, verbose=False) + m = tc.kmeans.create( + dataset=self.sf["int0"], + num_clusters=self.K, + max_iterations=self.max_iter, + verbose=False, + ) def test_bogus_parameters(self): """ @@ -250,25 +268,34 @@ def test_bogus_parameters(self): and types. """ ## Bad values for the number of clusters - for k in [0, -1, 'fossa', 3.5]: + for k in [0, -1, "fossa", 3.5]: with self.assertRaises(ToolkitError): m = tc.kmeans.create(dataset=self.sf, num_clusters=k, verbose=False) with self.assertRaises(ValueError): - m = tc.kmeans.create(dataset=self.sf, num_clusters=self.n + 1, - verbose=False) + m = tc.kmeans.create( + dataset=self.sf, num_clusters=self.n + 1, verbose=False + ) ## Bad values for max_iterations - for max_iter in [-1, 'fossa', 3.5]: + for max_iter in [-1, "fossa", 3.5]: with self.assertRaises(ToolkitError): - m = tc.kmeans.create(dataset=self.sf, num_clusters=self.K, - max_iterations=max_iter, verbose=False) + m = tc.kmeans.create( + dataset=self.sf, + num_clusters=self.K, + max_iterations=max_iter, + verbose=False, + ) ## Bad values for batch_size - for batch_size in [-1, 0, 'fossa', 3.5]: + for batch_size in [-1, 0, "fossa", 3.5]: with self.assertRaises(ToolkitError): - m = tc.kmeans.create(dataset=self.sf, num_clusters=self.K, - batch_size=batch_size, verbose=False) + m = tc.kmeans.create( + dataset=self.sf, + num_clusters=self.K, + batch_size=batch_size, + verbose=False, + ) def test_default_inputs(self): """ @@ -282,10 +309,10 @@ def test_default_inputs(self): m = tc.kmeans.create(self.sf, self.K) correct_fields = { - 'max_iterations': 10, - 'num_features': 2 * self.dim + 1, - 'method': 'elkan', - 'batch_size': self.n, + "max_iterations": 10, + "num_features": 2 * self.dim + 1, + "method": "elkan", + "batch_size": self.n, } for field, ans in correct_fields.items(): @@ -298,58 +325,72 @@ def test_features_param(self): ## Default is to get all features from the SFrame. m = tc.kmeans.create(self.sf, num_clusters=self.K, verbose=False) - self.assertItemsEqual(m.features, - ['int0', 'int1', 'int2', 'float0', 'float1', 'float2', 'dict0']) + self.assertItemsEqual( + m.features, ["int0", "int1", "int2", "float0", "float1", "float2", "dict0"] + ) ## The features parameter should be reflected correctly in the model. - test_ftrs = ['int0', 'int1', 'int2'] - m = tc.kmeans.create(self.sf, num_clusters=self.K, features=test_ftrs, - verbose=False) + test_ftrs = ["int0", "int1", "int2"] + m = tc.kmeans.create( + self.sf, num_clusters=self.K, features=test_ftrs, verbose=False + ) self.assertItemsEqual(m.features, test_ftrs) ## Missing features are pruned - m = tc.kmeans.create(self.sf, num_clusters=self.K, - features=['int0', 'int1', 'fossa'], verbose=False) - self.assertItemsEqual(m.features, ['int0', 'int1']) + m = tc.kmeans.create( + self.sf, + num_clusters=self.K, + features=["int0", "int1", "fossa"], + verbose=False, + ) + self.assertItemsEqual(m.features, ["int0", "int1"]) ## Duplicate features are pruned - test_ftrs = ['int0', 'int0', 'int1', 'int2'] - m = tc.kmeans.create(self.sf, num_clusters=self.K, features=test_ftrs, - verbose=False) - self.assertItemsEqual(m.features, ['int0', 'int1', 'int2']) + test_ftrs = ["int0", "int0", "int1", "int2"] + m = tc.kmeans.create( + self.sf, num_clusters=self.K, features=test_ftrs, verbose=False + ) + self.assertItemsEqual(m.features, ["int0", "int1", "int2"]) ## Features not specified by a string are pruned. - test_ftrs = [2.71, 'int0', 'int1', 'int2'] - m = tc.kmeans.create(self.sf, num_clusters=self.K, features=test_ftrs, - verbose=False) - self.assertItemsEqual(m.features, ['int0', 'int1', 'int2']) + test_ftrs = [2.71, "int0", "int1", "int2"] + m = tc.kmeans.create( + self.sf, num_clusters=self.K, features=test_ftrs, verbose=False + ) + self.assertItemsEqual(m.features, ["int0", "int1", "int2"]) ## Features with non-valid types are pruned. sf = copy.copy(self.sf) - sf['list0'] = sf['dict0'].dict_keys() + sf["list0"] = sf["dict0"].dict_keys() - test_ftrs = ['list0', 'int0', 'int1', 'int2'] - m = tc.kmeans.create(sf, num_clusters=self.K, features=test_ftrs, - verbose=False) - self.assertItemsEqual(m.features, ['int0', 'int1', 'int2']) + test_ftrs = ["list0", "int0", "int1", "int2"] + m = tc.kmeans.create(sf, num_clusters=self.K, features=test_ftrs, verbose=False) + self.assertItemsEqual(m.features, ["int0", "int1", "int2"]) ## Row label is pruned from the feature list. - sf = sf.add_row_number('row_id') - test_ftrs = ['row_id', 'int0', 'int1', 'int2'] - m = tc.kmeans.create(sf, label='row_id', num_clusters=self.K, - features=test_ftrs, verbose=False) - self.assertItemsEqual(m.features, ['int0', 'int1', 'int2']) + sf = sf.add_row_number("row_id") + test_ftrs = ["row_id", "int0", "int1", "int2"] + m = tc.kmeans.create( + sf, label="row_id", num_clusters=self.K, features=test_ftrs, verbose=False + ) + self.assertItemsEqual(m.features, ["int0", "int1", "int2"]) ## Empty list of features raises an error. with self.assertRaises(ValueError): - m = tc.kmeans.create(self.sf, features=[], num_clusters=self.K, - verbose=False) + m = tc.kmeans.create( + self.sf, features=[], num_clusters=self.K, verbose=False + ) ## All features pruned raises an error. - test_ftrs = ['row_id', 'list0'] + test_ftrs = ["row_id", "list0"] with self.assertRaises(ToolkitError): - m = tc.kmeans.create(sf, features=test_ftrs, label='row_id', - num_clusters=self.K, verbose=False) + m = tc.kmeans.create( + sf, + features=test_ftrs, + label="row_id", + num_clusters=self.K, + verbose=False, + ) def test_label_param(self): """ @@ -357,31 +398,32 @@ def test_label_param(self): """ ## No label leads to integers and 'row_id' m = tc.kmeans.create(self.sf, num_clusters=self.K, verbose=False) - self.assertItemsEqual(m.cluster_id.column_names(), - ['row_id', 'cluster_id', 'distance']) - self.assertEqual(m.cluster_id['row_id'].dtype, int) + self.assertItemsEqual( + m.cluster_id.column_names(), ["row_id", "cluster_id", "distance"] + ) + self.assertEqual(m.cluster_id["row_id"].dtype, int) ## Specified string labels are correctly passed through to the cluster # ID SFrame. - label_name = 'row_labels' + label_name = "row_labels" sf = self.sf.add_row_number(label_name) - sf[label_name] = sf[label_name].astype(str) + 'a' + sf[label_name] = sf[label_name].astype(str) + "a" - m = tc.kmeans.create(sf, label=label_name, num_clusters=self.K, - verbose=False) - self.assertItemsEqual(m.cluster_id.column_names(), - [label_name, 'cluster_id', 'distance']) + m = tc.kmeans.create(sf, label=label_name, num_clusters=self.K, verbose=False) + self.assertItemsEqual( + m.cluster_id.column_names(), [label_name, "cluster_id", "distance"] + ) self.assertEqual(m.cluster_id[label_name].dtype, str) ## Row label 'row_id' that's *not* int row numbers still works ok - label_name = 'row_id' + label_name = "row_id" sf = self.sf.add_row_number(label_name) - sf[label_name] = sf[label_name].astype(str) + 'a' + sf[label_name] = sf[label_name].astype(str) + "a" - m = tc.kmeans.create(sf, label=label_name, num_clusters=self.K, - verbose=False) - self.assertItemsEqual(m.cluster_id.column_names(), - [label_name, 'cluster_id', 'distance']) + m = tc.kmeans.create(sf, label=label_name, num_clusters=self.K, verbose=False) + self.assertItemsEqual( + m.cluster_id.column_names(), [label_name, "cluster_id", "distance"] + ) self.assertEqual(m.cluster_id[label_name].dtype, str) def test_batch_size(self): @@ -392,8 +434,13 @@ def test_batch_size(self): """ ## Typical usage - m = tc.kmeans.create(self.sf, num_clusters=self.K, batch_size=self.n / 5, - max_iterations=10, verbose=False) + m = tc.kmeans.create( + self.sf, + num_clusters=self.K, + batch_size=self.n / 5, + max_iterations=10, + verbose=False, + ) self.assertEqual(m.method, "minibatch") self.assertEqual(m.batch_size, self.n / 5) self.assertEqual(m.max_iterations, 10) @@ -401,8 +448,13 @@ def test_batch_size(self): self.assertEqual(m.cluster_info.num_rows(), self.K) ## Larger batch size than number of examples - m = tc.kmeans.create(self.sf, num_clusters=self.K, batch_size=2 * self.n, - max_iterations=10, verbose=False) + m = tc.kmeans.create( + self.sf, + num_clusters=self.K, + batch_size=2 * self.n, + max_iterations=10, + verbose=False, + ) self.assertEqual(m.method, "elkan") self.assertEqual(m.batch_size, self.n) self.assertEqual(m.max_iterations, 10) @@ -428,27 +480,48 @@ def test_custom_initial_centers(self): ## Empty initial centers with self.assertRaises(ValueError): - m = tc.kmeans.create(dataset=self.sf, initial_centers=tc.SFrame(), - max_iterations=self.max_iter, verbose=False) + m = tc.kmeans.create( + dataset=self.sf, + initial_centers=tc.SFrame(), + max_iterations=self.max_iter, + verbose=False, + ) ## Initial centers as an SArray of indices with self.assertRaises(TypeError): - m = tc.kmeans.create(dataset=self.sf, initial_centers=tc.SArray([1, 2, 3]), - max_iterations=self.max_iter, verbose=False) + m = tc.kmeans.create( + dataset=self.sf, + initial_centers=tc.SArray([1, 2, 3]), + max_iterations=self.max_iter, + verbose=False, + ) ## Initial centers with a schema that doesn't match the data - sf_init = make_clustering_data(n=10, d=self.dim-1, seed=43) + sf_init = make_clustering_data(n=10, d=self.dim - 1, seed=43) with self.assertRaises(ValueError): - m = tc.kmeans.create(dataset=self.sf, initial_centers=sf_init, - max_iterations=self.max_iter, verbose=False) + m = tc.kmeans.create( + dataset=self.sf, + initial_centers=sf_init, + max_iterations=self.max_iter, + verbose=False, + ) ## Good initial centers sf_init = make_clustering_data(n=10, d=self.dim, seed=43) - ftrs = ['float0', 'float1', 'dict0'] # exclude int feature because these *are* changed. - - m = tc.kmeans.create(self.sf, features=ftrs, initial_centers=sf_init, - max_iterations=0, verbose=False) + ftrs = [ + "float0", + "float1", + "dict0", + ] # exclude int feature because these *are* changed. + + m = tc.kmeans.create( + self.sf, + features=ftrs, + initial_centers=sf_init, + max_iterations=0, + verbose=False, + ) model_init_centers = m.cluster_info assert_sframe_equal(sf_init[ftrs], model_init_centers[ftrs]) @@ -458,21 +531,26 @@ def test_random_initial_centers(self): Make sure randomly initialized cluster centers work correctly. """ ## Check the basics - m = tc.kmeans.create(self.sf, num_clusters=self.K, max_iterations=0, - verbose=False) + m = tc.kmeans.create( + self.sf, num_clusters=self.K, max_iterations=0, verbose=False + ) self.assertEqual(m.cluster_id.num_rows(), self.n) - self.assertItemsEqual(m.cluster_id.column_names(), ['row_id', 'cluster_id', 'distance']) - self.assertItemsEqual(m.cluster_id['cluster_id'].unique(), range(10)) + self.assertItemsEqual( + m.cluster_id.column_names(), ["row_id", "cluster_id", "distance"] + ) + self.assertItemsEqual(m.cluster_id["cluster_id"].unique(), range(10)) - self.assertItemsEqual(m.cluster_info['cluster_id'], range(10)) - self.assertTrue((m.cluster_info['size'] > 0).all()) + self.assertItemsEqual(m.cluster_info["cluster_id"], range(10)) + self.assertTrue((m.cluster_info["size"] > 0).all()) self.assertEqual(m.cluster_info.num_rows(), self.K) - self.assertItemsEqual(m.cluster_info.column_names(), - self.sf.column_names() + ['cluster_id', 'size', 'sum_squared_distance']) + self.assertItemsEqual( + m.cluster_info.column_names(), + self.sf.column_names() + ["cluster_id", "size", "sum_squared_distance"], + ) self.assertEqual(m.training_iterations, 0) - self.assertGreaterEqual(m.training_time, 0.) + self.assertGreaterEqual(m.training_time, 0.0) self.assertEqual(m.num_clusters, self.K) @@ -495,19 +573,19 @@ def test_extreme_cluster_numbers(self): m = tc.kmeans.create(self.sf, num_clusters=1, verbose=False) self.assertEqual(m.cluster_info.num_rows(), 1) - self.assertEqual(m.cluster_info['cluster_id'][0], 0) - self.assertEqual(m.cluster_info['size'][0], self.n) - self.assertTrue(all(m.cluster_id['cluster_id'] == 0)) + self.assertEqual(m.cluster_info["cluster_id"][0], 0) + self.assertEqual(m.cluster_info["size"][0], self.n) + self.assertTrue(all(m.cluster_id["cluster_id"] == 0)) ## Check output if there is a cluster for each point. m = tc.kmeans.create(self.sf, num_clusters=self.n, verbose=False) - self.assertItemsEqual(m.cluster_id['cluster_id'], range(self.n)) + self.assertItemsEqual(m.cluster_id["cluster_id"], range(self.n)) - self.assertTrue(all(m.cluster_id['distance'] < 1e-12)) - self.assertItemsEqual(m.cluster_info['cluster_id'], range(self.n)) - self.assertTrue(all(m.cluster_info['size'] == 1)) - self.assertTrue(all(m.cluster_info['sum_squared_distance'] < 1e-12)) + self.assertTrue(all(m.cluster_id["distance"] < 1e-12)) + self.assertItemsEqual(m.cluster_info["cluster_id"], range(self.n)) + self.assertTrue(all(m.cluster_info["size"] == 1)) + self.assertTrue(all(m.cluster_info["sum_squared_distance"] < 1e-12)) def test_distance_accuracy(self): """ @@ -515,18 +593,22 @@ def test_distance_accuracy(self): a problem in early versions of the tool due to integer casting in the cluster centers. """ - ftrs = ['int0', 'int1', 'float0'] + ftrs = ["int0", "int1", "float0"] ## Create models. - kmeans = tc.kmeans.create(self.sf, features=ftrs, num_clusters=3, - verbose=False) - knn = tc.nearest_neighbors.create(kmeans.cluster_info, features=ftrs, - method='ball_tree', - distance='euclidean', verbose=False) + kmeans = tc.kmeans.create(self.sf, features=ftrs, num_clusters=3, verbose=False) + knn = tc.nearest_neighbors.create( + kmeans.cluster_info, + features=ftrs, + method="ball_tree", + distance="euclidean", + verbose=False, + ) ## Transform int features into floats. - coltype_map = {k: v for k, v in zip(self.sf.column_names(), - self.sf.column_types())} + coltype_map = { + k: v for k, v in zip(self.sf.column_names(), self.sf.column_types()) + } sf_float = tc.SFrame() for ftr in ftrs: if coltype_map[ftr] is int: @@ -537,10 +619,11 @@ def test_distance_accuracy(self): knn_dists = knn.query(sf_float, k=1, radius=None, verbose=False) ## Check that the results are equal. - self.assertTrue((kmeans.cluster_id['row_id'] == knn_dists['query_label']).all()) - self.assertTrue((kmeans.cluster_id['cluster_id'] == \ - knn_dists['reference_label']).all()) - assert_allclose(kmeans.cluster_id['distance'], knn_dists['distance']) + self.assertTrue((kmeans.cluster_id["row_id"] == knn_dists["query_label"]).all()) + self.assertTrue( + (kmeans.cluster_id["cluster_id"] == knn_dists["reference_label"]).all() + ) + assert_allclose(kmeans.cluster_id["distance"], knn_dists["distance"]) def test_predictions(self): """ @@ -560,40 +643,42 @@ def test_predictions(self): yhat = kmeans.predict(sf_train) assert_sframe_equal(sf_train, sf_train_copy) - self.assertTrue((yhat == kmeans.cluster_id['cluster_id']).all()) - + self.assertTrue((yhat == kmeans.cluster_id["cluster_id"]).all()) ## Check internal consistency for prediction distances. - yhat_dists = kmeans.predict(sf_train, output_type='distance') - assert_allclose(yhat_dists, kmeans.cluster_id['distance'], rtol=1e-6) - + yhat_dists = kmeans.predict(sf_train, output_type="distance") + assert_allclose(yhat_dists, kmeans.cluster_id["distance"], rtol=1e-6) ## Check consistency with nearest neighbors. # get the predictions from the model and combine into a single SFrame. - ystar_labels = kmeans.predict(sf_predict, output_type='cluster_id') - ystar_dists = kmeans.predict(sf_predict, output_type='distance') + ystar_labels = kmeans.predict(sf_predict, output_type="cluster_id") + ystar_dists = kmeans.predict(sf_predict, output_type="distance") - ystar = tc.SFrame({'cluster_id': ystar_labels, - 'distance': ystar_dists}) - ystar = ystar.add_row_number('row_id') + ystar = tc.SFrame({"cluster_id": ystar_labels, "distance": ystar_dists}) + ystar = ystar.add_row_number("row_id") # convert type of predictions to floats so they match the types of the # centers in the nearest neighbors model. - coltype_map = {k: v for k, v in zip(sf_predict.column_names(), - sf_predict.column_types())} + coltype_map = { + k: v for k, v in zip(sf_predict.column_names(), sf_predict.column_types()) + } for ftr in coltype_map.keys(): if coltype_map[ftr] is int: sf_predict[ftr] = sf_predict[ftr].astype(float) - knn_model = tc.nearest_neighbors.create(kmeans.cluster_info, - features=kmeans.features, - distance='euclidean', - method='ball_tree') + knn_model = tc.nearest_neighbors.create( + kmeans.cluster_info, + features=kmeans.features, + distance="euclidean", + method="ball_tree", + ) knn_dists = knn_model.query(sf_predict, k=1, radius=None) - assert_sframe_equal(ystar[['row_id', 'cluster_id']], - knn_dists[['query_label', 'reference_label']], - check_column_names=False) + assert_sframe_equal( + ystar[["row_id", "cluster_id"]], + knn_dists[["query_label", "reference_label"]], + check_column_names=False, + ) - assert_allclose(ystar['distance'], knn_dists['distance'], rtol=1e-6) + assert_allclose(ystar["distance"], knn_dists["distance"], rtol=1e-6) diff --git a/src/python/turicreate/test/test_knn_classifier.py b/src/python/turicreate/test/test_knn_classifier.py index 0a9e6d54e7..26a9aa694a 100644 --- a/src/python/turicreate/test/test_knn_classifier.py +++ b/src/python/turicreate/test/test_knn_classifier.py @@ -16,6 +16,7 @@ from turicreate.toolkits._main import ToolkitError import sys + if sys.version_info.major == 3: unittest.TestCase.assertItemsEqual = unittest.TestCase.assertCountEqual @@ -34,33 +35,33 @@ def make_classifier_data(n, d, seed=None): # integer and float columns for i in range(d): - sf['int{}'.format(i)] = np.random.randint(low=-10, high=10, size=n) + sf["int{}".format(i)] = np.random.randint(low=-10, high=10, size=n) for i in range(d): v = np.random.rand(n) - sf['float{}'.format(i)] = v * 20 - 10 + sf["float{}".format(i)] = v * 20 - 10 # array column array_feature = [] for i in range(n): - array_feature.append(array.array('f', np.random.rand(d))) - sf['array0'] = array_feature + array_feature.append(array.array("f", np.random.rand(d))) + sf["array0"] = array_feature # string and dict columns for i in range(d + 1): - sf['str{}'.format(i)] = test_util.uniform_string_column(n, word_length=5, - alphabet_size=5, - missingness=0.) + sf["str{}".format(i)] = test_util.uniform_string_column( + n, word_length=5, alphabet_size=5, missingness=0.0 + ) - sf['dict0'] = tc.text_analytics.count_ngrams(sf['str{}'.format(d)], - n=3, method='character', - to_lower=False) - sf.remove_column('str{}'.format(d), inplace=True) + sf["dict0"] = tc.text_analytics.count_ngrams( + sf["str{}".format(d)], n=3, method="character", to_lower=False + ) + sf.remove_column("str{}".format(d), inplace=True) # target column (string) - sf['class'] = test_util.uniform_string_column(n, word_length=1, - alphabet_size=3, - missingness=0.) + sf["class"] = test_util.uniform_string_column( + n, word_length=1, alphabet_size=3, missingness=0.0 + ) return sf @@ -77,10 +78,12 @@ class KnnClassifierCreateTest(unittest.TestCase): def setUpClass(self): self.sf = make_classifier_data(n=100, d=2, seed=19) self.verbose = False - self.distance = [[['int0', 'int1', 'float0', 'float1', 'array0'], 'euclidean', 1], - [['int0', 'int1'], 'manhattan', 1.5], - [['str0'], 'levenshtein', 2], - [['dict0'], 'weighted_jaccard', 1.3]] + self.distance = [ + [["int0", "int1", "float0", "float1", "array0"], "euclidean", 1], + [["int0", "int1"], "manhattan", 1.5], + [["str0"], "levenshtein", 2], + [["dict0"], "weighted_jaccard", 1.3], + ] def test_input_mutations(self): """ @@ -93,9 +96,9 @@ def test_input_mutations(self): verbose = self.verbose ## Create a model with the copied objects - m = tc.nearest_neighbor_classifier.create(sf, target='class', - distance=distance, - verbose=self.verbose) + m = tc.nearest_neighbor_classifier.create( + sf, target="class", distance=distance, verbose=self.verbose + ) ## Check that the copies still equal the originals assert_sframe_equal(sf, self.sf) @@ -109,21 +112,21 @@ def test_bad_data(self): ## Empty SFrame with self.assertRaises(ToolkitError): - m = tc.nearest_neighbor_classifier.create(dataset=tc.SFrame(), - target='class', - verbose=self.verbose) + m = tc.nearest_neighbor_classifier.create( + dataset=tc.SFrame(), target="class", verbose=self.verbose + ) ## SArray for the features with self.assertRaises(ToolkitError): - m = tc.nearest_neighbor_classifier.create(dataset=self.sf['int0'], - target='class', - verbose=self.verbose) + m = tc.nearest_neighbor_classifier.create( + dataset=self.sf["int0"], target="class", verbose=self.verbose + ) ## SArray for the target with self.assertRaises(ToolkitError): - m = tc.nearest_neighbor_classifier.create(dataset=self.sf, - target=self.sf['class'], - verbose=self.verbose) + m = tc.nearest_neighbor_classifier.create( + dataset=self.sf, target=self.sf["class"], verbose=self.verbose + ) def test_distances(self): """ @@ -132,94 +135,125 @@ def test_distances(self): is specified. """ - numeric_features = ['int0', 'int1', 'float0', 'float1'] - array_features = ['array0'] - string_features = ['str0'] - dict_features = ['dict0'] + numeric_features = ["int0", "int1", "float0", "float1"] + array_features = ["array0"] + string_features = ["str0"] + dict_features = ["dict0"] ## Numeric standard distances should work for numeric columns - for d in ['euclidean', 'squared_euclidean', 'manhattan', 'cosine', - 'transformed_dot_product']: + for d in [ + "euclidean", + "squared_euclidean", + "manhattan", + "cosine", + "transformed_dot_product", + ]: try: - m = tc.nearest_neighbor_classifier.create(self.sf, target='class', - features=numeric_features, - distance=d, - verbose=False) + m = tc.nearest_neighbor_classifier.create( + self.sf, + target="class", + features=numeric_features, + distance=d, + verbose=False, + ) except: assert False, "Standard distance {} failed.".format(d) - ## Numeric standard distances should work for array columns - for d in ['euclidean', 'squared_euclidean', 'manhattan', 'cosine', - 'transformed_dot_product']: + for d in [ + "euclidean", + "squared_euclidean", + "manhattan", + "cosine", + "transformed_dot_product", + ]: try: - m = tc.nearest_neighbor_classifier.create(self.sf, target='class', - features=array_features, - distance=d, - verbose=False) + m = tc.nearest_neighbor_classifier.create( + self.sf, + target="class", + features=array_features, + distance=d, + verbose=False, + ) except: assert False, "Standard distance {} failed.".format(d) - ## String standard distances should work. - for d in ['levenshtein']: + for d in ["levenshtein"]: try: - m = tc.nearest_neighbor_classifier.create(self.sf, target='class', - features=string_features, - distance=d, - verbose=False) + m = tc.nearest_neighbor_classifier.create( + self.sf, + target="class", + features=string_features, + distance=d, + verbose=False, + ) except: assert False, "Standard distance {} failed.".format(d) - ## Dictionary standard distances should work. - for d in ['jaccard', 'weighted_jaccard', 'cosine', 'transformed_dot_product']: + for d in ["jaccard", "weighted_jaccard", "cosine", "transformed_dot_product"]: try: - m = tc.nearest_neighbor_classifier.create(self.sf, target='class', - features=dict_features, - distance=d, - verbose=False) + m = tc.nearest_neighbor_classifier.create( + self.sf, + target="class", + features=dict_features, + distance=d, + verbose=False, + ) except: assert False, "Standard distance {} failed.".format(d) - # Bogus distance strings should not work. with self.assertRaises(ValueError): - m = tc.nearest_neighbor_classifier.create(self.sf, target='class', - features=numeric_features, - distance='fossa', - verbose=False) - + m = tc.nearest_neighbor_classifier.create( + self.sf, + target="class", + features=numeric_features, + distance="fossa", + verbose=False, + ) # Nonsensical combinations of feature types and distances should fail. with self.assertRaises(ValueError): - m = tc.nearest_neighbor_classifier.create(self.sf, target='class', - features=numeric_features, - distance='levenshtein', - verbose=False) + m = tc.nearest_neighbor_classifier.create( + self.sf, + target="class", + features=numeric_features, + distance="levenshtein", + verbose=False, + ) with self.assertRaises(ToolkitError): - m = tc.nearest_neighbor_classifier.create(self.sf, target='class', - features=dict_features, - distance='levenshtein', - verbose=False) + m = tc.nearest_neighbor_classifier.create( + self.sf, + target="class", + features=dict_features, + distance="levenshtein", + verbose=False, + ) with self.assertRaises(ToolkitError): - m = tc.nearest_neighbor_classifier.create(self.sf, target='class', - features=string_features, - distance='euclidean', - verbose=False) - + m = tc.nearest_neighbor_classifier.create( + self.sf, + target="class", + features=string_features, + distance="euclidean", + verbose=False, + ) # If no distance is specified, the automatic distance construction # should kick in and be correct. - correct_dist = [[['str0'], 'levenshtein', 1], - [['str1'], 'levenshtein', 1], - [['dict0'], 'weighted_jaccard', 1], - [['int0', 'int1', 'float0', 'float1', 'array0'], 'euclidean', 5]] - - m = tc.nearest_neighbor_classifier.create(self.sf, target='class', - verbose=False) + correct_dist = [ + [["str0"], "levenshtein", 1], + [["str1"], "levenshtein", 1], + [["dict0"], "weighted_jaccard", 1], + [["int0", "int1", "float0", "float1", "array0"], "euclidean", 5], + ] + + m = tc.nearest_neighbor_classifier.create( + self.sf, target="class", verbose=False + ) self.assertEqual(m.distance, correct_dist) def test_features(self): @@ -232,53 +266,62 @@ def test_features(self): ## Default is to get all features from the SFrame if 'features' is not # specified and if the distance parameter is not a composite distance. - numeric_features = ['int0', 'int1', 'float0', 'float1', 'array0'] - m = tc.nearest_neighbor_classifier.create(self.sf[numeric_features + ['class']], - target='class', - distance='euclidean', - verbose=False) + numeric_features = ["int0", "int1", "float0", "float1", "array0"] + m = tc.nearest_neighbor_classifier.create( + self.sf[numeric_features + ["class"]], + target="class", + distance="euclidean", + verbose=False, + ) self.assertEqual(m.num_features, 5) self.assertItemsEqual(m.features, numeric_features) - ## 'features' parameter applies if custom distance not provided. - m = tc.nearest_neighbor_classifier.create(self.sf, target='class', - features=numeric_features, - distance='euclidean', - verbose=False) + m = tc.nearest_neighbor_classifier.create( + self.sf, + target="class", + features=numeric_features, + distance="euclidean", + verbose=False, + ) self.assertEqual(m.num_features, 5) self.assertItemsEqual(m.features, numeric_features) - ## Composite distance features should override the 'features' parameter. - m = tc.nearest_neighbor_classifier.create(self.sf, target='class', - features=['str0', 'dict0'], - distance=self.distance, - verbose=False) + m = tc.nearest_neighbor_classifier.create( + self.sf, + target="class", + features=["str0", "dict0"], + distance=self.distance, + verbose=False, + ) self.assertEqual(m.num_features, 7) - self.assertItemsEqual(m.features, - ['int0', 'int1', 'float0', 'float1', 'array0', 'str0', 'dict0']) - + self.assertItemsEqual( + m.features, ["int0", "int1", "float0", "float1", "array0", "str0", "dict0"] + ) ## Make sure the target is removed from the 'features' parameter. - m = tc.nearest_neighbor_classifier.create(self.sf, target='class', - features=numeric_features + ['class'], - distance='euclidean', - verbose=False) + m = tc.nearest_neighbor_classifier.create( + self.sf, + target="class", + features=numeric_features + ["class"], + distance="euclidean", + verbose=False, + ) self.assertEqual(m.num_features, 5) self.assertItemsEqual(m.features, numeric_features) - ## Make sure the target is removed from the composite distance features. distance = copy.deepcopy(self.distance) - distance[2][0].append('class') - m = tc.nearest_neighbor_classifier.create(self.sf, target='class', - distance=distance, - verbose=False) + distance[2][0].append("class") + m = tc.nearest_neighbor_classifier.create( + self.sf, target="class", distance=distance, verbose=False + ) self.assertEqual(m.num_features, 7) - self.assertItemsEqual(m.features, - ['int0', 'int1', 'float0', 'float1', 'array0', 'str0', 'dict0']) + self.assertItemsEqual( + m.features, ["int0", "int1", "float0", "float1", "array0", "str0", "dict0"] + ) def test_backward_compatibility(self): """ @@ -302,30 +345,34 @@ def setUp(self): sf = make_classifier_data(n=self.n, d=2, seed=19) ## Make the model - self.distance = [[['int0', 'int1', 'float0', 'float1', 'array0'], 'euclidean', 1], - [['int0', 'int1'], 'manhattan', 1.5], - [['str0'], 'levenshtein', 2], - [['dict0'], 'weighted_jaccard', 1.3]] + self.distance = [ + [["int0", "int1", "float0", "float1", "array0"], "euclidean", 1], + [["int0", "int1"], "manhattan", 1.5], + [["str0"], "levenshtein", 2], + [["dict0"], "weighted_jaccard", 1.3], + ] - self.model = tc.nearest_neighbor_classifier.create(sf, target='class', - distance=self.distance, - verbose=False) + self.model = tc.nearest_neighbor_classifier.create( + sf, target="class", distance=self.distance, verbose=False + ) def test__list_fields(self): """ Check the model _list_fields method. """ - correct_fields = ['distance', - 'num_distance_components', - 'verbose', - 'num_features', - 'training_time', - 'num_unpacked_features', - 'num_examples', - 'features', - 'target', - 'num_classes', - '_target_type'] + correct_fields = [ + "distance", + "num_distance_components", + "verbose", + "num_features", + "training_time", + "num_unpacked_features", + "num_examples", + "features", + "target", + "num_classes", + "_target_type", + ] self.assertItemsEqual(self.model._list_fields(), correct_fields) @@ -334,21 +381,25 @@ def test_get(self): Check the get method against known answers for each field. """ - correct_fields = {'distance': self.distance, - 'num_distance_components': 4, - 'verbose': False, - 'num_features': 7, - 'num_examples': self.n, - 'target': 'class', - 'num_classes': 3} + correct_fields = { + "distance": self.distance, + "num_distance_components": 4, + "verbose": False, + "num_features": 7, + "num_examples": self.n, + "target": "class", + "num_classes": 3, + } for field, ans in correct_fields.items(): self.assertEqual(self.model._get(field), ans, "{} failed".format(field)) self.assertGreater(self.model.training_time, 0) self.assertGreater(self.model.num_unpacked_features, self.n) - self.assertItemsEqual(self.model.features, - ['int0', 'int1', 'float0', 'float1', 'array0', 'str0', 'dict0']) + self.assertItemsEqual( + self.model.features, + ["int0", "int1", "float0", "float1", "array0", "str0", "dict0"], + ) def test_summaries(self): """ @@ -392,7 +443,6 @@ def test_save_and_load(self): del self.model - class KnnClassifierPredictTest(unittest.TestCase): """ Unit test class for correctness of model predictions. @@ -402,15 +452,16 @@ class KnnClassifierPredictTest(unittest.TestCase): def setUpClass(self): self.sf = make_classifier_data(n=100, d=2, seed=19) self.sf_test = make_classifier_data(n=10, d=2, seed=92) - self.distance = [[['int0', 'int1', 'float0', 'float1', 'array0'], 'euclidean', 1], - [['int0', 'int1'], 'manhattan', 1.5], - [['str0'], 'levenshtein', 2], - [['dict0'], 'weighted_jaccard', 1.3]] + self.distance = [ + [["int0", "int1", "float0", "float1", "array0"], "euclidean", 1], + [["int0", "int1"], "manhattan", 1.5], + [["str0"], "levenshtein", 2], + [["dict0"], "weighted_jaccard", 1.3], + ] - self.model = tc.nearest_neighbor_classifier.create(self.sf, - target='class', - distance=self.distance, - verbose=False) + self.model = tc.nearest_neighbor_classifier.create( + self.sf, target="class", distance=self.distance, verbose=False + ) def test_bogus_parameters(self): """ @@ -418,7 +469,7 @@ def test_bogus_parameters(self): """ ## Bogus maximum number of neighbors in each query - for k in [-1, 0, 'fossa']: + for k in [-1, 0, "fossa"]: with self.assertRaises(ValueError): ystar = self.model.predict(self.sf, max_neighbors=k) @@ -428,9 +479,8 @@ def test_bogus_parameters(self): with self.assertRaises(ValueError): ystar = self.model.predict_topk(self.sf, max_neighbors=k) - ## Bogus neighborhood radius - for r in [-1, 'cat']: + for r in [-1, "cat"]: with self.assertRaises(ValueError): ystar = self.model.predict(self.sf, radius=r) @@ -440,13 +490,11 @@ def test_bogus_parameters(self): with self.assertRaises(ValueError): ystar = self.model.predict_topk(self.sf, radius=r) - ## Bogus number of results to return (for predict_topk only) - for k in [-1, 0, 'fossa']: + for k in [-1, 0, "fossa"]: with self.assertRaises(TypeError): ystar = self.model.predict_topk(self.sf, k=k) - ## Empty prediction dataset with self.assertRaises(ToolkitError): ystar = self.model.predict(tc.SFrame()) @@ -466,38 +514,44 @@ def test_classify(self): ystar = self.model.classify(self.sf[:1], verbose=False) self.assertIsInstance(ystar, tc.SFrame) - self.assertItemsEqual(ystar.column_names(), ['class', 'probability']) + self.assertItemsEqual(ystar.column_names(), ["class", "probability"]) self.assertItemsEqual(ystar.column_types(), [str, float]) ## Test plausibility of probability output. - self.assertTrue(all(ystar['probability'] >= 0)) - self.assertTrue(all(ystar['probability'] <= 1)) + self.assertTrue(all(ystar["probability"] >= 0)) + self.assertTrue(all(ystar["probability"] <= 1)) ## Check that classify on the training data with a single neighbor # returns the training point. ystar = self.model.classify(self.sf, max_neighbors=1, verbose=False) - self.assertTrue((ystar['class'] == self.sf['class']).all()) - self.assertTrue(all(ystar['probability'] == 1)) + self.assertTrue((ystar["class"] == self.sf["class"]).all()) + self.assertTrue(all(ystar["probability"] == 1)) ## Check that classifying test data with a small radius returns None # values if there are no qualified neighbors. - ystar = self.model.classify(self.sf_test, max_neighbors=None, - radius=1e-6, verbose=False) + ystar = self.model.classify( + self.sf_test, max_neighbors=None, radius=1e-6, verbose=False + ) - self.assertTrue(all(ystar['class'] == None)) - self.assertTrue(all(ystar['probability'] == None)) + self.assertTrue(all(ystar["class"] == None)) + self.assertTrue(all(ystar["probability"] == None)) ## Check that the results are correct if *some*, but not *all* of the # test points have no qualified neighbors in the training set. - ystar = self.model.classify(self.sf_test, max_neighbors=None, radius=15., - verbose=False) + ystar = self.model.classify( + self.sf_test, max_neighbors=None, radius=15.0, verbose=False + ) self.assertItemsEqual(ystar.column_types(), [str, float]) - self.assertTrue(ystar['class'].countna() > 0 and - ystar['class'].countna() < self.sf_test.num_rows()) - self.assertTrue(ystar['probability'].countna() > 0 and - ystar['probability'].countna() < self.sf_test.num_rows()) + self.assertTrue( + ystar["class"].countna() > 0 + and ystar["class"].countna() < self.sf_test.num_rows() + ) + self.assertTrue( + ystar["probability"].countna() > 0 + and ystar["probability"].countna() < self.sf_test.num_rows() + ) def test_predict(self): """ @@ -509,12 +563,12 @@ def test_predict(self): self.assertEqual(type(ystar), tc.SArray) ## Test type of output is correct. - ystar = self.model.predict(self.sf[:1], output_type='class', - verbose=False) + ystar = self.model.predict(self.sf[:1], output_type="class", verbose=False) self.assertIs(ystar.dtype, str) - ystar = self.model.predict(self.sf[:1], output_type='probability', - verbose=False) + ystar = self.model.predict( + self.sf[:1], output_type="probability", verbose=False + ) self.assertIs(ystar.dtype, float) def test_predict_topk(self): @@ -526,12 +580,12 @@ def test_predict_topk(self): topk = self.model.predict_topk(self.sf[:1], verbose=False) self.assertIsInstance(topk, tc.SFrame) - self.assertItemsEqual(topk.column_names(), ['row_id', 'class', 'probability']) + self.assertItemsEqual(topk.column_names(), ["row_id", "class", "probability"]) self.assertItemsEqual(topk.column_types(), [int, str, float]) ## Test plausibility of probability output. - self.assertTrue(all(topk['probability'] >= 0)) - self.assertTrue(all(topk['probability'] <= 1)) + self.assertTrue(all(topk["probability"] >= 0)) + self.assertTrue(all(topk["probability"] <= 1)) ## Test that k = 1 returns the same top probabilities as model.classify. # Note, the existence of ties means it's impossible to check for the @@ -541,20 +595,20 @@ def test_predict_topk(self): # TODO @papayawarrior -- this assertion fails now that SArray # comparison is meaningful. Please fix the test and uncomment. - #self.assertTrue((topk['probability'] == ystar['probability']).all()) + # self.assertTrue((topk['probability'] == ystar['probability']).all()) ## Check that k results are returned for each query. topk = self.model.predict_topk(self.sf, k=2, verbose=False) - counts = topk.groupby('row_id', tc.aggregate.COUNT) + counts = topk.groupby("row_id", tc.aggregate.COUNT) self.assertEqual(counts.num_rows(), self.sf.num_rows()) - self.assertTrue(all(counts['Count'] == 2)) + self.assertTrue(all(counts["Count"] == 2)) ## Check that 3 results are returned for each query even if k is bigger. topk = self.model.predict_topk(self.sf, k=100, verbose=False) - counts = topk.groupby('row_id', tc.aggregate.COUNT) + counts = topk.groupby("row_id", tc.aggregate.COUNT) - num_classes = len(self.sf['class'].unique()) - self.assertTrue(all(counts['Count'] <= num_classes)) + num_classes = len(self.sf["class"].unique()) + self.assertTrue(all(counts["Count"] <= num_classes)) self.assertEqual(counts.num_rows(), self.sf.num_rows()) def test_evaluate(self): @@ -565,38 +619,42 @@ def test_evaluate(self): ans = self.model.evaluate(self.sf) ## Check that the right keys are present in the results dict. - self.assertItemsEqual(ans.keys(), ['accuracy', 'confusion_matrix']) + self.assertItemsEqual(ans.keys(), ["accuracy", "confusion_matrix"]) ## Check plausibility of the accuracy. - self.assertIsInstance(ans['accuracy'], float) - self.assertTrue(ans['accuracy'] >= 0 and ans['accuracy'] <= 1) + self.assertIsInstance(ans["accuracy"], float) + self.assertTrue(ans["accuracy"] >= 0 and ans["accuracy"] <= 1) ## Check plausibility of the confusion matrix. - self.assertIsInstance(ans['confusion_matrix'], tc.SFrame) - self.assertEqual(ans['confusion_matrix'].num_columns(), 3) - self.assertEqual(len(ans['confusion_matrix']['target_label'].unique()), 3) - self.assertEqual(ans['confusion_matrix']['count'].sum(), 100) + self.assertIsInstance(ans["confusion_matrix"], tc.SFrame) + self.assertEqual(ans["confusion_matrix"].num_columns(), 3) + self.assertEqual(len(ans["confusion_matrix"]["target_label"].unique()), 3) + self.assertEqual(ans["confusion_matrix"]["count"].sum(), 100) ## Check the values and types of the confusion matrix of *none* of the # test points have *any* qualified neighbors in the training set. evals = self.model.evaluate(self.sf_test, max_neighbors=None, radius=1e-6) - acc = evals['accuracy'] - cf_mat = evals['confusion_matrix'] + acc = evals["accuracy"] + cf_mat = evals["confusion_matrix"] - self.assertTrue(acc == 0.) - self.assertTrue(all(cf_mat['target_label'] != None)) - self.assertTrue(all(cf_mat['predicted_label'] == None)) + self.assertTrue(acc == 0.0) + self.assertTrue(all(cf_mat["target_label"] != None)) + self.assertTrue(all(cf_mat["predicted_label"] == None)) self.assertItemsEqual(cf_mat.column_types(), [str, float, int]) - self.assertEqual(cf_mat['count'].sum(), self.sf_test.num_rows()) + self.assertEqual(cf_mat["count"].sum(), self.sf_test.num_rows()) ## Check that the results are correct if *some*, but not *all* of the # test points have no qualified neighbors in the training set. - evals = self.model.evaluate(self.sf_test, max_neighbors=None, radius=15.) - acc = evals['accuracy'] - cf_mat = evals['confusion_matrix'] + evals = self.model.evaluate(self.sf_test, max_neighbors=None, radius=15.0) + acc = evals["accuracy"] + cf_mat = evals["confusion_matrix"] - self.assertTrue(acc >= 0. and acc <= 1.) + self.assertTrue(acc >= 0.0 and acc <= 1.0) self.assertItemsEqual(cf_mat.column_types(), [str, str, int]) - self.assertItemsEqual(cf_mat['target_label'].unique(), self.sf_test['class'].unique()) - self.assertTrue(cf_mat['predicted_label'].countna() > 0 and - cf_mat['predicted_label'].countna() < self.sf_test.num_rows()) + self.assertItemsEqual( + cf_mat["target_label"].unique(), self.sf_test["class"].unique() + ) + self.assertTrue( + cf_mat["predicted_label"].countna() > 0 + and cf_mat["predicted_label"].countna() < self.sf_test.num_rows() + ) diff --git a/src/python/turicreate/test/test_lambda.py b/src/python/turicreate/test/test_lambda.py index 3843729f9a..68a54b1862 100644 --- a/src/python/turicreate/test/test_lambda.py +++ b/src/python/turicreate/test/test_lambda.py @@ -23,39 +23,54 @@ def fib(i): class LambdaTests(unittest.TestCase): - def test_simple_evaluation(self): x = 3 self.assertEqual(glconnect.get_unity().eval_lambda(lambda y: y + x, 0), 3) self.assertEqual(glconnect.get_unity().eval_lambda(lambda y: y + x, 1), 4) - self.assertEqual(glconnect.get_unity().eval_lambda(lambda x: x.upper(), 'abc'), 'ABC') - self.assertEqual(glconnect.get_unity().eval_lambda(lambda x: x.lower(), 'ABC'), 'abc') + self.assertEqual( + glconnect.get_unity().eval_lambda(lambda x: x.upper(), "abc"), "ABC" + ) + self.assertEqual( + glconnect.get_unity().eval_lambda(lambda x: x.lower(), "ABC"), "abc" + ) self.assertEqual(glconnect.get_unity().eval_lambda(fib, 1), 1) def test_exception(self): x = 3 - self.assertRaises(RuntimeError, glconnect.get_unity().eval_lambda, lambda y: x / y, 0) - self.assertRaises(RuntimeError, glconnect.get_unity().parallel_eval_lambda, lambda y: x / y, [0 for i in range(10)]) + self.assertRaises( + RuntimeError, glconnect.get_unity().eval_lambda, lambda y: x / y, 0 + ) + self.assertRaises( + RuntimeError, + glconnect.get_unity().parallel_eval_lambda, + lambda y: x / y, + [0 for i in range(10)], + ) def test_parallel_evaluation(self): xin = 33 repeat = 8 # execute the task bulk using one process to get a baseline start_time = time.time() - glconnect.get_unity().eval_lambda(lambda x: [fib(i) for i in x], [xin]*repeat) + glconnect.get_unity().eval_lambda(lambda x: [fib(i) for i in x], [xin] * repeat) single_thread_time = time.time() - start_time logging.info("Single thread lambda eval takes %s secs" % single_thread_time) # execute the task in parallel start_time = time.time() - ans_list = glconnect.get_unity().parallel_eval_lambda(lambda x: fib(x), [xin]*repeat) + ans_list = glconnect.get_unity().parallel_eval_lambda( + lambda x: fib(x), [xin] * repeat + ) multi_thread_time = time.time() - start_time logging.info("Multi thread lambda eval takes %s secs" % multi_thread_time) # test the speed up by running in parallel nproc = multiprocessing.cpu_count() - if (nproc > 1 and multi_thread_time > (single_thread_time / 1.5)): - logging.warning("Slow parallel processing: single thread takes %s secs, multithread on %s procs takes %s secs" % (single_thread_time, nproc, multi_thread_time)) + if nproc > 1 and multi_thread_time > (single_thread_time / 1.5): + logging.warning( + "Slow parallel processing: single thread takes %s secs, multithread on %s procs takes %s secs" + % (single_thread_time, nproc, multi_thread_time) + ) # test accuracy ans = fib(xin) @@ -63,7 +78,6 @@ def test_parallel_evaluation(self): self.assertEqual(a, ans) def test_environments(self): - def test_env(i): if i > 500: @@ -75,31 +89,38 @@ def test_env(i): return i - import turicreate as tc + x = tc.SArray(range(10000)) y = x.apply(test_env) - self.assertTrue( (x == y).all() ) - + self.assertTrue((x == y).all()) @unittest.skip("Disabling crash recovery test") def test_crash_recovery(self): import time, sys + ls = range(1000) def good_fun(x): return x def bad_fun(x): - if (x+1) % 251 == 0: + if (x + 1) % 251 == 0: cy_test_utils.force_exit_fun() # this will force the worker process to exit return x - self.assertRaises(RuntimeError, lambda: glconnect.get_unity().parallel_eval_lambda(lambda x: bad_fun(x), ls)) - glconnect.get_unity().parallel_eval_lambda(lambda x: good_fun(x), ls) + self.assertRaises( + RuntimeError, + lambda: glconnect.get_unity().parallel_eval_lambda( + lambda x: bad_fun(x), ls + ), + ) + glconnect.get_unity().parallel_eval_lambda(lambda x: good_fun(x), ls) - @unittest.skip("Disabling test as previous runs of this can mess up import. Reenamble when lambda workers can be reliably restarted.") + @unittest.skip( + "Disabling test as previous runs of this can mess up import. Reenamble when lambda workers can be reliably restarted." + ) def test_expensive_packages_not_imported_in_lambda(self): import turicreate as tc @@ -113,10 +134,10 @@ def lambda_func(x): if x >= 1000: for p in expensive_packages: - assert p not in sys.modules + assert p not in sys.modules return x + 1 x = tc.SArray(range(2000)).apply(lambda_func) - self.assertTrue( (x == tc.SArray(range(1, 2001))).all() ) + self.assertTrue((x == tc.SArray(range(1, 2001))).all()) diff --git a/src/python/turicreate/test/test_linear_regression.py b/src/python/turicreate/test/test_linear_regression.py index 2904b2deeb..bf85c7d993 100644 --- a/src/python/turicreate/test/test_linear_regression.py +++ b/src/python/turicreate/test/test_linear_regression.py @@ -47,12 +47,10 @@ def setUpClass(self): self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) target = np.random.rand(n) - self.sf['target'] = target - + self.sf["target"] = target ## Compute the correct answers with statsmodels - formula = 'target ~ ' + \ - ' + '.join(['X{}'.format(i) for i in range(1, d+1)]) + formula = "target ~ " + " + ".join(["X{}".format(i) for i in range(1, d + 1)]) df = self.sf.to_dataframe() sm_model = sm.ols(formula, data=df).fit() @@ -66,65 +64,73 @@ def setUpClass(self): ## Create the turicreate model self.def_kwargs = _DEFAULT_SOLVER_OPTIONS - self.solver = 'auto' - self.unpacked_features = ['X{}'.format(i) for i in range(1, d+1)] - self.features = ['X{}'.format(i) for i in range(1, d+1)] - self.target = 'target' - - self.def_opts = dict(list(self.def_kwargs.items()) + list({'solver' : 'auto', - 'feature_rescaling' : True, - 'l1_penalty' : 0, - 'l2_penalty': 1e-2}.items())) + self.solver = "auto" + self.unpacked_features = ["X{}".format(i) for i in range(1, d + 1)] + self.features = ["X{}".format(i) for i in range(1, d + 1)] + self.target = "target" + + self.def_opts = dict( + list(self.def_kwargs.items()) + + list( + { + "solver": "auto", + "feature_rescaling": True, + "l1_penalty": 0, + "l2_penalty": 1e-2, + }.items() + ) + ) self.opts = self.def_opts.copy() - self.opts['l2_penalty'] = 0.0 - self.opts['solver'] = 'newton' - self.model = tc.linear_regression.create(self.sf, - target=self.target, - features=None, - l2_penalty = 0.0, - l1_penalty = 0.0, - feature_rescaling = True, - validation_set = None, - solver = self.solver) - + self.opts["l2_penalty"] = 0.0 + self.opts["solver"] = "newton" + self.model = tc.linear_regression.create( + self.sf, + target=self.target, + features=None, + l2_penalty=0.0, + l1_penalty=0.0, + feature_rescaling=True, + validation_set=None, + solver=self.solver, + ) # self.maxerr = abs(np.array(y) - self.predict_ans).max() self.evaluate_ans = { - 'max_error': lambda x: abs(x - self.maxerr) < 1e-3, - 'evaluate_time': lambda x: x > 0, - 'rmse': lambda x: abs(x - self.rmse) < 1e-3, - } + "max_error": lambda x: abs(x - self.maxerr) < 1e-3, + "evaluate_time": lambda x: x > 0, + "rmse": lambda x: abs(x - self.rmse) < 1e-3, + } # Answers # ------------------------------------------------------------------------ self.get_ans = { - 'coefficients': lambda x: isinstance(x, tc.SFrame), - 'convergence_threshold': lambda x: x == self.opts['convergence_threshold'], - 'features': lambda x: x == self.features, - 'unpacked_features': lambda x: x == self.unpacked_features, - 'feature_rescaling': lambda x: x == self.opts['feature_rescaling'], - 'l1_penalty': lambda x: x == 0.0 , - 'l2_penalty': lambda x: x == 0.0 , - 'lbfgs_memory_level': lambda x: x == self.opts['lbfgs_memory_level'], - 'max_iterations': lambda x: x == self.opts['max_iterations'], - 'num_coefficients': lambda x: x == 11 , - 'num_examples': lambda x: x == 100, - 'num_features': lambda x: x == 10 , - 'num_unpacked_features': lambda x: x == 10 , - 'progress': lambda x: isinstance(x, tc.SFrame), - 'solver': lambda x: x == self.opts['solver'], - 'training_solver_status': lambda x: x == "SUCCESS: Optimal solution found.", - 'step_size': lambda x: x == self.opts['step_size'], - 'target': lambda x: x == self.target, - 'training_iterations': lambda x: x > 0, - 'training_loss': lambda x: abs(x - self.loss) < 1e-5, - 'training_rmse': lambda x: abs(x - self.rmse) < 1e-5, - 'training_time': lambda x: x >= 0, - 'training_max_error': lambda x: x > 0, - 'validation_data': lambda x: isinstance(x, tc.SFrame) and len(x) == 0, - 'disable_posttrain_evaluation' : lambda x: x == False, - } + "coefficients": lambda x: isinstance(x, tc.SFrame), + "convergence_threshold": lambda x: x == self.opts["convergence_threshold"], + "features": lambda x: x == self.features, + "unpacked_features": lambda x: x == self.unpacked_features, + "feature_rescaling": lambda x: x == self.opts["feature_rescaling"], + "l1_penalty": lambda x: x == 0.0, + "l2_penalty": lambda x: x == 0.0, + "lbfgs_memory_level": lambda x: x == self.opts["lbfgs_memory_level"], + "max_iterations": lambda x: x == self.opts["max_iterations"], + "num_coefficients": lambda x: x == 11, + "num_examples": lambda x: x == 100, + "num_features": lambda x: x == 10, + "num_unpacked_features": lambda x: x == 10, + "progress": lambda x: isinstance(x, tc.SFrame), + "solver": lambda x: x == self.opts["solver"], + "training_solver_status": lambda x: x == "SUCCESS: Optimal solution found.", + "step_size": lambda x: x == self.opts["step_size"], + "target": lambda x: x == self.target, + "training_iterations": lambda x: x > 0, + "training_loss": lambda x: abs(x - self.loss) < 1e-5, + "training_rmse": lambda x: abs(x - self.rmse) < 1e-5, + "training_time": lambda x: x >= 0, + "training_max_error": lambda x: x > 0, + "validation_data": lambda x: isinstance(x, tc.SFrame) and len(x) == 0, + "disable_posttrain_evaluation": lambda x: x == False, + } self.fields_ans = self.get_ans.keys() def test__list_fields(self): @@ -132,7 +138,7 @@ def test__list_fields(self): Check the _list_fields function. Compare with the answer. """ model = self.model - fields = model._list_fields() + fields = model._list_fields() self.assertEqual(set(fields), set(self.fields_ans)) def test_get(self): @@ -143,23 +149,27 @@ def test_get(self): model = self.model for field in self.fields_ans: ans = model._get(field) - self.assertTrue(self.get_ans[field](ans), \ - '''Get failed in field {}. Output was {}.'''.format(field, ans)) + self.assertTrue( + self.get_ans[field](ans), + """Get failed in field {}. Output was {}.""".format(field, ans), + ) - def test_coefficients(self, test_stderr = True): + def test_coefficients(self, test_stderr=True): """ Check that the coefficient values are very close to the correct values. """ model = self.model coefs = model.coefficients - coef_list = list(coefs['value']) + coef_list = list(coefs["value"]) self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-03, atol=1e-03)) if test_stderr: - stderr_list = list(coefs['stderr']) - self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03)) + stderr_list = list(coefs["stderr"]) + self.assertTrue( + np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03) + ) else: - self.assertTrue('stderr' in coefs.column_names()) - self.assertEqual(list(coefs['stderr']), [None for v in coef_list]) + self.assertTrue("stderr" in coefs.column_names()) + self.assertEqual(list(coefs["stderr"]), [None for v in coef_list]) def test_summary(self): """ @@ -174,24 +184,23 @@ def test_repr(self): Check the repr function. """ model = self.model - ans = str(model) + ans = str(model) self.assertEqual(type(ans), str) - def test_predict(self): """ Check the prediction function with precomputed answers. Check that all predictions are atmost 1e-5 away from the true answers. """ model = self.model - ans = model.predict(self.sf) - reduce(op.and_, map(lambda x,y: abs(x-y) < 1e-5, ans, self.yhat)) + ans = model.predict(self.sf) + reduce(op.and_, map(lambda x, y: abs(x - y) < 1e-5, ans, self.yhat)) # Test extra col - self.sf['extra_col'] = 1 - ans = model.predict(self.sf) - reduce(op.and_, map(lambda x,y: abs(x-y) < 1e-5, ans, self.yhat)) - del self.sf['extra_col'] + self.sf["extra_col"] = 1 + ans = model.predict(self.sf) + reduce(op.and_, map(lambda x, y: abs(x - y) < 1e-5, ans, self.yhat)) + del self.sf["extra_col"] def test_evaluate(self): """ @@ -199,24 +208,29 @@ def test_evaluate(self): """ model = self.model ans = model.evaluate(self.sf) + def check_ans(): - for field in ans: - self.assertTrue(self.evaluate_ans[field](ans[field]), \ - '''Evaluation failed in field {}. Output was {}'''.format(\ - field, ans[field])) + for field in ans: + self.assertTrue( + self.evaluate_ans[field](ans[field]), + """Evaluation failed in field {}. Output was {}""".format( + field, ans[field] + ), + ) + check_ans() - rmse = model.evaluate(self.sf, metric = 'rmse') + rmse = model.evaluate(self.sf, metric="rmse") check_ans() - max_error = model.evaluate(self.sf, metric = 'max_error') + max_error = model.evaluate(self.sf, metric="max_error") check_ans() def test_save_and_load(self): """ Make sure saving and loading retains things. """ - filename = 'save_file%s' % (str(uuid.uuid4())) + filename = "save_file%s" % (str(uuid.uuid4())) self.model.save(filename) self.model = tc.load_model(filename) @@ -241,743 +255,827 @@ def test_save_and_load(self): class LinearRegressionCreateTest(unittest.TestCase): - """ + """ Unit test class for testing a Linear Regression create function. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - # Simulate test data - np.random.seed(42) - n, d = 100, 10 - self.sf = tc.SFrame() - - for i in range(d): - self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + # Simulate test data + np.random.seed(42) + n, d = 100, 10 + self.sf = tc.SFrame() - target = np.random.rand(n) - self.sf['target'] = target + for i in range(d): + self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + target = np.random.rand(n) + self.sf["target"] = target - ## Compute the correct answers with statsmodels - formula = 'target ~ ' + \ - ' + '.join(['X{}'.format(i) for i in range(1, d+1)]) - df = self.sf.to_dataframe() + ## Compute the correct answers with statsmodels + formula = "target ~ " + " + ".join(["X{}".format(i) for i in range(1, d + 1)]) + df = self.sf.to_dataframe() - sm_model = sm.ols(formula, data=df).fit() + sm_model = sm.ols(formula, data=df).fit() - self.loss = sm_model.ssr # sum of squared residuals - self.coef = list(sm_model.params) - self.stderr = list(sm_model.bse) - self.yhat = list(sm_model.fittedvalues) - self.rmse = np.sqrt(sm_model.ssr / float(n)) - self.maxerr = abs(target - np.array(self.yhat)).max() + self.loss = sm_model.ssr # sum of squared residuals + self.coef = list(sm_model.params) + self.stderr = list(sm_model.bse) + self.yhat = list(sm_model.fittedvalues) + self.rmse = np.sqrt(sm_model.ssr / float(n)) + self.maxerr = abs(target - np.array(self.yhat)).max() - ## Create the turicreate model - self.def_kwargs = _DEFAULT_SOLVER_OPTIONS - self.solver = 'newton' - self.features = ', '.join(['X{}'.format(i) for i in range(1, d+1)]) - self.target = 'target' + ## Create the turicreate model + self.def_kwargs = _DEFAULT_SOLVER_OPTIONS + self.solver = "newton" + self.features = ", ".join(["X{}".format(i) for i in range(1, d + 1)]) + self.target = "target" - def _test_coefficients(self, model, test_case, test_stderr): - """ + def _test_coefficients(self, model, test_case, test_stderr): + """ Check that the coefficient values are very close to the correct values. """ - coefs = model.coefficients - coef_list = list(coefs['value']) - self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) - if test_stderr: - stderr_list = list(coefs['stderr']) - self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03)) - else: - self.assertTrue('stderr' in coefs.column_names()) - self.assertEqual(list(coefs['stderr']), [None for v in coef_list]) + coefs = model.coefficients + coef_list = list(coefs["value"]) + self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) + if test_stderr: + stderr_list = list(coefs["stderr"]) + self.assertTrue( + np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03) + ) + else: + self.assertTrue("stderr" in coefs.column_names()) + self.assertEqual(list(coefs["stderr"]), [None for v in coef_list]) - """ + """ test linear regression create. """ - def _test_create_no_rescaling(self, sf, target, solver, kwargs): - - model = tc.linear_regression.create(self.sf, - target=self.target, - features=None, - l2_penalty = 0.0, - l1_penalty = 0.0, - solver = solver, - feature_rescaling = False, - validation_set = None, - **kwargs) - - - test_case = 'solver = {solver}, kwargs = {kwargs}'.format(solver = solver, - kwargs = kwargs) - self.assertTrue(model is not None) - self.assertTrue(abs(model.training_rmse - self.rmse) < 0.1, - 'rmse failed: %s' % test_case) - self._test_coefficients(model, test_case, solver == 'newton') + def _test_create_no_rescaling(self, sf, target, solver, kwargs): + + model = tc.linear_regression.create( + self.sf, + target=self.target, + features=None, + l2_penalty=0.0, + l1_penalty=0.0, + solver=solver, + feature_rescaling=False, + validation_set=None, + **kwargs + ) + + test_case = "solver = {solver}, kwargs = {kwargs}".format( + solver=solver, kwargs=kwargs + ) + self.assertTrue(model is not None) + self.assertTrue( + abs(model.training_rmse - self.rmse) < 0.1, "rmse failed: %s" % test_case + ) + self._test_coefficients(model, test_case, solver == "newton") - """ + """ test linear regression create. """ - def _test_create(self, sf, target, solver, kwargs): - - model = tc.linear_regression.create(self.sf, - target=self.target, - features=None, - l2_penalty = 0.0, - l1_penalty = 0.0, - solver = solver, - feature_rescaling = True, - validation_set = None, - **kwargs) - - test_case = 'solver = {solver}, kwargs= {kwargs}'.format(solver = solver, - kwargs = kwargs) - self.assertTrue(model is not None) - self.assertTrue(abs(model.training_rmse - self.rmse) < 0.1, - 'rmse failed: %s' % test_case) - self.assertTrue(abs(model.training_loss - self.loss) < 0.1, - 'loss failed: %s' % test_case) - self._test_coefficients(model, test_case, solver == 'newton') + def _test_create(self, sf, target, solver, kwargs): + + model = tc.linear_regression.create( + self.sf, + target=self.target, + features=None, + l2_penalty=0.0, + l1_penalty=0.0, + solver=solver, + feature_rescaling=True, + validation_set=None, + **kwargs + ) + + test_case = "solver = {solver}, kwargs= {kwargs}".format( + solver=solver, kwargs=kwargs + ) + self.assertTrue(model is not None) + self.assertTrue( + abs(model.training_rmse - self.rmse) < 0.1, "rmse failed: %s" % test_case + ) + self.assertTrue( + abs(model.training_loss - self.loss) < 0.1, "loss failed: %s" % test_case + ) + self._test_coefficients(model, test_case, solver == "newton") - """ + """ Test linear regression create. """ - def test_create(self): - kwargs = self.def_kwargs.copy() - kwargs['convergence_threshold'] = 1e-6 - kwargs['max_iterations'] = 100 - for solver in ['newton', 'fista', 'lbfgs']: - args = (self.sf, self.target, solver, kwargs) - self._test_create(*args) - self._test_create_no_rescaling(*args) + def test_create(self): - """ + kwargs = self.def_kwargs.copy() + kwargs["convergence_threshold"] = 1e-6 + kwargs["max_iterations"] = 100 + for solver in ["newton", "fista", "lbfgs"]: + args = (self.sf, self.target, solver, kwargs) + self._test_create(*args) + self._test_create_no_rescaling(*args) + + """ Test linear regression create. """ - def test_lbfgs(self): - for m in [5,21]: - kwargs = self.def_kwargs.copy() - kwargs.update({'lbfgs_memory_level': m}) - kwargs['max_iterations'] = 100 - args = (self.sf, self.target, 'lbfgs', kwargs) - self._test_create(*args) - self._test_create_no_rescaling(*args) + def test_lbfgs(self): + for m in [5, 21]: + kwargs = self.def_kwargs.copy() + kwargs.update({"lbfgs_memory_level": m}) + kwargs["max_iterations"] = 100 + args = (self.sf, self.target, "lbfgs", kwargs) + self._test_create(*args) + self._test_create_no_rescaling(*args) - """ + """ Test detection of columns that are almost the same. """ - def test_zero_variance_detection(self): - sf = self.sf - try: - sf['error-column'] = 1 - model = tc.linear_regression.create(sf, self.target) - except ToolkitError: - pass - try: - sf['error-column'] = '1' - model = tc.linear_regression.create(sf, self.target) - except ToolkitError: - pass - try: - sf['error-column'] = [[1] for i in sf] - model = tc.linear_regression.create(sf, self.target) - except ToolkitError: - pass - try: - sf['error-column'] = [{1:1} for i in sf] - model = tc.linear_regression.create(sf, self.target) - except ToolkitError: - pass - del sf['error-column'] - """ + def test_zero_variance_detection(self): + sf = self.sf + try: + sf["error-column"] = 1 + model = tc.linear_regression.create(sf, self.target) + except ToolkitError: + pass + try: + sf["error-column"] = "1" + model = tc.linear_regression.create(sf, self.target) + except ToolkitError: + pass + try: + sf["error-column"] = [[1] for i in sf] + model = tc.linear_regression.create(sf, self.target) + except ToolkitError: + pass + try: + sf["error-column"] = [{1: 1} for i in sf] + model = tc.linear_regression.create(sf, self.target) + except ToolkitError: + pass + del sf["error-column"] + + """ Test detection of columns with nan values """ - def test_nan_detection(self): - sf = self.sf - try: - sf['error-column'] = np.nan - model = tc.linear_regression.create(sf, self.target) - except ToolkitError: - pass - try: - sf['error-column'] = [[np.nan] for i in sf] - model = tc.linear_regression.create(sf, self.target) - except ToolkitError: - pass - try: - sf['error-column'] = [{1:np.nan} for i in sf] - model = tc.linear_regression.create(sf, self.target) - except ToolkitError: - pass - del sf['error-column'] + + def test_nan_detection(self): + sf = self.sf + try: + sf["error-column"] = np.nan + model = tc.linear_regression.create(sf, self.target) + except ToolkitError: + pass + try: + sf["error-column"] = [[np.nan] for i in sf] + model = tc.linear_regression.create(sf, self.target) + except ToolkitError: + pass + try: + sf["error-column"] = [{1: np.nan} for i in sf] + model = tc.linear_regression.create(sf, self.target) + except ToolkitError: + pass + del sf["error-column"] class VectorLinearRegressionTest(unittest.TestCase): - """ + """ Unit test class for testing a Linear Regression create function. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - np.random.seed(15) - n, d = 100, 3 - self.sf = tc.SFrame() + np.random.seed(15) + n, d = 100, 3 + self.sf = tc.SFrame() - # float columns - for i in range(d): - self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + # float columns + for i in range(d): + self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - # target column - self.sf['target'] = np.random.randint(2, size=n) + # target column + self.sf["target"] = np.random.randint(2, size=n) - ## Get the right answer with statsmodels - df = self.sf.to_dataframe() - formula = 'target ~ ' + \ - ' + '.join(['X{}'.format(i+1) for i in range(d)]) - sm_model = sm.ols(formula, data=df).fit() + ## Get the right answer with statsmodels + df = self.sf.to_dataframe() + formula = "target ~ " + " + ".join(["X{}".format(i + 1) for i in range(d)]) + sm_model = sm.ols(formula, data=df).fit() - self.loss = sm_model.ssr # sum of squared residuals - self.coef = list(sm_model.params) - self.stderr = list(sm_model.bse) - self.yhat = list(sm_model.fittedvalues) - self.rmse = np.sqrt(sm_model.ssr / float(n)) + self.loss = sm_model.ssr # sum of squared residuals + self.coef = list(sm_model.params) + self.stderr = list(sm_model.bse) + self.yhat = list(sm_model.fittedvalues) + self.rmse = np.sqrt(sm_model.ssr / float(n)) - ## Set the turicreate model params - self.target = 'target' - self.sf['vec'] = self.sf.apply(lambda row: [row['X{}'.format(i+1)] for i in - range(d)]) - self.sf['vec'] = self.sf['vec'].apply(lambda x:x, array.array) + ## Set the turicreate model params + self.target = "target" + self.sf["vec"] = self.sf.apply( + lambda row: [row["X{}".format(i + 1)] for i in range(d)] + ) + self.sf["vec"] = self.sf["vec"].apply(lambda x: x, array.array) - self.features = ['vec'] - self.unpacked_features = ['vec[%s]' % (i) for i in range(d)] + self.features = ["vec"] + self.unpacked_features = ["vec[%s]" % (i) for i in range(d)] - self.def_kwargs= _DEFAULT_SOLVER_OPTIONS + self.def_kwargs = _DEFAULT_SOLVER_OPTIONS - def _test_coefficients(self, model): - """ + def _test_coefficients(self, model): + """ Check that the coefficient values are very close to the correct values. """ - coefs = model.coefficients - coef_list = list(coefs['value']) - stderr_list = list(coefs['stderr']) - self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) - self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03)) - - def _test_create(self, sf, target, features, solver, - opts, rescaling): - - model = tc.linear_regression.create(sf, target, features, solver = solver, - l2_penalty = 0.0, feature_rescaling = rescaling, - validation_set = None, - **opts) - test_case = 'solver = {solver}, opts = {opts}'.format(solver = solver, - opts = opts) - self.assertTrue(model is not None) - self.assertTrue(abs(model.training_rmse - self.rmse) < 0.1, - 'rmse failed: %s' % test_case) - self.assertTrue(abs(model.training_loss - self.loss) < 0.1, - 'loss failed: %s' % test_case) - self._test_coefficients(model) + coefs = model.coefficients + coef_list = list(coefs["value"]) + stderr_list = list(coefs["stderr"]) + self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) + self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03)) - """ + def _test_create(self, sf, target, features, solver, opts, rescaling): + + model = tc.linear_regression.create( + sf, + target, + features, + solver=solver, + l2_penalty=0.0, + feature_rescaling=rescaling, + validation_set=None, + **opts + ) + test_case = "solver = {solver}, opts = {opts}".format(solver=solver, opts=opts) + self.assertTrue(model is not None) + self.assertTrue( + abs(model.training_rmse - self.rmse) < 0.1, "rmse failed: %s" % test_case + ) + self.assertTrue( + abs(model.training_loss - self.loss) < 0.1, "loss failed: %s" % test_case + ) + self._test_coefficients(model) + + """ Test linear regression create. """ - def test_create(self): - for solver in ['newton']: - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, True) - self._test_create(*args) - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, False) - self._test_create(*args) + def test_create(self): + + for solver in ["newton"]: + args = (self.sf, self.target, self.features, solver, self.def_kwargs, True) + self._test_create(*args) + args = (self.sf, self.target, self.features, solver, self.def_kwargs, False) + self._test_create(*args) - def test_features(self): + def test_features(self): + + model = tc.linear_regression.create( + self.sf, + self.target, + self.features, + feature_rescaling=False, + validation_set=None, + ) + self.assertEqual(model.num_features, len(self.features)) + self.assertEqual(model.features, self.features) + self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) + self.assertEqual(model.unpacked_features, self.unpacked_features) - model = tc.linear_regression.create(self.sf, self.target, self.features, - feature_rescaling = False, validation_set = None) - self.assertEqual(model.num_features, len(self.features)) - self.assertEqual(model.features, self.features) - self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) - self.assertEqual(model.unpacked_features, self.unpacked_features) class NDArrayLinearRegressionTest(unittest.TestCase): - """ + """ Unit test class for testing a Linear Regression create function. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - np.random.seed(15) - n, d = 100, 6 - self.sf = tc.SFrame() + np.random.seed(15) + n, d = 100, 6 + self.sf = tc.SFrame() - # float columns - for i in range(d): - self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + # float columns + for i in range(d): + self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - # target column - self.sf['target'] = np.random.randint(2, size=n) + # target column + self.sf["target"] = np.random.randint(2, size=n) - ## Get the right answer with statsmodels - df = self.sf.to_dataframe() - print(df) - formula = 'target ~ ' + \ - ' + '.join(['X{}'.format(i+1) for i in range(d)]) - sm_model = sm.ols(formula, data=df).fit() + ## Get the right answer with statsmodels + df = self.sf.to_dataframe() + print(df) + formula = "target ~ " + " + ".join(["X{}".format(i + 1) for i in range(d)]) + sm_model = sm.ols(formula, data=df).fit() - self.loss = sm_model.ssr # sum of squared residuals - self.coef = list(sm_model.params) - self.stderr = list(sm_model.bse) - self.yhat = list(sm_model.fittedvalues) - self.rmse = np.sqrt(sm_model.ssr / float(n)) + self.loss = sm_model.ssr # sum of squared residuals + self.coef = list(sm_model.params) + self.stderr = list(sm_model.bse) + self.yhat = list(sm_model.fittedvalues) + self.rmse = np.sqrt(sm_model.ssr / float(n)) - ## Set the turicreate model params - self.target = 'target' - self.sf['nd_vec'] = self.sf.apply(lambda row: np.array([row['X{}'.format(i+1)] for i in - range(d)]).reshape( (3, 2) )) + ## Set the turicreate model params + self.target = "target" + self.sf["nd_vec"] = self.sf.apply( + lambda row: np.array([row["X{}".format(i + 1)] for i in range(d)]).reshape( + (3, 2) + ) + ) - self.features = ['nd_vec'] - self.unpacked_features = ['nd_vec[%d,%d]' % (i, j) for i in range(3) for j in range(2)] + self.features = ["nd_vec"] + self.unpacked_features = [ + "nd_vec[%d,%d]" % (i, j) for i in range(3) for j in range(2) + ] - self.def_kwargs= _DEFAULT_SOLVER_OPTIONS + self.def_kwargs = _DEFAULT_SOLVER_OPTIONS - def _test_coefficients(self, model): - """ + def _test_coefficients(self, model): + """ Check that the coefficient values are very close to the correct values. """ - coefs = model.coefficients - coef_list = list(coefs['value']) - stderr_list = list(coefs['stderr']) - self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) - self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03)) - - def _test_create(self, sf, target, features, solver, - opts, rescaling): - - model = tc.linear_regression.create(sf, target, features, solver = solver, - l2_penalty = 0.0, feature_rescaling = rescaling, - validation_set = None, - **opts) - test_case = 'solver = {solver}, opts = {opts}'.format(solver = solver, - opts = opts) - self.assertTrue(model is not None) - self.assertTrue(abs(model.training_rmse - self.rmse) < 0.1, - 'rmse failed: %s' % test_case) - self.assertTrue(abs(model.training_loss - self.loss) < 0.1, - 'loss failed: %s' % test_case) - self._test_coefficients(model) + coefs = model.coefficients + coef_list = list(coefs["value"]) + stderr_list = list(coefs["stderr"]) + self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) + self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03)) - """ + def _test_create(self, sf, target, features, solver, opts, rescaling): + + model = tc.linear_regression.create( + sf, + target, + features, + solver=solver, + l2_penalty=0.0, + feature_rescaling=rescaling, + validation_set=None, + **opts + ) + test_case = "solver = {solver}, opts = {opts}".format(solver=solver, opts=opts) + self.assertTrue(model is not None) + self.assertTrue( + abs(model.training_rmse - self.rmse) < 0.1, "rmse failed: %s" % test_case + ) + self.assertTrue( + abs(model.training_loss - self.loss) < 0.1, "loss failed: %s" % test_case + ) + self._test_coefficients(model) + + """ Test linear regression create. """ - def test_create(self): - for solver in ['newton']: - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, True) - self._test_create(*args) - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, False) - self._test_create(*args) + def test_create(self): + + for solver in ["newton"]: + args = (self.sf, self.target, self.features, solver, self.def_kwargs, True) + self._test_create(*args) + args = (self.sf, self.target, self.features, solver, self.def_kwargs, False) + self._test_create(*args) - def test_features(self): + def test_features(self): + + model = tc.linear_regression.create( + self.sf, + self.target, + self.features, + feature_rescaling=False, + validation_set=None, + ) + self.assertEqual(model.num_features, len(self.features)) + self.assertEqual(model.features, self.features) + self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) + self.assertEqual(model.unpacked_features, self.unpacked_features) - model = tc.linear_regression.create(self.sf, self.target, self.features, - feature_rescaling = False, validation_set = None) - self.assertEqual(model.num_features, len(self.features)) - self.assertEqual(model.features, self.features) - self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) - self.assertEqual(model.unpacked_features, self.unpacked_features) class DictLinearRegressionTest(unittest.TestCase): - """ + """ Unit test class for testing a Linear Regression create function. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - np.random.seed(15) - n, d = 100, 3 - self.d = d - self.sf = tc.SFrame() - - # float columns - for i in range(d): - self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - - # target column - self.sf['target'] = np.random.randint(2, size=n) - - ## Get the right answer with statsmodels - df = self.sf.to_dataframe() - formula = 'target ~ ' + \ - ' + '.join(['X{}'.format(i+1) for i in range(d)]) - sm_model = sm.ols(formula, data=df).fit() - - self.loss = sm_model.ssr # sum of squared residuals - self.coef = list(sm_model.params) - self.stderr = list(sm_model.bse) - self.yhat = list(sm_model.fittedvalues) - self.rmse = np.sqrt(sm_model.ssr / float(n)) - - ## Set the turicreate model params - self.target = 'target' - self.sf['dict'] = self.sf.apply(lambda row: {i: row['X{}'.format(i+1)] for i in - range(d)}) - - self.features = ['dict'] - self.unpacked_features = ['dict[%s]' % i for i in range(d)] - - self.def_kwargs = { - 'convergence_threshold': 1e-5, - 'step_size': 1.0, - 'max_iterations': 100, + np.random.seed(15) + n, d = 100, 3 + self.d = d + self.sf = tc.SFrame() + + # float columns + for i in range(d): + self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + + # target column + self.sf["target"] = np.random.randint(2, size=n) + + ## Get the right answer with statsmodels + df = self.sf.to_dataframe() + formula = "target ~ " + " + ".join(["X{}".format(i + 1) for i in range(d)]) + sm_model = sm.ols(formula, data=df).fit() + + self.loss = sm_model.ssr # sum of squared residuals + self.coef = list(sm_model.params) + self.stderr = list(sm_model.bse) + self.yhat = list(sm_model.fittedvalues) + self.rmse = np.sqrt(sm_model.ssr / float(n)) + + ## Set the turicreate model params + self.target = "target" + self.sf["dict"] = self.sf.apply( + lambda row: {i: row["X{}".format(i + 1)] for i in range(d)} + ) + + self.features = ["dict"] + self.unpacked_features = ["dict[%s]" % i for i in range(d)] + + self.def_kwargs = { + "convergence_threshold": 1e-5, + "step_size": 1.0, + "max_iterations": 100, } - def _test_coefficients(self, model): - """ + def _test_coefficients(self, model): + """ Check that the coefficient values are very close to the correct values. """ - coefs = model.coefficients - coef_list = list(coefs['value']) - stderr_list = list(coefs['stderr']) - self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) - self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03)) - - def _test_create(self, sf, target, features, solver, - opts, rescaling): - - model = tc.linear_regression.create(sf, target, features, solver = solver, - l2_penalty = 0.0, feature_rescaling = rescaling, - validation_set = None, **opts) - - test_case = 'solver = {solver}, opts = {opts}'.format(solver = solver, - opts = opts) - self.assertTrue(model is not None) - self.assertTrue(abs(model.training_rmse - self.rmse) < 0.1, - 'rmse failed: %s' % test_case) - self.assertTrue(abs(model.training_loss - self.loss) < 0.1, - 'loss failed: %s' % test_case) - self._test_coefficients(model) + coefs = model.coefficients + coef_list = list(coefs["value"]) + stderr_list = list(coefs["stderr"]) + self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) + self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03)) - """ + def _test_create(self, sf, target, features, solver, opts, rescaling): + + model = tc.linear_regression.create( + sf, + target, + features, + solver=solver, + l2_penalty=0.0, + feature_rescaling=rescaling, + validation_set=None, + **opts + ) + + test_case = "solver = {solver}, opts = {opts}".format(solver=solver, opts=opts) + self.assertTrue(model is not None) + self.assertTrue( + abs(model.training_rmse - self.rmse) < 0.1, "rmse failed: %s" % test_case + ) + self.assertTrue( + abs(model.training_loss - self.loss) < 0.1, "loss failed: %s" % test_case + ) + self._test_coefficients(model) + + """ Test linear regression create. """ - def test_create(self): - - for solver in ['newton']: - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, True) - self._test_create(*args) - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, False) - self._test_create(*args) - - def test_features(self): - - d = self.d - self.sf['dict'] = self.sf.apply(lambda row: {i: row['X{}'.format(i+1)] for i in - range(d)}) - model = tc.linear_regression.create(self.sf, self.target, self.features, - feature_rescaling = False, validation_set = None) - self.assertEqual(model.num_features, len(self.features)) - self.assertEqual(model.features, self.features) - self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) - self.assertEqual(model.unpacked_features, self.unpacked_features) - - def test_predict_extra_cols(self): - - sf = self.sf[:] - model = tc.linear_regression.create(sf, self.target, self.features, - feature_rescaling = False, validation_set = None) - pred = model.predict(sf) - sf['dict'] = sf['dict'].apply(lambda x: dict(list(x.items()) - + list({'extra_col': 0, 'extra_col_2': 1}.items()))) - pred2 = model.predict(sf) - self.assertEqual(list(pred), list(pred2)) - - def test_evaluate_extra_cols(self): - - sf = self.sf[:] - model = tc.linear_regression.create(sf, self.target, self.features, - feature_rescaling = False, validation_set = None) - eval1 = model.evaluate(sf) - sf['dict'] = sf['dict'].apply(lambda x: dict(list(x.items()) - + list({'extra_col': 0, 'extra_col_2': 1}.items()))) - eval2 = model.evaluate(sf) - self.assertEqual(eval1, eval2) + + def test_create(self): + + for solver in ["newton"]: + args = (self.sf, self.target, self.features, solver, self.def_kwargs, True) + self._test_create(*args) + args = (self.sf, self.target, self.features, solver, self.def_kwargs, False) + self._test_create(*args) + + def test_features(self): + + d = self.d + self.sf["dict"] = self.sf.apply( + lambda row: {i: row["X{}".format(i + 1)] for i in range(d)} + ) + model = tc.linear_regression.create( + self.sf, + self.target, + self.features, + feature_rescaling=False, + validation_set=None, + ) + self.assertEqual(model.num_features, len(self.features)) + self.assertEqual(model.features, self.features) + self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) + self.assertEqual(model.unpacked_features, self.unpacked_features) + + def test_predict_extra_cols(self): + + sf = self.sf[:] + model = tc.linear_regression.create( + sf, self.target, self.features, feature_rescaling=False, validation_set=None + ) + pred = model.predict(sf) + sf["dict"] = sf["dict"].apply( + lambda x: dict( + list(x.items()) + list({"extra_col": 0, "extra_col_2": 1}.items()) + ) + ) + pred2 = model.predict(sf) + self.assertEqual(list(pred), list(pred2)) + + def test_evaluate_extra_cols(self): + + sf = self.sf[:] + model = tc.linear_regression.create( + sf, self.target, self.features, feature_rescaling=False, validation_set=None + ) + eval1 = model.evaluate(sf) + sf["dict"] = sf["dict"].apply( + lambda x: dict( + list(x.items()) + list({"extra_col": 0, "extra_col_2": 1}.items()) + ) + ) + eval2 = model.evaluate(sf) + self.assertEqual(eval1, eval2) class ListCategoricalLinearRegressionTest(unittest.TestCase): - """ + """ Unit test class for testing a Linear Regression create function. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - ## Create fake data with a categorical variable - np.random.seed(15) - n, d = 100, 3 - self.sf = tc.SFrame() - - # float columns - for i in range(d): - self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - - - # categorical column - species = np.array(['cat', 'dog', 'foosa']) - idx = np.random.randint(3, size=n) - # Stats models maps categorical in alphabetical order of categories. - # We do it in the order of appearance. These three lines of code - # ensures that the two are the same. - idx[0] = 0 - idx[1] = 1 - idx[2] = 2 - self.sf['species'] = list(species[idx]) - - # target column - self.sf['target'] = np.random.randint(2, size=n) - - - ## Get the right answer with statsmodels - df = self.sf.to_dataframe() - formula = 'target ~ species + ' + \ - ' + '.join(['X{}'.format(i+1) for i in range(d)]) - sm_model = sm.ols(formula, data=df).fit() - - self.loss = sm_model.ssr # sum of squared residuals - self.stderr = list(sm_model.bse) - self.coef = list(sm_model.params) - self.yhat = list(sm_model.fittedvalues) - self.rmse = np.sqrt(sm_model.ssr / float(n)) - - ## Set the turicreate model params - self.target = 'target' - self.features = ['species', 'X1', 'X2', 'X3'] - self.unpacked_features = ['species[dog]', 'species[foosa]', 'X1', 'X2', 'X3'] - self.sf['species'] = self.sf["species"].apply(lambda x: [x]) - - self.def_kwargs = { - 'convergence_threshold': 1e-5, - 'step_size': 1.0, - 'max_iterations': 100, - } + ## Create fake data with a categorical variable + np.random.seed(15) + n, d = 100, 3 + self.sf = tc.SFrame() - def _test_coefficients(self, model, test_stderr): - """ - Check that the coefficient values are very close to the correct values. - """ - coefs = model.coefficients - coef_list = list(coefs['value']) - self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) - if test_stderr: - stderr_list = list(coefs['stderr']) - self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03)) - else: - self.assertTrue('stderr' in coefs.column_names()) - self.assertEqual(list(coefs['stderr']), [None for v in coef_list]) + # float columns + for i in range(d): + self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - def _test_create(self, sf, target, features, solver, opts, rescaling): + # categorical column + species = np.array(["cat", "dog", "foosa"]) + idx = np.random.randint(3, size=n) + # Stats models maps categorical in alphabetical order of categories. + # We do it in the order of appearance. These three lines of code + # ensures that the two are the same. + idx[0] = 0 + idx[1] = 1 + idx[2] = 2 + self.sf["species"] = list(species[idx]) + + # target column + self.sf["target"] = np.random.randint(2, size=n) + + ## Get the right answer with statsmodels + df = self.sf.to_dataframe() + formula = "target ~ species + " + " + ".join( + ["X{}".format(i + 1) for i in range(d)] + ) + sm_model = sm.ols(formula, data=df).fit() - model = tc.linear_regression.create(sf, target, features, solver = solver, - l2_penalty = 0.0, feature_rescaling = rescaling, - validation_set = None, **opts) + self.loss = sm_model.ssr # sum of squared residuals + self.stderr = list(sm_model.bse) + self.coef = list(sm_model.params) + self.yhat = list(sm_model.fittedvalues) + self.rmse = np.sqrt(sm_model.ssr / float(n)) - test_case = 'solver = {solver}, opts = {opts}'.format(solver = solver, - opts = opts) - self.assertTrue(model is not None) - self.assertTrue(abs(model.training_loss - self.loss) < 0.1, - 'loss failed: %s' % test_case) - self.assertTrue(abs(model.training_rmse - self.rmse) < 0.1, - 'rmse failed: %s' % test_case) - self._test_coefficients(model, solver == 'newton') + ## Set the turicreate model params + self.target = "target" + self.features = ["species", "X1", "X2", "X3"] + self.unpacked_features = ["species[dog]", "species[foosa]", "X1", "X2", "X3"] + self.sf["species"] = self.sf["species"].apply(lambda x: [x]) + self.def_kwargs = { + "convergence_threshold": 1e-5, + "step_size": 1.0, + "max_iterations": 100, + } - """ + def _test_coefficients(self, model, test_stderr): + """ + Check that the coefficient values are very close to the correct values. + """ + coefs = model.coefficients + coef_list = list(coefs["value"]) + self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) + if test_stderr: + stderr_list = list(coefs["stderr"]) + self.assertTrue( + np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03) + ) + else: + self.assertTrue("stderr" in coefs.column_names()) + self.assertEqual(list(coefs["stderr"]), [None for v in coef_list]) + + def _test_create(self, sf, target, features, solver, opts, rescaling): + + model = tc.linear_regression.create( + sf, + target, + features, + solver=solver, + l2_penalty=0.0, + feature_rescaling=rescaling, + validation_set=None, + **opts + ) + + test_case = "solver = {solver}, opts = {opts}".format(solver=solver, opts=opts) + self.assertTrue(model is not None) + self.assertTrue( + abs(model.training_loss - self.loss) < 0.1, "loss failed: %s" % test_case + ) + self.assertTrue( + abs(model.training_rmse - self.rmse) < 0.1, "rmse failed: %s" % test_case + ) + self._test_coefficients(model, solver == "newton") + + """ Test linear regression create. """ - def test_create(self): - for solver in ['newton', 'lbfgs', 'fista']: - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, True) - self._test_create(*args) - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, False) - self._test_create(*args) + def test_create(self): + + for solver in ["newton", "lbfgs", "fista"]: + args = (self.sf, self.target, self.features, solver, self.def_kwargs, True) + self._test_create(*args) + args = (self.sf, self.target, self.features, solver, self.def_kwargs, False) + self._test_create(*args) class CategoricalLinearRegressionTest(unittest.TestCase): - """ + """ Unit test class for testing a Linear Regression create function. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - ## Create fake data with a categorical variable - np.random.seed(15) - n, d = 100, 3 - self.sf = tc.SFrame() - - # float columns - for i in range(d): - self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - - - # categorical column - species = np.array(['cat', 'dog', 'foosa']) - idx = np.random.randint(3, size=n) - # Stats models maps categorical in alphabetical order of categories. - # We do it in the order of appearance. These three lines of code - # ensures that the two are the same. - idx[0] = 0 - idx[1] = 1 - idx[2] = 2 - self.sf['species'] = list(species[idx]) - - # target column - self.sf['target'] = np.random.randint(2, size=n) - - - ## Get the right answer with statsmodels - df = self.sf.to_dataframe() - formula = 'target ~ species + ' + \ - ' + '.join(['X{}'.format(i+1) for i in range(d)]) - sm_model = sm.ols(formula, data=df).fit() - - self.loss = sm_model.ssr # sum of squared residuals - self.stderr = list(sm_model.bse) - self.coef = list(sm_model.params) - self.yhat = list(sm_model.fittedvalues) - self.rmse = np.sqrt(sm_model.ssr / float(n)) - - ## Set the turicreate model params - self.target = 'target' - self.features = ['species', 'X1', 'X2', 'X3'] - self.unpacked_features = ['species', 'X1', 'X2', 'X3'] - - self.def_kwargs = { - 'convergence_threshold': 1e-5, - 'step_size': 1.0, - 'max_iterations': 100, - } + ## Create fake data with a categorical variable + np.random.seed(15) + n, d = 100, 3 + self.sf = tc.SFrame() - def _test_coefficients(self, model, test_stderr): - """ - Check that the coefficient values are very close to the correct values. - """ - coefs = model.coefficients - coef_list = list(coefs['value']) - self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) - if test_stderr: - stderr_list = list(coefs['stderr']) - self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03)) - else: - self.assertTrue('stderr' in coefs.column_names()) - self.assertEqual(list(coefs['stderr']), [None for v in coef_list]) + # float columns + for i in range(d): + self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - def _test_create(self, sf, target, features, solver, opts, rescaling): + # categorical column + species = np.array(["cat", "dog", "foosa"]) + idx = np.random.randint(3, size=n) + # Stats models maps categorical in alphabetical order of categories. + # We do it in the order of appearance. These three lines of code + # ensures that the two are the same. + idx[0] = 0 + idx[1] = 1 + idx[2] = 2 + self.sf["species"] = list(species[idx]) + + # target column + self.sf["target"] = np.random.randint(2, size=n) + + ## Get the right answer with statsmodels + df = self.sf.to_dataframe() + formula = "target ~ species + " + " + ".join( + ["X{}".format(i + 1) for i in range(d)] + ) + sm_model = sm.ols(formula, data=df).fit() - model = tc.linear_regression.create(sf, target, features, solver = solver, - l2_penalty = 0.0, feature_rescaling = rescaling, - validation_set = None, **opts) + self.loss = sm_model.ssr # sum of squared residuals + self.stderr = list(sm_model.bse) + self.coef = list(sm_model.params) + self.yhat = list(sm_model.fittedvalues) + self.rmse = np.sqrt(sm_model.ssr / float(n)) - test_case = 'solver = {solver}, opts = {opts}'.format(solver = solver, - opts = opts) - self.assertTrue(model is not None) - self.assertTrue(abs(model.training_loss - self.loss) < 0.1, - 'loss failed: %s' % test_case) - self.assertTrue(abs(model.training_rmse - self.rmse) < 0.1, - 'rmse failed: %s' % test_case) - self._test_coefficients(model, solver == 'newton') + ## Set the turicreate model params + self.target = "target" + self.features = ["species", "X1", "X2", "X3"] + self.unpacked_features = ["species", "X1", "X2", "X3"] + self.def_kwargs = { + "convergence_threshold": 1e-5, + "step_size": 1.0, + "max_iterations": 100, + } - """ + def _test_coefficients(self, model, test_stderr): + """ + Check that the coefficient values are very close to the correct values. + """ + coefs = model.coefficients + coef_list = list(coefs["value"]) + self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) + if test_stderr: + stderr_list = list(coefs["stderr"]) + self.assertTrue( + np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03) + ) + else: + self.assertTrue("stderr" in coefs.column_names()) + self.assertEqual(list(coefs["stderr"]), [None for v in coef_list]) + + def _test_create(self, sf, target, features, solver, opts, rescaling): + + model = tc.linear_regression.create( + sf, + target, + features, + solver=solver, + l2_penalty=0.0, + feature_rescaling=rescaling, + validation_set=None, + **opts + ) + + test_case = "solver = {solver}, opts = {opts}".format(solver=solver, opts=opts) + self.assertTrue(model is not None) + self.assertTrue( + abs(model.training_loss - self.loss) < 0.1, "loss failed: %s" % test_case + ) + self.assertTrue( + abs(model.training_rmse - self.rmse) < 0.1, "rmse failed: %s" % test_case + ) + self._test_coefficients(model, solver == "newton") + + """ Test linear regression create. """ - def test_create(self): - - for solver in ['newton', 'lbfgs', 'fista']: - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, True) - self._test_create(*args) - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, False) - self._test_create(*args) - - def test_predict_extra_cols(self): - - model = tc.linear_regression.create(self.sf, self.target, self.features, - feature_rescaling = False, validation_set = None) - # Create a copy so we don't alter the original SFrame - X_sf = self.sf.copy() - X_sf['species'] = X_sf['species'].apply(lambda x: x if x != 'foosa' - else 'rat') - pred = model.predict(X_sf) - - def test_evaluate_extra_cols(self): - - model = tc.linear_regression.create(self.sf, self.target, self.features, - feature_rescaling = False, validation_set = None) - # Create a copy so we don't alter the original SFrame - X_sf = self.sf.copy() - X_sf['species'] = X_sf['species'].apply(lambda x: x if x != 'foosa' - else 'rat') - pred = model.evaluate(X_sf) - - def test_features(self): - - model = tc.linear_regression.create(self.sf, self.target, self.features, - feature_rescaling = False, validation_set = None) - self.assertEqual(model.num_features, len(self.features)) - self.assertEqual(model.features, self.features) - self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) - self.assertEqual(model.unpacked_features, self.unpacked_features) + + def test_create(self): + + for solver in ["newton", "lbfgs", "fista"]: + args = (self.sf, self.target, self.features, solver, self.def_kwargs, True) + self._test_create(*args) + args = (self.sf, self.target, self.features, solver, self.def_kwargs, False) + self._test_create(*args) + + def test_predict_extra_cols(self): + + model = tc.linear_regression.create( + self.sf, + self.target, + self.features, + feature_rescaling=False, + validation_set=None, + ) + # Create a copy so we don't alter the original SFrame + X_sf = self.sf.copy() + X_sf["species"] = X_sf["species"].apply(lambda x: x if x != "foosa" else "rat") + pred = model.predict(X_sf) + + def test_evaluate_extra_cols(self): + + model = tc.linear_regression.create( + self.sf, + self.target, + self.features, + feature_rescaling=False, + validation_set=None, + ) + # Create a copy so we don't alter the original SFrame + X_sf = self.sf.copy() + X_sf["species"] = X_sf["species"].apply(lambda x: x if x != "foosa" else "rat") + pred = model.evaluate(X_sf) + + def test_features(self): + + model = tc.linear_regression.create( + self.sf, + self.target, + self.features, + feature_rescaling=False, + validation_set=None, + ) + self.assertEqual(model.num_features, len(self.features)) + self.assertEqual(model.features, self.features) + self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) + self.assertEqual(model.unpacked_features, self.unpacked_features) class L1LinearRegressionTest(unittest.TestCase): - """ + """ Unit test class for testing a Linear Regression create function. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - test_data = '''y,0,1,2,3,4 + test_data = """y,0,1,2,3,4 38,0,3,1,0,1.47 58,1,2,2,8,4.38 30,1,1,1,0,1.64 @@ -1006,90 +1104,97 @@ def setUpClass(self): 59,0,3,3,0,2.88 65,1,2,3,5,3.37 49,0,1,3,0,2.84 - 37,1,1,1,9,5.12''' - - dataset = 'data_file%s.csv' % (str(uuid.uuid4())) - self.dataset = dataset - f = open(dataset, 'w') - f.write(test_data) - f.close() - self.def_kwargs = {'convergence_threshold': 1e-5, - 'max_iterations': 1000} - - self.features = ['0', '1', '2', '3', '4'] - self.target = 'y' - type_dict = {n: float for n in self.features - + [self.target]} - - self.sf = tc.SFrame.read_csv(dataset, header=True, delimiter=',', - column_type_hints = type_dict) - - # Check answers with Numpy calculations - # ------------------------------------------------------------------------ - feature_matrix = np.genfromtxt(dataset, delimiter=',', skip_header=1) - X = feature_matrix[:, 1:] - y = feature_matrix[:, 0] - self.examples = X.shape[0] - - # Fit the model - self.l1_penalty = 10.0 - clf = linear_model.ElasticNet(alpha=self.l1_penalty/(2*self.examples), - l1_ratio=1) - clf.fit(X, y) - - self.coef = np.append(clf.intercept_, clf.coef_) - self.predictions = clf.predict(X) - self.loss = np.dot(self.predictions - y, self.predictions - y) - self.rmse = np.sqrt(self.loss/self.examples) - - @classmethod - def tearDownClass(self): - os.remove(self.dataset) - - def _test_coefficients(self, model): - """ - Check that the coefficient values are very close to the correct values. - """ - coefs = model.coefficients - coef_list = list(coefs['value']) - self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-02, atol=1e-02), - "%s vs %s" % (coef_list, self.coef)) + 37,1,1,1,9,5.12""" - def _test_create(self, sf, target, features, solver, - opts): + dataset = "data_file%s.csv" % (str(uuid.uuid4())) + self.dataset = dataset + f = open(dataset, "w") + f.write(test_data) + f.close() + self.def_kwargs = {"convergence_threshold": 1e-5, "max_iterations": 1000} - model = tc.linear_regression.create(sf, target, features, solver = solver, - l1_penalty=self.l1_penalty, l2_penalty = 0.0, feature_rescaling=False, - validation_set=None, **opts) + self.features = ["0", "1", "2", "3", "4"] + self.target = "y" + type_dict = {n: float for n in self.features + [self.target]} - test_case = 'solver = {solver}, opts = {opts}'.format(solver = solver, - opts = opts) - self.assertTrue(model is not None) - self._test_coefficients(model) - self.assertTrue(abs(model.training_rmse - self.rmse) < 0.1, - 'rmse failed: %s' % test_case) + self.sf = tc.SFrame.read_csv( + dataset, header=True, delimiter=",", column_type_hints=type_dict + ) - def test_create(self): + # Check answers with Numpy calculations + # ------------------------------------------------------------------------ + feature_matrix = np.genfromtxt(dataset, delimiter=",", skip_header=1) + X = feature_matrix[:, 1:] + y = feature_matrix[:, 0] + self.examples = X.shape[0] + + # Fit the model + self.l1_penalty = 10.0 + clf = linear_model.ElasticNet( + alpha=self.l1_penalty / (2 * self.examples), l1_ratio=1 + ) + clf.fit(X, y) + + self.coef = np.append(clf.intercept_, clf.coef_) + self.predictions = clf.predict(X) + self.loss = np.dot(self.predictions - y, self.predictions - y) + self.rmse = np.sqrt(self.loss / self.examples) - for solver in ['fista']: - args = (self.sf, self.target, self.features, - solver, self.def_kwargs) - self._test_create(*args) + @classmethod + def tearDownClass(self): + os.remove(self.dataset) + def _test_coefficients(self, model): + """ + Check that the coefficient values are very close to the correct values. + """ + coefs = model.coefficients + coef_list = list(coefs["value"]) + self.assertTrue( + np.allclose(coef_list, self.coef, rtol=1e-02, atol=1e-02), + "%s vs %s" % (coef_list, self.coef), + ) + + def _test_create(self, sf, target, features, solver, opts): + + model = tc.linear_regression.create( + sf, + target, + features, + solver=solver, + l1_penalty=self.l1_penalty, + l2_penalty=0.0, + feature_rescaling=False, + validation_set=None, + **opts + ) + + test_case = "solver = {solver}, opts = {opts}".format(solver=solver, opts=opts) + self.assertTrue(model is not None) + self._test_coefficients(model) + self.assertTrue( + abs(model.training_rmse - self.rmse) < 0.1, "rmse failed: %s" % test_case + ) + + def test_create(self): + + for solver in ["fista"]: + args = (self.sf, self.target, self.features, solver, self.def_kwargs) + self._test_create(*args) class L2LinearRegressionTest(unittest.TestCase): - """ + """ Unit test class for testing a Linear Regression create function. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - test_data = '''y,0,1,2,3,4 + test_data = """y,0,1,2,3,4 38,0,3,1,0,1.47 58,1,2,2,8,4.38 30,1,1,1,0,1.64 @@ -1118,96 +1223,104 @@ def setUpClass(self): 59,0,3,3,0,2.88 65,1,2,3,5,3.37 49,0,1,3,0,2.84 - 37,1,1,1,9,5.12''' - - dataset = 'data_file%s.csv' % (str(uuid.uuid4())) - self.dataset = dataset - f = open(dataset, 'w') - f.write(test_data) - f.close() - self.def_kwargs = {'convergence_threshold': 1e-5, - 'step_size': 1.0, - 'lbfgs_memory_level': 11, - 'max_iterations': 1000} - - self.features = ['0', '1', '2', '3', '4'] - self.target = 'y' - type_dict = {n: float for n in self.features - + [self.target]} - - self.sf = tc.SFrame.read_csv(dataset, header=True, delimiter=',', - column_type_hints = type_dict) - - - # Check answers with Numpy calculations - # ------------------------------------------------------------------------ - feature_matrix = np.genfromtxt(dataset, delimiter=',', skip_header=1) - X = feature_matrix[:, 1:] - y = feature_matrix[:, 0] - self.examples = X.shape[0] - self.variables = X.shape[1] + 1 - - # Fit the model - self.l2_penalty = 10.0 - clf = linear_model.ElasticNet(alpha=self.l2_penalty/(self.examples), - l1_ratio=0) - clf.fit(X, y) - - self.coef = np.append(clf.intercept_, clf.coef_) - self.predictions = clf.predict(X) - self.loss = np.dot(self.predictions - y, self.predictions - y) - self.rmse = np.sqrt(self.loss/self.examples) - - @classmethod - def tearDownClass(self): - os.remove(self.dataset) - - def _test_coefficients(self, model): - """ - Check that the coefficient values are very close to the correct values. - """ - coefs = model.coefficients - coef_list = list(coefs['value']) - self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) + 37,1,1,1,9,5.12""" + + dataset = "data_file%s.csv" % (str(uuid.uuid4())) + self.dataset = dataset + f = open(dataset, "w") + f.write(test_data) + f.close() + self.def_kwargs = { + "convergence_threshold": 1e-5, + "step_size": 1.0, + "lbfgs_memory_level": 11, + "max_iterations": 1000, + } - def _test_create(self, sf, target, features, solver, - opts): + self.features = ["0", "1", "2", "3", "4"] + self.target = "y" + type_dict = {n: float for n in self.features + [self.target]} - model = tc.linear_regression.create(sf, target, features, solver = solver, - l2_penalty=self.l2_penalty, feature_rescaling = False, - validation_set=None, **opts) + self.sf = tc.SFrame.read_csv( + dataset, header=True, delimiter=",", column_type_hints=type_dict + ) - test_case = 'solver = {solver}, opts = {opts}'.format(solver = solver, - opts = opts) + # Check answers with Numpy calculations + # ------------------------------------------------------------------------ + feature_matrix = np.genfromtxt(dataset, delimiter=",", skip_header=1) + X = feature_matrix[:, 1:] + y = feature_matrix[:, 0] + self.examples = X.shape[0] + self.variables = X.shape[1] + 1 + + # Fit the model + self.l2_penalty = 10.0 + clf = linear_model.ElasticNet( + alpha=self.l2_penalty / (self.examples), l1_ratio=0 + ) + clf.fit(X, y) + + self.coef = np.append(clf.intercept_, clf.coef_) + self.predictions = clf.predict(X) + self.loss = np.dot(self.predictions - y, self.predictions - y) + self.rmse = np.sqrt(self.loss / self.examples) - self.assertTrue(model is not None) - self.assertTrue(abs(model.training_rmse - self.rmse) < 0.1, - 'rmse failed: %s' % test_case) - self._test_coefficients(model) + @classmethod + def tearDownClass(self): + os.remove(self.dataset) - """ + def _test_coefficients(self, model): + """ + Check that the coefficient values are very close to the correct values. + """ + coefs = model.coefficients + coef_list = list(coefs["value"]) + self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01)) + + def _test_create(self, sf, target, features, solver, opts): + + model = tc.linear_regression.create( + sf, + target, + features, + solver=solver, + l2_penalty=self.l2_penalty, + feature_rescaling=False, + validation_set=None, + **opts + ) + + test_case = "solver = {solver}, opts = {opts}".format(solver=solver, opts=opts) + + self.assertTrue(model is not None) + self.assertTrue( + abs(model.training_rmse - self.rmse) < 0.1, "rmse failed: %s" % test_case + ) + self._test_coefficients(model) + + """ Test linear regression create. """ - def test_create(self): - for solver in ['newton', 'lbfgs', 'fista']: - args = (self.sf, self.target, self.features, - solver, self.def_kwargs) - self._test_create(*args) + def test_create(self): + + for solver in ["newton", "lbfgs", "fista"]: + args = (self.sf, self.target, self.features, solver, self.def_kwargs) + self._test_create(*args) class ElasticNetLinearRegressionTest(unittest.TestCase): - """ + """ Unit test class for testing a Linear Regression create function. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - test_data = '''y,0,1,2,3,4 + test_data = """y,0,1,2,3,4 38,0,3,1,0,1.47 58,1,2,2,8,4.38 30,1,1,1,0,1.64 @@ -1236,96 +1349,106 @@ def setUpClass(self): 59,0,3,3,0,2.88 65,1,2,3,5,3.37 49,0,1,3,0,2.84 - 37,1,1,1,9,5.12''' - - dataset = 'data_file%s.csv' % (str(uuid.uuid4())) - self.dataset = dataset - - f = open(dataset, 'w') - f.write(test_data) - f.close() - self.def_kwargs = {'convergence_threshold': 1e-5, - 'step_size': 1.0, - 'lbfgs_memory_level': 3, - 'max_iterations': 1000} - - self.features = ['0', '1', '2', '3', '4'] - self.target = 'y' - type_dict = {n: float for n in self.features - + [self.target]} - - self.sf = tc.SFrame.read_csv(dataset, header=True, delimiter=',', - column_type_hints = type_dict) - - # Check answers with Numpy calculations - # ------------------------------------------------------------------------ - feature_matrix = np.genfromtxt(dataset, delimiter=',', skip_header=1) - X = feature_matrix[:, 1:] - y = feature_matrix[:, 0] - self.examples = X.shape[0] - - # Fit the model - self.penalty = 10.0 - self.ratio = 0.5 - clf = linear_model.ElasticNet(alpha=self.penalty/self.examples, - l1_ratio=0.5) - clf.fit(X, y) - - self.coef = np.append(clf.intercept_, clf.coef_) - self.predictions = clf.predict(X) - self.loss = np.dot(self.predictions - y, self.predictions - y) - self.rmse = np.sqrt(self.loss/self.examples) - - @classmethod - def tearDownClass(self): - os.remove(self.dataset) - - def _test_coefficients(self, model): - """ - Check that the coefficient values are very close to the correct values. - """ - coefs = model.coefficients - coef_list = list(coefs['value']) - self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01), - "%s vs %s" % (coef_list, self.coef)) + 37,1,1,1,9,5.12""" + + dataset = "data_file%s.csv" % (str(uuid.uuid4())) + self.dataset = dataset + + f = open(dataset, "w") + f.write(test_data) + f.close() + self.def_kwargs = { + "convergence_threshold": 1e-5, + "step_size": 1.0, + "lbfgs_memory_level": 3, + "max_iterations": 1000, + } - def _test_create(self, sf, target, features, solver, - opts): + self.features = ["0", "1", "2", "3", "4"] + self.target = "y" + type_dict = {n: float for n in self.features + [self.target]} - model = tc.linear_regression.create(sf, target, features, solver = solver, - l1_penalty = self.penalty, l2_penalty = 0.5 * self.penalty, - feature_rescaling = False, validation_set=None, **opts) + self.sf = tc.SFrame.read_csv( + dataset, header=True, delimiter=",", column_type_hints=type_dict + ) - test_case = 'solver = {solver}, opts = {opts}'.format(solver = solver, - opts = opts) - self.assertTrue(model is not None) - self.assertTrue(abs(model.training_rmse - self.rmse) < 0.1, - 'rmse failed: %s' % test_case) - self._test_coefficients(model) + # Check answers with Numpy calculations + # ------------------------------------------------------------------------ + feature_matrix = np.genfromtxt(dataset, delimiter=",", skip_header=1) + X = feature_matrix[:, 1:] + y = feature_matrix[:, 0] + self.examples = X.shape[0] + + # Fit the model + self.penalty = 10.0 + self.ratio = 0.5 + clf = linear_model.ElasticNet(alpha=self.penalty / self.examples, l1_ratio=0.5) + clf.fit(X, y) + + self.coef = np.append(clf.intercept_, clf.coef_) + self.predictions = clf.predict(X) + self.loss = np.dot(self.predictions - y, self.predictions - y) + self.rmse = np.sqrt(self.loss / self.examples) - """ + @classmethod + def tearDownClass(self): + os.remove(self.dataset) + + def _test_coefficients(self, model): + """ + Check that the coefficient values are very close to the correct values. + """ + coefs = model.coefficients + coef_list = list(coefs["value"]) + self.assertTrue( + np.allclose(coef_list, self.coef, rtol=1e-01, atol=1e-01), + "%s vs %s" % (coef_list, self.coef), + ) + + def _test_create(self, sf, target, features, solver, opts): + + model = tc.linear_regression.create( + sf, + target, + features, + solver=solver, + l1_penalty=self.penalty, + l2_penalty=0.5 * self.penalty, + feature_rescaling=False, + validation_set=None, + **opts + ) + + test_case = "solver = {solver}, opts = {opts}".format(solver=solver, opts=opts) + self.assertTrue(model is not None) + self.assertTrue( + abs(model.training_rmse - self.rmse) < 0.1, "rmse failed: %s" % test_case + ) + self._test_coefficients(model) + + """ Test linear regression create. """ - def test_create(self): - for solver in ['fista']: - args = (self.sf, self.target, self.features, - solver, self.def_kwargs) - self._test_create(*args) + def test_create(self): + + for solver in ["fista"]: + args = (self.sf, self.target, self.features, solver, self.def_kwargs) + self._test_create(*args) class ValidationDataLinearRegressionTest(unittest.TestCase): - """ + """ Unit test class for testing create with validation data. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - test_data = '''y,0,1,2,3,4 + test_data = """y,0,1,2,3,4 38,0,3,1,0,1.47 58,1,2,2,8,4.38 30,1,1,1,0,1.64 @@ -1354,41 +1477,47 @@ def setUpClass(self): 59,0,3,3,0,2.88 65,1,2,3,5,3.37 49,0,1,3,0,2.84 - 37,1,1,1,9,5.12''' - - dataset = 'data_file%s.csv' % (str(uuid.uuid4())) - self.dataset = dataset - f = open(dataset, 'w') - f.write(test_data) - f.close() - self.def_kwargs = {'convergence_threshold': 1e-4, - 'step_size': 1.0, - 'lbfgs_memory_level': 11, - 'max_iterations': 200} - - self.features = ['0', '1', '2', '3', '4'] - self.target = 'y' - type_dict = {n: float for n in self.features + [self.target]} - - self.sf = tc.SFrame.read_csv(dataset, header=True, delimiter=',', - column_type_hints = type_dict) - - @classmethod - def tearDownClass(self): - os.remove(self.dataset) - - def test_valid_set(self): - m = tc.linear_regression.create(self.sf, target=self.target, - validation_set=self.sf) - self.assertTrue(m is not None) - self.assertTrue(isinstance(m.progress, tc.SFrame)) - - m = tc.linear_regression.create(self.sf, target=self.target, - validation_set='auto') - self.assertTrue(m is not None) - self.assertTrue(isinstance(m.progress, tc.SFrame)) - - m = tc.linear_regression.create(self.sf, target=self.target, - validation_set=None) - self.assertTrue(m is not None) - self.assertTrue(isinstance(m.progress, tc.SFrame)) + 37,1,1,1,9,5.12""" + + dataset = "data_file%s.csv" % (str(uuid.uuid4())) + self.dataset = dataset + f = open(dataset, "w") + f.write(test_data) + f.close() + self.def_kwargs = { + "convergence_threshold": 1e-4, + "step_size": 1.0, + "lbfgs_memory_level": 11, + "max_iterations": 200, + } + + self.features = ["0", "1", "2", "3", "4"] + self.target = "y" + type_dict = {n: float for n in self.features + [self.target]} + + self.sf = tc.SFrame.read_csv( + dataset, header=True, delimiter=",", column_type_hints=type_dict + ) + + @classmethod + def tearDownClass(self): + os.remove(self.dataset) + + def test_valid_set(self): + m = tc.linear_regression.create( + self.sf, target=self.target, validation_set=self.sf + ) + self.assertTrue(m is not None) + self.assertTrue(isinstance(m.progress, tc.SFrame)) + + m = tc.linear_regression.create( + self.sf, target=self.target, validation_set="auto" + ) + self.assertTrue(m is not None) + self.assertTrue(isinstance(m.progress, tc.SFrame)) + + m = tc.linear_regression.create( + self.sf, target=self.target, validation_set=None + ) + self.assertTrue(m is not None) + self.assertTrue(isinstance(m.progress, tc.SFrame)) diff --git a/src/python/turicreate/test/test_logger.py b/src/python/turicreate/test/test_logger.py index 36c946971d..7490101119 100644 --- a/src/python/turicreate/test/test_logger.py +++ b/src/python/turicreate/test/test_logger.py @@ -13,7 +13,6 @@ class LoggingConfigurationTests(TestCase): - def setUp(self): """ Cleanup the existing log configuration. diff --git a/src/python/turicreate/test/test_logistic_classifier.py b/src/python/turicreate/test/test_logistic_classifier.py index e128946e5b..5700aea618 100644 --- a/src/python/turicreate/test/test_logistic_classifier.py +++ b/src/python/turicreate/test/test_logistic_classifier.py @@ -43,16 +43,19 @@ def binary_classification_integer_target(cls): target[1] = 1 ## Compute the correct answers with statsmodels - sm_model = sm.GLM(target, sm.add_constant(cls.sf.to_dataframe()), - family=sm.families.Binomial()).fit() + sm_model = sm.GLM( + target, sm.add_constant(cls.sf.to_dataframe()), family=sm.families.Binomial() + ).fit() cls.loss = -sm_model.llf cls.coef = list(sm_model.params) cls.stderr = list(sm_model.bse) - cls.yhat_margin = tc.SArray(list(np.log(sm_model.fittedvalues) - \ - np.log(1 - sm_model.fittedvalues))) + cls.yhat_margin = tc.SArray( + list(np.log(sm_model.fittedvalues) - np.log(1 - sm_model.fittedvalues)) + ) cls.yhat_prob = tc.SArray(list(sm_model.fittedvalues)) - cls.yhat_max_prob = tc.SArray(list(sm_model.fittedvalues)).apply(\ - lambda x: max(x, 1.0 - x)) + cls.yhat_max_prob = tc.SArray(list(sm_model.fittedvalues)).apply( + lambda x: max(x, 1.0 - x) + ) cls.yhat_class = tc.SArray(list((sm_model.fittedvalues >= 0.5).astype(int))) cls.type = int cls.test_stderr = True @@ -67,99 +70,116 @@ def binary_classification_integer_target(cls): cls.topk_yhat_prob += [1 - prob, prob] cls.topk_yhat_margin += [0, margin] if prob <= 0.5: - cls.topk_yhat_rank += [0,1] + cls.topk_yhat_rank += [0, 1] else: - cls.topk_yhat_rank += [1,0] - + cls.topk_yhat_rank += [1, 0] # Compute the answers you get from Stats Models. cls.sm_cf_matrix = table = np.histogram2d(target, cls.yhat_class, bins=2)[0] ## Create the model - cls.sf['target'] = target + cls.sf["target"] = target cls.def_kwargs = copy.deepcopy(_DEFAULT_SOLVER_OPTIONS) - cls.def_opts= dict(list(cls.def_kwargs.items()) + list({'solver' : 'auto', - 'feature_rescaling' : True, - 'class_weights' : None, - 'l1_penalty' : 0, - 'l2_penalty': 1e-2}.items())) - - cls.solver = 'auto' - cls.features = ['X{}'.format(i) for i in range(1, d+1)] - cls.unpacked_features = ['X{}'.format(i) for i in range(1, d+1)] - cls.target = 'target' - - cls.model = tc.logistic_classifier.create(cls.sf, target='target', - features=None, - l2_penalty=0., - feature_rescaling = True, - validation_set=None, - solver=cls.solver) + cls.def_opts = dict( + list(cls.def_kwargs.items()) + + list( + { + "solver": "auto", + "feature_rescaling": True, + "class_weights": None, + "l1_penalty": 0, + "l2_penalty": 1e-2, + }.items() + ) + ) + + cls.solver = "auto" + cls.features = ["X{}".format(i) for i in range(1, d + 1)] + cls.unpacked_features = ["X{}".format(i) for i in range(1, d + 1)] + cls.target = "target" + + cls.model = tc.logistic_classifier.create( + cls.sf, + target="target", + features=None, + l2_penalty=0.0, + feature_rescaling=True, + validation_set=None, + solver=cls.solver, + ) # Metrics! - cls.metrics = ["accuracy", "auc", "confusion_matrix", "f1_score", - "log_loss", "precision", "recall", "roc_curve"] + cls.metrics = [ + "accuracy", + "auc", + "confusion_matrix", + "f1_score", + "log_loss", + "precision", + "recall", + "roc_curve", + ] cls.sm_metrics = { - "accuracy" : accuracy_score(target, list(cls.yhat_class)), - "auc" : roc_auc_score(target, list(cls.yhat_prob)), - "confusion_matrix" : cls.sm_cf_matrix.flatten(), - "f1_score" : f1_score(target, list(cls.yhat_class)), - "log_loss" : log_loss(target, list(cls.yhat_prob)), - "precision" : precision_score(target, list(cls.yhat_class)), - "recall" : recall_score(target, list(cls.yhat_class)), - "roc_curve" : tc.toolkits.evaluation.roc_curve( - cls.sf['target'], cls.yhat_prob) - } - + "accuracy": accuracy_score(target, list(cls.yhat_class)), + "auc": roc_auc_score(target, list(cls.yhat_prob)), + "confusion_matrix": cls.sm_cf_matrix.flatten(), + "f1_score": f1_score(target, list(cls.yhat_class)), + "log_loss": log_loss(target, list(cls.yhat_prob)), + "precision": precision_score(target, list(cls.yhat_class)), + "recall": recall_score(target, list(cls.yhat_class)), + "roc_curve": tc.toolkits.evaluation.roc_curve(cls.sf["target"], cls.yhat_prob), + } ## Answers cls.opts = cls.def_opts.copy() - cls.opts['l2_penalty'] = 0. - cls.opts['solver'] = "newton" + cls.opts["l2_penalty"] = 0.0 + cls.opts["solver"] = "newton" cls.get_ans = { - 'coefficients': lambda x: isinstance(x, tc.SFrame), - 'convergence_threshold': lambda x: x == cls.opts['convergence_threshold'], - 'unpacked_features': lambda x: x == cls.unpacked_features, - 'feature_rescaling': lambda x: x == cls.opts['feature_rescaling'], - 'features': lambda x: x == cls.features, - 'l1_penalty': lambda x: x == cls.opts['l1_penalty'], - 'l2_penalty': lambda x: x == cls.opts['l2_penalty'], - 'lbfgs_memory_level': lambda x: x == cls.opts['lbfgs_memory_level'], - 'max_iterations': lambda x: x == cls.opts['max_iterations'], - 'num_classes': lambda x: x == 2, - 'classes': lambda x: list(x) == [0,1], - 'num_coefficients': lambda x: x == 11, - 'num_examples': lambda x: x == 100, - 'class_weights': lambda x: x == {0:1, 1:1}, - 'num_examples_per_class': \ - lambda x: {0: cls.sf.num_rows() - cls.sf['target'].sum(), - 1: cls.sf['target'].sum()}, - 'num_unpacked_features': lambda x: x == 10, - 'num_features': lambda x: x == 10, - 'progress': lambda x: isinstance(x, tc.SFrame) or (x is None), - 'solver': lambda x: x == cls.opts['solver'], - 'step_size': lambda x: lambda x: x == cls.opts['step_size'], - 'target': lambda x: x == cls.target, - 'training_accuracy': lambda x: x >= 0 and x <= 1, - 'training_iterations': lambda x: x > 0, - 'training_loss': lambda x: abs(x - cls.loss) < 1e-5, - 'training_solver_status': lambda x: x == "SUCCESS: Optimal solution found.", - 'training_time': lambda x: x >= 0, - 'simple_mode': lambda x: not x, - 'training_auc': lambda x: x > 0, - 'training_confusion_matrix': lambda x: len(x) > 0, - 'training_f1_score': lambda x: x > 0, - 'training_log_loss': lambda x: x > 0, - 'training_precision': lambda x: x > 0, - 'training_recall': lambda x: x > 0, - 'training_report_by_class': lambda x: len(x) > 0, - 'training_roc_curve': lambda x: len(x) > 0, - 'validation_data': lambda x: isinstance(x, tc.SFrame) and len(x) == 0, - 'disable_posttrain_evaluation' : lambda x: x == False, - } + "coefficients": lambda x: isinstance(x, tc.SFrame), + "convergence_threshold": lambda x: x == cls.opts["convergence_threshold"], + "unpacked_features": lambda x: x == cls.unpacked_features, + "feature_rescaling": lambda x: x == cls.opts["feature_rescaling"], + "features": lambda x: x == cls.features, + "l1_penalty": lambda x: x == cls.opts["l1_penalty"], + "l2_penalty": lambda x: x == cls.opts["l2_penalty"], + "lbfgs_memory_level": lambda x: x == cls.opts["lbfgs_memory_level"], + "max_iterations": lambda x: x == cls.opts["max_iterations"], + "num_classes": lambda x: x == 2, + "classes": lambda x: list(x) == [0, 1], + "num_coefficients": lambda x: x == 11, + "num_examples": lambda x: x == 100, + "class_weights": lambda x: x == {0: 1, 1: 1}, + "num_examples_per_class": lambda x: { + 0: cls.sf.num_rows() - cls.sf["target"].sum(), + 1: cls.sf["target"].sum(), + }, + "num_unpacked_features": lambda x: x == 10, + "num_features": lambda x: x == 10, + "progress": lambda x: isinstance(x, tc.SFrame) or (x is None), + "solver": lambda x: x == cls.opts["solver"], + "step_size": lambda x: lambda x: x == cls.opts["step_size"], + "target": lambda x: x == cls.target, + "training_accuracy": lambda x: x >= 0 and x <= 1, + "training_iterations": lambda x: x > 0, + "training_loss": lambda x: abs(x - cls.loss) < 1e-5, + "training_solver_status": lambda x: x == "SUCCESS: Optimal solution found.", + "training_time": lambda x: x >= 0, + "simple_mode": lambda x: not x, + "training_auc": lambda x: x > 0, + "training_confusion_matrix": lambda x: len(x) > 0, + "training_f1_score": lambda x: x > 0, + "training_log_loss": lambda x: x > 0, + "training_precision": lambda x: x > 0, + "training_recall": lambda x: x > 0, + "training_report_by_class": lambda x: len(x) > 0, + "training_roc_curve": lambda x: len(x) > 0, + "validation_data": lambda x: isinstance(x, tc.SFrame) and len(x) == 0, + "disable_posttrain_evaluation": lambda x: x == False, + } cls.fields_ans = cls.get_ans.keys() + def multiclass_integer_target(cls): """ The setup class method for multi-class classification problem with the @@ -180,11 +200,11 @@ def multiclass_integer_target(cls): sm_model = sm.MNLogit(target, sm.add_constant(cls.sf.to_dataframe())).fit() coef = np.empty([0]) for i in range(sm_model.params.ndim): - coef = np.append(coef, sm_model.params[i].values) + coef = np.append(coef, sm_model.params[i].values) cls.coef = list(coef) stderr = np.empty([0]) for i in range(sm_model.params.ndim): - stderr = np.append(stderr, sm_model.bse[i].values) + stderr = np.append(stderr, sm_model.bse[i].values) cls.stderr = list(stderr) # Predict @@ -195,41 +215,43 @@ def multiclass_integer_target(cls): cls.sm_cf_matrix = sm_model.pred_table().flatten() cls.sm_metrics = { - "accuracy" : accuracy_score(target, list(cls.yhat_class)), - "auc" : tc.toolkits.evaluation.auc( - tc.SArray(target), tc.SArray(raw_predictions)), - "confusion_matrix": cls.sm_cf_matrix.flatten(), - "f1_score" : f1_score(target, list(cls.yhat_class), - average = 'macro'), - "log_loss" : log_loss(target, list(raw_predictions)), - "precision" : precision_score(target, list(cls.yhat_class), - average = 'macro'), - "recall" : recall_score(target, list(cls.yhat_class), - average = 'macro'), - "roc_curve" : tc.toolkits.evaluation.roc_curve( - tc.SArray(target), tc.SArray(raw_predictions)) - } - + "accuracy": accuracy_score(target, list(cls.yhat_class)), + "auc": tc.toolkits.evaluation.auc( + tc.SArray(target), tc.SArray(raw_predictions) + ), + "confusion_matrix": cls.sm_cf_matrix.flatten(), + "f1_score": f1_score(target, list(cls.yhat_class), average="macro"), + "log_loss": log_loss(target, list(raw_predictions)), + "precision": precision_score(target, list(cls.yhat_class), average="macro"), + "recall": recall_score(target, list(cls.yhat_class), average="macro"), + "roc_curve": tc.toolkits.evaluation.roc_curve( + tc.SArray(target), tc.SArray(raw_predictions) + ), + } # Predict topk preds_sf = tc.SFrame(pd.DataFrame(raw_predictions)) - cls.topk_yhat_prob = preds_sf\ - .pack_columns(preds_sf.column_names(), dtype = dict)\ - .add_row_number()\ - .stack('X1', ['class', 'prediction'])\ - .sort(['id', 'class'])['prediction'] - cls.yhat_prob_vec = preds_sf\ - .pack_columns(preds_sf.column_names(), dtype = dict)\ - .add_row_number()\ - .stack('X1', ['class', 'prediction'])\ - .sort(['id', 'class'])['prediction'] + cls.topk_yhat_prob = ( + preds_sf.pack_columns(preds_sf.column_names(), dtype=dict) + .add_row_number() + .stack("X1", ["class", "prediction"]) + .sort(["id", "class"])["prediction"] + ) + cls.yhat_prob_vec = ( + preds_sf.pack_columns(preds_sf.column_names(), dtype=dict) + .add_row_number() + .stack("X1", ["class", "prediction"]) + .sort(["id", "class"])["prediction"] + ) # Function to rank all items in a list rank = lambda x: list(len(x) - ss.rankdata(x)) - rank_sa = preds_sf.pack_columns(preds_sf.column_names())['X1'].apply(rank) - topk_yhat_rank = tc.SFrame({'X1': rank_sa}).add_row_number() - topk_yhat_rank['X1'] = topk_yhat_rank['X1'].apply(lambda x: {i:v for i,v in enumerate(x)}) - topk_yhat_rank = topk_yhat_rank.stack('X1').sort(['id', 'X2'])['X3'].astype(int) + rank_sa = preds_sf.pack_columns(preds_sf.column_names())["X1"].apply(rank) + topk_yhat_rank = tc.SFrame({"X1": rank_sa}).add_row_number() + topk_yhat_rank["X1"] = topk_yhat_rank["X1"].apply( + lambda x: {i: v for i, v in enumerate(x)} + ) + topk_yhat_rank = topk_yhat_rank.stack("X1").sort(["id", "X2"])["X3"].astype(int) cls.topk_yhat_rank = topk_yhat_rank # Compute the margins @@ -237,36 +259,40 @@ def multiclass_integer_target(cls): sf_margin = sm.add_constant(np.dot(df.values, sm_model.params)) sf_margin[:, 0] = 0 sf_margin = tc.SFrame(pd.DataFrame(sf_margin)) - cls.topk_yhat_margin = sf_margin\ - .pack_columns(sf_margin.column_names(), dtype = dict)\ - .add_row_number()\ - .stack('X1', ['class', 'prediction'])\ - .sort(['id', 'class'])['prediction'] - + cls.topk_yhat_margin = ( + sf_margin.pack_columns(sf_margin.column_names(), dtype=dict) + .add_row_number() + .stack("X1", ["class", "prediction"]) + .sort(["id", "class"])["prediction"] + ) ## Create the model - cls.sf['target'] = target + cls.sf["target"] = target cls.loss = -sm_model.llf - cls.model = tc.logistic_classifier.create(cls.sf, target='target', - features=None, - l2_penalty=0., - feature_rescaling = True, - validation_set=None, - solver=cls.solver) - - - cls.get_ans['num_classes'] = lambda x: x == 3 - cls.get_ans['classes'] = lambda x: x == [0,1,2] - cls.get_ans['num_coefficients'] = lambda x: x == 22 - cls.get_ans['class_weights'] = lambda x: x == {0:1, 1:1, 2:1} - cls.get_ans['num_examples_per_class'] = lambda x: { - 0: (cls.sf['target'] == 0).sum(), - 1: (cls.sf['target'] == 1).sum(), - 2: (cls.sf['target'] == 2).sum()} + cls.model = tc.logistic_classifier.create( + cls.sf, + target="target", + features=None, + l2_penalty=0.0, + feature_rescaling=True, + validation_set=None, + solver=cls.solver, + ) + + cls.get_ans["num_classes"] = lambda x: x == 3 + cls.get_ans["classes"] = lambda x: x == [0, 1, 2] + cls.get_ans["num_coefficients"] = lambda x: x == 22 + cls.get_ans["class_weights"] = lambda x: x == {0: 1, 1: 1, 2: 1} + cls.get_ans["num_examples_per_class"] = lambda x: { + 0: (cls.sf["target"] == 0).sum(), + 1: (cls.sf["target"] == 1).sum(), + 2: (cls.sf["target"] == 2).sum(), + } cls.fields_ans = cls.get_ans.keys() + def binary_classification_string_target(cls): """ The setup class method for a binary classification problem with the @@ -275,19 +301,23 @@ def binary_classification_string_target(cls): binary_classification_integer_target(cls) - cls.sf['target'] = cls.sf['target'].astype(str) - cls.model = tc.logistic_classifier.create(cls.sf, target='target', - features=None, - l2_penalty=0., - feature_rescaling = True, - validation_set=None, - solver=cls.solver) - - cls.get_ans["classes"] = lambda x: x == ["0","1"] - cls.get_ans["class_weights"] = lambda x: x == {"0":1, "1":1} - cls.get_ans["num_examples_per_class"] = lambda x: x == { - "0": (cls.sf["target"] == "0").sum(), - "1": (cls.sf["target"] == "1").sum()} + cls.sf["target"] = cls.sf["target"].astype(str) + cls.model = tc.logistic_classifier.create( + cls.sf, + target="target", + features=None, + l2_penalty=0.0, + feature_rescaling=True, + validation_set=None, + solver=cls.solver, + ) + + cls.get_ans["classes"] = lambda x: x == ["0", "1"] + cls.get_ans["class_weights"] = lambda x: x == {"0": 1, "1": 1} + cls.get_ans["num_examples_per_class"] = lambda x: x == { + "0": (cls.sf["target"] == "0").sum(), + "1": (cls.sf["target"] == "1").sum(), + } cls.type = str @@ -299,44 +329,50 @@ def multiclass_string_target(cls): multiclass_integer_target(cls) - cls.sf['target'] = cls.sf['target'].astype(str) - cls.model = tc.logistic_classifier.create(cls.sf, target='target', - features=None, - l2_penalty=0., - feature_rescaling = True, - validation_set=None, - solver=cls.solver) - - cls.get_ans["classes"] = lambda x: x == ["0","1","2"] - cls.get_ans["class_weights"] = lambda x: x == {"0":1, "1":1, "2":1} - cls.get_ans["num_examples_per_class"] = lambda x: x == { - "0": (cls.sf["target"] == "0").sum(), - "1": (cls.sf["target"] == "1").sum(), - "2": (cls.sf["target"] == "2").sum(), - } + cls.sf["target"] = cls.sf["target"].astype(str) + cls.model = tc.logistic_classifier.create( + cls.sf, + target="target", + features=None, + l2_penalty=0.0, + feature_rescaling=True, + validation_set=None, + solver=cls.solver, + ) + + cls.get_ans["classes"] = lambda x: x == ["0", "1", "2"] + cls.get_ans["class_weights"] = lambda x: x == {"0": 1, "1": 1, "2": 1} + cls.get_ans["num_examples_per_class"] = lambda x: x == { + "0": (cls.sf["target"] == "0").sum(), + "1": (cls.sf["target"] == "1").sum(), + "2": (cls.sf["target"] == "2").sum(), + } cls.type = str + def test_suite(): """ Create a test suite for each test case in LogisticRegressionClassifierModelTest """ - testCases = [binary_classification_integer_target, - binary_classification_string_target, - multiclass_integer_target] - #multiclass_string_target] + testCases = [ + binary_classification_integer_target, + binary_classification_string_target, + multiclass_integer_target, + ] + # multiclass_string_target] for t in testCases: testcase_members = {} testcase_members[t.__name__] = classmethod(t) testcase_class = type( - 'LogisticRegressionClassifierModelTest_%s' % t.__name__, + "LogisticRegressionClassifierModelTest_%s" % t.__name__, (LogisticRegressionClassifierModelTest,), - testcase_members + testcase_members, ) getattr(testcase_class, t.__name__)() testcase_class.__test__ = True for method in dir(testcase_class): - if method.startswith('test_'): + if method.startswith("test_"): testcase_instance = testcase_class(method) method_instance = getattr(testcase_instance, method) # needs callable() since some class- or instance-level @@ -344,6 +380,7 @@ def test_suite(): if callable(method_instance): method_instance() + class LogisticRegressionClassifierModelTest(unittest.TestCase): __test__ = False """ @@ -374,14 +411,16 @@ def test_coefficients(self): """ model = self.model coefs = model.coefficients - coef_list = list(coefs['value']) + coef_list = list(coefs["value"]) self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-03, atol=1e-03)) if self.test_stderr: - stderr_list = list(coefs['stderr']) - self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03)) + stderr_list = list(coefs["stderr"]) + self.assertTrue( + np.allclose(stderr_list, self.stderr, rtol=1e-03, atol=1e-03) + ) else: - self.assertTrue('stderr' in coefs.column_names()) - self.assertEqual(list(coefs['stderr']), [None for v in coef_list]) + self.assertTrue("stderr" in coefs.column_names()) + self.assertEqual(list(coefs["stderr"]), [None for v in coef_list]) def test_summary(self): """ @@ -395,7 +434,7 @@ def test_repr(self): Check the repr function. """ model = self.model - ans = str(model) + ans = str(model) self.assertEqual(type(ans), str) def test_predict_topk(self): @@ -407,25 +446,31 @@ def test_predict_topk(self): tol = 1e-3 k = model.num_classes - ans = model.predict_topk(self.sf, output_type='margin', k=k) - ans = ans.sort(['id', 'class'])['margin'] - self.assertTrue(np.allclose(ans, self.topk_yhat_margin, tol, tol), - "{%s} - {%s}" % (ans, self.topk_yhat_margin)) + ans = model.predict_topk(self.sf, output_type="margin", k=k) + ans = ans.sort(["id", "class"])["margin"] + self.assertTrue( + np.allclose(ans, self.topk_yhat_margin, tol, tol), + "{%s} - {%s}" % (ans, self.topk_yhat_margin), + ) - ans = model.predict_topk(self.sf, output_type='probability', k=k) - ans = ans.sort(['id', 'class'])['probability'] - self.assertTrue(np.allclose(ans, self.topk_yhat_prob, tol, tol), - "{%s} - {%s}" % (ans, self.topk_yhat_prob)) + ans = model.predict_topk(self.sf, output_type="probability", k=k) + ans = ans.sort(["id", "class"])["probability"] + self.assertTrue( + np.allclose(ans, self.topk_yhat_prob, tol, tol), + "{%s} - {%s}" % (ans, self.topk_yhat_prob), + ) - ans = model.predict_topk(self.sf, output_type='rank', k = k) - self.assertEqual(ans['class'].dtype, self.type) - ans = ans.sort(['id', 'class'])['rank'] + ans = model.predict_topk(self.sf, output_type="rank", k=k) + self.assertEqual(ans["class"].dtype, self.type) + ans = ans.sort(["id", "class"])["rank"] self.assertEqual(list(ans), list(self.topk_yhat_rank)) - ans = model.predict_topk(self.sf, k=k) - ans = ans.sort(['id', 'class'])['probability'] - self.assertTrue(np.allclose(ans, self.topk_yhat_prob, tol, tol), - "{%s} - {%s}" % (ans, self.topk_yhat_prob)) + ans = model.predict_topk(self.sf, k=k) + ans = ans.sort(["id", "class"])["probability"] + self.assertTrue( + np.allclose(ans, self.topk_yhat_prob, tol, tol), + "{%s} - {%s}" % (ans, self.topk_yhat_prob), + ) def test_predict(self): """ @@ -436,33 +481,34 @@ def test_predict(self): tol = 1e-3 if model.num_classes == 2: - ans = model.predict(self.sf, output_type='margin') + ans = model.predict(self.sf, output_type="margin") self.assertTrue(np.allclose(ans, self.yhat_margin, tol, tol)) - ans = model.predict(self.sf, output_type='probability') + ans = model.predict(self.sf, output_type="probability") self.assertTrue(np.allclose(ans, self.yhat_prob, tol, tol)) else: try: - ans = model.predict(self.sf, output_type='margin') + ans = model.predict(self.sf, output_type="margin") except ToolkitError: pass try: - ans = model.predict(self.sf, output_type='probability') + ans = model.predict(self.sf, output_type="probability") except ToolkitError: pass # Prob vector. - ans = model.predict(self.sf, output_type='probability_vector') + ans = model.predict(self.sf, output_type="probability_vector") import itertools + merged_ans = list(itertools.chain(*ans)) self.assertTrue(np.allclose(merged_ans, self.yhat_prob_vec, tol, tol)) # class - ans = model.predict(self.sf, output_type='class') + ans = model.predict(self.sf, output_type="class") self.assertEqual(ans.dtype, self.type) self.assertTrue((ans == tc.SArray(list(map(self.type, self.yhat_class)))).all()) # Default is class - ans = model.predict(self.sf) + ans = model.predict(self.sf) self.assertEqual(ans.dtype, self.type) self.assertTrue((ans == tc.SArray(list(map(self.type, self.yhat_class)))).all()) @@ -472,61 +518,65 @@ def test_classify(self): all predictions are at most 1e-5 away from the true answers. """ model = self.model - ans = model.classify(self.sf) + ans = model.classify(self.sf) tol = 1e-3 - self.assertEqual(ans['class'].dtype, self.type) - self.assertTrue((ans['class'] == tc.SArray(list(map(self.type, self.yhat_class)))).all()) - self.assertTrue(np.allclose(ans['probability'], self.yhat_max_prob, tol, tol)) + self.assertEqual(ans["class"].dtype, self.type) + self.assertTrue( + (ans["class"] == tc.SArray(list(map(self.type, self.yhat_class)))).all() + ) + self.assertTrue(np.allclose(ans["probability"], self.yhat_max_prob, tol, tol)) def test_evaluate(self): """ Make sure that evaluate works. """ model = self.model + def check_cf_matrix(ans): self.assertTrue(ans is not None) - self.assertTrue('confusion_matrix' in ans) - cf = ans['confusion_matrix'].sort(['target_label', 'predicted_label']) + self.assertTrue("confusion_matrix" in ans) + cf = ans["confusion_matrix"].sort(["target_label", "predicted_label"]) self.assertTrue( - np.allclose(cf['count'], - self.sm_metrics['confusion_matrix'])) + np.allclose(cf["count"], self.sm_metrics["confusion_matrix"]) + ) def check_roc_curve(ans): self.assertTrue(ans is not None) - self.assertTrue('roc_curve' in ans) - roc = ans['roc_curve'] + self.assertTrue("roc_curve" in ans) + roc = ans["roc_curve"] self.assertEqual(type(roc), tc.SFrame) def check_metric(ans, metric): - if metric == 'confusion_matrix': + if metric == "confusion_matrix": check_cf_matrix(ans) - elif metric == 'roc_curve': + elif metric == "roc_curve": check_roc_curve(ans) else: self.assertTrue(ans is not None) self.assertTrue(metric in ans) - self.assertAlmostEqual(ans[metric], - self.sm_metrics[metric], - places = 4, - msg = "%s = (%s,%s)" % \ - (metric, ans[metric], self.sm_metrics[metric])) + self.assertAlmostEqual( + ans[metric], + self.sm_metrics[metric], + places=4, + msg="%s = (%s,%s)" % (metric, ans[metric], self.sm_metrics[metric]), + ) # Default ans = model.evaluate(self.sf) self.assertEqual(sorted(ans.keys()), sorted(self.metrics)) for m in self.metrics: - check_metric(ans, m) + check_metric(ans, m) # Individual for m in self.metrics: - ans = model.evaluate(self.sf, metric = m) + ans = model.evaluate(self.sf, metric=m) check_metric(ans, m) def test_save_and_load(self): """ Make sure saving and loading retains everything. """ - filename = 'save_file{}'.format(uuid.uuid4()) + filename = "save_file{}".format(uuid.uuid4()) self.model.save(filename) self.model = tc.load_model(filename) @@ -574,56 +624,64 @@ def setUpClass(self): target[1] = 1 # the correct model - sm_model = sm.GLM(target, sm.add_constant(self.sf.to_dataframe()), - family=sm.families.Binomial()).fit() + sm_model = sm.GLM( + target, + sm.add_constant(self.sf.to_dataframe()), + family=sm.families.Binomial(), + ).fit() self.loss = -sm_model.llf self.coef = list(sm_model.params) self.stderr = list(sm_model.bse) - # turicreate model parameters self.def_kwargs = copy.deepcopy(_DEFAULT_SOLVER_OPTIONS) - self.def_kwargs['max_iterations'] = 100 - self.def_kwargs['convergence_threshold'] = 1e-5 - self.sf['target'] = target - self.solver = 'newton' - self.features = ['X{}'.format(i) for i in range(1, d+1)] - self.target = 'target' - + self.def_kwargs["max_iterations"] = 100 + self.def_kwargs["convergence_threshold"] = 1e-5 + self.sf["target"] = target + self.solver = "newton" + self.features = ["X{}".format(i) for i in range(1, d + 1)] + self.target = "target" def _test_create(self, sf, target, features, solver, kwargs, rescaling): """ Test logistic regression create. """ - model = tc.logistic_classifier.create(sf, target, features, - solver=solver, - l2_penalty = 0.0, - verbose = True, - validation_set=None, - **kwargs) + model = tc.logistic_classifier.create( + sf, + target, + features, + solver=solver, + l2_penalty=0.0, + verbose=True, + validation_set=None, + **kwargs + ) - test_case = 'solver = {}, kwargs = {}'.format(solver, kwargs) + test_case = "solver = {}, kwargs = {}".format(solver, kwargs) self.assertTrue(model is not None) - self.assertTrue(abs(model.training_loss - self.loss) < \ - 0.01 * abs(self.loss), 'Loss failed: {}'.format(test_case)) + self.assertTrue( + abs(model.training_loss - self.loss) < 0.01 * abs(self.loss), + "Loss failed: {}".format(test_case), + ) coefs = model.coefficients - coefs_list = list(coefs['value']) + coefs_list = list(coefs["value"]) self.assertTrue(np.allclose(coefs_list, self.coef, rtol=2e-01, atol=2e-01)) - if solver == 'newton': - stderr_list = list(model.coefficients['stderr']) - self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=2e-01, atol=2e-01)) + if solver == "newton": + stderr_list = list(model.coefficients["stderr"]) + self.assertTrue( + np.allclose(stderr_list, self.stderr, rtol=2e-01, atol=2e-01) + ) else: - self.assertTrue('stderr' in coefs.column_names()) - self.assertEqual(list(coefs['stderr']), [None for v in coefs_list]) - + self.assertTrue("stderr" in coefs.column_names()) + self.assertEqual(list(coefs["stderr"]), [None for v in coefs_list]) def test_create_default_features(self): """ Test logistic regression create. """ - for solver in ['newton', 'fista', 'lbfgs']: + for solver in ["newton", "fista", "lbfgs"]: args = (self.sf, self.target, None, solver, self.def_kwargs, True) self._test_create(*args) args = (self.sf, self.target, None, solver, self.def_kwargs, False) @@ -633,12 +691,10 @@ def test_create(self): """ Test logistic regression create. """ - for solver in ['newton', 'fista', 'lbfgs']: - args = (self.sf, self.target, self.features, solver, self.def_kwargs, - True) + for solver in ["newton", "fista", "lbfgs"]: + args = (self.sf, self.target, self.features, solver, self.def_kwargs, True) self._test_create(*args) - args = (self.sf, self.target, self.features, solver, self.def_kwargs, - False) + args = (self.sf, self.target, self.features, solver, self.def_kwargs, False) self._test_create(*args) def test_class_weights(self): @@ -647,46 +703,49 @@ def test_class_weights(self): """ # Should train correctly - model = tc.logistic_classifier.create(self.sf, self.target, self.features, - class_weights = 'auto') - model = tc.logistic_classifier.create(self.sf, self.target, self.features, - class_weights = {0:1, 1:2}) + model = tc.logistic_classifier.create( + self.sf, self.target, self.features, class_weights="auto" + ) + model = tc.logistic_classifier.create( + self.sf, self.target, self.features, class_weights={0: 1, 1: 2} + ) # Should fail try: - model = tc.logistic_classifier.create(self.sf, self.target, self.features, - class_weights = 1.0) + model = tc.logistic_classifier.create( + self.sf, self.target, self.features, class_weights=1.0 + ) except ToolkitError: - pass + pass try: - model = tc.logistic_classifier.create(self.sf, self.target, self.features, - class_weights = {2: 10}) + model = tc.logistic_classifier.create( + self.sf, self.target, self.features, class_weights={2: 10} + ) except ToolkitError: - pass + pass try: - model = tc.logistic_classifier.create(self.sf, self.target, self.features, - class_weights = [1,1]) + model = tc.logistic_classifier.create( + self.sf, self.target, self.features, class_weights=[1, 1] + ) except ToolkitError: - pass + pass def test_lbfgs(self): - solver = 'lbfgs' + solver = "lbfgs" kwargs = self.def_kwargs.copy() for m in [3, 5, 9, 21]: - kwargs['lbfgs_memory_level'] = m - args = (self.sf, self.target, self.features, solver, kwargs, - True) + kwargs["lbfgs_memory_level"] = m + args = (self.sf, self.target, self.features, solver, kwargs, True) self._test_create(*args) - args = (self.sf, self.target, self.features, solver, kwargs, - False) + args = (self.sf, self.target, self.features, solver, kwargs, False) self._test_create(*args) def test_init_residual_of_zero(self): - X = tc.SFrame({'col1': [2., 1., 2., 1.], 'target': [1, 1, 2, 2]}) + X = tc.SFrame({"col1": [2.0, 1.0, 2.0, 1.0], "target": [1, 1, 2, 2]}) # Try all three solvers - tc.logistic_classifier.create(X, target = 'target', solver = 'newton') - tc.logistic_classifier.create(X, target = 'target', solver = 'lbfgs') - tc.logistic_classifier.create(X, target = 'target', solver = 'fista') + tc.logistic_classifier.create(X, target="target", solver="newton") + tc.logistic_classifier.create(X, target="target", solver="lbfgs") + tc.logistic_classifier.create(X, target="target", solver="fista") class ListCategoricalLogisticRegressionTest(unittest.TestCase): @@ -710,45 +769,47 @@ def setUpClass(self): self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) # categorical column - species = np.array(['cat', 'dog', 'foosa']) + species = np.array(["cat", "dog", "foosa"]) idx = np.random.randint(3, size=n) # Stats models maps categorical in alphabetical order of categories. # We do it in the order of appearance. idx[0] = 0 idx[1] = 1 idx[2] = 2 - self.sf['species'] = list(species[idx]) + self.sf["species"] = list(species[idx]) # target column target = np.random.randint(2, size=n) target[0] = 0 target[1] = 1 - self.sf['target'] = target - + self.sf["target"] = target ## Get the right answer with statsmodels df = self.sf.to_dataframe() - formula = 'target ~ species + ' + \ - ' + '.join(['X{}'.format(i+1) for i in range(d)]) + formula = "target ~ species + " + " + ".join( + ["X{}".format(i + 1) for i in range(d)] + ) sm_model = smf.glm(formula, data=df, family=sm.families.Binomial()).fit() self.loss = -sm_model.llf self.coef = list(sm_model.params) self.yhat = np.array([1 if x >= 0.5 else 0 for x in sm_model.fittedvalues]) ## Set the turicreate model params - self.target = 'target' - self.features = ['species', 'X1', 'X2', 'X3'] + self.target = "target" + self.features = ["species", "X1", "X2", "X3"] self.def_kwargs = copy.deepcopy(_DEFAULT_SOLVER_OPTIONS) - self.sf['species'] = self.sf["species"].apply(lambda x: [x]) + self.sf["species"] = self.sf["species"].apply(lambda x: [x]) def _test_coefficients(self, model): """ Check that the coefficient values are very close to the correct values. """ coefs = model.coefficients - coef_list = list(coefs['value']) - self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-02, atol=1e-02), \ - "value values are incorrect. {} vs {}".format(self.coef, coef_list)) + coef_list = list(coefs["value"]) + self.assertTrue( + np.allclose(coef_list, self.coef, rtol=1e-02, atol=1e-02), + "value values are incorrect. {} vs {}".format(self.coef, coef_list), + ) def _test_create(self, sf, target, features, solver, kwargs, rescaling): """ @@ -756,17 +817,23 @@ def _test_create(self, sf, target, features, solver, kwargs, rescaling): """ test_label = "solver: {}\tkwargs: {}".format(solver, kwargs) - model = tc.logistic_classifier.create(sf, target, features, - l2_penalty=0., - solver=solver, - feature_rescaling = rescaling, - validation_set=None, - **kwargs) + model = tc.logistic_classifier.create( + sf, + target, + features, + l2_penalty=0.0, + solver=solver, + feature_rescaling=rescaling, + validation_set=None, + **kwargs + ) self.assertTrue(model is not None) loss_diff = abs(model.training_loss - self.loss) - self.assertTrue(loss_diff < self.def_kwargs['convergence_threshold'], \ - 'Loss failed: {}'.format(test_label)) + self.assertTrue( + loss_diff < self.def_kwargs["convergence_threshold"], + "Loss failed: {}".format(test_label), + ) self._test_coefficients(model) def test_create(self): @@ -774,11 +841,13 @@ def test_create(self): Driver for testing create function under various inputs. """ - for solver in ['newton']: - self._test_create(self.sf, self.target, self.features, solver, - self.def_kwargs, True) - self._test_create(self.sf, self.target, self.features, solver, - self.def_kwargs, False) + for solver in ["newton"]: + self._test_create( + self.sf, self.target, self.features, solver, self.def_kwargs, True + ) + self._test_create( + self.sf, self.target, self.features, solver, self.def_kwargs, False + ) class CategoricalLogisticRegressionTest(unittest.TestCase): @@ -802,34 +871,34 @@ def setUpClass(self): self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) # categorical column - species = np.array(['cat', 'dog', 'foosa']) + species = np.array(["cat", "dog", "foosa"]) idx = np.random.randint(3, size=n) # Stats models maps categorical in alphabetical order of categories. # We do it in the order of appearance. idx[0] = 0 idx[1] = 1 idx[2] = 2 - self.sf['species'] = list(species[idx]) + self.sf["species"] = list(species[idx]) # target column target = np.random.randint(2, size=n) target[0] = 0 target[1] = 1 - self.sf['target'] = target - + self.sf["target"] = target ## Get the right answer with statsmodels df = self.sf.to_dataframe() - formula = 'target ~ species + ' + \ - ' + '.join(['X{}'.format(i+1) for i in range(d)]) + formula = "target ~ species + " + " + ".join( + ["X{}".format(i + 1) for i in range(d)] + ) sm_model = smf.glm(formula, data=df, family=sm.families.Binomial()).fit() self.loss = -sm_model.llf self.coef = list(sm_model.params) self.yhat = np.array([1 if x >= 0.5 else 0 for x in sm_model.fittedvalues]) ## Set the turicreate model params - self.target = 'target' - self.features = ['species', 'X1', 'X2', 'X3'] + self.target = "target" + self.features = ["species", "X1", "X2", "X3"] self.def_kwargs = copy.deepcopy(_DEFAULT_SOLVER_OPTIONS) def _test_coefficients(self, model): @@ -837,9 +906,11 @@ def _test_coefficients(self, model): Check that the coefficient values are very close to the correct values. """ coefs = model.coefficients - coef_list = list(coefs['value']) - self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-02, atol=1e-02), \ - "value values are incorrect. {} vs {}".format(self.coef, coef_list)) + coef_list = list(coefs["value"]) + self.assertTrue( + np.allclose(coef_list, self.coef, rtol=1e-02, atol=1e-02), + "value values are incorrect. {} vs {}".format(self.coef, coef_list), + ) def _test_create(self, sf, target, features, solver, kwargs, rescaling): """ @@ -847,17 +918,23 @@ def _test_create(self, sf, target, features, solver, kwargs, rescaling): """ test_label = "solver: {}\tkwargs: {}".format(solver, kwargs) - model = tc.logistic_classifier.create(sf, target, features, - l2_penalty=0., - solver=solver, - feature_rescaling = rescaling, - validation_set=None, - **kwargs) + model = tc.logistic_classifier.create( + sf, + target, + features, + l2_penalty=0.0, + solver=solver, + feature_rescaling=rescaling, + validation_set=None, + **kwargs + ) self.assertTrue(model is not None) loss_diff = abs(model.training_loss - self.loss) - self.assertTrue(loss_diff < self.def_kwargs['convergence_threshold'], \ - 'Loss failed: {}'.format(test_label)) + self.assertTrue( + loss_diff < self.def_kwargs["convergence_threshold"], + "Loss failed: {}".format(test_label), + ) self._test_coefficients(model) def test_create(self): @@ -865,260 +942,292 @@ def test_create(self): Driver for testing create function under various inputs. """ - for solver in ['newton']: - self._test_create(self.sf, self.target, self.features, solver, - self.def_kwargs, True) - self._test_create(self.sf, self.target, self.features, solver, - self.def_kwargs, False) + for solver in ["newton"]: + self._test_create( + self.sf, self.target, self.features, solver, self.def_kwargs, True + ) + self._test_create( + self.sf, self.target, self.features, solver, self.def_kwargs, False + ) def test_predict_extra_cols(self): - sf = self.sf[:] - model = tc.logistic_classifier.create(sf, self.target, - self.features, feature_rescaling=False) - pred = model.predict(sf) - sf['species'] = sf['species'].apply(lambda x: 'rat' if x == 'foosa' - else x) - pred = model.predict(sf) + sf = self.sf[:] + model = tc.logistic_classifier.create( + sf, self.target, self.features, feature_rescaling=False + ) + pred = model.predict(sf) + sf["species"] = sf["species"].apply(lambda x: "rat" if x == "foosa" else x) + pred = model.predict(sf) def test_evaluate_extra_cols(self): - sf = self.sf[:] - model = tc.logistic_classifier.create(sf, self.target, - self.features, feature_rescaling = False) - eval1 = model.evaluate(sf) - sf['species'] = sf['species'].apply(lambda x: 'rat' if x == 'foosa' - else x) - eval2 = model.evaluate(sf) + sf = self.sf[:] + model = tc.logistic_classifier.create( + sf, self.target, self.features, feature_rescaling=False + ) + eval1 = model.evaluate(sf) + sf["species"] = sf["species"].apply(lambda x: "rat" if x == "foosa" else x) + eval2 = model.evaluate(sf) """ Test detection of columns that are almost the same. """ + def test_zero_variance_detection(self): sf = self.sf[:] - sf['error-column'] = '1' + sf["error-column"] = "1" model = tc.logistic_classifier.create(sf, self.target) - sf['error-column'] = [[1] for i in sf] + sf["error-column"] = [[1] for i in sf] model = tc.logistic_classifier.create(sf, self.target) - sf['error-column'] = [{1:1} for i in sf] + sf["error-column"] = [{1: 1} for i in sf] model = tc.logistic_classifier.create(sf, self.target) """ Test detection of columns have nan values """ + def test_nan_detection(self): sf = self.sf[:] try: - sf['error-column'] = np.nan + sf["error-column"] = np.nan model = tc.logistic_classifier.create(sf, self.target) except ToolkitError: pass try: - sf['error-column'] = [[np.nan] for i in sf] + sf["error-column"] = [[np.nan] for i in sf] model = tc.logistic_classifier.create(sf, self.target) except ToolkitError: pass try: - sf['error-column'] = [{1:np.nan} for i in sf] + sf["error-column"] = [{1: np.nan} for i in sf] model = tc.logistic_classifier.create(sf, self.target) except ToolkitError: pass + class VectorLogisticRegressionTest(unittest.TestCase): - """ + """ Unit test class for testing a Logistic Regression create function. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - np.random.seed(15) - n, d = 100, 3 - self.sf = tc.SFrame() - for i in range(d): - self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - target = np.random.randint(2, size=n) - target[0] = 0 - target[1] = 1 - self.sf['target'] = target - - ## Get the right answer with statsmodels - df = self.sf.to_dataframe() - formula = 'target ~ ' + \ - ' + '.join(['X{}'.format(i+1) for i in range(d)]) - sm_model = smf.glm(formula, data=df, family=sm.families.Binomial()).fit() - - self.loss = -sm_model.llf # sum of squared residuals - self.coef = list(sm_model.params) - self.stderr = list(sm_model.bse) - self.yhat = list(sm_model.fittedvalues) - - ## Set the turicreate model params - self.target = 'target' - self.sf['vec'] = self.sf.apply(lambda row: [row['X{}'.format(i+1)] for i in - range(d)]) - self.sf['vec'] = self.sf['vec'].apply(lambda x:x, array.array) - - self.features = ['vec'] - self.unpacked_features = ['vec[%s]' % (i) for i in range(d)] - self.def_kwargs = copy.deepcopy(_DEFAULT_SOLVER_OPTIONS) - - def _test_coefficients(self, model): - """ + np.random.seed(15) + n, d = 100, 3 + self.sf = tc.SFrame() + for i in range(d): + self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + target = np.random.randint(2, size=n) + target[0] = 0 + target[1] = 1 + self.sf["target"] = target + + ## Get the right answer with statsmodels + df = self.sf.to_dataframe() + formula = "target ~ " + " + ".join(["X{}".format(i + 1) for i in range(d)]) + sm_model = smf.glm(formula, data=df, family=sm.families.Binomial()).fit() + + self.loss = -sm_model.llf # sum of squared residuals + self.coef = list(sm_model.params) + self.stderr = list(sm_model.bse) + self.yhat = list(sm_model.fittedvalues) + + ## Set the turicreate model params + self.target = "target" + self.sf["vec"] = self.sf.apply( + lambda row: [row["X{}".format(i + 1)] for i in range(d)] + ) + self.sf["vec"] = self.sf["vec"].apply(lambda x: x, array.array) + + self.features = ["vec"] + self.unpacked_features = ["vec[%s]" % (i) for i in range(d)] + self.def_kwargs = copy.deepcopy(_DEFAULT_SOLVER_OPTIONS) + + def _test_coefficients(self, model): + """ Check that the coefficient values are very close to the correct values. """ - coefs = model.coefficients - coef_list = list(coefs['value']) - stderr_list = list(coefs['stderr']) - self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-02, atol=1e-02)) - self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-02, atol=1e-02)) + coefs = model.coefficients + coef_list = list(coefs["value"]) + stderr_list = list(coefs["stderr"]) + self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-02, atol=1e-02)) + self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-02, atol=1e-02)) - def _test_create(self, sf, target, features, solver, - kwargs, rescaling): + def _test_create(self, sf, target, features, solver, kwargs, rescaling): - model = tc.logistic_classifier.create(sf, target, features, solver - = solver, l2_penalty = 0.0, feature_rescaling = rescaling, + model = tc.logistic_classifier.create( + sf, + target, + features, + solver=solver, + l2_penalty=0.0, + feature_rescaling=rescaling, validation_set=None, - **kwargs) + **kwargs + ) - test_case = 'solver = {solver}, kwargs = {kwargs}'.format(solver = solver, - kwargs = kwargs) - self.assertTrue(model is not None) - self.assertTrue(abs(model.training_loss - self.loss) < \ - 0.01 * abs(self.loss),\ - 'Loss failed: %s. Expected %s' % (test_case, self.loss)) - self._test_coefficients(model) + test_case = "solver = {solver}, kwargs = {kwargs}".format( + solver=solver, kwargs=kwargs + ) + self.assertTrue(model is not None) + self.assertTrue( + abs(model.training_loss - self.loss) < 0.01 * abs(self.loss), + "Loss failed: %s. Expected %s" % (test_case, self.loss), + ) + self._test_coefficients(model) - def test_create(self): + def test_create(self): - for solver in ['newton']: - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, True) - self._test_create(*args) - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, False) - self._test_create(*args) + for solver in ["newton"]: + args = (self.sf, self.target, self.features, solver, self.def_kwargs, True) + self._test_create(*args) + args = (self.sf, self.target, self.features, solver, self.def_kwargs, False) + self._test_create(*args) - def test_features(self): + def test_features(self): - model = tc.logistic_classifier.create(self.sf, self.target, self.features, - feature_rescaling = False) - self.assertEqual(model.num_features, len(self.features)) - self.assertEqual(model.features, self.features) - self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) - self.assertEqual(model.unpacked_features, self.unpacked_features) + model = tc.logistic_classifier.create( + self.sf, self.target, self.features, feature_rescaling=False + ) + self.assertEqual(model.num_features, len(self.features)) + self.assertEqual(model.features, self.features) + self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) + self.assertEqual(model.unpacked_features, self.unpacked_features) class DictLogisticRegressionTest(unittest.TestCase): - """ + """ Unit test class for testing a Logistic Regression create function. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - np.random.seed(15) - n, d = 100, 3 - self.sf = tc.SFrame() + np.random.seed(15) + n, d = 100, 3 + self.sf = tc.SFrame() - # float columns - for i in range(d): - self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + # float columns + for i in range(d): + self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + + # target column + target = np.random.randint(2, size=n) + target[0] = 0 + target[1] = 1 + self.sf["target"] = target + + ## Get the right answer with statsmodels + df = self.sf.to_dataframe() + formula = "target ~ " + " + ".join(["X{}".format(i + 1) for i in range(d)]) + sm_model = smf.glm(formula, data=df, family=sm.families.Binomial()).fit() + + self.loss = -sm_model.llf # sum of squared residuals + self.coef = list(sm_model.params) + self.stderr = list(sm_model.bse) + self.yhat = list(sm_model.fittedvalues) + + ## Set the turicreate model params + self.target = "target" + self.sf["dict"] = self.sf.apply( + lambda row: {i: row["X{}".format(i + 1)] for i in range(d)} + ) + self.features = ["dict"] + self.unpacked_features = ["dict[%s]" % i for i in range(d)] + self.def_kwargs = copy.deepcopy(_DEFAULT_SOLVER_OPTIONS) + + def _test_coefficients(self, model): + coefs = model.coefficients + coef_list = list(coefs["value"]) + stderr_list = list(coefs["stderr"]) + self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-02, atol=1e-02)) + self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-02, atol=1e-02)) + + def test_features(self): + + model = tc.logistic_classifier.create( + self.sf, self.target, self.features, feature_rescaling=False + ) + self.assertEqual(model.num_features, len(self.features)) + self.assertEqual(model.features, self.features) + self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) + self.assertEqual(model.unpacked_features, self.unpacked_features) + + def _test_create(self, sf, target, features, solver, opts, rescaling): + + model = tc.logistic_classifier.create( + sf, + target, + features, + solver=solver, + l2_penalty=0.0, + feature_rescaling=rescaling, + validation_set=None, + **opts + ) + test_case = "solver = {solver}, opts = {opts}".format(solver=solver, opts=opts) + self.assertTrue(model is not None) + self.assertTrue( + abs(model.training_loss - self.loss) < 0.01 * abs(self.loss), + "Loss failed: %s. Expected %s" % (test_case, self.loss), + ) + self._test_coefficients(model) + + def test_create(self): + + for solver in ["newton"]: + args = (self.sf, self.target, self.features, solver, self.def_kwargs, True) + self._test_create(*args) + args = (self.sf, self.target, self.features, solver, self.def_kwargs, False) + self._test_create(*args) + + def test_predict_extra_cols(self): + + model = tc.logistic_classifier.create( + self.sf, self.target, self.features, feature_rescaling=False + ) + pred = model.predict(self.sf) + self.sf["dict"] = self.sf["dict"].apply( + lambda x: dict( + list(x.items()) + list({"extra_col": 0, "extra_col_2": 1}.items()) + ) + ) + pred2 = model.predict(self.sf) + self.assertEqual(sum(pred - pred2), 0) + self.sf["dict"] = self.sf["dict"].apply( + lambda x: { + k: v for k, v in x.items() if k not in ["extra_col", "extra_col_2"] + } + ) + + def test_evaluate_extra_cols(self): + + model = tc.logistic_classifier.create( + self.sf, self.target, self.features, feature_rescaling=False + ) + eval1 = model.evaluate(self.sf) + self.sf["dict"] = self.sf["dict"].apply( + lambda x: dict( + list(x.items()) + list({"extra_col": 0, "extra_col_2": 1}.items()) + ) + ) + eval2 = model.evaluate(self.sf) + self.sf["dict"] = self.sf["dict"].apply( + lambda x: { + k: v for k, v in x.items() if k not in ["extra_col", "extra_col_2"] + } + ) + self.assertEqual(eval1["accuracy"], eval2["accuracy"]) - # target column - target = np.random.randint(2, size=n) - target[0] = 0 - target[1] = 1 - self.sf['target'] = target - - ## Get the right answer with statsmodels - df = self.sf.to_dataframe() - formula = 'target ~ ' + \ - ' + '.join(['X{}'.format(i+1) for i in range(d)]) - sm_model = smf.glm(formula, data=df, family=sm.families.Binomial()).fit() - - self.loss = -sm_model.llf # sum of squared residuals - self.coef = list(sm_model.params) - self.stderr = list(sm_model.bse) - self.yhat = list(sm_model.fittedvalues) - - ## Set the turicreate model params - self.target = 'target' - self.sf['dict'] = self.sf.apply(lambda row: {i: row['X{}'.format(i+1)] for i in - range(d)}) - self.features = ['dict'] - self.unpacked_features = ['dict[%s]' % i for i in range(d)] - self.def_kwargs = copy.deepcopy(_DEFAULT_SOLVER_OPTIONS) - - def _test_coefficients(self, model): - coefs = model.coefficients - coef_list = list(coefs['value']) - stderr_list = list(coefs['stderr']) - self.assertTrue(np.allclose(coef_list, self.coef, rtol=1e-02, atol=1e-02)) - self.assertTrue(np.allclose(stderr_list, self.stderr, rtol=1e-02, atol=1e-02)) - - def test_features(self): - - model = tc.logistic_classifier.create(self.sf, self.target, self.features, - feature_rescaling = False) - self.assertEqual(model.num_features, len(self.features)) - self.assertEqual(model.features, self.features) - self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) - self.assertEqual(model.unpacked_features, self.unpacked_features) - - def _test_create(self, sf, target, features, solver, opts, rescaling): - - model = tc.logistic_classifier.create(sf, target, features, - solver= solver, l2_penalty = 0.0, feature_rescaling = rescaling, - validation_set=None, **opts) - test_case = 'solver = {solver}, opts = {opts}'.format(solver = solver, - opts = opts) - self.assertTrue(model is not None) - self.assertTrue(abs(model.training_loss - self.loss) < \ - 0.01 * abs(self.loss),\ - 'Loss failed: %s. Expected %s' % (test_case, self.loss)) - self._test_coefficients(model) - - def test_create(self): - - for solver in ['newton']: - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, True) - self._test_create(*args) - args = (self.sf, self.target, self.features, - solver, self.def_kwargs, False) - self._test_create(*args) - - def test_predict_extra_cols(self): - - model = tc.logistic_classifier.create(self.sf, self.target, self.features, - feature_rescaling = False) - pred = model.predict(self.sf) - self.sf['dict'] = self.sf['dict'].apply(lambda x: dict(list(x.items()) - + list({'extra_col': 0, 'extra_col_2': 1}.items()))) - pred2 = model.predict(self.sf) - self.assertEqual(sum(pred - pred2), 0) - self.sf['dict'] = self.sf['dict'].apply(lambda x: {k:v for k,v in x.items() \ - if k not in ['extra_col', 'extra_col_2']}) - - def test_evaluate_extra_cols(self): - - model = tc.logistic_classifier.create(self.sf, self.target, self.features, - feature_rescaling = False) - eval1 = model.evaluate(self.sf) - self.sf['dict'] = self.sf['dict'].apply(lambda x: dict(list(x.items()) - + list({'extra_col': 0, 'extra_col_2': 1}.items()))) - eval2 = model.evaluate(self.sf) - self.sf['dict'] = self.sf['dict'].apply(lambda x: {k:v for k,v in x.items() \ - if k not in ['extra_col', 'extra_col_2']}) - self.assertEqual(eval1["accuracy"], eval2["accuracy"]) class RegularizedLogisticRegressionTest(unittest.TestCase): """ @@ -1136,7 +1245,7 @@ def setUpClass(self): """ ## Fake data, generated in R - feature_data = '''0.723040834941846,1.0648961025071,-0.479191624484056,0.433073682915559 + feature_data = """0.723040834941846,1.0648961025071,-0.479191624484056,0.433073682915559 -1.29705301514688,-0.0898754334392415,-0.244320454255808,-0.578687648218724 -1.99524976461205,-0.125152158307165,-0.086446106920042,-0.233340479601935 0.402456295304511,-0.550374347857019,1.35685637262204,0.544712458718116 @@ -1235,64 +1344,165 @@ def setUpClass(self): -0.313676473532486,0.244242322538692,-0.172553981996335,0.31935807851552 -0.620909598452922,0.655163343467281,2.00816338389406,-0.422875475337577 -0.339769903386523,0.189204653082022,-2.34980611959092,0.783263944917566 - 1.19717835010489,0.479479297178576,-0.682999419503163,1.55590456330123''' - target_data = [0,1,0,0,0,1,1,1,0,0,0,1,0,1,1,0,0,0,1,0,0,0,1,0,1,1,1,1, - 0,0,0,0,0,1,0,1,0,0,0,1,1,0,0,1,0,0,0,1,0,1,1,1,1,1,1,0,0,1,0,0,0,1, - 0,0,1,0,1,0,0,1,0,0,1,1,0,1,0,0,1,1,0,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0, - 0,0,0,1] + 1.19717835010489,0.479479297178576,-0.682999419503163,1.55590456330123""" + target_data = [ + 0, + 1, + 0, + 0, + 0, + 1, + 1, + 1, + 0, + 0, + 0, + 1, + 0, + 1, + 1, + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 1, + 0, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 1, + 0, + 1, + 0, + 0, + 0, + 1, + 1, + 0, + 0, + 1, + 0, + 0, + 0, + 1, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 1, + 0, + 0, + 0, + 1, + 0, + 0, + 1, + 0, + 1, + 0, + 0, + 1, + 0, + 0, + 1, + 1, + 0, + 1, + 0, + 0, + 1, + 1, + 0, + 0, + 1, + 0, + 0, + 1, + 1, + 0, + 0, + 1, + 1, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + ] ## Write data to file so it can be loaded into an SFrame - f_data = 'data_file_{}.csv'.format(uuid.uuid4()) + f_data = "data_file_{}.csv".format(uuid.uuid4()) self.dataset = f_data - with open(f_data, 'w') as f: + with open(f_data, "w") as f: f.write(feature_data) ## Load the data into an SFrame - self.sf = tc.SFrame.read_csv(f_data, header=False, - column_type_hints=float) - self.sf['target'] = target_data + self.sf = tc.SFrame.read_csv(f_data, header=False, column_type_hints=float) + self.sf["target"] = target_data ## Default options self.def_kwargs = copy.deepcopy(_DEFAULT_SOLVER_OPTIONS) - self.def_kwargs['max_iterations'] = 100 - self.def_kwargs['convergence_threshold'] = 1e-5 - self.l2_penalty = 5. - self.l1_penalty = 3. + self.def_kwargs["max_iterations"] = 100 + self.def_kwargs["convergence_threshold"] = 1e-5 + self.l2_penalty = 5.0 + self.l1_penalty = 3.0 ## Constant parameters - self.target = 'target' - self.features = ['X{}'.format(i) for i in range(1, 4+1)] - self.solver = 'auto' + self.target = "target" + self.features = ["X{}".format(i) for i in range(1, 4 + 1)] + self.solver = "auto" ## Correct answers, from glmnet in R ## require(glmnet) ## fit = glmnet(x, y, family='binomial', alpha=0, lambda=0.1, standardize=False) ## Note: l2_penalty is 0.1 in R but 5 here because a) the penalty in # glmnet is lambda/2, and b) the loss in glmnet is the log-likelihood/n + penalty. - self.l2_coef = np.array([-0.3554688, 0.06594038, -0.48338736, -0.11910414, - -0.09901472]) + self.l2_coef = np.array( + [-0.3554688, 0.06594038, -0.48338736, -0.11910414, -0.09901472] + ) ## fit = glmnet(x, y, family='binomial', alpha=1.0, lambda=0.03, standardize=False) ## Note: l1 penalty is 0.03 in R but 3 here because the loss in glmnet # is log-lik/n + penalty. self.l1_coef = np.array([-0.3728739, 0.0, -0.58645032, -0.07656562, 0.0]) - def _test_l2_create(self, sf, target, features, solver, opts, l2_penalty): """ Test l2-regularized logistic regression create under particular parameter settings. """ - test_case = 'solver = {}, opts = {}'.format(solver, opts) - model = tc.logistic_classifier.create(sf, target, features, - l2_penalty=l2_penalty, - l1_penalty=0., - solver=solver, - feature_rescaling = False, - validation_set=None, - **opts) - coefs = list(model.coefficients['value']) + test_case = "solver = {}, opts = {}".format(solver, opts) + model = tc.logistic_classifier.create( + sf, + target, + features, + l2_penalty=l2_penalty, + l1_penalty=0.0, + solver=solver, + feature_rescaling=False, + validation_set=None, + **opts + ) + coefs = list(model.coefficients["value"]) self.assertTrue(model is not None) self.assertTrue(np.allclose(coefs, self.l2_coef, rtol=1e-02, atol=1e-02)) @@ -1302,15 +1512,19 @@ def _test_l1_create(self, sf, target, features, solver, opts, l1_penalty): parameter settings. """ - test_case = 'solver = {}, opts = {}'.format(solver, opts) - model = tc.logistic_classifier.create(sf, target, features, - l2_penalty=0., - l1_penalty=l1_penalty, - solver=solver, - feature_rescaling = False, - validation_set=None, - **opts) - coefs = list(model.coefficients['value']) + test_case = "solver = {}, opts = {}".format(solver, opts) + model = tc.logistic_classifier.create( + sf, + target, + features, + l2_penalty=0.0, + l1_penalty=l1_penalty, + solver=solver, + feature_rescaling=False, + validation_set=None, + **opts + ) + coefs = list(model.coefficients["value"]) self.assertTrue(model is not None) self.assertTrue(np.allclose(coefs, self.l1_coef, rtol=1e-02, atol=1e-02)) @@ -1320,48 +1534,62 @@ def test_create(self): regularization. """ - for solver in ['newton', 'lbfgs', 'fista']: - self._test_l2_create(self.sf, self.target, self.features, solver, - self.def_kwargs, self.l2_penalty) - for solver in ['fista']: - self._test_l1_create(self.sf, self.target, self.features, solver, - self.def_kwargs, self.l1_penalty) + for solver in ["newton", "lbfgs", "fista"]: + self._test_l2_create( + self.sf, + self.target, + self.features, + solver, + self.def_kwargs, + self.l2_penalty, + ) + for solver in ["fista"]: + self._test_l1_create( + self.sf, + self.target, + self.features, + solver, + self.def_kwargs, + self.l1_penalty, + ) + class ImproperProblemsTest(unittest.TestCase): - """ + """ Unit test class for problems with the setup, e.g. dataset. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - self.target = 'y' - self.sf = tc.SFrame() - self.sf['y'] = tc.SArray([0,1,0], int) - self.sf['int'] = tc.SArray([1,2,3], int) - self.sf['float'] = tc.SArray([1,2,3], float) - self.sf['dict'] = tc.SArray([{'1':3, '2':2},{'2':1},{}], dict) - self.sf['array'] = tc.SArray([[1,2],[3,4],[5,6]], array.array) - self.sf['str'] = tc.SArray(['1','2','3'], str) - print(self.sf) + self.target = "y" + self.sf = tc.SFrame() + self.sf["y"] = tc.SArray([0, 1, 0], int) + self.sf["int"] = tc.SArray([1, 2, 3], int) + self.sf["float"] = tc.SArray([1, 2, 3], float) + self.sf["dict"] = tc.SArray([{"1": 3, "2": 2}, {"2": 1}, {}], dict) + self.sf["array"] = tc.SArray([[1, 2], [3, 4], [5, 6]], array.array) + self.sf["str"] = tc.SArray(["1", "2", "3"], str) + print(self.sf) - """ + """ Test predict missing value """ - def test_single_label_error(self): - sf = self.sf.__copy__() - sf['y'] = tc.SArray.from_const(0, 3) - with self.assertRaises(ToolkitError): - m = tc.logistic_classifier.create(sf, 'y') + + def test_single_label_error(self): + sf = self.sf.__copy__() + sf["y"] = tc.SArray.from_const(0, 3) + with self.assertRaises(ToolkitError): + m = tc.logistic_classifier.create(sf, "y") class ValidationSetLogisticClassifierTest(unittest.TestCase): @classmethod def setUpClass(self): - ## Simulate test data + ## Simulate test data np.random.seed(10) n, d = 100, 10 self.sf = tc.SFrame() @@ -1372,31 +1600,35 @@ def setUpClass(self): target[1] = 1 ## Create the model - self.sf['target'] = target - self.target = 'target' + self.sf["target"] = target + self.target = "target" def test_valid_set(self): - model = tc.logistic_classifier.create(self.sf, target='target', - validation_set = 'auto') + model = tc.logistic_classifier.create( + self.sf, target="target", validation_set="auto" + ) self.assertTrue(model is not None) self.assertTrue(isinstance(model.progress, tc.SFrame)) - model = tc.logistic_classifier.create(self.sf, target='target', - validation_set =self.sf) + model = tc.logistic_classifier.create( + self.sf, target="target", validation_set=self.sf + ) self.assertTrue(model is not None) self.assertTrue(isinstance(model.progress, tc.SFrame)) valid_set = self.sf.head(5) valid_set[self.target] = 0 - model = tc.logistic_classifier.create(self.sf, target='target', - validation_set = valid_set) + model = tc.logistic_classifier.create( + self.sf, target="target", validation_set=valid_set + ) self.assertTrue(model is not None) self.assertTrue(isinstance(model.progress, tc.SFrame)) - model = tc.logistic_classifier.create(self.sf, target='target', - validation_set = None) + model = tc.logistic_classifier.create( + self.sf, target="target", validation_set=None + ) self.assertTrue(model is not None) self.assertTrue(isinstance(model.progress, tc.SFrame)) @@ -1404,30 +1636,34 @@ def test_valid_set(self): # validation set. with self.assertRaises(RuntimeError): validation_set = self.sf.__copy__() - validation_set['X1'] = validation_set['X1'].astype(str) - model = tc.logistic_classifier.create(self.sf, target='target', - validation_set = validation_set) + validation_set["X1"] = validation_set["X1"].astype(str) + model = tc.logistic_classifier.create( + self.sf, target="target", validation_set=validation_set + ) -class TestStringTarget(unittest.TestCase): +class TestStringTarget(unittest.TestCase): def test_cat(self): import numpy as np + # Arrange np.random.seed(8) n, d = 1000, 100 sf = tc.SFrame() for i in range(d): - sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - target = np.random.randint(2, size=n) - sf['target'] = target + sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + target = np.random.randint(2, size=n) + sf["target"] = target - sf['target'] = sf['target'].astype(str) - sf['target'] = 'cat-' + sf['target'] - model = tc.logistic_classifier.create(sf, 'target') + sf["target"] = sf["target"].astype(str) + sf["target"] = "cat-" + sf["target"] + model = tc.logistic_classifier.create(sf, "target") # Act evaluation = model.evaluate(sf) # Assert - self.assertEqual(['cat-0', 'cat-1'], - sorted(list(evaluation['confusion_matrix']['target_label'].unique()))) + self.assertEqual( + ["cat-0", "cat-1"], + sorted(list(evaluation["confusion_matrix"]["target_label"].unique())), + ) diff --git a/src/python/turicreate/test/test_models.py b/src/python/turicreate/test/test_models.py index 95961cb55d..8f0861d513 100644 --- a/src/python/turicreate/test/test_models.py +++ b/src/python/turicreate/test/test_models.py @@ -14,7 +14,8 @@ import random import sys -temp_number = random.randint(0, 2**64) +temp_number = random.randint(0, 2 ** 64) + class TestModel(unittest.TestCase): def __assert_model_equals__(self, m1, m2): @@ -30,18 +31,18 @@ def setUp(self): self.pr_model = tc.pagerank.create(tc.SGraph()) self.cc_model = tc.connected_components.create(tc.SGraph()) - self.__remove_file('~/tmp/tmp_model-%d' % temp_number) - self.__remove_file('./tmp_model-%d' % temp_number) - self.__remove_file('/tmp/tmp_model-%d' % temp_number) - self.__remove_file('/tmp/tmp_model2-%d' % temp_number) + self.__remove_file("~/tmp/tmp_model-%d" % temp_number) + self.__remove_file("./tmp_model-%d" % temp_number) + self.__remove_file("/tmp/tmp_model-%d" % temp_number) + self.__remove_file("/tmp/tmp_model2-%d" % temp_number) def tearDown(self): - self.__remove_file('~/tmp/tmp_model-%d' % temp_number) - self.__remove_file('./tmp_model-%d' % temp_number) - self.__remove_file('/tmp/tmp_model-%d' % temp_number) - self.__remove_file('/tmp/tmp_model2-%d' % temp_number) + self.__remove_file("~/tmp/tmp_model-%d" % temp_number) + self.__remove_file("./tmp_model-%d" % temp_number) + self.__remove_file("/tmp/tmp_model-%d" % temp_number) + self.__remove_file("/tmp/tmp_model2-%d" % temp_number) - @unittest.skip('failing since 08/30/2016') + @unittest.skip("failing since 08/30/2016") def test_basic_save_load(self): # save and load the pagerank model with util.TempDirectory() as tmp_pr_model_file: @@ -57,14 +58,16 @@ def test_basic_save_load(self): # handle different types of urls. # TODO: test hdfs and s3 urls. - for url in ['./tmp_model-%d' % temp_number, - '/tmp/tmp_model-%d' % temp_number, - '~/tmp/tmp_model-%d' % temp_number]: + for url in [ + "./tmp_model-%d" % temp_number, + "/tmp/tmp_model-%d" % temp_number, + "~/tmp/tmp_model-%d" % temp_number, + ]: self.pr_model.save(url) self.__assert_model_equals__(self.pr_model, tc.load_model(url)) - @unittest.skip('failing since 08/30/2016') + @unittest.skip("failing since 08/30/2016") def test_exception(self): # load model from empty file with util.TempDirectory() as tmp_empty_file: @@ -72,17 +75,17 @@ def test_exception(self): tc.load_model(tmp_empty_file) # load model from non-existing file - if (os.path.exists('./tmp_model-%d' % temp_number)): - shutil.rmtree('./tmp_model-%d' % temp_number) + if os.path.exists("./tmp_model-%d" % temp_number): + shutil.rmtree("./tmp_model-%d" % temp_number) with self.assertRaises(IOError): - tc.load_model('./tmp_model-%d' % temp_number) + tc.load_model("./tmp_model-%d" % temp_number) # save model to invalid url restricted_place = None - if sys.platform == 'win32': - restricted_place = 'C:\\Windows\\System32\\config\\RegBack\\testmodel' + if sys.platform == "win32": + restricted_place = "C:\\Windows\\System32\\config\\RegBack\\testmodel" else: - restricted_place = '/root/tmp/testmodel' - for url in ['http://test', restricted_place]: + restricted_place = "/root/tmp/testmodel" + for url in ["http://test", restricted_place]: with self.assertRaises(IOError): self.pr_model.save(url) diff --git a/src/python/turicreate/test/test_nearest_neighbors.py b/src/python/turicreate/test/test_nearest_neighbors.py index 2c06aaef52..e7733b6d20 100644 --- a/src/python/turicreate/test/test_nearest_neighbors.py +++ b/src/python/turicreate/test/test_nearest_neighbors.py @@ -46,30 +46,36 @@ def setUpClass(self): array_features = [] dict_features = [] for i in range(n): - array_features.append(array.array('f', np.random.rand(d))) - dict_features.append({'alice': np.random.randint(10), - 'brian': np.random.randint(10), - 'chris': np.random.randint(10)}) + array_features.append(array.array("f", np.random.rand(d))) + dict_features.append( + { + "alice": np.random.randint(10), + "brian": np.random.randint(10), + "chris": np.random.randint(10), + } + ) self.refs = tc.SFrame() for i in range(d): - self.refs['X{}'.format(i+1)] = tc.SArray(np.random.rand(n)) + self.refs["X{}".format(i + 1)] = tc.SArray(np.random.rand(n)) - self.label = 'label' + self.label = "label" self.refs[self.label] = [str(x) for x in range(n)] - self.refs['array_ftr'] = array_features - self.refs['dict_ftr'] = dict_features - self.refs['str_ftr'] = random_string(n, length=3, num_letters=5) - self.refs['list_str_ftr'] = random_list_of_str(n, length=3) + self.refs["array_ftr"] = array_features + self.refs["dict_ftr"] = dict_features + self.refs["str_ftr"] = random_string(n, length=3, num_letters=5) + self.refs["list_str_ftr"] = random_list_of_str(n, length=3) - def _test_create(self, sf, label, features, distance, method, field=None, - value=None): + def _test_create( + self, sf, label, features, distance, method, field=None, value=None + ): """ Test creation of nearest neighbors models. """ - m = tc.nearest_neighbors.create(sf, label, features, distance, method, - verbose=False) + m = tc.nearest_neighbors.create( + sf, label, features, distance, method, verbose=False + ) assert m is not None, "Model creation failed." if field is not None: @@ -82,55 +88,91 @@ def test_create_default(self): """ ## check auto configurations for the method when features are provided. - self._test_create(self.refs, self.label, features=['X1', 'X2', 'X3'], - distance='auto', method='auto', field='method', - value='ball_tree') - - self._test_create(self.refs, self.label, features=['X1', 'X2', 'X3'], - distance='euclidean', method='auto', field='method', - value='ball_tree') + self._test_create( + self.refs, + self.label, + features=["X1", "X2", "X3"], + distance="auto", + method="auto", + field="method", + value="ball_tree", + ) + + self._test_create( + self.refs, + self.label, + features=["X1", "X2", "X3"], + distance="euclidean", + method="auto", + field="method", + value="ball_tree", + ) ## check auto configurations for distance if features specified. - m = tc.nearest_neighbors.create(self.refs, self.label, - features=['X1', 'X2', 'X3'], - distance='auto', method='brute_force', - verbose=False) - self.assertEqual(m.distance, [[['X1', 'X2', 'X3'], 'euclidean', 1.]]) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=["X1", "X2", "X3"], + distance="auto", + method="brute_force", + verbose=False, + ) + self.assertEqual(m.distance, [[["X1", "X2", "X3"], "euclidean", 1.0]]) ## check auto configurations for distance if features *not* specified. - ans_dist = [[['X1', 'X2', 'X3'], 'euclidean', 1.], - [['dict_ftr'], 'jaccard', 1.], - [['str_ftr'], 'levenshtein', 1.], - [['array_ftr'], 'euclidean', 1.], - [['list_str_ftr'], 'jaccard', 1.]] - - m = tc.nearest_neighbors.create(self.refs, self.label, features=None, - distance='auto', method='brute_force', - verbose=False) + ans_dist = [ + [["X1", "X2", "X3"], "euclidean", 1.0], + [["dict_ftr"], "jaccard", 1.0], + [["str_ftr"], "levenshtein", 1.0], + [["array_ftr"], "euclidean", 1.0], + [["list_str_ftr"], "jaccard", 1.0], + ] + + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=None, + distance="auto", + method="brute_force", + verbose=False, + ) self.assertItemsEqual(m.distance, ans_dist) - - m = tc.nearest_neighbors.create(self.refs, self.label, features=None, - distance=None, method='brute_force', - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=None, + distance=None, + method="brute_force", + verbose=False, + ) self.assertItemsEqual(m.distance, ans_dist) - ## check default leaf size for ball tree correct_leaf_size = 1000 - self._test_create(self.refs, self.label, features=['array_ftr'], - distance='euclidean', method='ball_tree', - field='leaf_size', value=correct_leaf_size) - - m = tc.nearest_neighbors.create(self.refs, self.label, - features=['array_ftr'], - method='ball_tree', - leaf_size=0, - verbose=False) + self._test_create( + self.refs, + self.label, + features=["array_ftr"], + distance="euclidean", + method="ball_tree", + field="leaf_size", + value=correct_leaf_size, + ) + + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=["array_ftr"], + method="ball_tree", + leaf_size=0, + verbose=False, + ) assert m is not None, "Model creation failed." - assert m.leaf_size == correct_leaf_size, "Leaf size explicit default" +\ - "failed." + assert m.leaf_size == correct_leaf_size, ( + "Leaf size explicit default" + "failed." + ) def test_create_labels(self): """ @@ -143,47 +185,73 @@ def test_create_labels(self): ## String labels are tested everywhere else in this class. ## Passing no label should work, with and without listed features - self._test_create(sf, label=None, features=None, distance='auto', - method='auto', field='label', value=None) - - self._test_create(sf, label=None, features=['X1', 'X2', 'X3'], - distance='euclidean', method='auto', field='label', - value=None) + self._test_create( + sf, + label=None, + features=None, + distance="auto", + method="auto", + field="label", + value=None, + ) + + self._test_create( + sf, + label=None, + features=["X1", "X2", "X3"], + distance="euclidean", + method="auto", + field="label", + value=None, + ) ## Integer label should work - sf = sf.add_row_number(column_name='id') - - self._test_create(sf, label='id', features=None, distance='auto', - method='auto', field='label', value='id') - - m = tc.nearest_neighbors.create(sf, label='id', - features=['X1', 'X2', 'X3'], - distance='euclidean', - method='brute_force', - verbose=False) - self.assertEqual(set(m.features), set(['X1', 'X2', 'X3'])) + sf = sf.add_row_number(column_name="id") + + self._test_create( + sf, + label="id", + features=None, + distance="auto", + method="auto", + field="label", + value="id", + ) + + m = tc.nearest_neighbors.create( + sf, + label="id", + features=["X1", "X2", "X3"], + distance="euclidean", + method="brute_force", + verbose=False, + ) + self.assertEqual(set(m.features), set(["X1", "X2", "X3"])) ## Float label should fail - sf['id'] = sf['id'].astype(float) + sf["id"] = sf["id"].astype(float) with self.assertRaises(TypeError): - m = tc.nearest_neighbors.create(sf, label='id') + m = tc.nearest_neighbors.create(sf, label="id") ## Specified label, included in the features list should drop the label # from the features. - m = tc.nearest_neighbors.create(sf, label=None, - features=['X1', 'X2', '__id'], - distance='euclidean', - method='brute_force', verbose=False) - self.assertEqual(set(m.features), set(['X1', 'X2'])) + m = tc.nearest_neighbors.create( + sf, + label=None, + features=["X1", "X2", "__id"], + distance="euclidean", + method="brute_force", + verbose=False, + ) + self.assertEqual(set(m.features), set(["X1", "X2"])) ## If there is only one feature, and it's specified as the label, this # should raise an informative error. - sf = sf.add_row_number('id_test') + sf = sf.add_row_number("id_test") with self.assertRaises(ToolkitError): - m = tc.nearest_neighbors.create(sf, label='id_test', - features=['id_test']) + m = tc.nearest_neighbors.create(sf, label="id_test", features=["id_test"]) def test_create_methods(self): """ @@ -191,39 +259,57 @@ def test_create_methods(self): """ methods = { - 'auto': 'ball_tree', - 'brute_force': 'brute_force', - 'ball_tree': 'ball_tree', - 'lsh': 'lsh'} + "auto": "ball_tree", + "brute_force": "brute_force", + "ball_tree": "ball_tree", + "lsh": "lsh", + } for m, name in methods.items(): - self._test_create(self.refs, self.label, features=['array_ftr'], - method=m, distance='euclidean', - field='method', value=name) + self._test_create( + self.refs, + self.label, + features=["array_ftr"], + method=m, + distance="euclidean", + field="method", + value=name, + ) ## Cosine and transformed_dot_product distances should not work with ball tree - for dist in ['cosine', 'transformed_dot_product', - tc.distances.cosine, - tc.distances.transformed_dot_product]: + for dist in [ + "cosine", + "transformed_dot_product", + tc.distances.cosine, + tc.distances.transformed_dot_product, + ]: with self.assertRaises(TypeError): - tc.nearest_neighbors.create(self.refs, self.label, - features=['array_ftr'], - distance=dist, - method='ball_tree', verbose=False) + tc.nearest_neighbors.create( + self.refs, + self.label, + features=["array_ftr"], + distance=dist, + method="ball_tree", + verbose=False, + ) ## Multiple distance components should cause an automatic switch to # brute force, even if ball tree is specified. distance_components = [ - [['X1', 'X2', 'X3'], 'euclidean', 1], - [['array_ftr'], 'manhattan', 1], - [['str_ftr'], 'levenshtein', 1]] - - m = tc.nearest_neighbors.create(self.refs, self.label, - method='ball_tree', - distance=distance_components, - verbose=False) - self.assertEqual(m.method, 'brute_force') + [["X1", "X2", "X3"], "euclidean", 1], + [["array_ftr"], "manhattan", 1], + [["str_ftr"], "levenshtein", 1], + ] + + m = tc.nearest_neighbors.create( + self.refs, + self.label, + method="ball_tree", + distance=distance_components, + verbose=False, + ) + self.assertEqual(m.method, "brute_force") def test_kwargs(self): """ @@ -231,11 +317,14 @@ def test_kwargs(self): raise errors to avoid confusion in downstream errors. """ with self.assertRaises(ToolkitError): - m = tc.nearest_neighbors.create(self.refs, self.label, - feature='array_ftr', # this is bogus - method='ball_tree', - distance='euclidean', - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + feature="array_ftr", # this is bogus + method="ball_tree", + distance="euclidean", + verbose=False, + ) def test_create_dense_distances(self): """ @@ -243,41 +332,53 @@ def test_create_dense_distances(self): """ dense_dists = { - 'euclidean': tc.distances.euclidean, - 'squared_euclidean': tc.distances.squared_euclidean, - 'gaussian_kernel': tc.distances.gaussian_kernel, - 'manhattan': tc.distances.manhattan, - 'cosine': tc.distances.cosine, - 'transformed_dot_product': tc.distances.transformed_dot_product} + "euclidean": tc.distances.euclidean, + "squared_euclidean": tc.distances.squared_euclidean, + "gaussian_kernel": tc.distances.gaussian_kernel, + "manhattan": tc.distances.manhattan, + "cosine": tc.distances.cosine, + "transformed_dot_product": tc.distances.transformed_dot_product, + } for dist_name, dist_fn in dense_dists.items(): - ans_dist = [[['array_ftr'], dist_name, 1.]] + ans_dist = [[["array_ftr"], dist_name, 1.0]] ## Test the string form of the distance argument - m = tc.nearest_neighbors.create(self.refs, self.label, - features=['array_ftr'], - distance=dist_name, - method='brute_force', - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=["array_ftr"], + distance=dist_name, + method="brute_force", + verbose=False, + ) self.assertEqual(m.distance, ans_dist) ## Test the function form of the distance argument - m = tc.nearest_neighbors.create(self.refs, self.label, - features=['array_ftr'], - distance=dist_fn, - method='brute_force', - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=["array_ftr"], + distance=dist_fn, + method="brute_force", + verbose=False, + ) self.assertEqual(m.distance, ans_dist) ## Numeric distances should *not* work with string features try: - tc.nearest_neighbors.create(self.refs, self.label, ['str_ftr'], - distance=dist_name) + tc.nearest_neighbors.create( + self.refs, self.label, ["str_ftr"], distance=dist_name + ) except ToolkitError as e: - self.assertTrue(str(e).startswith("The only distance allowed for string features is 'levenshtein'. " - "Please try this distance, or use 'text_analytics.count_ngrams' to " - "convert the strings to dictionaries, which permit more distance functions.\n")) + self.assertTrue( + str(e).startswith( + "The only distance allowed for string features is 'levenshtein'. " + "Please try this distance, or use 'text_analytics.count_ngrams' to " + "convert the strings to dictionaries, which permit more distance functions.\n" + ) + ) def test_create_sparse_distances(self): """ @@ -285,84 +386,114 @@ def test_create_sparse_distances(self): sparse data, e.g. vectors.""" sparse_dists = { - 'jaccard': tc.distances.jaccard, - 'weighted_jaccard': tc.distances.weighted_jaccard, - 'cosine': tc.distances.cosine, - 'transformed_dot_product': tc.distances.transformed_dot_product} + "jaccard": tc.distances.jaccard, + "weighted_jaccard": tc.distances.weighted_jaccard, + "cosine": tc.distances.cosine, + "transformed_dot_product": tc.distances.transformed_dot_product, + } for dist_name, dist_fn in sparse_dists.items(): - ans_dist = [[['dict_ftr'], dist_name, 1.]] + ans_dist = [[["dict_ftr"], dist_name, 1.0]] ## Test the string form of the distance argument - m = tc.nearest_neighbors.create(self.refs, self.label, - features=['dict_ftr'], - distance=dist_name, - method='brute_force', - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=["dict_ftr"], + distance=dist_name, + method="brute_force", + verbose=False, + ) self.assertEqual(m.distance, ans_dist) ## Test the function form of the distance argument - m = tc.nearest_neighbors.create(self.refs, self.label, - features=['dict_ftr'], - distance=dist_fn, - method='brute_force', - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=["dict_ftr"], + distance=dist_fn, + method="brute_force", + verbose=False, + ) self.assertEqual(m.distance, ans_dist) ## Test the string form of the distance argument with list of str - ans_dist = [[['list_str_ftr'], dist_name, 1.]] - m = tc.nearest_neighbors.create(self.refs, self.label, - features=['list_str_ftr'], - distance=dist_name, - method='brute_force', - verbose=False) + ans_dist = [[["list_str_ftr"], dist_name, 1.0]] + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=["list_str_ftr"], + distance=dist_name, + method="brute_force", + verbose=False, + ) self.assertEqual(m.distance, ans_dist) - ## Jaccard distances should not work with numeric or string features - for dist in ['jaccard', - 'weighted_jaccard', - tc.distances.jaccard, - tc.distances.weighted_jaccard]: + for dist in [ + "jaccard", + "weighted_jaccard", + tc.distances.jaccard, + tc.distances.weighted_jaccard, + ]: try: - tc.nearest_neighbors.create(self.refs, self.label, - features=['array_ftr'], - distance=dist, - method='brute_force', - verbose=False) + tc.nearest_neighbors.create( + self.refs, + self.label, + features=["array_ftr"], + distance=dist, + method="brute_force", + verbose=False, + ) except ToolkitError as e: - self.assertTrue(str(e).startswith("Cannot compute jaccard distances with column 'array_ftr'." - " Jaccard distances currently can only be computed for" - " dictionary and list features.\n")) + self.assertTrue( + str(e).startswith( + "Cannot compute jaccard distances with column 'array_ftr'." + " Jaccard distances currently can only be computed for" + " dictionary and list features.\n" + ) + ) try: - tc.nearest_neighbors.create(self.refs, self.label, ['str_ftr'], - distance=dist_name, verbose=False) + tc.nearest_neighbors.create( + self.refs, + self.label, + ["str_ftr"], + distance=dist_name, + verbose=False, + ) except ToolkitError as e: - self.assertTrue(str(e).startswith("The only distance allowed for string features is 'levenshtein'. " - "Please try this distance, or use 'text_analytics.count_ngrams' " - "to convert the strings to dictionaries, which permit more distance functions.\n")) + self.assertTrue( + str(e).startswith( + "The only distance allowed for string features is 'levenshtein'. " + "Please try this distance, or use 'text_analytics.count_ngrams' " + "to convert the strings to dictionaries, which permit more distance functions.\n" + ) + ) ## Jacard distance throws TypeError on lists of non-strings refs = self.refs.__copy__() - refs['list_float_ftr'] = refs['array_ftr'].apply(lambda x: list(x), dtype=list) + refs["list_float_ftr"] = refs["array_ftr"].apply(lambda x: list(x), dtype=list) # Check autodistance with self.assertRaises(TypeError): - m = tc.nearest_neighbors.create(refs, self.label, - features=['list_float_ftr'], - verbose=False) + m = tc.nearest_neighbors.create( + refs, self.label, features=["list_float_ftr"], verbose=False + ) # Check user-specified distance - for distance in ['jaccard', 'weighted_jaccard', 'euclidean']: + for distance in ["jaccard", "weighted_jaccard", "euclidean"]: with self.assertRaises(TypeError): - m = tc.nearest_neighbors.create(refs, self.label, - features=['list_float_ftr'], - distance=distance, - method='brute_force', - verbose=False) + m = tc.nearest_neighbors.create( + refs, + self.label, + features=["list_float_ftr"], + distance=distance, + method="brute_force", + verbose=False, + ) def test_create_string_distances(self): """ @@ -370,39 +501,56 @@ def test_create_string_distances(self): neighbors model. """ - string_dists = { - 'levenshtein': tc.distances.levenshtein} + string_dists = {"levenshtein": tc.distances.levenshtein} for dist_name, dist_fn in string_dists.items(): - ans_dist = [[['str_ftr'], dist_name, 1.]] + ans_dist = [[["str_ftr"], dist_name, 1.0]] ## Test the string form of the distance argument - m = tc.nearest_neighbors.create(self.refs, self.label, - features=['str_ftr'], - distance=dist_name, - method='brute_force', - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=["str_ftr"], + distance=dist_name, + method="brute_force", + verbose=False, + ) self.assertEqual(m.distance, ans_dist) ## Test the function form of the distance argument - m = tc.nearest_neighbors.create(self.refs, self.label, - features=['str_ftr'], - distance=dist_fn, - method='brute_force', - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=["str_ftr"], + distance=dist_fn, + method="brute_force", + verbose=False, + ) self.assertEqual(m.distance, ans_dist) ## String distances should not work with numeric or dictionary # features try: - tc.nearest_neighbors.create(self.refs, self.label, - features=['dict_ftr'], - distance=dist_name, - method='brute_force', verbose=False) + tc.nearest_neighbors.create( + self.refs, + self.label, + features=["dict_ftr"], + distance=dist_name, + method="brute_force", + verbose=False, + ) except ToolkitError as e: - self.assertTrue(str(e).startswith("Cannot compute {} distance with column 'dict_ftr'.".format(dist_name) + - " {} distance can only computed for string features.\n".format(dist_name))) + self.assertTrue( + str(e).startswith( + "Cannot compute {} distance with column 'dict_ftr'.".format( + dist_name + ) + + " {} distance can only computed for string features.\n".format( + dist_name + ) + ) + ) def test_create_composite_distances(self): """ @@ -411,29 +559,37 @@ def test_create_composite_distances(self): """ distance_components = [ - [['X1', 'X2'], 'euclidean', 1], - [['X2', 'X3'], 'manhattan', 1], # note overlap with first component's features - [['array_ftr'], 'manhattan', 1], - [['str_ftr'], 'levenshtein', 1]] + [["X1", "X2"], "euclidean", 1], + [ + ["X2", "X3"], + "manhattan", + 1, + ], # note overlap with first component's features + [["array_ftr"], "manhattan", 1], + [["str_ftr"], "levenshtein", 1], + ] ## Test that things work correctly in the vanilla case. - m = tc.nearest_neighbors.create(self.refs, self.label, - distance=distance_components, - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, self.label, distance=distance_components, verbose=False + ) assert m is not None, "Model creation failed." self.assertEqual(m.distance, distance_components) self.assertEqual(m.num_distance_components, 4) - self.assertEqual(m.method, 'brute_force') + self.assertEqual(m.method, "brute_force") self.assertEqual(m.num_features, 5) self.assertEqual(m.num_unpacked_features, 7) ## Make sure the features parameter is ignored if a composite distance # is specified. - m = tc.nearest_neighbors.create(self.refs, self.label, - features=['X1', 'X2'], - distance=distance_components, - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=["X1", "X2"], + distance=distance_components, + verbose=False, + ) assert m is not None, "Model creation failed." self.assertEqual(m.distance, distance_components) @@ -445,21 +601,37 @@ def test_create_num_variables(self): """ Test vector features, numeric features, and combinations thereof. """ - for ftr_list, v in [(['X1', 'X2', 'X3'], 3), - (['array_ftr'], 3), - (['dict_ftr'], 3)]: - - self._test_create(self.refs, self.label, features=ftr_list, - method='auto', distance='auto', - field='num_unpacked_features', value=v) - - for ftr_list, v in [(['X1', 'X2', 'X3'], 3), - (['array_ftr'], 1), - (['dict_ftr'], 1)]: - - self._test_create(self.refs, self.label, features=ftr_list, - method='auto', distance='auto', - field='num_features', value=v) + for ftr_list, v in [ + (["X1", "X2", "X3"], 3), + (["array_ftr"], 3), + (["dict_ftr"], 3), + ]: + + self._test_create( + self.refs, + self.label, + features=ftr_list, + method="auto", + distance="auto", + field="num_unpacked_features", + value=v, + ) + + for ftr_list, v in [ + (["X1", "X2", "X3"], 3), + (["array_ftr"], 1), + (["dict_ftr"], 1), + ]: + + self._test_create( + self.refs, + self.label, + features=ftr_list, + method="auto", + distance="auto", + field="num_features", + value=v, + ) def test_create_mutations(self): """ @@ -468,12 +640,17 @@ def test_create_mutations(self): """ sf = self.refs[:] label = self.label - ftrs_orig = ['X1', 'X2', 'X3', 'array_ftr'] + ftrs_orig = ["X1", "X2", "X3", "array_ftr"] ftrs_copy = ftrs_orig[:] - m = tc.nearest_neighbors.create(sf, label=label, features=ftrs_copy, - method='auto', distance='auto', - verbose=False) + m = tc.nearest_neighbors.create( + sf, + label=label, + features=ftrs_copy, + method="auto", + distance="auto", + verbose=False, + ) assert_frame_equal(self.refs.to_dataframe(), sf.to_dataframe()) self.assertEqual(label, self.label) @@ -485,20 +662,20 @@ def test_missing_data(self): missing data of any type in any cell of the input dataset. """ - sf = tc.SFrame({'x0': [1, 2, 3], - 'x1': ['a', 'b', 'c']}) - sf['ints'] = [1, 2, None] - sf['floats'] = [None, 2.2, 3.3] - sf['strings'] = ['a', None, 'c'] - sf['dicts'] = [{'a': 1}, {'b': 2}, None] - sf['arrays'] = [array.array('f', [1., 2.]), - array.array('f', [3., 4.]), - None] + sf = tc.SFrame({"x0": [1, 2, 3], "x1": ["a", "b", "c"]}) + sf["ints"] = [1, 2, None] + sf["floats"] = [None, 2.2, 3.3] + sf["strings"] = ["a", None, "c"] + sf["dicts"] = [{"a": 1}, {"b": 2}, None] + sf["arrays"] = [ + array.array("f", [1.0, 2.0]), + array.array("f", [3.0, 4.0]), + None, + ] - for ftr in ['ints', 'floats', 'strings', 'dicts', 'arrays']: + for ftr in ["ints", "floats", "strings", "dicts", "arrays"]: with self.assertRaises(ToolkitError): - m = tc.nearest_neighbors.create(sf[['x0', 'x1', ftr]], - verbose=False) + m = tc.nearest_neighbors.create(sf[["x0", "x1", ftr]], verbose=False) class NearestNeighborsEdgeCaseTests(unittest.TestCase): @@ -512,14 +689,14 @@ def setUpClass(self): ## Make data np.random.seed(19) n, d = 100, 3 - self.label = 'label' + self.label = "label" array_features = [] for i in range(n): - array_features.append(array.array('f', np.random.rand(d))) + array_features.append(array.array("f", np.random.rand(d))) self.refs = tc.SFrame() - self.refs['array'] = array_features + self.refs["array"] = array_features self.refs[self.label] = [str(x) for x in range(n)] def test_empty_data(self): @@ -530,10 +707,14 @@ def test_empty_data(self): ## Useful objects sf_empty = tc.SFrame() - m = tc.nearest_neighbors.create(self.refs, label=self.label, - features=None, - method='brute_force', - distance='euclidean', verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + label=self.label, + features=None, + method="brute_force", + distance="euclidean", + verbose=False, + ) with self.assertRaises(ToolkitError): m = tc.nearest_neighbors.create(sf_empty, self.label) @@ -547,12 +728,12 @@ def test_bogus_labels(self): """ with self.assertRaises(ToolkitError): - m = tc.nearest_neighbors.create(self.refs, label='fossa') + m = tc.nearest_neighbors.create(self.refs, label="fossa") m = tc.nearest_neighbors.create(self.refs) with self.assertRaises(ValueError): - m.query(self.refs, label='fossa') + m.query(self.refs, label="fossa") def test_empty_composite_distance(self): """ @@ -568,64 +749,69 @@ def test_bogus_parameters(self): """ ## k is out of bounds raises an error - m = tc.nearest_neighbors.create(self.refs, self.label, - method='brute_force') + m = tc.nearest_neighbors.create(self.refs, self.label, method="brute_force") - for k in [-1, 0, 'cat']: + for k in [-1, 0, "cat"]: with self.assertRaises(ValueError): knn = m.query(self.refs, self.label, k=k) - ## k > n should default to n knn = m.query(self.refs, self.label, k=2 * self.refs.num_rows()) - assert knn.num_rows() == self.refs.num_rows()**2, \ - "Query with k > n returned the wrong number of rows." - + assert ( + knn.num_rows() == self.refs.num_rows() ** 2 + ), "Query with k > n returned the wrong number of rows." ## radius out of bounds should raise an error - for r in [-1, 'cat']: + for r in [-1, "cat"]: with self.assertRaises(ValueError): knn = m.query(self.refs, self.label, radius=r) - # ## Leaf size is out of bounds # import ipdb # ipdb.set_trace() - for ls in [-1, 12.3, 'fossa']: + for ls in [-1, 12.3, "fossa"]: with self.assertRaises(ToolkitError): - m = tc.nearest_neighbors.create(self.refs, self.label, - features=['array'], - method='ball_tree', - distance='euclidean', - leaf_size=ls) - + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=["array"], + method="ball_tree", + distance="euclidean", + leaf_size=ls, + ) ## Leaf size > n should be fine - m = tc.nearest_neighbors.create(self.refs, self.label, - features=['array'], - method='ball_tree', - distance='euclidean', - leaf_size = 2 * self.refs.num_rows()) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=["array"], + method="ball_tree", + distance="euclidean", + leaf_size=2 * self.refs.num_rows(), + ) assert m.leaf_size == 2 * self.refs.num_rows() - ## Distance component weights are out of bounds with self.assertRaises(ValueError): - m = tc.nearest_neighbors.create(self.refs, self.label, - distance=[[['array'], 'euclidean', -1e-7]]) + m = tc.nearest_neighbors.create( + self.refs, self.label, distance=[[["array"], "euclidean", -1e-7]] + ) with self.assertRaises(ValueError): - m = tc.nearest_neighbors.create(self.refs, self.label, - distance=[[['array'], 'euclidean', -1]]) + m = tc.nearest_neighbors.create( + self.refs, self.label, distance=[[["array"], "euclidean", -1]] + ) with self.assertRaises(ValueError): - m = tc.nearest_neighbors.create(self.refs, self.label, - distance=[[['array'], 'euclidean', 'a']]) + m = tc.nearest_neighbors.create( + self.refs, self.label, distance=[[["array"], "euclidean", "a"]] + ) with self.assertRaises(ToolkitError): - m = tc.nearest_neighbors.create(self.refs, self.label, - distance=[[['array'], 'euclidean', 1e15]]) + m = tc.nearest_neighbors.create( + self.refs, self.label, distance=[[["array"], "euclidean", 1e15]] + ) class NearestNeighborsBruteForceAPITest(unittest.TestCase): @@ -638,68 +824,79 @@ def setUp(self): ## Make data np.random.seed(19) - d = 3 # dimension - n = 100 # number of reference points + d = 3 # dimension + n = 100 # number of reference points self.refs = tc.SFrame() for i in range(d): self.refs.add_column(tc.SArray(np.random.rand(n)), inplace=True) - self.refs['row_label'] = [str(x) for x in range(n)] + self.refs["row_label"] = [str(x) for x in range(n)] - self.label = 'row_label' - self.features = ['X{}'.format(i+1) for i in range(d)] - self.unpacked_features = ['X{}'.format(i+1) for i in range(d)] + self.label = "row_label" + self.features = ["X{}".format(i + 1) for i in range(d)] + self.unpacked_features = ["X{}".format(i + 1) for i in range(d)] ## Create the nearest neighbors model - self.model = tc.nearest_neighbors.create(self.refs, self.label, - features=None, - method='brute_force', - distance='euclidean', - verbose=False) + self.model = tc.nearest_neighbors.create( + self.refs, + self.label, + features=None, + method="brute_force", + distance="euclidean", + verbose=False, + ) ## Answers self.fields_ans = [ - 'training_time', - 'label', - 'unpacked_features', - 'features', - 'method', - 'num_examples', - 'num_unpacked_features', - 'num_features', - 'num_distance_components', - 'distance', - 'distance_for_summary_struct'] + "training_time", + "label", + "unpacked_features", + "features", + "method", + "num_examples", + "num_unpacked_features", + "num_features", + "num_distance_components", + "distance", + "distance_for_summary_struct", + ] self.default_opts = { - 'leaf_size': {u'default_value': 0, - u'lower_bound': 0, - u'upper_bound': 2147483647, - u'description': u'Max number of points in a leaf node of the ball tree', - u'parameter_type': u'INTEGER'}, - 'label': {u'default_value': u'', - u'description': u'Name of the reference dataset column with row labels.', - u'parameter_type': u'STRING'}} - - self.opts = {'label': self.label} + "leaf_size": { + u"default_value": 0, + u"lower_bound": 0, + u"upper_bound": 2147483647, + u"description": u"Max number of points in a leaf node of the ball tree", + u"parameter_type": u"INTEGER", + }, + "label": { + u"default_value": u"", + u"description": u"Name of the reference dataset column with row labels.", + u"parameter_type": u"STRING", + }, + } + + self.opts = {"label": self.label} self.get_ans = { - 'distance': lambda x: len(x) == 1, - 'training_time': lambda x: x >= 0, - 'label': lambda x: x == self.label, - 'method': lambda x: x == 'brute_force', - 'num_examples': lambda x: x == 100, - 'num_features': lambda x: x == 3, - 'num_unpacked_features': lambda x: x == 3, - 'num_distance_components': lambda x: x == 1} + "distance": lambda x: len(x) == 1, + "training_time": lambda x: x >= 0, + "label": lambda x: x == self.label, + "method": lambda x: x == "brute_force", + "num_examples": lambda x: x == 100, + "num_features": lambda x: x == 3, + "num_unpacked_features": lambda x: x == 3, + "num_distance_components": lambda x: x == 1, + } def test__list_fields(self): """ Check the _list_fields method. """ - assert set(self.model._list_fields()) == set(self.fields_ans), \ - "List fields failed with {}.".format(self.model._list_fields()) + assert set(self.model._list_fields()) == set( + self.fields_ans + ), "List fields failed with {}.".format(self.model._list_fields()) def test_get(self): """ @@ -707,8 +904,9 @@ def test_get(self): """ for field in self.get_ans.keys(): ans = self.model._get(field) - assert self.get_ans[field](ans), \ - "Get failed in field '{}'. Output: {}".format(field, ans) + assert self.get_ans[field]( + ans + ), "Get failed in field '{}'. Output: {}".format(field, ans) ## Check names of features and unpacked features assert set(self.model.features) == set(self.features) @@ -776,77 +974,90 @@ def setUp(self): ## Make data np.random.seed(19) - d = 3 # dimension - n = 100 # number of reference points + d = 3 # dimension + n = 100 # number of reference points refs = [] for i in range(n): - refs.append(array.array('f', np.random.rand(d))) + refs.append(array.array("f", np.random.rand(d))) - self.refs = tc.SFrame({'features': refs}) - self.refs['row_label'] = [str(x) for x in range(n)] - self.query = tc.SFrame({'features': refs}) - self.query['row_label'] = [str(x) for x in range(50, n+50)] + self.refs = tc.SFrame({"features": refs}) + self.refs["row_label"] = [str(x) for x in range(n)] + self.query = tc.SFrame({"features": refs}) + self.query["row_label"] = [str(x) for x in range(50, n + 50)] - self.label = 'row_label' + self.label = "row_label" self.features = ["features"] - self.unpacked_features= ['features[0]', 'features[1]', 'features[2]'] + self.unpacked_features = ["features[0]", "features[1]", "features[2]"] - self.opts = {'num_tables': 4, 'num_projections_per_table': 4, 'label': self.label} + self.opts = { + "num_tables": 4, + "num_projections_per_table": 4, + "label": self.label, + } ## Create the nearest neighbors model - self.model = tc.nearest_neighbors.create(self.refs, self.label, - features=self.features, - method='lsh', - distance='euclidean', - num_tables=self.opts['num_tables'], - num_projections_per_table=self.opts['num_projections_per_table']) + self.model = tc.nearest_neighbors.create( + self.refs, + self.label, + features=self.features, + method="lsh", + distance="euclidean", + num_tables=self.opts["num_tables"], + num_projections_per_table=self.opts["num_projections_per_table"], + ) ## Answers self.fields_ans = [ - 'distance', - 'distance_for_summary_struct', - 'num_distance_components', - 'features', - 'unpacked_features', - 'label', - 'num_tables', - 'num_projections_per_table', - 'method', - 'num_examples', - 'num_unpacked_features', - 'num_features', - 'training_time', - ] + "distance", + "distance_for_summary_struct", + "num_distance_components", + "features", + "unpacked_features", + "label", + "num_tables", + "num_projections_per_table", + "method", + "num_examples", + "num_unpacked_features", + "num_features", + "training_time", + ] self.default_opts = { - 'num_tables': {u'default_value': 10, - u'lower_bound': 1, - u'upper_bound': 2147483647, - u'description': u'number of hash tables for LSH', - u'parameter_type': u'INTEGER'}, - - 'num_projections_per_table': {u'default_value': 8, - u'lower_bound': 1, - u'upper_bound': 2147483647, - u'description': u'number of projections in each hash table', - u'parameter_type': u'INTEGER'}, - - 'label': {u'default_value': u'', - u'description': u'Name of the reference dataset column with row labels.', - u'parameter_type': u'STRING'}} + "num_tables": { + u"default_value": 10, + u"lower_bound": 1, + u"upper_bound": 2147483647, + u"description": u"number of hash tables for LSH", + u"parameter_type": u"INTEGER", + }, + "num_projections_per_table": { + u"default_value": 8, + u"lower_bound": 1, + u"upper_bound": 2147483647, + u"description": u"number of projections in each hash table", + u"parameter_type": u"INTEGER", + }, + "label": { + u"default_value": u"", + u"description": u"Name of the reference dataset column with row labels.", + u"parameter_type": u"STRING", + }, + } self.get_ans = { - 'distance': lambda x: len(x) == 1, - 'num_distance_components': lambda x: x == 1, - 'label': lambda x: x == self.label, - 'num_tables': lambda x: x == self.opts['num_tables'], - 'num_projections_per_table': lambda x: x == self.opts['num_projections_per_table'], - 'method': lambda x: x == 'lsh', - 'num_examples': lambda x: x == 100, - 'num_features': lambda x: x == 1, - 'num_unpacked_features': lambda x: x == 3, - 'training_time': lambda x: x >= 0, - } + "distance": lambda x: len(x) == 1, + "num_distance_components": lambda x: x == 1, + "label": lambda x: x == self.label, + "num_tables": lambda x: x == self.opts["num_tables"], + "num_projections_per_table": lambda x: x + == self.opts["num_projections_per_table"], + "method": lambda x: x == "lsh", + "num_examples": lambda x: x == 100, + "num_features": lambda x: x == 1, + "num_unpacked_features": lambda x: x == 3, + "training_time": lambda x: x >= 0, + } def test_query(self): q = self.model.query(self.query, label=self.label, k=1, verbose=False) @@ -854,15 +1065,16 @@ def test_query(self): assert q.num_rows() >= self.query.num_rows() # all the 1-nearest-neighbor should be the queries themselves (and identical points) # so that means a distance of zero - distances = q['distance'] - assert(len(distances.filter(lambda x: x != 0.0)) == 0) + distances = q["distance"] + assert len(distances.filter(lambda x: x != 0.0)) == 0 def test__list_fields(self): """ Check the _list_fields method. """ - assert set(self.model._list_fields()) == set(self.fields_ans), \ - "List fields failed with {}.".format(self.model._list_fields()) + assert set(self.model._list_fields()) == set( + self.fields_ans + ), "List fields failed with {}.".format(self.model._list_fields()) def test_get(self): """ @@ -870,8 +1082,9 @@ def test_get(self): """ for field in self.get_ans.keys(): ans = self.model._get(field) - assert self.get_ans[field](ans), \ - "Get failed in field '{}'. Output: {}".format(field, ans) + assert self.get_ans[field]( + ans + ), "Get failed in field '{}'. Output: {}".format(field, ans) ## Check names of features and unpacked features assert set(self.model.features) == set(self.features) @@ -929,70 +1142,79 @@ def setUp(self): ## Make data np.random.seed(19) - d = 3 # dimension - n = 100 # number of reference points + d = 3 # dimension + n = 100 # number of reference points refs = [] for i in range(n): - refs.append(array.array('f', np.random.rand(d))) + refs.append(array.array("f", np.random.rand(d))) - self.refs = tc.SFrame({'features': refs}) - self.refs['row_label'] = [str(x) for x in range(n)] - self.query = tc.SFrame({'features': refs}) - self.query['row_label'] = [str(x) for x in range(50, n+50)] + self.refs = tc.SFrame({"features": refs}) + self.refs["row_label"] = [str(x) for x in range(n)] + self.query = tc.SFrame({"features": refs}) + self.query["row_label"] = [str(x) for x in range(50, n + 50)] - self.label = 'row_label' + self.label = "row_label" self.features = ["features"] - self.unpacked_features= ['features[0]', 'features[1]', 'features[2]'] + self.unpacked_features = ["features[0]", "features[1]", "features[2]"] - self.opts = {'leaf_size': 16, 'label': self.label} + self.opts = {"leaf_size": 16, "label": self.label} ## Create the nearest neighbors model - self.model = tc.nearest_neighbors.create(self.refs, self.label, - features=self.features, - method='ball_tree', - distance='euclidean', - leaf_size=self.opts['leaf_size'], - verbose=False) + self.model = tc.nearest_neighbors.create( + self.refs, + self.label, + features=self.features, + method="ball_tree", + distance="euclidean", + leaf_size=self.opts["leaf_size"], + verbose=False, + ) ## Answers self.fields_ans = [ - 'distance', - 'distance_for_summary_struct', - 'num_distance_components', - 'features', - 'unpacked_features', - 'label', - 'leaf_size', - 'method', - 'num_examples', - 'num_unpacked_features', - 'num_features', - 'training_time', - 'tree_depth'] + "distance", + "distance_for_summary_struct", + "num_distance_components", + "features", + "unpacked_features", + "label", + "leaf_size", + "method", + "num_examples", + "num_unpacked_features", + "num_features", + "training_time", + "tree_depth", + ] self.default_opts = { - 'leaf_size': {u'default_value': 0, - u'lower_bound': 0, - u'upper_bound': 2147483647, - u'description': u'Max number of points in a leaf node of the ball tree', - u'parameter_type': u'INTEGER'}, - 'label': {u'default_value': u'', - u'description': u'Name of the reference dataset column with row labels.', - u'parameter_type': u'STRING'}} + "leaf_size": { + u"default_value": 0, + u"lower_bound": 0, + u"upper_bound": 2147483647, + u"description": u"Max number of points in a leaf node of the ball tree", + u"parameter_type": u"INTEGER", + }, + "label": { + u"default_value": u"", + u"description": u"Name of the reference dataset column with row labels.", + u"parameter_type": u"STRING", + }, + } self.get_ans = { - 'distance': lambda x: len(x) == 1, - 'num_distance_components': lambda x: x == 1, - 'label': lambda x: x == self.label, - 'leaf_size': lambda x: x == self.opts['leaf_size'], - 'method': lambda x: x == 'ball_tree', - 'num_examples': lambda x: x == 100, - 'num_features': lambda x: x == 1, - 'num_unpacked_features': lambda x: x == 3, - 'training_time': lambda x: x >= 0, - 'tree_depth': lambda x: x == 4, # assumes n=100, leaf_size=16 - } + "distance": lambda x: len(x) == 1, + "num_distance_components": lambda x: x == 1, + "label": lambda x: x == self.label, + "leaf_size": lambda x: x == self.opts["leaf_size"], + "method": lambda x: x == "ball_tree", + "num_examples": lambda x: x == 100, + "num_features": lambda x: x == 1, + "num_unpacked_features": lambda x: x == 3, + "training_time": lambda x: x >= 0, + "tree_depth": lambda x: x == 4, # assumes n=100, leaf_size=16 + } def test_query(self): q = self.model.query(self.query, self.label, k=3, verbose=False) @@ -1002,8 +1224,9 @@ def test__list_fields(self): """ Check the _list_fields method. """ - assert set(self.model._list_fields()) == set(self.fields_ans), \ - "List fields failed with {}.".format(self.model._list_fields()) + assert set(self.model._list_fields()) == set( + self.fields_ans + ), "List fields failed with {}.".format(self.model._list_fields()) def test_get(self): """ @@ -1011,8 +1234,9 @@ def test_get(self): """ for field in self.get_ans.keys(): ans = self.model._get(field) - assert self.get_ans[field](ans), \ - "Get failed in field '{}'. Output: {}".format(field, ans) + assert self.get_ans[field]( + ans + ), "Get failed in field '{}'. Output: {}".format(field, ans) ## Check names of features and unpacked features assert set(self.model.features) == set(self.features) @@ -1060,6 +1284,7 @@ def test_save_and_load(self): del self.model + class GeneralSimilarityGraphTest(unittest.TestCase): """ Tests that apply to all nearest neighbors similarity graph methods, @@ -1070,50 +1295,73 @@ class GeneralSimilarityGraphTest(unittest.TestCase): def setUpClass(self): np.random.seed(19) - self.dimension = 3 # dimension - n = 10 # number of reference points + self.dimension = 3 # dimension + n = 10 # number of reference points self.refs = tc.SFrame(np.random.rand(n, self.dimension)) - self.features = ['X1.{}'.format(i) for i in range(self.dimension)] - self.refs = self.refs.unpack('X1') + self.features = ["X1.{}".format(i) for i in range(self.dimension)] + self.refs = self.refs.unpack("X1") - self.label = 'id' + self.label = "id" self.refs = self.refs.add_row_number(self.label) - self.refs[self.label] = self.refs[self.label].astype(str) + 'a' + self.refs[self.label] = self.refs[self.label].astype(str) + "a" df_refs = self.refs.to_dataframe().drop(self.label, axis=1) - self.answer_dists = scipy_dist(df_refs, df_refs, 'euclidean') + self.answer_dists = scipy_dist(df_refs, df_refs, "euclidean") def test_neighborhood_constraints(self): """ Test various combinations of the k and radius constraints. """ - m = tc.nearest_neighbors.create(self.refs, features=self.features, - distance='euclidean', - method='brute_force', - verbose=False) - - knn = m.similarity_graph(k=None, radius=None, include_self_edges=True, - output_type='SFrame', verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + features=self.features, + distance="euclidean", + method="brute_force", + verbose=False, + ) + + knn = m.similarity_graph( + k=None, + radius=None, + include_self_edges=True, + output_type="SFrame", + verbose=False, + ) assert_frame_equal(self.answer_dists.to_dataframe(), knn.to_dataframe()) ## k only, no radius - knn = m.similarity_graph(k=3, radius=None, include_self_edges=True, - output_type='SFrame', verbose=False) - ans = self.answer_dists[self.answer_dists['rank'] <= 3] + knn = m.similarity_graph( + k=3, + radius=None, + include_self_edges=True, + output_type="SFrame", + verbose=False, + ) + ans = self.answer_dists[self.answer_dists["rank"] <= 3] assert_frame_equal(ans.to_dataframe(), knn.to_dataframe()) ## radius only, no k - knn = m.similarity_graph(k=None, radius=0.4, include_self_edges=True, - output_type='SFrame', verbose=False) - ans = self.answer_dists[self.answer_dists['distance'] <= 0.4] + knn = m.similarity_graph( + k=None, + radius=0.4, + include_self_edges=True, + output_type="SFrame", + verbose=False, + ) + ans = self.answer_dists[self.answer_dists["distance"] <= 0.4] assert_frame_equal(ans.to_dataframe(), knn.to_dataframe()) ## Both radius and k - knn = m.similarity_graph(k=3, radius=0.4, include_self_edges=True, - output_type='SFrame', verbose=False) - ans = self.answer_dists[self.answer_dists['rank'] <= 3] - ans = ans[ans['distance'] <= 0.4] + knn = m.similarity_graph( + k=3, + radius=0.4, + include_self_edges=True, + output_type="SFrame", + verbose=False, + ) + ans = self.answer_dists[self.answer_dists["rank"] <= 3] + ans = ans[ans["distance"] <= 0.4] assert_frame_equal(ans.to_dataframe(), knn.to_dataframe()) def test_self_edges(self): @@ -1123,44 +1371,70 @@ def test_self_edges(self): """ ## Without row labels - m = tc.nearest_neighbors.create(self.refs, features=self.features, - distance='euclidean', - method='brute_force', - verbose=False) - - knn = m.similarity_graph(k=None, radius=None, include_self_edges=True, - output_type='SFrame', verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + features=self.features, + distance="euclidean", + method="brute_force", + verbose=False, + ) + + knn = m.similarity_graph( + k=None, + radius=None, + include_self_edges=True, + output_type="SFrame", + verbose=False, + ) assert_frame_equal(self.answer_dists.to_dataframe(), knn.to_dataframe()) - knn2 = m.similarity_graph(k=None, radius=None, include_self_edges=False, - output_type='SFrame', verbose=False) - mask = self.answer_dists['query_label'] != self.answer_dists['reference_label'] + knn2 = m.similarity_graph( + k=None, + radius=None, + include_self_edges=False, + output_type="SFrame", + verbose=False, + ) + mask = self.answer_dists["query_label"] != self.answer_dists["reference_label"] ans = self.answer_dists[mask] - ans['rank'] = ans['rank'] - 1 + ans["rank"] = ans["rank"] - 1 assert_frame_equal(ans.to_dataframe(), knn2.to_dataframe()) - ## With string row labels - m = tc.nearest_neighbors.create(self.refs, self.label, features=None, - distance='euclidean', - method='brute_force', - verbose=False) - - knn = m.similarity_graph(k=None, radius=None, output_type='SFrame', - include_self_edges=True, verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=None, + distance="euclidean", + method="brute_force", + verbose=False, + ) + + knn = m.similarity_graph( + k=None, + radius=None, + output_type="SFrame", + include_self_edges=True, + verbose=False, + ) ans = copy.copy(self.answer_dists) - ans['query_label'] = ans['query_label'].astype(str) + 'a' - ans['reference_label'] = ans['reference_label'].astype(str) + 'a' + ans["query_label"] = ans["query_label"].astype(str) + "a" + ans["reference_label"] = ans["reference_label"].astype(str) + "a" assert_frame_equal(ans.to_dataframe(), knn.to_dataframe()) - knn2 = m.similarity_graph(k=None, radius=None, include_self_edges=False, - output_type='SFrame', verbose=False) - mask = self.answer_dists['query_label'] != self.answer_dists['reference_label'] + knn2 = m.similarity_graph( + k=None, + radius=None, + include_self_edges=False, + output_type="SFrame", + verbose=False, + ) + mask = self.answer_dists["query_label"] != self.answer_dists["reference_label"] ans = self.answer_dists[mask] - ans['rank'] = ans['rank'] - 1 - ans['query_label'] = ans['query_label'].astype(str) + 'a' - ans['reference_label'] = ans['reference_label'].astype(str) + 'a' + ans["rank"] = ans["rank"] - 1 + ans["query_label"] = ans["query_label"].astype(str) + "a" + ans["reference_label"] = ans["reference_label"].astype(str) + "a" assert_frame_equal(ans.to_dataframe(), knn2.to_dataframe()) @@ -1169,21 +1443,35 @@ def test_output_type(self): Check that the results can be returned as either an SFrame or an SGraph and that the results match in both of these forms. """ - m = tc.nearest_neighbors.create(self.refs, features=self.features, - distance='euclidean', - method='brute_force', - verbose=False) - - knn = m.similarity_graph(k=None, radius=None, include_self_edges=False, - output_type='SFrame', verbose=False) - - sg = m.similarity_graph(k=None, radius=None, include_self_edges=False, - output_type='SGraph', verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + features=self.features, + distance="euclidean", + method="brute_force", + verbose=False, + ) + + knn = m.similarity_graph( + k=None, + radius=None, + include_self_edges=False, + output_type="SFrame", + verbose=False, + ) + + sg = m.similarity_graph( + k=None, + radius=None, + include_self_edges=False, + output_type="SGraph", + verbose=False, + ) sg_edges = copy.copy(sg.edges) - sg_edges = sg_edges.rename({'__src_id': 'query_label', - '__dst_id': 'reference_label'}, inplace=True) - sg_edges = sg_edges.sort(['query_label', 'distance']) + sg_edges = sg_edges.rename( + {"__src_id": "query_label", "__dst_id": "reference_label"}, inplace=True + ) + sg_edges = sg_edges.sort(["query_label", "distance"]) assert_frame_equal(sg_edges.to_dataframe(), knn.to_dataframe()) @@ -1193,19 +1481,26 @@ def test_other_methods(self): """ ## Ball tree - m = tc.nearest_neighbors.create(self.refs, self.label, features=None, - distance='euclidean', - method='ball_tree', - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=None, + distance="euclidean", + method="ball_tree", + verbose=False, + ) knn = m.similarity_graph(k=5, radius=None, verbose=False) - ## LSH - m = tc.nearest_neighbors.create(self.refs, self.label, features=None, - distance='euclidean', - method='lsh', - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + features=None, + distance="euclidean", + method="lsh", + verbose=False, + ) knn = m.similarity_graph(k=5, radius=None, verbose=False) @@ -1220,20 +1515,20 @@ class GeneralQueryTest(unittest.TestCase): def setUpClass(self): np.random.seed(19) - p = 3 # dimension - n = 10 # number of reference points - self.n_query = 2 # number of query points + p = 3 # dimension + n = 10 # number of reference points + self.n_query = 2 # number of query points self.refs = tc.SFrame(np.random.rand(n, p)) - self.refs = self.refs.unpack('X1') + self.refs = self.refs.unpack("X1") - self.label = 'id' + self.label = "id" self.refs = self.refs.add_row_number(self.label) - self.queries = self.refs[0:self.n_query] + self.queries = self.refs[0 : self.n_query] df_refs = self.refs.to_dataframe().drop(self.label, axis=1) df_queries = self.queries.to_dataframe().drop(self.label, axis=1) - self.answer_dists = scipy_dist(df_queries, df_refs, 'euclidean') + self.answer_dists = scipy_dist(df_queries, df_refs, "euclidean") def test_neighborhood_constraints(self): """ @@ -1241,10 +1536,13 @@ def test_neighborhood_constraints(self): """ ## No constraints - m = tc.nearest_neighbors.create(self.refs, self.label, - distance='euclidean', - method='brute_force', - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + self.label, + distance="euclidean", + method="brute_force", + verbose=False, + ) knn = m.query(self.queries, k=None, radius=None, verbose=False) @@ -1252,18 +1550,18 @@ def test_neighborhood_constraints(self): ## k only, no radius knn = m.query(self.queries, k=3, radius=None, verbose=False) - ans = self.answer_dists[self.answer_dists['rank'] <= 3] + ans = self.answer_dists[self.answer_dists["rank"] <= 3] assert_frame_equal(ans.to_dataframe(), knn.to_dataframe()) ## radius only, no k knn = m.query(self.queries, k=None, radius=0.4, verbose=False) - ans = self.answer_dists[self.answer_dists['distance'] <= 0.4] + ans = self.answer_dists[self.answer_dists["distance"] <= 0.4] assert_frame_equal(ans.to_dataframe(), knn.to_dataframe()) ## Both radius and k knn = m.query(self.queries, k=3, radius=0.4, verbose=False) - ans = self.answer_dists[self.answer_dists['rank'] <= 3] - ans = ans[ans['distance'] <= 0.4] + ans = self.answer_dists[self.answer_dists["rank"] <= 3] + ans = ans[ans["distance"] <= 0.4] assert_frame_equal(ans.to_dataframe(), knn.to_dataframe()) def test_labels(self): @@ -1271,29 +1569,34 @@ def test_labels(self): Test query accuracy for various configurations of row labels. """ sfq = self.queries[:] - sfq.remove_column('id', inplace=True) + sfq.remove_column("id", inplace=True) k = 3 - m = tc.nearest_neighbors.create(self.refs, label=self.label, - features=None, distance='euclidean', - method='brute_force', verbose=False) + m = tc.nearest_neighbors.create( + self.refs, + label=self.label, + features=None, + distance="euclidean", + method="brute_force", + verbose=False, + ) knn_correct = m.query(self.queries, self.label, k=k, verbose=False) ## No label should work fine knn = m.query(sfq, label=None, k=k, verbose=False) - self.assertTrue(knn['query_label'].dtype is int) + self.assertTrue(knn["query_label"].dtype is int) assert_frame_equal(knn_correct.to_dataframe(), knn.to_dataframe()) ## Integer label should work fine - sfq = sfq.add_row_number(column_name='id') - knn = m.query(sfq, label='id', k=k, verbose=False) - self.assertTrue(knn['query_label'].dtype is int) + sfq = sfq.add_row_number(column_name="id") + knn = m.query(sfq, label="id", k=k, verbose=False) + self.assertTrue(knn["query_label"].dtype is int) assert_frame_equal(knn_correct.to_dataframe(), knn.to_dataframe()) ## Float label should fail - sfq['id'] = sfq['id'].astype(float) + sfq["id"] = sfq["id"].astype(float) with self.assertRaises(TypeError): - knn = m.query(sfq, label='id', k=k) + knn = m.query(sfq, label="id", k=k) class NearestNeighborsNumericQueryTest(unittest.TestCase): @@ -1306,39 +1609,50 @@ def setUpClass(self): ## Make data np.random.seed(19) - p = 3 # dimension - n = 10 # number of reference points - self.n_query = 2 # number of query points + p = 3 # dimension + n = 10 # number of reference points + self.n_query = 2 # number of query points self.refs = tc.SFrame(np.random.rand(n, p)) - self.refs = self.refs.unpack('X1') + self.refs = self.refs.unpack("X1") - self.label = 'id' + self.label = "id" self.refs = self.refs.add_row_number(self.label) - self.queries = self.refs[0:self.n_query] + self.queries = self.refs[0 : self.n_query] ## Answer items common to all tests self.r = self.refs.to_dataframe().drop(self.label, axis=1) self.q = self.queries.to_dataframe().drop(self.label, axis=1) - def _test_query(self, answer, sf_ref, sf_query, label, features, distance, - method, k=None, radius=None): + def _test_query( + self, + answer, + sf_ref, + sf_query, + label, + features, + distance, + method, + k=None, + radius=None, + ): """ Test the accuracy of exact queries against python brute force solution, from Scipy. """ ## Construct nearest neighbors model and get query results - m = tc.nearest_neighbors.create(sf_ref, label, features, distance, - method, verbose=False) + m = tc.nearest_neighbors.create( + sf_ref, label, features, distance, method, verbose=False + ) knn = m.query(sf_query, label, k=k, radius=radius, verbose=False) ## Trim the answers to the k and radius parameters if k is not None: - answer = answer[answer['rank'] <= k] + answer = answer[answer["rank"] <= k] if radius is not None: - answer = answer[answer['distance'] <= radius] + answer = answer[answer["distance"] <= radius] ## Test data frame equality assert_frame_equal(answer.to_dataframe(), knn.to_dataframe()) @@ -1351,28 +1665,53 @@ def test_query_distances(self): idx_row = np.array([[x] for x in range(self.n_query)]) ## Euclidean and cosine distance - for dist in ['euclidean', 'cosine']: + for dist in ["euclidean", "cosine"]: answer = scipy_dist(self.q, self.r, dist) - self._test_query(answer, self.refs, self.queries, self.label, - features=None, distance=dist, method='brute_force') - + self._test_query( + answer, + self.refs, + self.queries, + self.label, + features=None, + distance=dist, + method="brute_force", + ) ## Squared euclidean distances - answer = scipy_dist(self.q, self.r, 'sqeuclidean') - self._test_query(answer, self.refs, self.queries, self.label, - features=None, distance='squared_euclidean', method='brute_force') - + answer = scipy_dist(self.q, self.r, "sqeuclidean") + self._test_query( + answer, + self.refs, + self.queries, + self.label, + features=None, + distance="squared_euclidean", + method="brute_force", + ) ## Manhattan distance - answer = scipy_dist(self.q, self.r, 'cityblock') - self._test_query(answer, self.refs, self.queries, self.label, - features=None, distance='manhattan', method='brute_force') - + answer = scipy_dist(self.q, self.r, "cityblock") + self._test_query( + answer, + self.refs, + self.queries, + self.label, + features=None, + distance="manhattan", + method="brute_force", + ) ## Auto distance (brute force, dense features) - answer = scipy_dist(self.q, self.r, 'euclidean') - self._test_query(answer, self.refs, self.queries, self.label, - features=None, distance='auto', method='brute_force') + answer = scipy_dist(self.q, self.r, "euclidean") + self._test_query( + answer, + self.refs, + self.queries, + self.label, + features=None, + distance="auto", + method="brute_force", + ) ## Transformed dot product distance D = self.q.dot(self.r.T) @@ -1384,30 +1723,47 @@ def test_query_distances(self): idx_col = np.argsort(D, axis=1) idx_row = np.array([[x] for x in range(n_query)]) query_labels = list(np.repeat(range(n_query), n)) - ranks = np.tile(range(1, n+1), n_query) - - answer = tc.SFrame({'query_label': query_labels, - 'reference_label': idx_col.flatten(), - 'distance': D[idx_row, idx_col].flatten(), - 'rank': ranks}) - - answer.swap_columns('distance', 'query_label', inplace=True) - answer.swap_columns('distance', 'reference_label', inplace=True) - answer.swap_columns('distance', 'rank', inplace=True) - - self._test_query(answer, self.refs, self.queries, self.label, - features=None, distance='transformed_dot_product', - method='brute_force') + ranks = np.tile(range(1, n + 1), n_query) + + answer = tc.SFrame( + { + "query_label": query_labels, + "reference_label": idx_col.flatten(), + "distance": D[idx_row, idx_col].flatten(), + "rank": ranks, + } + ) + + answer.swap_columns("distance", "query_label", inplace=True) + answer.swap_columns("distance", "reference_label", inplace=True) + answer.swap_columns("distance", "rank", inplace=True) + + self._test_query( + answer, + self.refs, + self.queries, + self.label, + features=None, + distance="transformed_dot_product", + method="brute_force", + ) def test_query_methods(self): """ Test query accuracy for various nearest neighbor methods. """ - answer = scipy_dist(self.q, self.r, 'euclidean') + answer = scipy_dist(self.q, self.r, "euclidean") - for method in ['auto', 'brute_force', 'ball_tree']: - self._test_query(answer, self.refs, self.queries, self.label, - features=None, distance='euclidean', method=method) + for method in ["auto", "brute_force", "ball_tree"]: + self._test_query( + answer, + self.refs, + self.queries, + self.label, + features=None, + distance="euclidean", + method=method, + ) def test_blockwise_brute_force(self): """ @@ -1423,16 +1779,17 @@ def test_blockwise_brute_force(self): sf = tc.SFrame(np.random.rand(n, d)) - m = tc.nearest_neighbors.create(sf, method='brute_force', - distance='euclidean', verbose=False) + m = tc.nearest_neighbors.create( + sf, method="brute_force", distance="euclidean", verbose=False + ) sf_query = tc.SFrame(np.random.rand(n_query, d)) knn = m.query(sf_query, verbose=False) # blockwise brute force query - knn2 = m.query(sf_query[:10], verbose=False) # pairwise brute force query + knn2 = m.query(sf_query[:10], verbose=False) # pairwise brute force query self.assertEqual(knn.num_rows(), 21 * 5) - assert_frame_equal(knn[:10 * 5].to_dataframe(), knn2.to_dataframe()) + assert_frame_equal(knn[: 10 * 5].to_dataframe(), knn2.to_dataframe()) def test_similarity_graph(self): """ @@ -1445,67 +1802,69 @@ def test_similarity_graph(self): n, d = 500, 10 sf = tc.SFrame(np.random.rand(n, d)) - m = tc.nearest_neighbors.create(sf, method='brute_force', - distance='euclidean', verbose=False) + m = tc.nearest_neighbors.create( + sf, method="brute_force", distance="euclidean", verbose=False + ) knn = m.query(sf[:10], k=3, verbose=False) - knn_graph = m.similarity_graph(k=2, output_type='SFrame', verbose=False) + knn_graph = m.similarity_graph(k=2, output_type="SFrame", verbose=False) ## Basic metadata about the output self.assertEqual(knn_graph.num_rows(), 1000) - self.assertEqual(knn_graph['rank'].max(), 2) - self.assertGreaterEqual(knn_graph['distance'].min(), 0.) + self.assertEqual(knn_graph["rank"].max(), 2) + self.assertGreaterEqual(knn_graph["distance"].min(), 0.0) ## No self-edges - label_diff = knn_graph['query_label'] - knn_graph['reference_label'] + label_diff = knn_graph["query_label"] - knn_graph["reference_label"] self.assertEqual(sum(label_diff == 0), 0) ## Exact match for query results with different - knn = knn[knn['rank'] > 1] - knn['rank'] = knn['rank'] - 1 + knn = knn[knn["rank"] > 1] + knn["rank"] = knn["rank"] - 1 assert_frame_equal(knn.to_dataframe(), knn_graph[:20].to_dataframe()) - ### Tweaking query-time parameters ### ------------------------------ - knn_graph2 = m.similarity_graph(k=5, output_type='SFrame', verbose=False) + knn_graph2 = m.similarity_graph(k=5, output_type="SFrame", verbose=False) self.assertEqual(knn_graph2.num_rows(), 2500) - self.assertEqual(knn_graph2['rank'].max(), 5) - self.assertGreaterEqual(knn_graph2['distance'].min(), 0.) + self.assertEqual(knn_graph2["rank"].max(), 5) + self.assertGreaterEqual(knn_graph2["distance"].min(), 0.0) - knn_graph3 = m.similarity_graph(k=None, radius=0.5, - output_type='SFrame', verbose=False) - self.assertLessEqual(knn_graph3['distance'].max(), 0.5) - - knn_graph4 = m.similarity_graph(k=2, radius=0.5, output_type='SFrame', - verbose=False) - self.assertEqual(knn_graph4['rank'].max(), 2) - self.assertLessEqual(knn_graph4['distance'].max(), 0.5) + knn_graph3 = m.similarity_graph( + k=None, radius=0.5, output_type="SFrame", verbose=False + ) + self.assertLessEqual(knn_graph3["distance"].max(), 0.5) + knn_graph4 = m.similarity_graph( + k=2, radius=0.5, output_type="SFrame", verbose=False + ) + self.assertEqual(knn_graph4["rank"].max(), 2) + self.assertLessEqual(knn_graph4["distance"].max(), 0.5) ### Pairwise similarity graph - manhattan distance ### ---------------------------------------------- n, d = 500, 10 sf = tc.SFrame(np.random.rand(n, d)) - m = tc.nearest_neighbors.create(sf, method='brute_force', - distance='manhattan', verbose=False) + m = tc.nearest_neighbors.create( + sf, method="brute_force", distance="manhattan", verbose=False + ) knn = m.query(sf[:10], k=3, verbose=False) - knn_graph = m.similarity_graph(k=2, output_type='SFrame', verbose=False) + knn_graph = m.similarity_graph(k=2, output_type="SFrame", verbose=False) ## Basic metadata about the output self.assertEqual(knn_graph.num_rows(), 1000) - self.assertEqual(knn_graph['rank'].max(), 2) - self.assertGreaterEqual(knn_graph['distance'].min(), 0.) + self.assertEqual(knn_graph["rank"].max(), 2) + self.assertGreaterEqual(knn_graph["distance"].min(), 0.0) ## No self-edges - label_diff = knn_graph['query_label'] - knn_graph['reference_label'] + label_diff = knn_graph["query_label"] - knn_graph["reference_label"] self.assertEqual(sum(label_diff == 0), 0) ## Exact match for query results with different - knn = knn[knn['rank'] > 1] - knn['rank'] = knn['rank'] - 1 + knn = knn[knn["rank"] > 1] + knn["rank"] = knn["rank"] - 1 assert_frame_equal(knn.to_dataframe(), knn_graph[:20].to_dataframe()) @@ -1519,51 +1878,53 @@ def setUpClass(self): n = 10 n_query = 2 - # Generate sparse data: an SFrame with a column of type dict - self.field = 'docs' + # Generate sparse data: an SFrame with a column of type dict + self.field = "docs" self.refs = tc.SFrame() self.refs[self.field] = [random_dict(5, 3) for i in range(n)] - self.k = 3 # number of neighbors to return + self.k = 3 # number of neighbors to return self.radius = 1.0 # radius to use - self.label = 'row_label' + self.label = "row_label" self.refs[self.label] = [str(x) for x in range(n)] - def _test_query(self, sf_ref, sf_query, label, features, distance, method, - k=5, radius=1.0): + def _test_query( + self, sf_ref, sf_query, label, features, distance, method, k=5, radius=1.0 + ): """ Test the accuracy of exact queries against hand-coded solution above. """ ## Construct nearest neighbors model and get query results - m = tc.nearest_neighbors.create(sf_ref, label, features, distance, - method, verbose=False) + m = tc.nearest_neighbors.create( + sf_ref, label, features, distance, method, verbose=False + ) knn = m.query(sf_query, label, k=k, radius=radius) # TODO: Speed up this test for row in knn: - q = row['query_label'] - r = row['reference_label'] + q = row["query_label"] + r = row["reference_label"] query = sf_ref[int(q)][self.field] ref = sf_ref[int(r)][self.field] - score = row['distance'] + score = row["distance"] - if distance == 'cosine': + if distance == "cosine": ans = cosine(query, ref) - elif distance == 'dot_product': + elif distance == "dot_product": ans = dot_product(query, ref) - elif distance == 'transformed_dot_product': + elif distance == "transformed_dot_product": ans = transformed_dot_product(query, ref) - elif distance == 'jaccard': + elif distance == "jaccard": ans = jaccard(query, ref) - elif distance == 'weighted_jaccard': + elif distance == "weighted_jaccard": ans = weighted_jaccard(query, ref) - elif distance == 'euclidean': + elif distance == "euclidean": ans = euclidean(query, ref) - elif distance == 'squared_euclidean': + elif distance == "squared_euclidean": ans = squared_euclidean(query, ref) - elif distance == 'manhattan': + elif distance == "manhattan": ans = manhattan(query, ref) else: raise RuntimeError("Unknown distance") @@ -1573,23 +1934,57 @@ def test_query_distances(self): """ Test query accuracy for various distances. """ - for dist in ['euclidean', 'squared_euclidean', 'manhattan', 'cosine', - 'transformed_dot_product', 'jaccard', 'weighted_jaccard']: - - self._test_query(self.refs, self.refs, self.label, features=None, - distance=dist, method='brute_force') - self._test_query(self.refs, self.refs, self.label, features=None, - distance=dist, method='brute_force', k=self.k) - self._test_query(self.refs, self.refs, self.label, features=None, - distance=dist, method='brute_force', k=self.k, radius=self.radius) + for dist in [ + "euclidean", + "squared_euclidean", + "manhattan", + "cosine", + "transformed_dot_product", + "jaccard", + "weighted_jaccard", + ]: + + self._test_query( + self.refs, + self.refs, + self.label, + features=None, + distance=dist, + method="brute_force", + ) + self._test_query( + self.refs, + self.refs, + self.label, + features=None, + distance=dist, + method="brute_force", + k=self.k, + ) + self._test_query( + self.refs, + self.refs, + self.label, + features=None, + distance=dist, + method="brute_force", + k=self.k, + radius=self.radius, + ) def test_query_methods(self): """ Test query accuracy for various nearest neighbor methods. """ - for method in ['auto', 'ball_tree', 'brute_force']: - self._test_query(self.refs, self.refs, self.label, features=None, - distance='euclidean', method=method) + for method in ["auto", "ball_tree", "brute_force"]: + self._test_query( + self.refs, + self.refs, + self.label, + features=None, + distance="euclidean", + method=method, + ) def test_similarity_graph(self): """ @@ -1599,35 +1994,36 @@ def test_similarity_graph(self): """ n = 30 sf = tc.SFrame() - sf['docs'] = [random_dict(5 ,3) for i in range(n)] - sf = sf.add_row_number('id') + sf["docs"] = [random_dict(5, 3) for i in range(n)] + sf = sf.add_row_number("id") - m = tc.nearest_neighbors.create(sf, label='id', features=None, - distance='euclidean', - method='brute_force') + m = tc.nearest_neighbors.create( + sf, label="id", features=None, distance="euclidean", method="brute_force" + ) knn = m.query(sf, k=3, verbose=False) - knn_graph = m.similarity_graph(k=2, output_type='SFrame', verbose=False) + knn_graph = m.similarity_graph(k=2, output_type="SFrame", verbose=False) ## Basic metadata about the output self.assertEqual(knn_graph.num_rows(), 60) - self.assertEqual(knn_graph['rank'].max(), 2) - self.assertGreaterEqual(knn_graph['distance'].min(), 0.) + self.assertEqual(knn_graph["rank"].max(), 2) + self.assertGreaterEqual(knn_graph["distance"].min(), 0.0) ## No self edges - label_diff = knn_graph['query_label'] - knn_graph['reference_label'] + label_diff = knn_graph["query_label"] - knn_graph["reference_label"] self.assertEqual(sum(label_diff == 0), 0) ## Exact match to query method, adjusting for no self-edges. NOTE: for # this type of data and distance, there are many ties, so the reference # labels won't necessarily match across the two methods. Only check the # three columns that must match exactly. - knn = knn[knn['rank'] > 1] - knn['rank'] = knn['rank'] - 1 - test_ftrs = ['query_label', 'distance', 'rank'] + knn = knn[knn["rank"] > 1] + knn["rank"] = knn["rank"] - 1 + test_ftrs = ["query_label", "distance", "rank"] - assert_frame_equal(knn[test_ftrs].to_dataframe(), - knn_graph[test_ftrs].to_dataframe()) + assert_frame_equal( + knn[test_ftrs].to_dataframe(), knn_graph[test_ftrs].to_dataframe() + ) class NearestNeighborsStringQueryTest(unittest.TestCase): @@ -1642,8 +2038,8 @@ def setUpClass(self): n = 5 word_length = 3 alphabet_size = 5 - self.label = 'id' - self.refs = tc.SFrame({'X1': random_string(n, word_length, alphabet_size)}) + self.label = "id" + self.refs = tc.SFrame({"X1": random_string(n, word_length, alphabet_size)}) self.refs = self.refs.add_row_number(self.label) def _test_query(self, sf_ref, sf_query, features, distance, method): @@ -1652,31 +2048,39 @@ def _test_query(self, sf_ref, sf_query, features, distance, method): """ ## Get the toolkit answer - m = tc.nearest_neighbors.create(sf_ref, label=self.label, - features=features, - distance=distance, - method=method, - verbose=False) + m = tc.nearest_neighbors.create( + sf_ref, + label=self.label, + features=features, + distance=distance, + method=method, + verbose=False, + ) knn = m.query(sf_query, verbose=False) ## Compute the answer from scratch - knn = knn.join(self.refs, on={'query_label': 'id'}, how='left') - knn = knn.join(self.refs, on={'reference_label': 'id'}, how='left') + knn = knn.join(self.refs, on={"query_label": "id"}, how="left") + knn = knn.join(self.refs, on={"reference_label": "id"}, how="left") - if distance == 'levenshtein': - knn['test_dist'] = knn.apply(lambda x: levenshtein(x['X1'], x['X1.1'])) + if distance == "levenshtein": + knn["test_dist"] = knn.apply(lambda x: levenshtein(x["X1"], x["X1.1"])) else: raise ValueError("Distance not found in string query test.") - self.assertAlmostEqual(sum(knn['distance'] - knn['test_dist']), 0) + self.assertAlmostEqual(sum(knn["distance"] - knn["test_dist"]), 0) def test_query_distances(self): """ Test query accuracy for various dista0nces. As of v1.1, only levenshtein distance is implemented. """ - self._test_query(self.refs, self.refs, features=None, - distance='levenshtein', method='brute_force') + self._test_query( + self.refs, + self.refs, + features=None, + distance="levenshtein", + method="brute_force", + ) def test_similarity_graph(self): """ @@ -1685,55 +2089,61 @@ def test_similarity_graph(self): reference dataset. """ n = 30 - sf = tc.SFrame({'X1': random_string(n, length=3, num_letters=5)}) - sf = sf.add_row_number('id') + sf = tc.SFrame({"X1": random_string(n, length=3, num_letters=5)}) + sf = sf.add_row_number("id") - m = tc.nearest_neighbors.create(sf, label='id', features=None, - distance='levenshtein', - method='brute_force') + m = tc.nearest_neighbors.create( + sf, label="id", features=None, distance="levenshtein", method="brute_force" + ) knn = m.query(sf, k=3, verbose=False) - knn_graph = m.similarity_graph(k=2, output_type='SFrame', verbose=False) + knn_graph = m.similarity_graph(k=2, output_type="SFrame", verbose=False) ## Basic metadata about the output self.assertEqual(knn_graph.num_rows(), 60) - self.assertEqual(knn_graph['rank'].max(), 2) - self.assertGreaterEqual(knn_graph['distance'].min(), 0.) + self.assertEqual(knn_graph["rank"].max(), 2) + self.assertGreaterEqual(knn_graph["distance"].min(), 0.0) ## No self edges - label_diff = knn_graph['query_label'] - knn_graph['reference_label'] + label_diff = knn_graph["query_label"] - knn_graph["reference_label"] self.assertEqual(sum(label_diff == 0), 0) ## Exact match to query method, adjusting for no self-edges. NOTE: for # this type of data and distance, there are many ties, so the reference # labels won't necessarily match across the two methods. Only check the # three columns that must match exactly. - knn = knn[knn['rank'] > 1] - knn['rank'] = knn['rank'] - 1 - test_ftrs = ['query_label', 'distance', 'rank'] + knn = knn[knn["rank"] > 1] + knn["rank"] = knn["rank"] - 1 + test_ftrs = ["query_label", "distance", "rank"] - assert_frame_equal(knn[test_ftrs].to_dataframe(), - knn_graph[test_ftrs].to_dataframe()) + assert_frame_equal( + knn[test_ftrs].to_dataframe(), knn_graph[test_ftrs].to_dataframe() + ) def test_missing_queries(self): """ Check that missing string queries are correctly imputed to be empty strings. """ - sf = tc.SFrame({'x0': ['a', 'b'], - 'x1': ['d', 'e']}) + sf = tc.SFrame({"x0": ["a", "b"], "x1": ["d", "e"]}) - sf_query = tc.SFrame({'x0': ['a', None, 'b', None], - 'x1': ['b', 'c', None, None]}) + sf_query = tc.SFrame( + {"x0": ["a", None, "b", None], "x1": ["b", "c", None, None]} + ) m = tc.nearest_neighbors.create(sf, verbose=False) knn = m.query(sf_query, k=None, radius=None) - answer = tc.SFrame({'query_label': [0, 0, 1, 1, 2, 2, 3, 3], - 'distance': [1., 2., 2., 2., 1., 2., 2., 2.]}) + answer = tc.SFrame( + { + "query_label": [0, 0, 1, 1, 2, 2, 3, 3], + "distance": [1.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0], + } + ) - assert_frame_equal(knn[['distance', 'query_label']].to_dataframe(), - answer.to_dataframe()) + assert_frame_equal( + knn[["distance", "query_label"]].to_dataframe(), answer.to_dataframe() + ) class NearestNeighborsCompositeQueryTest(unittest.TestCase): @@ -1755,7 +2165,7 @@ def setUpClass(self): # self.refs['address'], n=2, method='character', to_lower=False, # ignore_space=True) - self.label = 'id' + self.label = "id" self.refs = self.refs.add_row_number(self.label) def _test_query(self, composite_params): @@ -1763,22 +2173,23 @@ def _test_query(self, composite_params): Test query accuracy using arbitrary composite distance inputs. """ ## Get the toolkit answer - m = tc.nearest_neighbors.create(self.refs, label=self.label, - distance=composite_params, - verbose=False) + m = tc.nearest_neighbors.create( + self.refs, label=self.label, distance=composite_params, verbose=False + ) knn = m.query(self.refs, verbose=False) ## Compute the answer from scratch dist_ans = [] for row in knn: - query = self.refs[row['query_label']] - ref = self.refs[row['reference_label']] - dist_row = tc.distances.compute_composite_distance(composite_params, - query, ref) + query = self.refs[row["query_label"]] + ref = self.refs[row["reference_label"]] + dist_row = tc.distances.compute_composite_distance( + composite_params, query, ref + ) dist_ans.append(dist_row) - knn['test_dist'] = dist_ans - self.assertAlmostEqual(sum(knn['distance'] - knn['test_dist']), 0) + knn["test_dist"] = dist_ans + self.assertAlmostEqual(sum(knn["distance"] - knn["test_dist"]), 0) def test_composite_queries(self): """ @@ -1787,13 +2198,17 @@ def test_composite_queries(self): """ ## Test accuracy over overlapping feature sets - distance_components = [[['X1', 'X2'], 'euclidean', 1], - [['X2', 'X3'], 'manhattan', 1]] + distance_components = [ + [["X1", "X2"], "euclidean", 1], + [["X2", "X3"], "manhattan", 1], + ] self._test_query(distance_components) ## Test accuracy of different weights - distance_components = [[['X1', 'X2'], 'euclidean', 2], - [['X2', 'X3'], 'manhattan', 3.4]] + distance_components = [ + [["X1", "X2"], "euclidean", 2], + [["X2", "X3"], "manhattan", 3.4], + ] self._test_query(distance_components) def test_similarity_graph(self): @@ -1804,32 +2219,31 @@ def test_similarity_graph(self): """ n, d = 30, 3 sf = tc.SFrame(np.random.random((n, d))) - sf = sf.unpack('X1', column_name_prefix='') - sf = sf.add_row_number('id') + sf = sf.unpack("X1", column_name_prefix="") + sf = sf.add_row_number("id") - my_dist = [[['0', '1'], 'euclidean', 1], - [['1', '2'], 'manhattan', 1]] + my_dist = [[["0", "1"], "euclidean", 1], [["1", "2"], "manhattan", 1]] - m = tc.nearest_neighbors.create(sf, label='id', features=None, - distance=my_dist, - method='brute_force') + m = tc.nearest_neighbors.create( + sf, label="id", features=None, distance=my_dist, method="brute_force" + ) knn = m.query(sf, k=3, verbose=False) - knn_graph = m.similarity_graph(k=2, output_type='SFrame', verbose=False) + knn_graph = m.similarity_graph(k=2, output_type="SFrame", verbose=False) ## Basic metadata about the output self.assertEqual(knn_graph.num_rows(), 60) - self.assertEqual(knn_graph['rank'].max(), 2) - self.assertGreaterEqual(knn_graph['distance'].min(), 0.) + self.assertEqual(knn_graph["rank"].max(), 2) + self.assertGreaterEqual(knn_graph["distance"].min(), 0.0) ## No self edges - label_diff = knn_graph['query_label'] - knn_graph['reference_label'] + label_diff = knn_graph["query_label"] - knn_graph["reference_label"] self.assertEqual(sum(label_diff == 0), 0) ## Exact match to query method, adjusting for no self-edges. - knn = knn[knn['rank'] > 1] - knn['rank'] = knn['rank'] - 1 - test_ftrs = ['query_label', 'distance', 'rank'] + knn = knn[knn["rank"] > 1] + knn["rank"] = knn["rank"] - 1 + test_ftrs = ["query_label", "distance", "rank"] assert_frame_equal(knn.to_dataframe(), knn_graph.to_dataframe()) @@ -1838,18 +2252,24 @@ class ValidateListUtilityTest(unittest.TestCase): """ Unit test class for checking correctness of utility function. """ - def setUp(self): - self.check_for_numeric_fixed_length_lists = partial(_validate_lists, - allowed_types=[int, float, long], - require_equal_length=True, - require_same_type=True) - self.check_for_str_lists = partial(_validate_lists, allowed_types=[str], - require_equal_length=False, - require_same_type=True) + def setUp(self): + self.check_for_numeric_fixed_length_lists = partial( + _validate_lists, + allowed_types=[int, float, long], + require_equal_length=True, + require_same_type=True, + ) + + self.check_for_str_lists = partial( + _validate_lists, + allowed_types=[str], + require_equal_length=False, + require_same_type=True, + ) def test_str_cases(self): - lst = [['a', 'b'], ['a', 'b', 'c']] + lst = [["a", "b"], ["a", "b", "c"]] lst = tc.SArray(lst, dtype=list) observed = self.check_for_str_lists(lst) self.assertTrue(observed) @@ -1859,12 +2279,11 @@ def test_str_cases(self): observed = self.check_for_str_lists(lst) self.assertFalse(observed) - lst = [['a', 2, 3], [1, 2, 3, 4]] + lst = [["a", 2, 3], [1, 2, 3, 4]] lst = tc.SArray(lst, dtype=list) observed = self.check_for_str_lists(lst) self.assertFalse(observed) - def test_true_numeric_cases(self): # Same length, all floats @@ -1881,19 +2300,20 @@ def test_true_numeric_cases(self): # Only checks the first 10 lst = [ - [1, 2], - [2, 3], - [1, 2], - [1, 2], - [2, 3], - [1, 2], - [1, 2], - [2, 3], - [1, 2], - [1, 2], - [2, 3, 4], - [1, 2, 3], - [2, 3, 4, 5, 6]] + [1, 2], + [2, 3], + [1, 2], + [1, 2], + [2, 3], + [1, 2], + [1, 2], + [2, 3], + [1, 2], + [1, 2], + [2, 3, 4], + [1, 2, 3], + [2, 3, 4, 5, 6], + ] lst = tc.SArray(lst, dtype=list) observed = self.check_for_numeric_fixed_length_lists(lst) self.assertTrue(observed) @@ -1938,7 +2358,7 @@ def test_false_numeric_cases(self): def test_bad_cases(self): - lst = [{'a': 3}, {'b': 5}] + lst = [{"a": 3}, {"b": 5}] lst = tc.SArray(lst) with self.assertRaises(ValueError): observed = self.check_for_numeric_fixed_length_lists(lst) @@ -1954,7 +2374,7 @@ def random_dict(num_elements=10, max_count=20): words = string.ascii_lowercase d = {} for j in range(num_elements): - w = words[random.randint(0, len(words)-1)] + w = words[random.randint(0, len(words) - 1)] d[w] = random.randint(0, max_count) return d @@ -1970,10 +2390,11 @@ def random_string(number, length, num_letters): word = [] for j in range(length): word.append(random.choice(letters)) - result.append(''.join(word)) + result.append("".join(word)) return result + def random_list_of_str(number, length): """ Generate a list of random lists of strings. @@ -1986,6 +2407,7 @@ def random_list_of_str(number, length): results.append(result) return results + def scipy_dist(q, r, dist): n = len(r) n_query = len(q) @@ -1994,21 +2416,24 @@ def scipy_dist(q, r, dist): idx_col = np.argsort(D, axis=1) idx_row = np.array([[x] for x in range(n_query)]) query_labels = list(np.repeat(range(n_query), n)) - ranks = np.tile(range(1, n+1), n_query) + ranks = np.tile(range(1, n + 1), n_query) - answer = tc.SFrame({'query_label': query_labels, - 'reference_label': idx_col.flatten(), - 'distance': D[idx_row, idx_col].flatten(), - 'rank': ranks}) + answer = tc.SFrame( + { + "query_label": query_labels, + "reference_label": idx_col.flatten(), + "distance": D[idx_row, idx_col].flatten(), + "rank": ranks, + } + ) - answer.swap_columns('distance', 'query_label', inplace=True) - answer.swap_columns('distance', 'reference_label', inplace=True) - answer.swap_columns('distance', 'rank', inplace=True) + answer.swap_columns("distance", "query_label", inplace=True) + answer.swap_columns("distance", "reference_label", inplace=True) + answer.swap_columns("distance", "rank", inplace=True) return answer - if __name__ == "__main__": # Check if we are supposed to connect to another server diff --git a/src/python/turicreate/test/test_object_detector.py b/src/python/turicreate/test/test_object_detector.py index e2ca2ee63c..991f430caf 100644 --- a/src/python/turicreate/test/test_object_detector.py +++ b/src/python/turicreate/test/test_object_detector.py @@ -17,28 +17,39 @@ import sys import os from turicreate.toolkits._main import ToolkitError as _ToolkitError -from turicreate.toolkits._internal_utils import _raise_error_if_not_sarray, _mac_ver, _read_env_var_cpp +from turicreate.toolkits._internal_utils import ( + _raise_error_if_not_sarray, + _mac_ver, + _read_env_var_cpp, +) from six import StringIO import coremltools -_CLASSES = ['person', 'cat', 'dog', 'chair'] +_CLASSES = ["person", "cat", "dog", "chair"] + def _get_data(feature, annotations): from PIL import Image as _PIL_Image + rs = np.random.RandomState(1234) - def from_pil_image(pil_img, image_format='png'): + + def from_pil_image(pil_img, image_format="png"): # The above didn't work, so as a temporary fix write to temp files - if image_format == 'raw': + if image_format == "raw": image = np.array(pil_img) FORMAT_RAW = 2 - return tc.Image(_image_data=image.tobytes(), - _width=image.shape[1], - _height=image.shape[0], - _channels=image.shape[2], - _format_enum=FORMAT_RAW, - _image_data_size=image.size) + return tc.Image( + _image_data=image.tobytes(), + _width=image.shape[1], + _height=image.shape[0], + _channels=image.shape[2], + _format_enum=FORMAT_RAW, + _image_data_size=image.size, + ) else: - with tempfile.NamedTemporaryFile(mode='w+b', suffix='.' + image_format) as f: + with tempfile.NamedTemporaryFile( + mode="w+b", suffix="." + image_format + ) as f: pil_img.save(f, format=image_format) return tc.Image(f.name) @@ -47,13 +58,13 @@ def from_pil_image(pil_img, image_format='png'): classes = _CLASSES images = [] anns = [] - FORMATS = ['png', 'jpeg', 'raw'] + FORMATS = ["png", "jpeg", "raw"] for i in range(num_examples): # Randomly determine image size (should handle large and small) img_shape = tuple(rs.randint(100, 1000, size=2)) + (3,) img = rs.randint(255, size=img_shape) - pil_img = _PIL_Image.fromarray(img, mode='RGB') + pil_img = _PIL_Image.fromarray(img, mode="RGB") # Randomly select image format image_format = FORMATS[rs.randint(len(FORMATS))] images.append(from_pil_image(pil_img, image_format=image_format)) @@ -70,22 +81,14 @@ def from_pil_image(pil_img, image_format='png'): height = max(bottom - top, 1) label = { - 'coordinates': { - 'x': x, - 'y': y, - 'width': width, - 'height': height, - }, - 'label': classes[rs.randint(len(classes))], - 'type': 'rectangle', + "coordinates": {"x": x, "y": y, "width": width, "height": height,}, + "label": classes[rs.randint(len(classes))], + "type": "rectangle", } ann.append(label) anns.append(ann) - data = tc.SFrame({ - feature: tc.SArray(images), - annotations: tc.SArray(anns), - }) + data = tc.SFrame({feature: tc.SArray(images), annotations: tc.SArray(anns),}) return data @@ -95,86 +98,104 @@ def setUpClass(self): """ The setup class method for the basic test case with all default values. """ - self.feature = 'myimage' - self.annotations = 'myannotations' - self.pre_trained_model = 'darknet-yolo' + self.feature = "myimage" + self.annotations = "myannotations" + self.pre_trained_model = "darknet-yolo" ## Create the model self.def_opts = { - 'model': 'darknet-yolo', - 'max_iterations': 0, + "model": "darknet-yolo", + "max_iterations": 0, } # Model self.sf = _get_data(feature=self.feature, annotations=self.annotations) - self.model = tc.object_detector.create(self.sf, - feature=self.feature, - annotations=self.annotations, - batch_size=2, - max_iterations=1, - model=self.pre_trained_model) + self.model = tc.object_detector.create( + self.sf, + feature=self.feature, + annotations=self.annotations, + batch_size=2, + max_iterations=1, + model=self.pre_trained_model, + ) ## Answers self.opts = self.def_opts.copy() - self.opts['max_iterations'] = 1 + self.opts["max_iterations"] = 1 self.get_ans = { - '_model': lambda x: True, - '_class_to_index': lambda x: isinstance(x, dict), - '_training_time_as_string': lambda x: isinstance(x, str), - '_grid_shape': lambda x: tuple(x) == (13, 13), - 'model': lambda x: x == self.pre_trained_model, - 'anchors': lambda x: (isinstance(x, (list, tuple, np.ndarray)) and - len(x) > 0 and len(x[0]) == 2), - 'input_image_shape': lambda x: tuple(x) == (3, 416, 416), - 'batch_size': lambda x: x == 2, - 'classes': lambda x: x == sorted(_CLASSES), - 'feature': lambda x: x == self.feature, - 'max_iterations': lambda x: x >= 0, - 'non_maximum_suppression_threshold': lambda x: 0 <= x <= 1, - 'training_time': lambda x: x > 0, - 'training_iterations': lambda x: x > 0, - 'training_epochs': lambda x: x >= 0, - 'num_bounding_boxes': lambda x: x > 0, - 'num_examples': lambda x: x > 0, - 'training_loss': lambda x: x > 0, - 'annotations': lambda x: x == self.annotations, - 'num_classes': lambda x: x == len(_CLASSES), + "_model": lambda x: True, + "_class_to_index": lambda x: isinstance(x, dict), + "_training_time_as_string": lambda x: isinstance(x, str), + "_grid_shape": lambda x: tuple(x) == (13, 13), + "model": lambda x: x == self.pre_trained_model, + "anchors": lambda x: ( + isinstance(x, (list, tuple, np.ndarray)) + and len(x) > 0 + and len(x[0]) == 2 + ), + "input_image_shape": lambda x: tuple(x) == (3, 416, 416), + "batch_size": lambda x: x == 2, + "classes": lambda x: x == sorted(_CLASSES), + "feature": lambda x: x == self.feature, + "max_iterations": lambda x: x >= 0, + "non_maximum_suppression_threshold": lambda x: 0 <= x <= 1, + "training_time": lambda x: x > 0, + "training_iterations": lambda x: x > 0, + "training_epochs": lambda x: x >= 0, + "num_bounding_boxes": lambda x: x > 0, + "num_examples": lambda x: x > 0, + "training_loss": lambda x: x > 0, + "annotations": lambda x: x == self.annotations, + "num_classes": lambda x: x == len(_CLASSES), } - self.get_ans['annotation_position'] = lambda x: isinstance(x, str) - self.get_ans['annotation_scale'] = lambda x: isinstance(x, str) - self.get_ans['annotation_origin'] = lambda x: isinstance(x, str) - self.get_ans['grid_height'] = lambda x: x > 0 - self.get_ans['grid_width'] = lambda x: x > 0 - self.get_ans['random_seed'] = lambda x: True - self.get_ans['verbose'] = lambda x: True - del self.get_ans['_model'] - del self.get_ans['_class_to_index'] - del self.get_ans['_grid_shape'] - del self.get_ans['anchors'] - del self.get_ans['non_maximum_suppression_threshold'] + self.get_ans["annotation_position"] = lambda x: isinstance(x, str) + self.get_ans["annotation_scale"] = lambda x: isinstance(x, str) + self.get_ans["annotation_origin"] = lambda x: isinstance(x, str) + self.get_ans["grid_height"] = lambda x: x > 0 + self.get_ans["grid_width"] = lambda x: x > 0 + self.get_ans["random_seed"] = lambda x: True + self.get_ans["verbose"] = lambda x: True + del self.get_ans["_model"] + del self.get_ans["_class_to_index"] + del self.get_ans["_grid_shape"] + del self.get_ans["anchors"] + del self.get_ans["non_maximum_suppression_threshold"] self.fields_ans = self.get_ans.keys() def test_create_with_missing_value(self): - sf = self.sf.append(tc.SFrame({self.feature: tc.SArray([None], dtype=tc.Image), self.annotations: [self.sf[self.annotations][0]]})) + sf = self.sf.append( + tc.SFrame( + { + self.feature: tc.SArray([None], dtype=tc.Image), + self.annotations: [self.sf[self.annotations][0]], + } + ) + ) with self.assertRaises(_ToolkitError): - tc.object_detector.create(sf, feature=self.feature, annotations=self.annotations) - + tc.object_detector.create( + sf, feature=self.feature, annotations=self.annotations + ) def test_create_with_missing_feature(self): with self.assertRaises(_ToolkitError): - tc.object_detector.create(self.sf, feature='wrong_feature', annotations=self.annotations) + tc.object_detector.create( + self.sf, feature="wrong_feature", annotations=self.annotations + ) def test_create_with_missing_annotations(self): with self.assertRaises(_ToolkitError): - tc.object_detector.create(self.sf, feature=self.feature, annotations='wrong_annotations') + tc.object_detector.create( + self.sf, feature=self.feature, annotations="wrong_annotations" + ) def test_create_with_invalid_annotations_list_coord(self): with self.assertRaises(_ToolkitError): sf = self.sf.head() sf[self.annotations] = sf[self.annotations].apply( - lambda x: [{'label': _CLASSES[0], 'coordinates': [100, 50, 20, 40]}]) + lambda x: [{"label": _CLASSES[0], "coordinates": [100, 50, 20, 40]}] + ) tc.object_detector.create(sf) @@ -182,46 +203,68 @@ def test_create_with_invalid_annotations_coordinate(self): with self.assertRaises(_ToolkitError): sf = self.sf.head() sf[self.annotations] = sf[self.annotations].apply( - lambda x: [{'label': _CLASSES[0], 'coordinates':{'x':None, 'y':1, 'width':1, 'height': 1}}]) + lambda x: [ + { + "label": _CLASSES[0], + "coordinates": {"x": None, "y": 1, "width": 1, "height": 1}, + } + ] + ) tc.object_detector.create(sf) with self.assertRaises(_ToolkitError): sf = self.sf.head() sf[self.annotations] = sf[self.annotations].apply( - lambda x: [{'label': _CLASSES[0], 'coordinates':{'x':1, 'y':[], 'width':1, 'height': 1}}]) + lambda x: [ + { + "label": _CLASSES[0], + "coordinates": {"x": 1, "y": [], "width": 1, "height": 1}, + } + ] + ) tc.object_detector.create(sf) with self.assertRaises(_ToolkitError): sf = self.sf.head() sf[self.annotations] = sf[self.annotations].apply( - lambda x: [{'label': _CLASSES[0], 'coordinates':{'x':1, 'y':1, 'width':{}, 'height': 1}}]) + lambda x: [ + { + "label": _CLASSES[0], + "coordinates": {"x": 1, "y": 1, "width": {}, "height": 1}, + } + ] + ) tc.object_detector.create(sf) with self.assertRaises(_ToolkitError): sf = self.sf.head() sf[self.annotations] = sf[self.annotations].apply( - lambda x: [{'label': _CLASSES[0], 'coordinates':{'x':1, 'y':1, 'width':1, 'height': '1'}}]) + lambda x: [ + { + "label": _CLASSES[0], + "coordinates": {"x": 1, "y": 1, "width": 1, "height": "1"}, + } + ] + ) tc.object_detector.create(sf) - def test_create_with_missing_annotations_label(self): - def create_missing_annotations_label(x): for y in x: - y['label'] = None + y["label"] = None return x with self.assertRaises(_ToolkitError): sf = self.sf.head() sf[self.annotations] = sf[self.annotations].apply( - lambda x: create_missing_annotations_label(x)) + lambda x: create_missing_annotations_label(x) + ) tc.object_detector.create(sf) def test_create_with_invalid_annotations_not_dict(self): with self.assertRaises(_ToolkitError): sf = self.sf.head() - sf[self.annotations] = sf[self.annotations].apply( - lambda x: [1]) + sf[self.annotations] = sf[self.annotations].apply(lambda x: [1]) tc.object_detector.create(sf) @@ -230,7 +273,13 @@ def test_create_with_invalid_user_define_classes(self): old_stdout = sys.stdout result_out = StringIO() sys.stdout = result_out - model = tc.object_detector.create(sf, feature=self.feature, annotations=self.annotations, classes=['invalid'], max_iterations=1) + model = tc.object_detector.create( + sf, + feature=self.feature, + annotations=self.annotations, + classes=["invalid"], + max_iterations=1, + ) sys.stdout = old_stdout self.assertTrue("Warning" in result_out.getvalue()) @@ -240,41 +289,46 @@ def test_create_with_empty_dataset(self): def test_create_with_verbose_False(self): args = [self.sf, self.annotations, self.feature] - kwargs = { - 'max_iterations': 1, - 'model': self.pre_trained_model - } - test_util.assert_longer_verbose_logs( - tc.object_detector.create, args, kwargs) + kwargs = {"max_iterations": 1, "model": self.pre_trained_model} + test_util.assert_longer_verbose_logs(tc.object_detector.create, args, kwargs) def test_dict_annotations(self): sf_copy = self.sf[:] - sf_copy[self.annotations] = sf_copy[self.annotations].apply(lambda x: x[0] if len(x) > 0 else None) - dict_model = tc.object_detector.create(sf_copy, - feature=self.feature, - annotations=self.annotations, - max_iterations=1, - model=self.pre_trained_model) + sf_copy[self.annotations] = sf_copy[self.annotations].apply( + lambda x: x[0] if len(x) > 0 else None + ) + dict_model = tc.object_detector.create( + sf_copy, + feature=self.feature, + annotations=self.annotations, + max_iterations=1, + model=self.pre_trained_model, + ) pred = dict_model.predict(sf_copy) metrics = dict_model.evaluate(sf_copy) - annotated_img = tc.object_detector.util.draw_bounding_boxes(sf_copy[self.feature], - sf_copy[self.annotations]) + annotated_img = tc.object_detector.util.draw_bounding_boxes( + sf_copy[self.feature], sf_copy[self.annotations] + ) def test_extra_classes(self): # Create while the data has extra classes - model = tc.object_detector.create(self.sf, classes=_CLASSES[:2], max_iterations=1) + model = tc.object_detector.create( + self.sf, classes=_CLASSES[:2], max_iterations=1 + ) self.assertEqual(len(model.classes), 2) # Evaluate while the data has extra classes ret = model.evaluate(self.sf.head()) - self.assertEqual(len(ret['average_precision_50']), 2) + self.assertEqual(len(ret["average_precision_50"]), 2) def test_different_grip_shape(self): # Should able to give different input grip shape - shapes = [[1,1], [5,5], [13,13], [26,26]] + shapes = [[1, 1], [5, 5], [13, 13], [26, 26]] for shape in shapes: - model = tc.object_detector.create(self.sf, max_iterations=1, grid_shape=shape) + model = tc.object_detector.create( + self.sf, max_iterations=1, grid_shape=shape + ) pred = model.predict(self.sf) def test_predict(self): @@ -289,17 +343,16 @@ def test_predict(self): self.assertEqual(len(pred), len(sf)) # Make sure SFrame was not altered - self.assertEqual([col for col in sf.column_names() if col.startswith('_')], - []) + self.assertEqual([col for col in sf.column_names() if col.startswith("_")], []) # Predict should work on no input (and produce no predictions) pred0 = self.model.predict(sf[:0]) self.assertEqual(len(pred0), 0) def test_predict_with_invalid_annotation(self): - #predict function shouldn't throw exception when annotations column is invalid + # predict function shouldn't throw exception when annotations column is invalid sf = self.sf.head() - sf[self.annotations] = sf[self.annotations].apply(lambda x:'invalid') + sf[self.annotations] = sf[self.annotations].apply(lambda x: "invalid") pred = self.model.predict(sf) def test_single_image(self): @@ -327,46 +380,50 @@ def test_confidence_threshold(self): self.assertTrue(len(stacked) > 0) def test_evaluate(self): - ret = self.model.evaluate(self.sf.head(), metric='average_precision') + ret = self.model.evaluate(self.sf.head(), metric="average_precision") - self.assertTrue(set(ret), {'average_precision'}) - self.assertEqual(set(ret['average_precision'].keys()), set(_CLASSES)) + self.assertTrue(set(ret), {"average_precision"}) + self.assertEqual(set(ret["average_precision"].keys()), set(_CLASSES)) ret = self.model.evaluate(self.sf.head()) - self.assertEqual(set(ret), {'mean_average_precision_50', 'average_precision_50'}) - self.assertTrue(isinstance(ret['mean_average_precision_50'], float)) - self.assertEqual(set(ret['average_precision_50'].keys()), set(_CLASSES)) + self.assertEqual( + set(ret), {"mean_average_precision_50", "average_precision_50"} + ) + self.assertTrue(isinstance(ret["mean_average_precision_50"], float)) + self.assertEqual(set(ret["average_precision_50"].keys()), set(_CLASSES)) # Empty dataset should not fail with error (although it should to 0 # metrics) ret = self.model.evaluate(self.sf[:0]) - self.assertEqual(ret['mean_average_precision_50'], 0.0) + self.assertEqual(ret["mean_average_precision_50"], 0.0) def test_predict_invalid_threshold(self): with self.assertRaises(_ToolkitError): self.model.predict(self.sf.head(), confidence_threshold=-1) with self.assertRaises(_ToolkitError): - self.model.predict(self.sf.head(), iou_threshold =-1) + self.model.predict(self.sf.head(), iou_threshold=-1) def test_evaluate_invalid_threshold(self): with self.assertRaises(_ToolkitError): self.model.evaluate(self.sf.head(), confidence_threshold=-1) with self.assertRaises(_ToolkitError): - self.model.evaluate(self.sf.head(), iou_threshold =-1) + self.model.evaluate(self.sf.head(), iou_threshold=-1) def test_evaluate_sframe_format(self): - metrics = ["mean_average_precision_50","mean_average_precision"] + metrics = ["mean_average_precision_50", "mean_average_precision"] for metric in metrics: - pred = self.model.evaluate(self.sf.head(), metric=metric, output_type="sframe") + pred = self.model.evaluate( + self.sf.head(), metric=metric, output_type="sframe" + ) self.assertEqual(pred.column_names(), ["label"]) def test_evaluate_invalid_metric(self): with self.assertRaises(_ToolkitError): - self.model.evaluate(self.sf.head(), metric='not-supported-metric') + self.model.evaluate(self.sf.head(), metric="not-supported-metric") def test_evaluate_invalid_format(self): with self.assertRaises(_ToolkitError): - self.model.evaluate(self.sf.head(), output_type='not-supported-format') + self.model.evaluate(self.sf.head(), output_type="not-supported-format") def test_evaluate_missing_annotations(self): with self.assertRaises(_ToolkitError): @@ -375,74 +432,80 @@ def test_evaluate_missing_annotations(self): self.model.evaluate(sf.head()) def test_evaluate_with_missing_annotations_label(self): - def create_missing_annotations_label(x): for y in x: - y['label'] = None + y["label"] = None return x - with self.assertRaises(_ToolkitError): sf = self.sf.head() sf[self.annotations] = sf[self.annotations].apply( - lambda x: create_missing_annotations_label(x)) + lambda x: create_missing_annotations_label(x) + ) self.model.evaluate(sf) def test_export_coreml(self): from PIL import Image import coremltools import platform - filename = tempfile.mkstemp('bingo.mlmodel')[1] - self.model.export_coreml(filename, - include_non_maximum_suppression=False) + + filename = tempfile.mkstemp("bingo.mlmodel")[1] + self.model.export_coreml(filename, include_non_maximum_suppression=False) coreml_model = coremltools.models.MLModel(filename) - self.assertDictEqual({ - 'com.github.apple.turicreate.version': tc.__version__, - 'com.github.apple.os.platform': platform.platform(), - 'annotations': self.annotations, - 'type': 'object_detector', - 'classes': ','.join(sorted(_CLASSES)), - 'feature': self.feature, - 'include_non_maximum_suppression': 'False', - 'max_iterations': '1', - 'model': 'darknet-yolo', - 'training_iterations': '1', - 'version': '1', - }, dict(coreml_model.user_defined_metadata) + self.assertDictEqual( + { + "com.github.apple.turicreate.version": tc.__version__, + "com.github.apple.os.platform": platform.platform(), + "annotations": self.annotations, + "type": "object_detector", + "classes": ",".join(sorted(_CLASSES)), + "feature": self.feature, + "include_non_maximum_suppression": "False", + "max_iterations": "1", + "model": "darknet-yolo", + "training_iterations": "1", + "version": "1", + }, + dict(coreml_model.user_defined_metadata), + ) + expected_result = "Object detector created by Turi Create (version %s)" % ( + tc.__version__ ) - expected_result = 'Object detector created by Turi Create (version %s)' \ - % (tc.__version__) self.assertEquals(expected_result, coreml_model.short_description) img = self.sf[0:1][self.feature][0] img_fixed = tc.image_analysis.resize(img, 416, 416, 3) pil_img = Image.fromarray(img_fixed.pixel_data) if _mac_ver() >= (10, 13): - ret = coreml_model.predict({self.feature: pil_img}, - usesCPUOnly = True) - self.assertEqual(ret['coordinates'].shape[1], 4) - self.assertEqual(ret['confidence'].shape[1], len(_CLASSES)) - self.assertEqual(ret['coordinates'].shape[0], - ret['confidence'].shape[0]) + ret = coreml_model.predict({self.feature: pil_img}, usesCPUOnly=True) + self.assertEqual(ret["coordinates"].shape[1], 4) + self.assertEqual(ret["confidence"].shape[1], len(_CLASSES)) + self.assertEqual(ret["coordinates"].shape[0], ret["confidence"].shape[0]) # A numeric comparison of the resulting of top bounding boxes is # not that meaningful unless the model has converged # Also check if we can train a second model and export it. - filename2 = tempfile.mkstemp('bingo2.mlmodel')[1] + filename2 = tempfile.mkstemp("bingo2.mlmodel")[1] # We also test at the same time if we can export a model with a single # class - sf = tc.SFrame({'image': [self.sf[self.feature][0]], - 'ann': [self.sf[self.annotations][0][:1]]}) + sf = tc.SFrame( + { + "image": [self.sf[self.feature][0]], + "ann": [self.sf[self.annotations][0][:1]], + } + ) model2 = tc.object_detector.create(sf, max_iterations=1) - model2.export_coreml(filename2, - include_non_maximum_suppression=False) + model2.export_coreml(filename2, include_non_maximum_suppression=False) - @unittest.skipIf(_mac_ver() < (10, 14), - "Non-maximum suppression is only supported on MacOS 10.14+.") + @unittest.skipIf( + _mac_ver() < (10, 14), + "Non-maximum suppression is only supported on MacOS 10.14+.", + ) def test_export_coreml_with_non_maximum_suppression(self): from PIL import Image - filename = tempfile.mkstemp('bingo.mlmodel')[1] + + filename = tempfile.mkstemp("bingo.mlmodel")[1] self.model.export_coreml(filename, include_non_maximum_suppression=True) coreml_model = coremltools.models.MLModel(filename) @@ -450,27 +513,31 @@ def test_export_coreml_with_non_maximum_suppression(self): img_fixed = tc.image_analysis.resize(img, 416, 416, 3) pil_img = Image.fromarray(img_fixed.pixel_data) if _mac_ver() >= (10, 13): - ret = coreml_model.predict({self.feature: pil_img}, - usesCPUOnly = True) - self.assertEqual(ret['coordinates'].shape[1], 4) - self.assertEqual(ret['confidence'].shape[1], len(_CLASSES)) - self.assertEqual(ret['coordinates'].shape[0], - ret['confidence'].shape[0]) + ret = coreml_model.predict({self.feature: pil_img}, usesCPUOnly=True) + self.assertEqual(ret["coordinates"].shape[1], 4) + self.assertEqual(ret["confidence"].shape[1], len(_CLASSES)) + self.assertEqual(ret["coordinates"].shape[0], ret["confidence"].shape[0]) # A numeric comparison of the resulting of top bounding boxes is # not that meaningful unless the model has converged # Also check if we can train a second model and export it. - filename2 = tempfile.mkstemp('bingo2.mlmodel')[1] + filename2 = tempfile.mkstemp("bingo2.mlmodel")[1] # We also test at the same time if we can export a model with a single # class - sf = tc.SFrame({'image': [self.sf[self.feature][0]], - 'ann': [self.sf[self.annotations][0][:1]]}) + sf = tc.SFrame( + { + "image": [self.sf[self.feature][0]], + "ann": [self.sf[self.annotations][0][:1]], + } + ) model2 = tc.object_detector.create(sf, max_iterations=1) model2.export_coreml(filename2, include_non_maximum_suppression=True) @pytest.mark.xfail - @unittest.skipIf(sys.platform != 'darwin' or _mac_ver() >= (10, 14), - "GPU selection should fail on macOS 10.13 or below") + @unittest.skipIf( + sys.platform != "darwin" or _mac_ver() >= (10, 14), + "GPU selection should fail on macOS 10.13 or below", + ) def test_no_gpu_support_on_unsupported_macos(self): num_gpus = tc.config.get_num_gpus() tc.config.set_num_gpus(1) @@ -487,8 +554,10 @@ def test_get(self): model = self.model for field in self.fields_ans: ans = model._get(field) - self.assertTrue(self.get_ans[field](ans), - '''Get failed in field {}. Output was {}.'''.format(field, ans)) + self.assertTrue( + self.get_ans[field](ans), + """Get failed in field {}. Output was {}.""".format(field, ans), + ) def test_summary(self): model = self.model @@ -496,16 +565,16 @@ def test_summary(self): def test_summary_str(self): model = self.model - self.assertTrue(isinstance(model.summary('str'), str)) + self.assertTrue(isinstance(model.summary("str"), str)) def test_summary_dict(self): model = self.model - self.assertTrue(isinstance(model.summary('dict'), dict)) + self.assertTrue(isinstance(model.summary("dict"), dict)) def test_summary_invalid_input(self): model = self.model with self.assertRaises(_ToolkitError): - model.summary(model.summary('invalid')) + model.summary(model.summary("invalid")) with self.assertRaises(_ToolkitError): model.summary(model.summary(0)) @@ -534,13 +603,13 @@ def test_save_and_load(self): print("List fields passed") -@unittest.skipIf(tc.util._num_available_gpus() == 0, 'Requires GPU') +@unittest.skipIf(tc.util._num_available_gpus() == 0, "Requires GPU") @pytest.mark.gpu class ObjectDetectorGPUTest(unittest.TestCase): @classmethod def setUpClass(self): - self.feature = 'myimage' - self.annotations = 'myannotations' + self.feature = "myimage" + self.annotations = "myannotations" self.sf = _get_data(feature=self.feature, annotations=self.annotations) def test_gpu_save_load_export(self): @@ -554,6 +623,6 @@ def test_gpu_save_load_export(self): model.save(path) tc.config.set_num_gpus(out_gpus) model = tc.load_model(path) - model.export_coreml(os.path.join(path, 'model.mlmodel')) + model.export_coreml(os.path.join(path, "model.mlmodel")) tc.config.set_num_gpus(old_num_gpus) diff --git a/src/python/turicreate/test/test_one_shot_object_detector.py b/src/python/turicreate/test/test_one_shot_object_detector.py index 21a6c78ade..ec39e05836 100644 --- a/src/python/turicreate/test/test_one_shot_object_detector.py +++ b/src/python/turicreate/test/test_one_shot_object_detector.py @@ -17,26 +17,37 @@ import sys import os from turicreate.toolkits._main import ToolkitError as _ToolkitError -from turicreate.toolkits._internal_utils import _raise_error_if_not_sarray, _mac_ver, _read_env_var_cpp +from turicreate.toolkits._internal_utils import ( + _raise_error_if_not_sarray, + _mac_ver, + _read_env_var_cpp, +) import coremltools -_CLASSES = ['logo_a', 'logo_b', 'logo_c', 'logo_d'] +_CLASSES = ["logo_a", "logo_b", "logo_c", "logo_d"] + def _get_data(feature, target): from PIL import Image as _PIL_Image + rs = np.random.RandomState(1234) - def from_pil_image(pil_img, image_format='png'): - if image_format == 'raw': + + def from_pil_image(pil_img, image_format="png"): + if image_format == "raw": image = np.array(pil_img) FORMAT_RAW = 2 - return tc.Image(_image_data=image.tobytes(), - _width=image.shape[1], - _height=image.shape[0], - _channels=image.shape[2], - _format_enum=FORMAT_RAW, - _image_data_size=image.size) + return tc.Image( + _image_data=image.tobytes(), + _width=image.shape[1], + _height=image.shape[0], + _channels=image.shape[2], + _format_enum=FORMAT_RAW, + _image_data_size=image.size, + ) else: - with tempfile.NamedTemporaryFile(mode='w+b', suffix='.' + image_format) as f: + with tempfile.NamedTemporaryFile( + mode="w+b", suffix="." + image_format + ) as f: pil_img.save(f, format=image_format) return tc.Image(f.name) @@ -45,36 +56,31 @@ def from_pil_image(pil_img, image_format='png'): max_num_boxes_per_image = 10 classes = _CLASSES images = [] - FORMATS = ['png', 'jpeg', 'raw'] + FORMATS = ["png", "jpeg", "raw"] for _ in range(num_examples): # Randomly determine image size (should handle large and small) img_shape = tuple(rs.randint(100, 1000, size=2)) + (3,) img = rs.randint(255, size=img_shape) - pil_img = _PIL_Image.fromarray(img, mode='RGB') + pil_img = _PIL_Image.fromarray(img, mode="RGB") # Randomly select image format image_format = FORMATS[rs.randint(len(FORMATS))] images.append(from_pil_image(pil_img, image_format=image_format)) - - starter_images = [] starter_target = [] for i in range(num_starter_images): img_shape = tuple(rs.randint(100, 1000, size=2)) + (3,) img = rs.randint(255, size=img_shape) - pil_img = _PIL_Image.fromarray(img, mode='RGB') + pil_img = _PIL_Image.fromarray(img, mode="RGB") image_format = FORMATS[rs.randint(len(FORMATS))] starter_images.append(from_pil_image(pil_img, image_format=image_format)) starter_target.append(_CLASSES[i % len(_CLASSES)]) - train = tc.SFrame({ - feature: tc.SArray(starter_images), - target: tc.SArray(starter_target), - }) - test = tc.SFrame({ - feature: tc.SArray(images), - }) + train = tc.SFrame( + {feature: tc.SArray(starter_images), target: tc.SArray(starter_target),} + ) + test = tc.SFrame({feature: tc.SArray(images),}) backgrounds = test[feature].head(5) return train, test, backgrounds @@ -85,29 +91,35 @@ def setUpClass(self): """ The setup class method for the basic test case with all default values. """ - self.feature = 'myimage' - self.target = 'mytarget' + self.feature = "myimage" + self.target = "mytarget" ## Create the model self.def_opts = { - 'model': 'darknet-yolo', - 'max_iterations': 2, + "model": "darknet-yolo", + "max_iterations": 2, } # Model - self.train, self.test, self.backgrounds = _get_data(feature=self.feature, target=self.target) - self.model = tc.one_shot_object_detector.create(self.train, - target=self.target, - backgrounds=self.backgrounds, - batch_size=2, - max_iterations=1) + self.train, self.test, self.backgrounds = _get_data( + feature=self.feature, target=self.target + ) + self.model = tc.one_shot_object_detector.create( + self.train, + target=self.target, + backgrounds=self.backgrounds, + batch_size=2, + max_iterations=1, + ) self.get_ans = { - 'target': lambda x: x == self.target, - 'num_starter_images': lambda x: len(self.train), - 'num_classes': lambda x: x == len(_CLASSES), - 'detector': lambda x: isinstance(x, tc.object_detector.object_detector.ObjectDetector), - '_detector_version': lambda x: x==1 + "target": lambda x: x == self.target, + "num_starter_images": lambda x: len(self.train), + "num_classes": lambda x: x == len(_CLASSES), + "detector": lambda x: isinstance( + x, tc.object_detector.object_detector.ObjectDetector + ), + "_detector_version": lambda x: x == 1, } self.fields_ans = self.get_ans.keys() @@ -115,35 +127,54 @@ def setUpClass(self): def test_synthesis_with_single_image(self): image = self.train[0][self.feature] data = tc.one_shot_object_detector.util.preview_synthetic_training_data( - image, 'custom_logo', backgrounds=self.backgrounds) + image, "custom_logo", backgrounds=self.backgrounds + ) def test_create_with_single_image(self): image = self.train[0][self.feature] model = tc.one_shot_object_detector.create( - image, 'custom_logo', backgrounds=self.backgrounds, max_iterations=1) + image, "custom_logo", backgrounds=self.backgrounds, max_iterations=1 + ) def test_create_with_missing_value(self): - sf = self.train.append(tc.SFrame({self.feature: tc.SArray([None], dtype=tc.Image), self.target: [self.train[self.target][0]]})) + sf = self.train.append( + tc.SFrame( + { + self.feature: tc.SArray([None], dtype=tc.Image), + self.target: [self.train[self.target][0]], + } + ) + ) with self.assertRaises(_ToolkitError): tc.one_shot_object_detector.create(sf, target=self.target) def test_create_with_missing_target(self): with self.assertRaises(_ToolkitError): - tc.one_shot_object_detector.create(self.train, target='wrong_feature', - backgrounds=self.backgrounds, max_iterations=1) - - @pytest.mark.xfail(reason="Non-deterministic test failure tracked in https://github.com/apple/turicreate/issues/2936") + tc.one_shot_object_detector.create( + self.train, + target="wrong_feature", + backgrounds=self.backgrounds, + max_iterations=1, + ) + + @pytest.mark.xfail( + reason="Non-deterministic test failure tracked in https://github.com/apple/turicreate/issues/2936" + ) def test_create_with_empty_dataset(self): with self.assertRaises(_ToolkitError): tc.one_shot_object_detector.create(self.train[:0], target=self.target) def test_create_with_no_background_images(self): with self.assertRaises(_ToolkitError): - tc.one_shot_object_detector.create(self.train, target=self.target, backgrounds=tc.SArray()) + tc.one_shot_object_detector.create( + self.train, target=self.target, backgrounds=tc.SArray() + ) def test_create_with_wrong_type_background_images(self): with self.assertRaises(TypeError): - tc.one_shot_object_detector.create(self.train, target=self.target, backgrounds='wrong_backgrounds') + tc.one_shot_object_detector.create( + self.train, target=self.target, backgrounds="wrong_backgrounds" + ) def test_predict(self): sf = self.test.head() @@ -154,8 +185,7 @@ def test_predict(self): self.assertEqual(len(pred), len(sf)) # Make sure SFrame was not altered - self.assertEqual([col for col in sf.column_names() if col.startswith('_')], - []) + self.assertEqual([col for col in sf.column_names() if col.startswith("_")], []) # Predict should work on no input (and produce no predictions) pred0 = self.model.predict(sf[self.feature][:0]) @@ -187,29 +217,33 @@ def test_export_coreml(self): from PIL import Image import coremltools import platform - filename = tempfile.mkstemp('bingo.mlmodel')[1] - self.model.export_coreml(filename, - include_non_maximum_suppression=False) + + filename = tempfile.mkstemp("bingo.mlmodel")[1] + self.model.export_coreml(filename, include_non_maximum_suppression=False) ## Test metadata coreml_model = coremltools.models.MLModel(filename) self.maxDiff = None - self.assertDictEqual({ - 'com.github.apple.turicreate.version': tc.__version__, - 'com.github.apple.os.platform': platform.platform(), - 'type': 'object_detector', - 'classes': ','.join(sorted(_CLASSES)), - 'feature': self.feature, - 'include_non_maximum_suppression': 'False', - 'annotations': 'annotation', - 'max_iterations': '1', - 'model': 'darknet-yolo', - 'training_iterations': '1', - 'version': '1', - }, dict(coreml_model.user_defined_metadata) + self.assertDictEqual( + { + "com.github.apple.turicreate.version": tc.__version__, + "com.github.apple.os.platform": platform.platform(), + "type": "object_detector", + "classes": ",".join(sorted(_CLASSES)), + "feature": self.feature, + "include_non_maximum_suppression": "False", + "annotations": "annotation", + "max_iterations": "1", + "model": "darknet-yolo", + "training_iterations": "1", + "version": "1", + }, + dict(coreml_model.user_defined_metadata), + ) + expected_result = ( + "One shot object detector created by Turi Create (version %s)" + % (tc.__version__) ) - expected_result = 'One shot object detector created by Turi Create (version %s)' \ - % (tc.__version__) self.assertEquals(expected_result, coreml_model.short_description) ## Test prediction @@ -217,31 +251,32 @@ def test_export_coreml(self): img_fixed = tc.image_analysis.resize(img, 416, 416, 3) pil_img = Image.fromarray(img_fixed.pixel_data) if _mac_ver() >= (10, 13): - ret = coreml_model.predict({self.feature: pil_img}, - usesCPUOnly = True) - self.assertEqual(ret['coordinates'].shape[1], 4) - self.assertEqual(ret['confidence'].shape[1], len(_CLASSES)) - self.assertEqual(ret['coordinates'].shape[0], - ret['confidence'].shape[0]) + ret = coreml_model.predict({self.feature: pil_img}, usesCPUOnly=True) + self.assertEqual(ret["coordinates"].shape[1], 4) + self.assertEqual(ret["confidence"].shape[1], len(_CLASSES)) + self.assertEqual(ret["coordinates"].shape[0], ret["confidence"].shape[0]) # Test export without non max supression - filename2 = tempfile.mkstemp('bingo2.mlmodel')[1] + filename2 = tempfile.mkstemp("bingo2.mlmodel")[1] self.model.export_coreml(filename2, include_non_maximum_suppression=True) coreml_model = coremltools.models.MLModel(filename) self.assertTrue( - coreml_model.user_defined_metadata['include_non_maximum_suppression']) + coreml_model.user_defined_metadata["include_non_maximum_suppression"] + ) def test__list_fields(self): model = self.model fields = model._list_fields() self.assertEqual(set(fields), set(self.fields_ans)) - def test_get(self, ): + def test_get(self,): model = self.model for field in self.fields_ans: ans = model._get(field) - self.assertTrue(self.get_ans[field](ans), - '''Get failed in field {}. Output was {}.'''.format(field, ans)) + self.assertTrue( + self.get_ans[field](ans), + """Get failed in field {}. Output was {}.""".format(field, ans), + ) def test_summary(self): model = self.model diff --git a/src/python/turicreate/test/test_prototype_models.py b/src/python/turicreate/test/test_prototype_models.py index 7431597f2b..cd44c750f1 100644 --- a/src/python/turicreate/test/test_prototype_models.py +++ b/src/python/turicreate/test/test_prototype_models.py @@ -3,8 +3,8 @@ import tempfile import shutil -class SparseNNTest(unittest.TestCase): +class SparseNNTest(unittest.TestCase): def test_sparse_nn(self): X = tc.util.generate_random_sframe(100, "ssszzz") Y = X.copy() @@ -16,7 +16,7 @@ def test_sparse_nn(self): for i, row in enumerate(Y): res = m.query(Y[i], 1) - self.assertEqual(res, {i : 1.0}) + self.assertEqual(res, {i: 1.0}) # Save and load model_file = tempfile.gettempdir() + "/sparse_nn.model" @@ -25,6 +25,6 @@ def test_sparse_nn(self): for i, row in enumerate(Y): res = m2.query(Y[i], 1) - self.assertEqual(res, {i : 1.0}) + self.assertEqual(res, {i: 1.0}) shutil.rmtree(model_file) diff --git a/src/python/turicreate/test/test_python_decision_tree.py b/src/python/turicreate/test/test_python_decision_tree.py index 60d1c56b0b..6a609556fb 100644 --- a/src/python/turicreate/test/test_python_decision_tree.py +++ b/src/python/turicreate/test/test_python_decision_tree.py @@ -11,23 +11,26 @@ from turicreate.toolkits._decision_tree import DecisionTree, Node from turicreate.toolkits._main import ToolkitError + def _make_tree(sf): - model = tc.decision_tree_classifier.create(sf, - 'target', validation_set = None, max_depth=10) + model = tc.decision_tree_classifier.create( + sf, "target", validation_set=None, max_depth=10 + ) tree = DecisionTree.from_model(model) return tree -class PythonDecisionTreeCorrectness(unittest.TestCase): - +class PythonDecisionTreeCorrectness(unittest.TestCase): def test_categorical(self): # Arrange - sf = tc.SFrame({ - 'cat1': ['1', '1', '2', '2', '2'] * 100, - 'cat2': ['1', '3', '3', '1', '1'] * 100, - 'target': ['1', '2', '1', '2', '1'] * 100, - }) + sf = tc.SFrame( + { + "cat1": ["1", "1", "2", "2", "2"] * 100, + "cat2": ["1", "3", "3", "1", "1"] * 100, + "target": ["1", "2", "1", "2", "1"] * 100, + } + ) # Act tree = _make_tree(sf) @@ -35,58 +38,166 @@ def test_categorical(self): # Check the root node. self.assertEqual(len(tree.nodes), 7) - self.assertEqual(root.to_dict(), {'is_leaf': False, - 'left_id': 2, - 'node_id': 0, - 'missing_id': 1, - 'node_type': u'indicator', - 'parent_id': None, - 'right_id': 1, - 'split_feature_column': 'cat1', - 'split_feature_index': '1', - 'value': 1}) + self.assertEqual( + root.to_dict(), + { + "is_leaf": False, + "left_id": 2, + "node_id": 0, + "missing_id": 1, + "node_type": u"indicator", + "parent_id": None, + "right_id": 1, + "split_feature_column": "cat1", + "split_feature_index": "1", + "value": 1, + }, + ) # Check prediction paths. self.assertEqual(tree.get_prediction_path(0), []) - self.assertEqual(tree.get_prediction_path(1), [ - {'child_id': 1, 'feature': 'cat1', 'index': '1', - 'node_type': 'indicator', 'node_id': 0, 'sign': '!=', - 'value': 1, 'is_missing': False}]) - self.assertEqual(tree.get_prediction_path(2), [{'child_id': 2, - 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '=', - 'value': 1, 'node_type': 'indicator', 'is_missing': False}]) - self.assertEqual(tree.get_prediction_path(3), [{'child_id': 1, - 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '!=', - 'value': 1, 'node_type': 'indicator', 'is_missing': False}, - {'child_id': 3, 'feature': 'cat2', 'index': '1', 'node_id': 1, - 'sign': '!=', 'value': 1, 'node_type': 'indicator', - 'is_missing': False}]) - self.assertEqual(tree.get_prediction_path(4), [{'child_id': 1, - 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '!=', - 'value': 1, 'node_type': 'indicator', 'is_missing': False}, {'child_id': 4, - 'feature': 'cat2', 'index': '1', 'node_id': 1, 'sign': '=', 'value': 1, - 'node_type': 'indicator', 'is_missing': False}]) - self.assertEqual(tree.get_prediction_path(5), [{'child_id': 2, - 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '=', - 'value': 1, 'node_type': 'indicator', 'is_missing': False}, - {'child_id': 5, 'feature': 'cat2', 'index': '1', 'node_id': 2, - 'sign': '!=', 'value': 1,'node_type': 'indicator', 'is_missing': False}]) - self.assertEqual(tree.get_prediction_path(6), [{'child_id': 2, - 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '=', - 'value': 1, 'node_type': 'indicator', 'is_missing': False}, - {'child_id': 6, 'feature': 'cat2', 'index': '1', 'node_id': 2, - 'sign': '=', 'value': 1, 'node_type': 'indicator', - 'is_missing': False}]) + self.assertEqual( + tree.get_prediction_path(1), + [ + { + "child_id": 1, + "feature": "cat1", + "index": "1", + "node_type": "indicator", + "node_id": 0, + "sign": "!=", + "value": 1, + "is_missing": False, + } + ], + ) + self.assertEqual( + tree.get_prediction_path(2), + [ + { + "child_id": 2, + "feature": "cat1", + "index": "1", + "node_id": 0, + "sign": "=", + "value": 1, + "node_type": "indicator", + "is_missing": False, + } + ], + ) + self.assertEqual( + tree.get_prediction_path(3), + [ + { + "child_id": 1, + "feature": "cat1", + "index": "1", + "node_id": 0, + "sign": "!=", + "value": 1, + "node_type": "indicator", + "is_missing": False, + }, + { + "child_id": 3, + "feature": "cat2", + "index": "1", + "node_id": 1, + "sign": "!=", + "value": 1, + "node_type": "indicator", + "is_missing": False, + }, + ], + ) + self.assertEqual( + tree.get_prediction_path(4), + [ + { + "child_id": 1, + "feature": "cat1", + "index": "1", + "node_id": 0, + "sign": "!=", + "value": 1, + "node_type": "indicator", + "is_missing": False, + }, + { + "child_id": 4, + "feature": "cat2", + "index": "1", + "node_id": 1, + "sign": "=", + "value": 1, + "node_type": "indicator", + "is_missing": False, + }, + ], + ) + self.assertEqual( + tree.get_prediction_path(5), + [ + { + "child_id": 2, + "feature": "cat1", + "index": "1", + "node_id": 0, + "sign": "=", + "value": 1, + "node_type": "indicator", + "is_missing": False, + }, + { + "child_id": 5, + "feature": "cat2", + "index": "1", + "node_id": 2, + "sign": "!=", + "value": 1, + "node_type": "indicator", + "is_missing": False, + }, + ], + ) + self.assertEqual( + tree.get_prediction_path(6), + [ + { + "child_id": 2, + "feature": "cat1", + "index": "1", + "node_id": 0, + "sign": "=", + "value": 1, + "node_type": "indicator", + "is_missing": False, + }, + { + "child_id": 6, + "feature": "cat2", + "index": "1", + "node_id": 2, + "sign": "=", + "value": 1, + "node_type": "indicator", + "is_missing": False, + }, + ], + ) def test_dict(self): # Arrange - sf = tc.SFrame({ - 'cat1': ['1', '1', '2', '2', '2'] * 100, - 'cat2': ['1', '3', '3', '1', '1'] * 100, - 'target': ['1', '2', '1', '2', '1'] * 100, - }) - sf['cat1'] = sf['cat1'].apply(lambda x: {x:1}) - sf['cat2'] = sf['cat2'].apply(lambda x: {x:1}) + sf = tc.SFrame( + { + "cat1": ["1", "1", "2", "2", "2"] * 100, + "cat2": ["1", "3", "3", "1", "1"] * 100, + "target": ["1", "2", "1", "2", "1"] * 100, + } + ) + sf["cat1"] = sf["cat1"].apply(lambda x: {x: 1}) + sf["cat2"] = sf["cat2"].apply(lambda x: {x: 1}) # Act tree = _make_tree(sf) @@ -94,61 +205,171 @@ def test_dict(self): # Check the root node. self.assertEqual(len(tree.nodes), 7) - self.assertEqual(root.to_dict(), {'is_leaf': False, - 'left_id': 1, - 'node_id': 0, - 'node_type': u'float', - 'parent_id': None, - 'right_id': 2, - 'missing_id': 1, - 'split_feature_column': 'cat1', - 'split_feature_index': '1', - 'value': -1e-5}) + self.assertEqual( + root.to_dict(), + { + "is_leaf": False, + "left_id": 1, + "node_id": 0, + "node_type": u"float", + "parent_id": None, + "right_id": 2, + "missing_id": 1, + "split_feature_column": "cat1", + "split_feature_index": "1", + "value": -1e-5, + }, + ) # Check prediction paths. self.assertEqual(tree.get_prediction_path(0), []) - self.assertEqual(tree.get_prediction_path(1), [ - {'child_id': 1, 'feature': 'cat1', 'index': '1', - 'node_id': 0, 'sign': '<', 'value': -1e-5,'node_type': 'float', - 'is_missing': False}]) - self.assertEqual(tree.get_prediction_path(2), [{'child_id': 2, - 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '>=', - 'value': -1e-5, 'node_type': 'float', 'is_missing': False}]) - self.assertEqual(tree.get_prediction_path(3), [{'child_id': 1, - 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '<', - 'value': -1e-05, 'node_type': 'float', 'is_missing': False}, {'child_id': 3, - 'feature': 'cat2', 'index': '1', 'node_id': 1, 'sign': '<', - 'value': -1e-05, 'node_type': 'float', 'is_missing': False}]) - self.assertEqual(tree.get_prediction_path(4), [{'child_id': 1, - 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '<', - 'value': -1e-05, 'node_type': 'float', 'is_missing': False}, - {'child_id': 4, 'feature': 'cat2', 'index': '1', 'node_id': 1, - 'sign': '>=', 'value': -1e-05, 'node_type': 'float', - 'is_missing': False}]) - self.assertEqual(tree.get_prediction_path(5), [{'child_id': 2, - 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '>=', - 'value': -1e-05, 'node_type': 'float', 'is_missing': False}, - {'child_id': 5, 'feature': 'cat2', 'index': '1', 'node_id': 2, - 'sign': '<', 'value': -1e-05, 'node_type': 'float', - 'is_missing': False}]) - self.assertEqual(tree.get_prediction_path(6), [{'child_id': 2, - 'feature': 'cat1', 'index': '1', 'node_id': 0, 'sign': '>=', - 'value': -1e-05, 'node_type': 'float', 'is_missing': False}, - {'child_id': 6, 'feature': 'cat2', 'index': '1', 'node_id': 2, - 'sign': '>=', 'value': -1e-05, 'node_type': 'float', - 'is_missing': False}]) + self.assertEqual( + tree.get_prediction_path(1), + [ + { + "child_id": 1, + "feature": "cat1", + "index": "1", + "node_id": 0, + "sign": "<", + "value": -1e-5, + "node_type": "float", + "is_missing": False, + } + ], + ) + self.assertEqual( + tree.get_prediction_path(2), + [ + { + "child_id": 2, + "feature": "cat1", + "index": "1", + "node_id": 0, + "sign": ">=", + "value": -1e-5, + "node_type": "float", + "is_missing": False, + } + ], + ) + self.assertEqual( + tree.get_prediction_path(3), + [ + { + "child_id": 1, + "feature": "cat1", + "index": "1", + "node_id": 0, + "sign": "<", + "value": -1e-05, + "node_type": "float", + "is_missing": False, + }, + { + "child_id": 3, + "feature": "cat2", + "index": "1", + "node_id": 1, + "sign": "<", + "value": -1e-05, + "node_type": "float", + "is_missing": False, + }, + ], + ) + self.assertEqual( + tree.get_prediction_path(4), + [ + { + "child_id": 1, + "feature": "cat1", + "index": "1", + "node_id": 0, + "sign": "<", + "value": -1e-05, + "node_type": "float", + "is_missing": False, + }, + { + "child_id": 4, + "feature": "cat2", + "index": "1", + "node_id": 1, + "sign": ">=", + "value": -1e-05, + "node_type": "float", + "is_missing": False, + }, + ], + ) + self.assertEqual( + tree.get_prediction_path(5), + [ + { + "child_id": 2, + "feature": "cat1", + "index": "1", + "node_id": 0, + "sign": ">=", + "value": -1e-05, + "node_type": "float", + "is_missing": False, + }, + { + "child_id": 5, + "feature": "cat2", + "index": "1", + "node_id": 2, + "sign": "<", + "value": -1e-05, + "node_type": "float", + "is_missing": False, + }, + ], + ) + self.assertEqual( + tree.get_prediction_path(6), + [ + { + "child_id": 2, + "feature": "cat1", + "index": "1", + "node_id": 0, + "sign": ">=", + "value": -1e-05, + "node_type": "float", + "is_missing": False, + }, + { + "child_id": 6, + "feature": "cat2", + "index": "1", + "node_id": 2, + "sign": ">=", + "value": -1e-05, + "node_type": "float", + "is_missing": False, + }, + ], + ) def test_cat_dict(self): # Arrange - sf = tc.SFrame({ - 'cat1': [str(i) for i in range(500)], - 'dict2': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100, - 'target': ['1', '2', '1', '2', '1'] * 100, - }) + sf = tc.SFrame( + { + "cat1": [str(i) for i in range(500)], + "dict2": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100, + "target": ["1", "2", "1", "2", "1"] * 100, + } + ) # Act tree = _make_tree(sf) @@ -156,19 +377,31 @@ def test_cat_dict(self): # Assert. self.assertEqual(len(tree.nodes), 7) - self.assertEqual(root.to_dict(), {'is_leaf': False, 'left_id': 1, - 'node_id': 0, 'parent_id': None, 'right_id': - 2, 'split_feature_column': 'dict2', 'split_feature_index': '1', - 'value': 2.05, 'node_type': 'float', 'missing_id': 1}) - + self.assertEqual( + root.to_dict(), + { + "is_leaf": False, + "left_id": 1, + "node_id": 0, + "parent_id": None, + "right_id": 2, + "split_feature_column": "dict2", + "split_feature_index": "1", + "value": 2.05, + "node_type": "float", + "missing_id": 1, + }, + ) def test_numeric(self): - sf = tc.SFrame({ - 'num1' : [1,2,3.5,4,5] * 100, - 'num2' : [1,2,3.5,4,5] * 100, - 'num3' : [1,2,3.5,4,5] * 100, - 'target': ['1', '2', '1', '2', '1'] * 100, - }) + sf = tc.SFrame( + { + "num1": [1, 2, 3.5, 4, 5] * 100, + "num2": [1, 2, 3.5, 4, 5] * 100, + "num3": [1, 2, 3.5, 4, 5] * 100, + "target": ["1", "2", "1", "2", "1"] * 100, + } + ) # Act tree = _make_tree(sf) @@ -176,19 +409,31 @@ def test_numeric(self): # Assert. self.assertEqual(len(tree.nodes), 9) - self.assertEqual(root.to_dict(), {'is_leaf': False, 'left_id': 1, - 'node_id': 0, 'parent_id': None, 'right_id': - 2, 'split_feature_column': 'num1', 'split_feature_index': None, - 'value': 4.5, 'node_type': 'float', 'missing_id': 1}) - + self.assertEqual( + root.to_dict(), + { + "is_leaf": False, + "left_id": 1, + "node_id": 0, + "parent_id": None, + "right_id": 2, + "split_feature_column": "num1", + "split_feature_index": None, + "value": 4.5, + "node_type": "float", + "missing_id": 1, + }, + ) def test_vector(self): - sf = tc.SFrame({ - 'num1' : [1,2,3.5,4,5] * 100, - 'num2' : [1,2,3.5,4,5] * 100, - 'vect' : [[1,2,3.5,4,5]] * 500, - 'target': ['1', '2', '1', '2', '1'] * 100, - }) + sf = tc.SFrame( + { + "num1": [1, 2, 3.5, 4, 5] * 100, + "num2": [1, 2, 3.5, 4, 5] * 100, + "vect": [[1, 2, 3.5, 4, 5]] * 500, + "target": ["1", "2", "1", "2", "1"] * 100, + } + ) # Act tree = _make_tree(sf) @@ -196,24 +441,39 @@ def test_vector(self): # Assert. self.assertEqual(len(tree.nodes), 9) - self.assertEqual(root.to_dict(), {'is_leaf': False, 'left_id': 1, - 'node_id': 0, 'parent_id': None, 'right_id': - 2, 'split_feature_column': 'num1', 'split_feature_index': None, - 'value': 4.5, 'node_type': 'float', 'missing_id': 1}) - + self.assertEqual( + root.to_dict(), + { + "is_leaf": False, + "left_id": 1, + "node_id": 0, + "parent_id": None, + "right_id": 2, + "split_feature_column": "num1", + "split_feature_index": None, + "value": 4.5, + "node_type": "float", + "missing_id": 1, + }, + ) def test_numeric_dict(self): - sf = tc.SFrame({ - 'num1' : [1,2,3.5,4,5] * 100, - 'num2' : [1,2,3.5,4,5] * 100, - 'vect' : [[1,2,3.5,4,5]] * 500, - 'target': ['1', '2', '1', '2', '1'] * 100, - 'dict[2]': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100, - }) + sf = tc.SFrame( + { + "num1": [1, 2, 3.5, 4, 5] * 100, + "num2": [1, 2, 3.5, 4, 5] * 100, + "vect": [[1, 2, 3.5, 4, 5]] * 500, + "target": ["1", "2", "1", "2", "1"] * 100, + "dict[2]": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100, + } + ) # Act tree = _make_tree(sf) @@ -221,26 +481,37 @@ def test_numeric_dict(self): # Assert. self.assertEqual(len(tree.nodes), 7) - self.assertEqual(root.to_dict(), {'is_leaf': False, 'left_id': 1, - 'node_id': 0, 'parent_id': None, 'right_id': - 2, 'split_feature_column': 'dict[2]', 'split_feature_index': '1', - 'value': 2.05, 'node_type': 'float', 'missing_id': 1}) + self.assertEqual( + root.to_dict(), + { + "is_leaf": False, + "left_id": 1, + "node_id": 0, + "parent_id": None, + "right_id": 2, + "split_feature_column": "dict[2]", + "split_feature_index": "1", + "value": 2.05, + "node_type": "float", + "missing_id": 1, + }, + ) class PythonDecisionTreeAllModelsTest(unittest.TestCase): - def _run_test(self, sf): - sf['target'] = [i < sf.num_rows()/2 for i in range(sf.num_rows())] + sf["target"] = [i < sf.num_rows() / 2 for i in range(sf.num_rows())] for model in [ - tc.regression.boosted_trees_regression, - tc.classifier.boosted_trees_classifier, - tc.regression.random_forest_regression, - tc.classifier.random_forest_classifier, - tc.regression.decision_tree_regression, - tc.classifier.decision_tree_classifier]: - m = model.create(sf, 'target', validation_set = None, max_depth=2) + tc.regression.boosted_trees_regression, + tc.classifier.boosted_trees_classifier, + tc.regression.random_forest_regression, + tc.classifier.random_forest_classifier, + tc.regression.decision_tree_regression, + tc.classifier.decision_tree_classifier, + ]: + m = model.create(sf, "target", validation_set=None, max_depth=2) tree = DecisionTree.from_model(m) for nid, node in tree.nodes.items(): val = tree.get_prediction_score(nid) @@ -249,119 +520,159 @@ def _run_test(self, sf): else: self.assertEqual(val, None) - def test_categorical_1(self): - sf = tc.SFrame({ - 'cat1': ['1', '1', '2', '2', '2'] * 100, - 'cat2': ['1', '3', '3', '1', '1'] * 100 - }) + sf = tc.SFrame( + { + "cat1": ["1", "1", "2", "2", "2"] * 100, + "cat2": ["1", "3", "3", "1", "1"] * 100, + } + ) self._run_test(sf) def test_categorical_2(self): - sf = tc.SFrame({ - 'cat[1]': ['1', '1', '2', '2', '2'] * 100, - 'cat[2]': ['1', '3', '3', '1', '1'] * 100 - }) + sf = tc.SFrame( + { + "cat[1]": ["1", "1", "2", "2", "2"] * 100, + "cat[2]": ["1", "3", "3", "1", "1"] * 100, + } + ) self._run_test(sf) def test_dict_1(self): - sf = tc.SFrame({ - 'dict1': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {'1' : 1, 'b' : 2}, - {'1' : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100 - }) + sf = tc.SFrame( + { + "dict1": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {"1": 1, "b": 2}, + {"1": 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100 + } + ) self._run_test(sf) def test_dict_2(self): - sf = tc.SFrame({ - 'dict1': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100 - }) + sf = tc.SFrame( + { + "dict1": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100 + } + ) self._run_test(sf) def test_dict_3(self): - sf = tc.SFrame({ - 'dict': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100, - 'dict[2]': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100, - 'dict[3]': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100 - }) + sf = tc.SFrame( + { + "dict": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100, + "dict[2]": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100, + "dict[3]": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100, + } + ) self._run_test(sf) def test_cat_dict_1(self): - sf = tc.SFrame({ - 'cat1': [str(i) for i in range(500)], - 'dict2': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100 - }) + sf = tc.SFrame( + { + "cat1": [str(i) for i in range(500)], + "dict2": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100, + } + ) self._run_test(sf) def test_numeric_1(self): - sf = tc.SFrame({ - 'num1' : [1,2,3.5,4,5] * 100, - 'num2' : [1,2,3.5,4,5] * 100, - 'num3' : [1,2,3.5,4,5] * 100 - }) + sf = tc.SFrame( + { + "num1": [1, 2, 3.5, 4, 5] * 100, + "num2": [1, 2, 3.5, 4, 5] * 100, + "num3": [1, 2, 3.5, 4, 5] * 100, + } + ) self._run_test(sf) def test_numeric_2(self): - sf = tc.SFrame({ - 'num1' : [1,2,3.5,4,5] * 100, - 'num2' : [1,2,3.5,4,5] * 100, - 'vect' : [[1,2,3.5,4,5]] * 500 - }) + sf = tc.SFrame( + { + "num1": [1, 2, 3.5, 4, 5] * 100, + "num2": [1, 2, 3.5, 4, 5] * 100, + "vect": [[1, 2, 3.5, 4, 5]] * 500, + } + ) self._run_test(sf) - def test_numeric_dict(self): - sf = tc.SFrame({ - 'num1' : [1,2,3.5,4,5] * 100, - 'num2' : [1,2,3.5,4,5] * 100, - 'vect' : [[1,2,3.5,4,5]] * 500, - 'dict[2]': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100, - }) + sf = tc.SFrame( + { + "num1": [1, 2, 3.5, 4, 5] * 100, + "num2": [1, 2, 3.5, 4, 5] * 100, + "vect": [[1, 2, 3.5, 4, 5]] * 500, + "dict[2]": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100, + } + ) self._run_test(sf) -class PythonDecisionTreeTest(unittest.TestCase): +class PythonDecisionTreeTest(unittest.TestCase): @classmethod def setUpClass(self): - sf = tc.SFrame({ - 'cat1': ['1', '1', '2', '2', '2'] * 100, - 'cat2': ['1', '3', '3', '1', '1'] * 100, - 'target': ['1', '2', '1', '2', '1'] * 100, - }) - model = tc.classifier.boosted_trees_classifier.create(sf, 'target', - validation_set = None, max_depth=2) + sf = tc.SFrame( + { + "cat1": ["1", "1", "2", "2", "2"] * 100, + "cat2": ["1", "3", "3", "1", "1"] * 100, + "target": ["1", "2", "1", "2", "1"] * 100, + } + ) + model = tc.classifier.boosted_trees_classifier.create( + sf, "target", validation_set=None, max_depth=2 + ) tree = DecisionTree.from_model(model) self.tree = tree @@ -405,7 +716,6 @@ def get_prediction_score(self): with self.assertRaises(ToolkitError): score = tree.get_prediction_score(-1) - def get_prediction_path(self, node_id): # Arrange tree = self.tree diff --git a/src/python/turicreate/test/test_random_forest.py b/src/python/turicreate/test/test_random_forest.py index 1482208200..8ca34115e5 100644 --- a/src/python/turicreate/test/test_random_forest.py +++ b/src/python/turicreate/test/test_random_forest.py @@ -22,46 +22,42 @@ import os as _os dirname = _os.path.dirname(__file__) -mushroom_dataset = _os.path.join(dirname, 'mushroom.csv') +mushroom_dataset = _os.path.join(dirname, "mushroom.csv") RMSE_CUTOFF = 0.35 _DEFAULT_OPTIONS_REGRESSION = { -'max_iterations': 10, -'max_depth': 6, -'min_child_weight': 0.1, -'min_loss_reduction': 0.0, -'row_subsample': 0.8, -'column_subsample': 0.8, -'random_seed': None, -'metric': 'auto', -'model_checkpoint_interval': 5, -'model_checkpoint_path': None, -'resume_from_checkpoint': None, + "max_iterations": 10, + "max_depth": 6, + "min_child_weight": 0.1, + "min_loss_reduction": 0.0, + "row_subsample": 0.8, + "column_subsample": 0.8, + "random_seed": None, + "metric": "auto", + "model_checkpoint_interval": 5, + "model_checkpoint_path": None, + "resume_from_checkpoint": None, } _DEFAULT_OPTIONS_CLASSIFIER = copy.deepcopy(_DEFAULT_OPTIONS_REGRESSION) -_DEFAULT_OPTIONS_CLASSIFIER['class_weights'] = None +_DEFAULT_OPTIONS_CLASSIFIER["class_weights"] = None class RandomForestRegressionTest(unittest.TestCase): - @classmethod def setUpClass(self): self.data = tc.SFrame.read_csv(mushroom_dataset) - self.data['label'] = (self.data['label'] == 'p') + 20 + self.data["label"] = (self.data["label"] == "p") + 20 self.dtrain, self.dtest = self.data.random_split(0.8, seed=1) - self.param = {'max_depth': 5, - 'min_loss_reduction': 1, - 'min_child_weight': 1} - self.target = 'label' + self.param = {"max_depth": 5, "min_loss_reduction": 1, "min_child_weight": 1} + self.target = "label" self.unpacked_features = self.data.column_names() self.unpacked_features.remove(self.target) self.features = self.unpacked_features[:] - self.model = tc.random_forest_regression.create(self.dtrain, - target=self.target, - validation_set=self.dtest, - **self.param) + self.model = tc.random_forest_regression.create( + self.dtrain, target=self.target, validation_set=self.dtest, **self.param + ) self.def_opts = copy.deepcopy(_DEFAULT_OPTIONS_REGRESSION) self.opts = self.def_opts.copy() @@ -70,69 +66,72 @@ def setUpClass(self): # Answers # ------------------------------------------------------------------------ self.get_ans = { - 'column_subsample': lambda x: self.opts['column_subsample'], - 'unpacked_features': lambda x: x == self.unpacked_features, - 'features': lambda x: x == self.features, - 'max_depth': lambda x: x == self.opts['max_depth'], - 'max_iterations': lambda x: x == self.opts['max_iterations'], - 'min_child_weight': lambda x: x == self.opts['min_child_weight'], - 'min_loss_reduction': lambda x: x == self.opts['min_loss_reduction'], - 'num_examples': lambda x: x == self.dtrain.num_rows(), - 'num_unpacked_features': lambda x: x == 22, - 'num_features': lambda x: x == 22, - 'num_trees': lambda x: x == self.opts['max_iterations'], - 'num_validation_examples': lambda x: x == self.dtest.num_rows(), - 'row_subsample': lambda x: x == self.opts['row_subsample'], - 'target': lambda x: x == self.target, - 'training_rmse': lambda x: x > 0, - 'training_max_error': lambda x: x > 0, - 'training_time': lambda x: x >= 0, - 'trees_json': lambda x: isinstance(x, list), - 'validation_rmse': lambda x: x > 0, - 'validation_max_error': lambda x: x > 0, - 'random_seed': lambda x: x is None, - 'progress': lambda x: isinstance(x, tc.SFrame) or (x is None), - 'metric': lambda x: x == 'auto', - 'model_checkpoint_interval': lambda x: x == 5, - 'model_checkpoint_path': lambda x: x is None, - 'resume_from_checkpoint': lambda x: x is None, - 'validation_data': lambda x: isinstance(x, tc.SFrame) and len(x) == len(self.dtest), - 'disable_posttrain_evaluation' : lambda x: x == False, - } + "column_subsample": lambda x: self.opts["column_subsample"], + "unpacked_features": lambda x: x == self.unpacked_features, + "features": lambda x: x == self.features, + "max_depth": lambda x: x == self.opts["max_depth"], + "max_iterations": lambda x: x == self.opts["max_iterations"], + "min_child_weight": lambda x: x == self.opts["min_child_weight"], + "min_loss_reduction": lambda x: x == self.opts["min_loss_reduction"], + "num_examples": lambda x: x == self.dtrain.num_rows(), + "num_unpacked_features": lambda x: x == 22, + "num_features": lambda x: x == 22, + "num_trees": lambda x: x == self.opts["max_iterations"], + "num_validation_examples": lambda x: x == self.dtest.num_rows(), + "row_subsample": lambda x: x == self.opts["row_subsample"], + "target": lambda x: x == self.target, + "training_rmse": lambda x: x > 0, + "training_max_error": lambda x: x > 0, + "training_time": lambda x: x >= 0, + "trees_json": lambda x: isinstance(x, list), + "validation_rmse": lambda x: x > 0, + "validation_max_error": lambda x: x > 0, + "random_seed": lambda x: x is None, + "progress": lambda x: isinstance(x, tc.SFrame) or (x is None), + "metric": lambda x: x == "auto", + "model_checkpoint_interval": lambda x: x == 5, + "model_checkpoint_path": lambda x: x is None, + "resume_from_checkpoint": lambda x: x is None, + "validation_data": lambda x: isinstance(x, tc.SFrame) + and len(x) == len(self.dtest), + "disable_posttrain_evaluation": lambda x: x == False, + } self.metrics = ["rmse", "max_error"] self.fields_ans = self.get_ans.keys() def test_create(self): - model =tc.random_forest_regression.create(self.dtrain, target='label', - validation_set=self.dtest, - **self.param) + model = tc.random_forest_regression.create( + self.dtrain, target="label", validation_set=self.dtest, **self.param + ) - rmse = model.evaluate(self.dtest, 'rmse')['rmse'] + rmse = model.evaluate(self.dtest, "rmse")["rmse"] self.assertTrue(model is not None) self.assertTrue(rmse < RMSE_CUTOFF) dtrain = self.dtrain - dtrain['label'] = 10 - self.assertRaises(ToolkitError, - lambda:tc.random_forest_regression.create(self.dtrain, - target='label_wrong', **self.param)) + dtrain["label"] = 10 + self.assertRaises( + ToolkitError, + lambda: tc.random_forest_regression.create( + self.dtrain, target="label_wrong", **self.param + ), + ) def test_create_with_deprecated_num_trees(self): num_trees = 10 - model = tc.random_forest_regression.create(self.dtrain, target='label', - validation_set=self.dtest, - num_trees=num_trees) + model = tc.random_forest_regression.create( + self.dtrain, target="label", validation_set=self.dtest, num_trees=num_trees + ) self.assertTrue(model is not None) self.assertEqual(model.num_trees, num_trees) - def test__list_fields(self): """ Check the _list_fields function. Compare with the answer. """ model = self.model - fields = model._list_fields() + fields = model._list_fields() self.assertEqual(set(fields), set(self.fields_ans)) def test_get(self): @@ -143,8 +142,10 @@ def test_get(self): model = self.model for field in self.fields_ans: ans = model._get(field) - self.assertTrue(self.get_ans[field](ans), \ - '''Get failed in field {}. Output was {}.'''.format(field, ans)) + self.assertTrue( + self.get_ans[field](ans), + """Get failed in field {}. Output was {}.""".format(field, ans), + ) def test_summary(self): """ @@ -159,14 +160,14 @@ def test_repr(self): Check the repr function. """ model = self.model - ans = str(model) + ans = str(model) self.assertTrue(type(ans) == str) def test_save_and_load(self): """ Make sure saving and loading retains things. """ - filename = 'save_file%s' % (str(uuid.uuid4())) + filename = "save_file%s" % (str(uuid.uuid4())) self.model.save(filename) self.model = tc.load_model(filename) @@ -191,7 +192,6 @@ def test_save_and_load(self): except: self.assertTrue(False, "Failed during save & load diagnostics") - def test_predict(self): # Make predictions from SFrame. @@ -211,27 +211,29 @@ def test_evaluate(self): t = self.dtrain[self.target] p = model.predict(self.dtrain) self.sm_metrics = { - "max_error" : tc.toolkits.evaluation.max_error(t, p), - "rmse" : tc.toolkits.evaluation.rmse(t, p) + "max_error": tc.toolkits.evaluation.max_error(t, p), + "rmse": tc.toolkits.evaluation.rmse(t, p), } + def check_metric(ans, metric): self.assertTrue(ans is not None) self.assertTrue(metric in ans) - self.assertAlmostEqual(ans[metric], - self.sm_metrics[metric], - places = 4, - msg = "%s = (%s,%s)" % \ - (metric, ans[metric], self.sm_metrics[metric])) + self.assertAlmostEqual( + ans[metric], + self.sm_metrics[metric], + places=4, + msg="%s = (%s,%s)" % (metric, ans[metric], self.sm_metrics[metric]), + ) # Default ans = model.evaluate(self.dtrain) self.assertEqual(sorted(ans.keys()), sorted(self.metrics)) for m in self.metrics: - check_metric(ans, m) + check_metric(ans, m) # Individual for m in self.metrics: - ans = model.evaluate(self.dtrain, metric = m) + ans = model.evaluate(self.dtrain, metric=m) check_metric(ans, m) def test_extract_features(self): @@ -252,20 +254,28 @@ def test_list_and_dict_type(self): # make a more complicated dataset containing list and dictionary type columns complex_data = copy.copy(simple_data) - complex_data['random_list_noise'] = \ - tc.SArray([[random.gauss(0, 1) for j in range(3)] for i in range(complex_data.num_rows())]) - complex_data['random_dict_noise'] = \ - tc.SArray([{'x0': random.gauss(0, 1)} for i in range(complex_data.num_rows())]) + complex_data["random_list_noise"] = tc.SArray( + [ + [random.gauss(0, 1) for j in range(3)] + for i in range(complex_data.num_rows()) + ] + ) + complex_data["random_dict_noise"] = tc.SArray( + [{"x0": random.gauss(0, 1)} for i in range(complex_data.num_rows())] + ) complex_train, complex_test = complex_data.random_split(0.8, seed=1) - for (train, test) in [(simple_train, simple_test), (complex_train, complex_test)]: + for (train, test) in [ + (simple_train, simple_test), + (complex_train, complex_test), + ]: self._test_regression_model(train, test, rmse_threshold) - def _test_regression_model(self, train, test, rmse_threshold, target='label'): + def _test_regression_model(self, train, test, rmse_threshold, target="label"): # create - model = tc.random_forest_regression.create(train, target=target, - validation_set=test, - **self.param) + model = tc.random_forest_regression.create( + train, target=target, validation_set=test, **self.param + ) # predict pred = model.predict(test) pred_lst = model.predict(list(test)) @@ -273,7 +283,7 @@ def _test_regression_model(self, train, test, rmse_threshold, target='label'): self.assertLess(rmse, rmse_threshold) # evaluate - rmse_eval = model.evaluate(test, metric='rmse')['rmse'] + rmse_eval = model.evaluate(test, metric="rmse")["rmse"] self.assertTrue(rmse_eval < rmse_threshold) self.assertAlmostEqual(rmse_eval, rmse, delta=1e-2) @@ -282,7 +292,9 @@ def test_predict_new_category(self): # Arrange new_test = copy.copy(self.dtest) # change 'r' cap-color into a new color 'z' - new_test['cap-color'] = new_test['cap-color'].apply(lambda x: 'z'if x == 'r' else x) + new_test["cap-color"] = new_test["cap-color"].apply( + lambda x: "z" if x == "r" else x + ) # Act y1 = self.model.predict(new_test) @@ -294,16 +306,19 @@ def test_predict_new_category(self): def test_predict_new_dictionary_key(self): # Arrange new_data = copy.copy(self.data) - new_data['dict_color_feature'] = \ - new_data['cap-color'].apply(lambda x: {'cap-color': ord(x)}) + new_data["dict_color_feature"] = new_data["cap-color"].apply( + lambda x: {"cap-color": ord(x)} + ) train, test = new_data.random_split(0.8, seed=1) # add a new key to dictionary in predict time - test['dict_color_feature'] = test['dict_color_feature'].apply( - lambda x: dict(list(x.items()) + list({'cap-color2': x['cap-color']+1}.items()))) + test["dict_color_feature"] = test["dict_color_feature"].apply( + lambda x: dict( + list(x.items()) + list({"cap-color2": x["cap-color"] + 1}.items()) + ) + ) - model = tc.random_forest_regression.create(train, target='label', - **self.param) + model = tc.random_forest_regression.create(train, target="label", **self.param) # Act. y1 = model.predict(test) y2 = model.predict(list(test)) @@ -318,29 +333,30 @@ def test_predict_new_dictionary_key(self): ## ## --------------------------------------------------------------------------- + def test_suite_random_forest_classifier(): """ Create a test suite for each test case in the RandomForestClassifierTest. """ testCases = [ - binary_classification_integer_target, - binary_classification_string_target, - multiclass_classification_integer_target, - multiclass_classification_string_target - ] + binary_classification_integer_target, + binary_classification_string_target, + multiclass_classification_integer_target, + multiclass_classification_string_target, + ] for t in testCases: testcase_members = {} testcase_members[t.__name__] = classmethod(t) testcase_class = type( - 'RandomForestClassifierTest_%s' % t.__name__, + "RandomForestClassifierTest_%s" % t.__name__, (RandomForestClassifierTest,), - testcase_members + testcase_members, ) testcase_class.__test__ = True getattr(testcase_class, t.__name__)() for method in dir(testcase_class): - if method.startswith('test_'): + if method.startswith("test_"): testcase_instance = testcase_class(method) getattr(testcase_instance, method)() @@ -352,20 +368,21 @@ def binary_classification_integer_target(cls): # Get the data from the mushroom dataset. cls.data = tc.SFrame.read_csv(mushroom_dataset) cls.dtrain, cls.dtest = cls.data.random_split(0.8, seed=1) - cls.dtrain['label'] = cls.dtrain['label'] == 'p' - cls.dtest['label'] = cls.dtest['label'] == 'p' - cls.param = {'max_depth': 3, - 'min_loss_reduction': 1, - 'max_iterations': 2, - 'min_child_weight': 1} - cls.target = 'label' + cls.dtrain["label"] = cls.dtrain["label"] == "p" + cls.dtest["label"] = cls.dtest["label"] == "p" + cls.param = { + "max_depth": 3, + "min_loss_reduction": 1, + "max_iterations": 2, + "min_child_weight": 1, + } + cls.target = "label" cls.unpacked_features = cls.data.column_names() cls.unpacked_features.remove(cls.target) cls.features = cls.unpacked_features[:] - cls.model = tc.random_forest_classifier.create(cls.dtrain, - target=cls.target, - validation_set = cls.dtest, - **cls.param) + cls.model = tc.random_forest_classifier.create( + cls.dtrain, target=cls.target, validation_set=cls.dtest, **cls.param + ) cls.def_opts = copy.deepcopy(_DEFAULT_OPTIONS_CLASSIFIER) cls.opts = cls.def_opts.copy() @@ -374,141 +391,153 @@ def binary_classification_integer_target(cls): # Answers # ------------------------------------------------------------------------ - if 'classes' in cls.model._list_fields(): - num_examples_per_class = {\ - c:(cls.dtrain[cls.target] == c).sum() for c in cls.model.classes} + if "classes" in cls.model._list_fields(): + num_examples_per_class = { + c: (cls.dtrain[cls.target] == c).sum() for c in cls.model.classes + } cls.get_ans = { - 'column_subsample': lambda x: cls.opts['column_subsample'], - 'unpacked_features': lambda x: x == cls.unpacked_features, - 'features': lambda x: x == cls.features, - 'max_depth': lambda x: x == cls.opts['max_depth'], - 'max_iterations': lambda x: x == cls.opts['max_iterations'], - 'min_child_weight': lambda x: x == cls.opts['min_child_weight'], - 'min_loss_reduction': lambda x: x == cls.opts['min_loss_reduction'], - 'num_examples': lambda x: x == cls.dtrain.num_rows(), - 'num_examples_per_class': lambda x: x == num_examples_per_class, - 'num_classes': lambda x: x == 2, - 'classes': lambda x: x == [0,1], - 'num_unpacked_features': lambda x: x == 22, - 'num_features': lambda x: x == 22, - 'num_trees': lambda x: x == cls.opts['max_iterations'], - 'num_validation_examples': lambda x: x == cls.dtest.num_rows(), - 'row_subsample': lambda x: x == cls.opts['row_subsample'], - 'target': lambda x: x == cls.target, - 'training_accuracy': lambda x: x > 0, - 'training_log_loss': lambda x: x > 0, - 'training_time': lambda x: x >= 0, - 'class_weights': lambda x: x == {0:1.0, 1:1.0}, - 'trees_json': lambda x: isinstance(x, list), - 'validation_accuracy': lambda x: x > 0, - 'validation_log_loss': lambda x: x > 0, - 'random_seed': lambda x: x is None, - 'progress': lambda x: isinstance(x, tc.SFrame) or (x is None), - 'metric': lambda x: x == 'auto', - 'model_checkpoint_interval': lambda x: x == 5, - 'model_checkpoint_path': lambda x: x is None, - 'resume_from_checkpoint': lambda x: x is None, - 'training_auc': lambda x: x > 0, - 'training_confusion_matrix': lambda x: len(x) > 0, - 'training_f1_score': lambda x: x > 0, - 'training_precision': lambda x: x > 0, - 'training_recall': lambda x: x > 0, - 'training_report_by_class': lambda x: len(x) > 0, - 'training_roc_curve': lambda x: len(x) > 0, - 'validation_data': lambda x: isinstance(x, tc.SFrame) and len(x) == len(cls.dtest), - 'validation_auc': lambda x: x > 0, - 'validation_confusion_matrix': lambda x: len(x) > 0, - 'validation_f1_score': lambda x: x > 0, - 'validation_precision': lambda x: x > 0, - 'validation_recall': lambda x: x > 0, - 'validation_report_by_class': lambda x: len(x) > 0, - 'validation_roc_curve': lambda x: len(x) > 0, - 'disable_posttrain_evaluation' : lambda x: x == False, - } + "column_subsample": lambda x: cls.opts["column_subsample"], + "unpacked_features": lambda x: x == cls.unpacked_features, + "features": lambda x: x == cls.features, + "max_depth": lambda x: x == cls.opts["max_depth"], + "max_iterations": lambda x: x == cls.opts["max_iterations"], + "min_child_weight": lambda x: x == cls.opts["min_child_weight"], + "min_loss_reduction": lambda x: x == cls.opts["min_loss_reduction"], + "num_examples": lambda x: x == cls.dtrain.num_rows(), + "num_examples_per_class": lambda x: x == num_examples_per_class, + "num_classes": lambda x: x == 2, + "classes": lambda x: x == [0, 1], + "num_unpacked_features": lambda x: x == 22, + "num_features": lambda x: x == 22, + "num_trees": lambda x: x == cls.opts["max_iterations"], + "num_validation_examples": lambda x: x == cls.dtest.num_rows(), + "row_subsample": lambda x: x == cls.opts["row_subsample"], + "target": lambda x: x == cls.target, + "training_accuracy": lambda x: x > 0, + "training_log_loss": lambda x: x > 0, + "training_time": lambda x: x >= 0, + "class_weights": lambda x: x == {0: 1.0, 1: 1.0}, + "trees_json": lambda x: isinstance(x, list), + "validation_accuracy": lambda x: x > 0, + "validation_log_loss": lambda x: x > 0, + "random_seed": lambda x: x is None, + "progress": lambda x: isinstance(x, tc.SFrame) or (x is None), + "metric": lambda x: x == "auto", + "model_checkpoint_interval": lambda x: x == 5, + "model_checkpoint_path": lambda x: x is None, + "resume_from_checkpoint": lambda x: x is None, + "training_auc": lambda x: x > 0, + "training_confusion_matrix": lambda x: len(x) > 0, + "training_f1_score": lambda x: x > 0, + "training_precision": lambda x: x > 0, + "training_recall": lambda x: x > 0, + "training_report_by_class": lambda x: len(x) > 0, + "training_roc_curve": lambda x: len(x) > 0, + "validation_data": lambda x: isinstance(x, tc.SFrame) + and len(x) == len(cls.dtest), + "validation_auc": lambda x: x > 0, + "validation_confusion_matrix": lambda x: len(x) > 0, + "validation_f1_score": lambda x: x > 0, + "validation_precision": lambda x: x > 0, + "validation_recall": lambda x: x > 0, + "validation_report_by_class": lambda x: len(x) > 0, + "validation_roc_curve": lambda x: len(x) > 0, + "disable_posttrain_evaluation": lambda x: x == False, + } cls.fields_ans = cls.get_ans.keys() + def binary_classification_string_target(cls): binary_classification_integer_target(cls) cls.type = str - cls.dtrain['label'] = cls.dtrain['label'].astype(str) - cls.dtest['label'] = cls.dtest['label'].astype(str) - cls.dtrain['label'] = cls.dtrain['label'] + '-cat' - cls.dtest['label'] = cls.dtest['label'] + '-cat' - cls.model = tc.random_forest_classifier.create(cls.dtrain, - target=cls.target, - validation_set = cls.dtest, - **cls.param) - - num_examples_per_class = {c:(cls.dtrain[cls.target] == c).sum() for c in cls.model.classes} - cls.get_ans['num_examples_per_class'] = lambda x: x == num_examples_per_class - cls.get_ans['class_weights'] = lambda x: x == {'0-cat':1.0, '1-cat':1.0} - cls.get_ans['classes'] = lambda x: x == ['0-cat', '1-cat'] + cls.dtrain["label"] = cls.dtrain["label"].astype(str) + cls.dtest["label"] = cls.dtest["label"].astype(str) + cls.dtrain["label"] = cls.dtrain["label"] + "-cat" + cls.dtest["label"] = cls.dtest["label"] + "-cat" + cls.model = tc.random_forest_classifier.create( + cls.dtrain, target=cls.target, validation_set=cls.dtest, **cls.param + ) + + num_examples_per_class = { + c: (cls.dtrain[cls.target] == c).sum() for c in cls.model.classes + } + cls.get_ans["num_examples_per_class"] = lambda x: x == num_examples_per_class + cls.get_ans["class_weights"] = lambda x: x == {"0-cat": 1.0, "1-cat": 1.0} + cls.get_ans["classes"] = lambda x: x == ["0-cat", "1-cat"] def multiclass_classification_integer_target(cls): binary_classification_integer_target(cls) + def create_multiclass_label(row): - if row['label'] == 0: + if row["label"] == 0: return 0 - elif row['cap-surface'] == 'y': + elif row["cap-surface"] == "y": return 1 else: return 2 - cls.dtrain['label'] = cls.dtrain.apply(create_multiclass_label) - cls.dtest['label'] = cls.dtest.apply(create_multiclass_label) - cls.model = tc.random_forest_classifier.create(cls.dtrain, - target=cls.target, - validation_set = cls.dtest, - **cls.param) - - num_examples_per_class = {c:(cls.dtrain[cls.target] == c).sum() for c in cls.model.classes} - cls.get_ans['num_examples_per_class'] = lambda x: x == num_examples_per_class - cls.get_ans['num_classes'] = lambda x: x == 3 - cls.get_ans['num_trees'] = lambda x: x == 6 - cls.get_ans['classes'] = lambda x: set(x) == set([0,1,2]) - cls.get_ans['class_weights'] = lambda x: x == {0:1.0, 1:1.0, 2:1.0} + + cls.dtrain["label"] = cls.dtrain.apply(create_multiclass_label) + cls.dtest["label"] = cls.dtest.apply(create_multiclass_label) + cls.model = tc.random_forest_classifier.create( + cls.dtrain, target=cls.target, validation_set=cls.dtest, **cls.param + ) + + num_examples_per_class = { + c: (cls.dtrain[cls.target] == c).sum() for c in cls.model.classes + } + cls.get_ans["num_examples_per_class"] = lambda x: x == num_examples_per_class + cls.get_ans["num_classes"] = lambda x: x == 3 + cls.get_ans["num_trees"] = lambda x: x == 6 + cls.get_ans["classes"] = lambda x: set(x) == set([0, 1, 2]) + cls.get_ans["class_weights"] = lambda x: x == {0: 1.0, 1: 1.0, 2: 1.0} + def multiclass_classification_string_target(cls): multiclass_classification_integer_target(cls) cls.type = str - cls.dtrain['label'] = cls.dtrain['label'].astype(str) - cls.dtest['label'] = cls.dtest['label'].astype(str) - cls.model = tc.random_forest_classifier.create(cls.dtrain, - target=cls.target, - validation_set = cls.dtest, - **cls.param) - num_examples_per_class = {c:(cls.dtrain[cls.target] == c).sum() for c in cls.model.classes} - cls.get_ans['num_examples_per_class'] = lambda x: x == num_examples_per_class - cls.get_ans['classes'] = lambda x: set(x) == set(map(str, [0,1,2])) - cls.get_ans['class_weights'] = lambda x: x == {'0':1.0, '1':1.0, '2':1.0} + cls.dtrain["label"] = cls.dtrain["label"].astype(str) + cls.dtest["label"] = cls.dtest["label"].astype(str) + cls.model = tc.random_forest_classifier.create( + cls.dtrain, target=cls.target, validation_set=cls.dtest, **cls.param + ) + num_examples_per_class = { + c: (cls.dtrain[cls.target] == c).sum() for c in cls.model.classes + } + cls.get_ans["num_examples_per_class"] = lambda x: x == num_examples_per_class + cls.get_ans["classes"] = lambda x: set(x) == set(map(str, [0, 1, 2])) + cls.get_ans["class_weights"] = lambda x: x == {"0": 1.0, "1": 1.0, "2": 1.0} class RandomForestClassifierTest(unittest.TestCase): __test__ = False def test_create(self): - model =tc.random_forest_classifier.create(self.dtrain, target='label', - validation_set=self.dtest, - **self.param) + model = tc.random_forest_classifier.create( + self.dtrain, target="label", validation_set=self.dtest, **self.param + ) self.assertTrue(model is not None) - self.assertGreater(model.evaluate(self.dtest, 'accuracy')['accuracy'], 0.9) + self.assertGreater(model.evaluate(self.dtest, "accuracy")["accuracy"], 0.9) dtrain = self.dtrain[:] - dtrain['label'] = 10 - self.assertRaises(ToolkitError, - lambda: tc.random_forest_classifier.create(self.dtrain, - target='label_wrong', **self.param)) + dtrain["label"] = 10 + self.assertRaises( + ToolkitError, + lambda: tc.random_forest_classifier.create( + self.dtrain, target="label_wrong", **self.param + ), + ) def test__list_fields(self): """ Check the _list_fields function. Compare with the answer. """ model = self.model - fields = model._list_fields() + fields = model._list_fields() self.assertEqual(set(fields), set(self.fields_ans)) def test_get(self): @@ -519,8 +548,10 @@ def test_get(self): model = self.model for field in self.fields_ans: ans = model._get(field) - self.assertTrue(self.get_ans[field](ans), \ - '''Get failed in field {}. Output was {}.'''.format(field, ans)) + self.assertTrue( + self.get_ans[field](ans), + """Get failed in field {}. Output was {}.""".format(field, ans), + ) def test_summary(self): """ @@ -535,14 +566,14 @@ def test_repr(self): Check the repr function. """ model = self.model - ans = str(model) + ans = str(model) self.assertTrue(type(ans) == str, "Repr failed") def test_save_and_load(self): """ Make sure saving and loading retains things. """ - filename = 'save_file%s' % (str(uuid.uuid4())) + filename = "save_file%s" % (str(uuid.uuid4())) self.model.save(filename) self.model = tc.load_model(filename) @@ -567,21 +598,20 @@ def test_save_and_load(self): except: self.assertTrue(False, "Failed during save & load diagnostics") - def test_predict_topk(self): k = self.model.num_classes - y1 = self.model.predict_topk(self.dtest, k = k, output_type = 'rank') - self.assertEqual(y1['class'].dtype, self.type) - self.assertEqual(y1['id'].dtype, int) + y1 = self.model.predict_topk(self.dtest, k=k, output_type="rank") + self.assertEqual(y1["class"].dtype, self.type) + self.assertEqual(y1["id"].dtype, int) self.assertEqual(y1.num_rows(), self.dtest.num_rows() * k) - y1 = self.model.predict_topk(self.dtest, k = k, output_type = 'margin') - self.assertEqual(y1['id'].dtype, int) + y1 = self.model.predict_topk(self.dtest, k=k, output_type="margin") + self.assertEqual(y1["id"].dtype, int) self.assertEqual(y1.num_rows(), self.dtest.num_rows() * k) - y1 = self.model.predict_topk(self.dtest, k = k, output_type = 'probability') - self.assertEqual(y1['id'].dtype, int) + y1 = self.model.predict_topk(self.dtest, k=k, output_type="probability") + self.assertEqual(y1["id"].dtype, int) self.assertEqual(y1.num_rows(), self.dtest.num_rows() * k) def test_predict(self): @@ -589,20 +619,20 @@ def test_predict(self): self.assertEqual(len(y1), self.dtest.num_rows()) self.assertEqual(y1.dtype, self.type) - y1 = self.model.predict(self.dtest, output_type = 'probability_vector') + y1 = self.model.predict(self.dtest, output_type="probability_vector") self.assertEqual(len(y1), self.dtest.num_rows()) self.assertEqual(y1.dtype, array) k = self.model.num_classes if k == 2: - y1 = self.model.predict(self.dtest, 'margin') - y2 = self.model.predict(list(self.dtest), 'margin') + y1 = self.model.predict(self.dtest, "margin") + y2 = self.model.predict(list(self.dtest), "margin") self.assertEqual(len(y1), self.dtest.num_rows()) self.assertEqual(len(y2), self.dtest.num_rows()) self.assertEqual(list(y1), list(y2)) - y1 = self.model.predict(self.dtest, 'probability') - y2 = self.model.predict(list(self.dtest), 'probability') + y1 = self.model.predict(self.dtest, "probability") + y2 = self.model.predict(list(self.dtest), "probability") self.assertEqual(len(y1), self.dtest.num_rows()) self.assertEqual(len(y2), self.dtest.num_rows()) self.assertEqual(list(y1), list(y2)) @@ -610,68 +640,76 @@ def test_predict(self): def test_classify(self): y1 = self.model.classify(self.dtest) self.assertEqual(len(y1), len(self.dtest)) - self.assertEqual(y1['class'].dtype, self.type) - self.assertEqual(set(y1.column_names()), set(['class', 'probability'])) + self.assertEqual(y1["class"].dtype, self.type) + self.assertEqual(set(y1.column_names()), set(["class", "probability"])) def test_evaluate(self): t = self.dtrain[self.target] c = self.model.predict(self.dtrain, "class") p = self.model.predict(self.dtrain, "probability_vector") - ans_metrics = ["accuracy", "auc", "confusion_matrix", "f1_score", - "log_loss", "precision", "recall", "roc_curve"] + ans_metrics = [ + "accuracy", + "auc", + "confusion_matrix", + "f1_score", + "log_loss", + "precision", + "recall", + "roc_curve", + ] self.sm_metrics = { - "accuracy" : tc.toolkits.evaluation.accuracy(t, c), - "auc" : tc.toolkits.evaluation.auc(t, p), - "confusion_matrix" : tc.toolkits.evaluation.confusion_matrix(t, c), - "f1_score" : tc.toolkits.evaluation.f1_score(t, c), - "log_loss" : tc.toolkits.evaluation.log_loss(t, p), - "precision" : tc.toolkits.evaluation.precision(t, c), - "recall" : tc.toolkits.evaluation.recall(t, c), - "roc_curve" : tc.toolkits.evaluation.roc_curve(t, p), - } + "accuracy": tc.toolkits.evaluation.accuracy(t, c), + "auc": tc.toolkits.evaluation.auc(t, p), + "confusion_matrix": tc.toolkits.evaluation.confusion_matrix(t, c), + "f1_score": tc.toolkits.evaluation.f1_score(t, c), + "log_loss": tc.toolkits.evaluation.log_loss(t, p), + "precision": tc.toolkits.evaluation.precision(t, c), + "recall": tc.toolkits.evaluation.recall(t, c), + "roc_curve": tc.toolkits.evaluation.roc_curve(t, p), + } model = self.model + def check_cf_matrix(ans): self.assertTrue(ans is not None) - self.assertTrue('confusion_matrix' in ans) - cf = ans['confusion_matrix']\ - .sort(['target_label', 'predicted_label']) - ans_cf = self.sm_metrics['confusion_matrix']\ - .sort(['target_label', 'predicted_label']) - self.assertEqual(list(cf['count']), list(ans_cf['count'])) + self.assertTrue("confusion_matrix" in ans) + cf = ans["confusion_matrix"].sort(["target_label", "predicted_label"]) + ans_cf = self.sm_metrics["confusion_matrix"].sort( + ["target_label", "predicted_label"] + ) + self.assertEqual(list(cf["count"]), list(ans_cf["count"])) def check_roc_curve(ans): self.assertTrue(ans is not None) - self.assertTrue('roc_curve' in ans) - roc = ans['roc_curve'] + self.assertTrue("roc_curve" in ans) + roc = ans["roc_curve"] self.assertEqual(type(roc), tc.SFrame) def check_metric(ans, metric): - if metric == 'confusion_matrix': + if metric == "confusion_matrix": check_cf_matrix(ans) - elif metric == 'roc_curve': + elif metric == "roc_curve": check_roc_curve(ans) else: self.assertTrue(ans is not None) self.assertTrue(metric in ans) - self.assertAlmostEqual(ans[metric], - self.sm_metrics[metric], - places = 4, - msg = "%s = (%s,%s)" % \ - (metric, ans[metric], self.sm_metrics[metric])) + self.assertAlmostEqual( + ans[metric], + self.sm_metrics[metric], + places=4, + msg="%s = (%s,%s)" % (metric, ans[metric], self.sm_metrics[metric]), + ) # Default ans = model.evaluate(self.dtrain) self.assertEqual(sorted(ans.keys()), sorted(ans_metrics)) for m in ans_metrics: - check_metric(ans, m) + check_metric(ans, m) # Individual for m in ans_metrics: - ans = model.evaluate(self.dtrain, metric = m) + ans = model.evaluate(self.dtrain, metric=m) check_metric(ans, m) - - def test_extract_features(self): y1 = self.model.extract_features(self.dtest) self.assertTrue(len(y1) == len(self.dtest)) @@ -690,34 +728,43 @@ def test_list_and_dict_type(self): # make a more complicated dataset containing list and dictionary type columns complex_data = copy.copy(simple_data) - complex_data['random_list_noise'] = \ - tc.SArray([[random.gauss(0, 1) for j in range(3)] for i in range(complex_data.num_rows())]) - complex_data['random_dict_noise'] = \ - tc.SArray([{'x0': random.gauss(0, 1)} for i in range(complex_data.num_rows())]) + complex_data["random_list_noise"] = tc.SArray( + [ + [random.gauss(0, 1) for j in range(3)] + for i in range(complex_data.num_rows()) + ] + ) + complex_data["random_dict_noise"] = tc.SArray( + [{"x0": random.gauss(0, 1)} for i in range(complex_data.num_rows())] + ) complex_train, complex_test = complex_data.random_split(0.8, seed=1) - for (train, test) in [(simple_train, simple_test), (complex_train, complex_test)]: + for (train, test) in [ + (simple_train, simple_test), + (complex_train, complex_test), + ]: self._test_classifier_model(train, test, accuracy_threshold) - def _test_classifier_model(self, train, test, accuracy_threshold, target='label'): + def _test_classifier_model(self, train, test, accuracy_threshold, target="label"): # create - model = tc.random_forest_classifier.create(train, target=target, - validation_set=test, - **self.param) + model = tc.random_forest_classifier.create( + train, target=target, validation_set=test, **self.param + ) # predict - pred = model.predict(test, output_type = 'class') - pred_lst = model.predict(list(test), output_type = 'class') + pred = model.predict(test, output_type="class") + pred_lst = model.predict(list(test), output_type="class") self.assertEqual(list(pred), list(pred_lst)) - accuracy = model.evaluate(test, metric='accuracy') - self.assertGreater(accuracy['accuracy'], accuracy_threshold) - + accuracy = model.evaluate(test, metric="accuracy") + self.assertGreater(accuracy["accuracy"], accuracy_threshold) def test_predict_new_category(self): # Arrange new_test = copy.copy(self.dtest) # change 'r' cap-color into a new color 'z' - new_test['cap-color'] = new_test['cap-color'].apply(lambda x: 'z'if x == 'r' else x) + new_test["cap-color"] = new_test["cap-color"].apply( + lambda x: "z" if x == "r" else x + ) # Act y1 = self.model.predict(new_test) @@ -729,16 +776,19 @@ def test_predict_new_category(self): def test_predict_new_dictionary_key(self): # Arrange new_data = copy.copy(self.data) - new_data['dict_color_feature'] = \ - new_data['cap-color'].apply(lambda x: {'cap-color': ord(x)}) + new_data["dict_color_feature"] = new_data["cap-color"].apply( + lambda x: {"cap-color": ord(x)} + ) train, test = new_data.random_split(0.8, seed=1) # add a new key to dictionary in predict time - test['dict_color_feature'] = test['dict_color_feature'].apply( - lambda x: dict(list(x.items()) + list({'cap-color2': x['cap-color']+1}.items()))) + test["dict_color_feature"] = test["dict_color_feature"].apply( + lambda x: dict( + list(x.items()) + list({"cap-color2": x["cap-color"] + 1}.items()) + ) + ) - model = tc.random_forest_classifier.create(train, target='label', - **self.param) + model = tc.random_forest_classifier.create(train, target="label", **self.param) # Act. y1 = model.predict(test) y2 = model.predict(list(test)) @@ -746,26 +796,29 @@ def test_predict_new_dictionary_key(self): # Assert self.assertEqual(list(y1), list(y2)) -class TestStringTarget(unittest.TestCase): +class TestStringTarget(unittest.TestCase): def test_cat(self): import numpy as np + # Arrange np.random.seed(8) n, d = 1000, 100 sf = tc.SFrame() for i in range(d): - sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - target = np.random.randint(2, size=n) - sf['target'] = target + sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + target = np.random.randint(2, size=n) + sf["target"] = target - sf['target'] = sf['target'].astype(str) - sf['target'] = 'cat-' + sf['target'] - model = tc.random_forest_classifier.create(sf, 'target') + sf["target"] = sf["target"].astype(str) + sf["target"] = "cat-" + sf["target"] + model = tc.random_forest_classifier.create(sf, "target") # Act evaluation = model.evaluate(sf) # Assert - self.assertEqual(['cat-0', 'cat-1'], - sorted(list(evaluation['confusion_matrix']['target_label'].unique()))) + self.assertEqual( + ["cat-0", "cat-1"], + sorted(list(evaluation["confusion_matrix"]["target_label"].unique())), + ) diff --git a/src/python/turicreate/test/test_recommender.py b/src/python/turicreate/test/test_recommender.py index 9a68d09de2..391045812b 100644 --- a/src/python/turicreate/test/test_recommender.py +++ b/src/python/turicreate/test/test_recommender.py @@ -23,9 +23,15 @@ from turicreate.toolkits._main import ToolkitError from turicreate.toolkits.recommender.util import random_split_by_user import itertools -from turicreate.toolkits.recommender.item_similarity_recommender import ItemSimilarityRecommender -from turicreate.toolkits.recommender.factorization_recommender import FactorizationRecommender -from turicreate.toolkits.recommender.ranking_factorization_recommender import RankingFactorizationRecommender +from turicreate.toolkits.recommender.item_similarity_recommender import ( + ItemSimilarityRecommender, +) +from turicreate.toolkits.recommender.factorization_recommender import ( + FactorizationRecommender, +) +from turicreate.toolkits.recommender.ranking_factorization_recommender import ( + RankingFactorizationRecommender, +) from turicreate.toolkits.recommender.popularity_recommender import PopularityRecommender import array from turicreate.util import _assert_sframe_equal as assert_sframe_equal @@ -44,33 +50,36 @@ raise ImportError("Tests need Pandas") -DELTA = .000001 +DELTA = 0.000001 + +model_names = [ + "popularity_recommender", + "popularity_recommender_with_target", + "item_content_recommender", + "item_similarity_recommender", + "item_similarity_recommender_cosine", + "item_similarity_recommender_pearson", + "factorization_recommender", + "factorization_recommender_als", + "factorization_recommender_binary", + "factorization_recommender_nmf", + "ranking_factorization_recommender", + "ranking_factorization_recommender_ials", + "ranking_factorization_recommender_no_target", +] -model_names = ['popularity_recommender', - 'popularity_recommender_with_target', - 'item_content_recommender', - 'item_similarity_recommender', - 'item_similarity_recommender_cosine', - 'item_similarity_recommender_pearson', - 'factorization_recommender', - 'factorization_recommender_als', - 'factorization_recommender_binary', - 'factorization_recommender_nmf', - 'ranking_factorization_recommender', - 'ranking_factorization_recommender_ials', - 'ranking_factorization_recommender_no_target'] def _coreml_to_tc(preds): - return {'rank': preds['recommendations'], 'score': preds['probabilities']} + return {"rank": preds["recommendations"], "score": preds["probabilities"]} + class RecommenderTestBase(unittest.TestCase): - def _test_coreml_export(self, m, item_ids, ratings = None): + def _test_coreml_export(self, m, item_ids, ratings=None): temp_file_path = _mkstemp()[1] if m.target and ratings: - obs_data_sf = tc.SFrame({ - m.item_id: tc.SArray(item_ids), - m.target: tc.SArray(ratings) - }) + obs_data_sf = tc.SFrame( + {m.item_id: tc.SArray(item_ids), m.target: tc.SArray(ratings)} + ) predictions_tc = m.recommend_from_interactions(obs_data_sf, k=5) interactions = {item_ids[i]: ratings[i] for i in range(len(item_ids))} else: @@ -79,188 +88,192 @@ def _test_coreml_export(self, m, item_ids, ratings = None): # convert TC SFrame into same dict structure as CoreML will return predictions_tc_dict = dict() - for field in [u'score', u'rank']: + for field in [u"score", u"rank"]: predictions_tc_dict[field] = dict() item_ids_from_preds = predictions_tc[m.item_id] - scores_from_preds = predictions_tc['score'] - ranks_from_preds = predictions_tc['rank'] + scores_from_preds = predictions_tc["score"] + ranks_from_preds = predictions_tc["rank"] for i in range(len(item_ids_from_preds)): - predictions_tc_dict['score'][item_ids_from_preds[i]] = scores_from_preds[i] - predictions_tc_dict['rank'][item_ids_from_preds[i]] = ranks_from_preds[i] + predictions_tc_dict["score"][item_ids_from_preds[i]] = scores_from_preds[i] + predictions_tc_dict["rank"][item_ids_from_preds[i]] = ranks_from_preds[i] # Do the CoreML export and predict (if on macOS) m.export_coreml(temp_file_path) - if _mac_ver() >= (10,14): + if _mac_ver() >= (10, 14): coremlmodel = _coremltools.models.MLModel(temp_file_path) - predictions_coreml = coremlmodel.predict({'interactions': interactions, 'k': 5}) + predictions_coreml = coremlmodel.predict( + {"interactions": interactions, "k": 5} + ) # compare them self.assertEqual(predictions_tc_dict, _coreml_to_tc(predictions_coreml)) # compare user defined data import platform - self.assertDictEqual({ - 'com.github.apple.turicreate.version': tc.__version__, - 'com.github.apple.os.platform': platform.platform(), - }, dict(coremlmodel.user_defined_metadata) + + self.assertDictEqual( + { + "com.github.apple.turicreate.version": tc.__version__, + "com.github.apple.os.platform": platform.platform(), + }, + dict(coremlmodel.user_defined_metadata), ) os.unlink(temp_file_path) - def _get_trained_model(self, model_name, data, - user_id='user', - item_id='item', - target=None, - test_export_to_coreml=True, - **args): - - if model_name == 'default': - m = tc.recommender.create(data, user_id, item_id, - target=target, **args) - elif model_name == 'popularity_recommender': + def _get_trained_model( + self, + model_name, + data, + user_id="user", + item_id="item", + target=None, + test_export_to_coreml=True, + **args + ): + + if model_name == "default": + m = tc.recommender.create(data, user_id, item_id, target=target, **args) + elif model_name == "popularity_recommender": m = tc.popularity_recommender.create(data, user_id, item_id, **args) - elif model_name == 'popularity_recommender_with_target': - m = tc.popularity_recommender.create(data, user_id, item_id, target=target, **args) + elif model_name == "popularity_recommender_with_target": + m = tc.popularity_recommender.create( + data, user_id, item_id, target=target, **args + ) - elif model_name == 'ranking_factorization_recommender': + elif model_name == "ranking_factorization_recommender": args.setdefault("max_iterations", 10) - m = tc.ranking_factorization_recommender.create(data, - user_id=user_id, - item_id=item_id, - target=target, - **args) + m = tc.ranking_factorization_recommender.create( + data, user_id=user_id, item_id=item_id, target=target, **args + ) - elif model_name == 'item_content_recommender': + elif model_name == "item_content_recommender": items = data[item_id].unique() alt_data = tc.util.generate_random_sframe(len(items), "ccnv") alt_data[item_id] = items - m = tc.item_content_recommender.create(alt_data, - item_id=item_id, - observation_data = data, - user_id=user_id, - target=target, - **args) + m = tc.item_content_recommender.create( + alt_data, + item_id=item_id, + observation_data=data, + user_id=user_id, + target=target, + **args + ) - elif model_name == 'ranking_factorization_recommender_ials': + elif model_name == "ranking_factorization_recommender_ials": args.setdefault("max_iterations", 10) - m = tc.ranking_factorization_recommender.create(data, - user_id=user_id, - item_id=item_id, - target=target, - solver = 'ials', - **args) + m = tc.ranking_factorization_recommender.create( + data, + user_id=user_id, + item_id=item_id, + target=target, + solver="ials", + **args + ) - elif model_name == 'ranking_factorization_recommender_no_target': + elif model_name == "ranking_factorization_recommender_no_target": args.setdefault("max_iterations", 5) m = tc.ranking_factorization_recommender.create( - data[[user_id, item_id]], - user_id=user_id, - item_id=item_id, - **args) + data[[user_id, item_id]], user_id=user_id, item_id=item_id, **args + ) - elif model_name == 'factorization_recommender': + elif model_name == "factorization_recommender": args.setdefault("max_iterations", 10) - m = tc.factorization_recommender.create(data, - user_id=user_id, - item_id=item_id, - target=target, - **args) + m = tc.factorization_recommender.create( + data, user_id=user_id, item_id=item_id, target=target, **args + ) - elif model_name == 'factorization_recommender_nmf': + elif model_name == "factorization_recommender_nmf": args.setdefault("max_iterations", 10) - m = tc.recommender.factorization_recommender.create(data, - user_id=user_id, - item_id=item_id, - target=target, - nmf=True, - side_data_factorization=False, - **args) + m = tc.recommender.factorization_recommender.create( + data, + user_id=user_id, + item_id=item_id, + target=target, + nmf=True, + side_data_factorization=False, + **args + ) - elif model_name == 'factorization_recommender_als': + elif model_name == "factorization_recommender_als": args.setdefault("max_iterations", 10) - m = tc.recommender.factorization_recommender.create(data, - user_id=user_id, - item_id=item_id, - target=target, - solver = 'als', - **args) + m = tc.recommender.factorization_recommender.create( + data, + user_id=user_id, + item_id=item_id, + target=target, + solver="als", + **args + ) - elif model_name == 'factorization_recommender_binary': + elif model_name == "factorization_recommender_binary": args.setdefault("max_iterations", 10) if target in data.column_names(): - data[target] = data[target] > .5 # Make it binary - m = tc.recommender.ranking_factorization_recommender.create(data, - user_id=user_id, - item_id=item_id, - target=target, - **args) - - elif model_name == 'item_similarity_recommender': + data[target] = data[target] > 0.5 # Make it binary + m = tc.recommender.ranking_factorization_recommender.create( + data, user_id=user_id, item_id=item_id, target=target, **args + ) + + elif model_name == "item_similarity_recommender": + m = tc.recommender.item_similarity_recommender.create( + data, user_id, item_id, target=target, **args + ) + + elif model_name == "item_similarity_recommender_cosine": + m = tc.recommender.item_similarity_recommender.create( + data, user_id, item_id, target=target, similarity_type="cosine", **args + ) + + elif model_name == "item_similarity_recommender_pearson": m = tc.recommender.item_similarity_recommender.create( - data, user_id, item_id, target=target, **args) - - elif model_name == 'item_similarity_recommender_cosine': - m = tc.recommender.item_similarity_recommender.create(data, user_id, item_id, - target=target, - similarity_type = "cosine", - **args) - - elif model_name == 'item_similarity_recommender_pearson': - m = tc.recommender.item_similarity_recommender.create(data, - user_id, - item_id, - target=target, - similarity_type='pearson', - **args) - - elif model_name == 'itemcf-user-distance': - m = tc.recommender.item_similarity_recommender.create(data, - user_id, - item_id, - target=target, - similarity_type = "pearson", - **args) + data, user_id, item_id, target=target, similarity_type="pearson", **args + ) + + elif model_name == "itemcf-user-distance": + m = tc.recommender.item_similarity_recommender.create( + data, user_id, item_id, target=target, similarity_type="pearson", **args + ) nearest_items = m.get_similar_items() - m = tc.recommender.item_similarity_recommender.create(data, - user_id, - item_id, - target=target, - similarity_type = 'cosine', - nearest_items=nearest_items, - **args) - - - elif model_name == 'itemcf-jaccard-topk': - m = tc.recommender.item_similarity_recommender.create(data, - user_id, - item_id, - threshold=0.001, - only_top_k=100, - **args) + m = tc.recommender.item_similarity_recommender.create( + data, + user_id, + item_id, + target=target, + similarity_type="cosine", + nearest_items=nearest_items, + **args + ) - else: - raise NotImplementedError('Unknown model %s requested' % model_name) + elif model_name == "itemcf-jaccard-topk": + m = tc.recommender.item_similarity_recommender.create( + data, user_id, item_id, threshold=0.001, only_top_k=100, **args + ) + else: + raise NotImplementedError("Unknown model %s requested" % model_name) from itertools import chain, permutations + all_items = data[item_id].unique() - for some_items in chain(*list(permutations(all_items, i) for i in range(min(3, len(all_items))))): + for some_items in chain( + *list(permutations(all_items, i) for i in range(min(3, len(all_items)))) + ): some_items = list(some_items) if target: - ratings = [random.uniform(0,1) for i in some_items] + ratings = [random.uniform(0, 1) for i in some_items] else: ratings = None @@ -271,23 +284,31 @@ def _get_trained_model(self, model_name, data, class UserDefinedSimilarityTest(RecommenderTestBase): - def setUp(self): - sf = tc.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"], - 'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"], - 'rating': [.2, .3, .4, .1, .3, .3, .5, 0.9]}) - nearest_items = tc.SFrame({'item_id': ["a", "a", "b", "b", "b", "e"], - 'similar': ["b", "c", "a", "c", "e", "f"], - 'score': [.2, .3, .4, .1, .5, .8]}) + sf = tc.SFrame( + { + "user_id": ["0", "0", "0", "1", "1", "2", "2", "2"], + "item_id": ["a", "b", "c", "a", "b", "b", "c", "d"], + "rating": [0.2, 0.3, 0.4, 0.1, 0.3, 0.3, 0.5, 0.9], + } + ) + nearest_items = tc.SFrame( + { + "item_id": ["a", "a", "b", "b", "b", "e"], + "similar": ["b", "c", "a", "c", "e", "f"], + "score": [0.2, 0.3, 0.4, 0.1, 0.5, 0.8], + } + ) self.sf = sf self.nearest_items = nearest_items def test_default(self): - m = tc.recommender.item_similarity_recommender.create(self.sf, 'user_id', 'item_id', - nearest_items=self.nearest_items) + m = tc.recommender.item_similarity_recommender.create( + self.sf, "user_id", "item_id", nearest_items=self.nearest_items + ) - y = self.nearest_items.sort(['item_id', 'score']) - z = m.get_similar_items().sort(['item_id', 'score']) + y = self.nearest_items.sort(["item_id", "score"]) + z = m.get_similar_items().sort(["item_id", "score"]) assert all(y["item_id"] == z["item_id"]) assert all(y["similar"] == z["similar"]) @@ -305,24 +326,26 @@ def test_default(self): self._test_coreml_export(m, ['a','b']) """ - def tmp_test_bad_input(self): x = self.nearest_items - x.rename({'score': 'rating'}, inplace=True) + x.rename({"score": "rating"}, inplace=True) - self.assertRaises(ToolkitError, - lambda a: - tc.item_similarity_recommender.create(self.sf, 'user_id', 'item_id', - nearest_items=x), \ - 'Could not initialize using nearest_items argument.') + self.assertRaises( + ToolkitError, + lambda a: tc.item_similarity_recommender.create( + self.sf, "user_id", "item_id", nearest_items=x + ), + "Could not initialize using nearest_items argument.", + ) class ImmutableTest(RecommenderTestBase): - def setUp(self): - df_dict = {'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"], - 'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"], - 'rating': [.2, .3, .4, .1, .3, .3, .5, .9]} + df_dict = { + "user_id": ["0", "0", "0", "1", "1", "2", "2", "2"], + "item_id": ["a", "b", "c", "a", "b", "b", "c", "d"], + "rating": [0.2, 0.3, 0.4, 0.1, 0.3, 0.3, 0.5, 0.9], + } df = SFrame(df_dict) self.df = df self.df_dict = df_dict @@ -330,8 +353,12 @@ def setUp(self): self.test = self.df.tail(4) def test_immutable(self): - m = tc.recommender.factorization_recommender.create(self.df, target='rating', num_factors=2) - assert type(m) == tc.recommender.factorization_recommender.FactorizationRecommender + m = tc.recommender.factorization_recommender.create( + self.df, target="rating", num_factors=2 + ) + assert ( + type(m) == tc.recommender.factorization_recommender.FactorizationRecommender + ) yhat = m.predict(self.df) N, P = self.df.shape @@ -349,16 +376,19 @@ def test_immutable(self): class EdgeCasesTest(RecommenderTestBase): - def setUp(self): - df = SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2"], - 'item_id': ["a", "b", "c", "a", "b", "b", "c"], - 'rating': [ .2, .3, .4, .1, .3, .5, 0.9]}) + df = SFrame( + { + "user_id": ["0", "0", "0", "1", "1", "2", "2"], + "item_id": ["a", "b", "c", "a", "b", "b", "c"], + "rating": [0.2, 0.3, 0.4, 0.1, 0.3, 0.5, 0.9], + } + ) self.df = df m1 = tc.recommender.popularity_recommender.create(self.df) m2 = tc.recommender.item_similarity_recommender.create(self.df) - m3 = tc.recommender.factorization_recommender.create(self.df, target='rating') + m3 = tc.recommender.factorization_recommender.create(self.df, target="rating") self.trained_models = [m1, m2, m3] def test_recommend_empty(self): @@ -369,128 +399,156 @@ def test_recommend_empty(self): # observed all unique items. recs = m.recommend(k=3) - assert '0' not in set(list(recs['user_id'])) + assert "0" not in set(list(recs["user_id"])) # Ensure we do not return k recommendations if fewer are available. recs = m.recommend(k=10) - assert '0' not in set(list(recs['user_id'])) + assert "0" not in set(list(recs["user_id"])) assert recs.num_rows() == 2 # TODO: test CoreML export, when we can support serializing # factorization or popularity models - if isinstance(m, (tc.recommender.factorization_recommender.FactorizationRecommender, - tc.recommender.popularity_recommender.PopularityRecommender)): + if isinstance( + m, + ( + tc.recommender.factorization_recommender.FactorizationRecommender, + tc.recommender.popularity_recommender.PopularityRecommender, + ), + ): continue # TODO - why is item similarity failing here !? if m.target: - self._test_coreml_export(m, ['a','b'], [.2,.3]) + self._test_coreml_export(m, ["a", "b"], [0.2, 0.3]) else: - self._test_coreml_export(m, ['a','b']) - + self._test_coreml_export(m, ["a", "b"]) def test_no_target(self): - df = SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2"], - 'item_id': ["0", "0", "0", "1", "1", "2", "2"]}) + df = SFrame( + { + "user_id": ["0", "0", "0", "1", "1", "2", "2"], + "item_id": ["0", "0", "0", "1", "1", "2", "2"], + } + ) model_names_with_targets = [ - 'popularity_recommender_with_target', - 'item_similarity_recommender_cosine', - 'item_similarity_recommender_pearson', - 'factorization_recommender', - 'factorization_recommender_binary', - 'factorization_recommender_nmf', - 'ranking_factorization_recommender'] + "popularity_recommender_with_target", + "item_similarity_recommender_cosine", + "item_similarity_recommender_pearson", + "factorization_recommender", + "factorization_recommender_binary", + "factorization_recommender_nmf", + "ranking_factorization_recommender", + ] for m in model_names_with_targets: print(m) - self.assertRaises(Exception, lambda: - self._get_trained_model(m, df, - user_id='user_id', - item_id='item_id', - target='rating-that-isnt-really-there-just-like-your-happiness')) + self.assertRaises( + Exception, + lambda: self._get_trained_model( + m, + df, + user_id="user_id", + item_id="item_id", + target="rating-that-isnt-really-there-just-like-your-happiness", + ), + ) def test_bad_columns(self): - df = SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2"], - 'rating': [.2, .3, .4, .1, .3, .3, .9]}) + df = SFrame( + { + "user_id": ["0", "0", "0", "1", "1", "2", "2"], + "rating": [0.2, 0.3, 0.4, 0.1, 0.3, 0.3, 0.9], + } + ) for m in model_names: - self.assertRaises(Exception, lambda: - self._get_trained_model(m, df, - user_id='user_id', - item_id='user_id', - target='rating')) - self.assertRaises(Exception, lambda: - self._get_trained_model(m, df, - user_id='user_id', - item_id='item_id', - target='item_id')) - - self.assertRaises(Exception, lambda: - self._get_trained_model(m, df, - user_id='user_id', - item_id='user_id', - target='user_id')) + self.assertRaises( + Exception, + lambda: self._get_trained_model( + m, df, user_id="user_id", item_id="user_id", target="rating" + ), + ) + self.assertRaises( + Exception, + lambda: self._get_trained_model( + m, df, user_id="user_id", item_id="item_id", target="item_id" + ), + ) + self.assertRaises( + Exception, + lambda: self._get_trained_model( + m, df, user_id="user_id", item_id="user_id", target="user_id" + ), + ) class ReturnStatisticsTest(RecommenderTestBase): - def setUp(self): - self.df = SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2"], - 'item_id': ["a", "b", "c", "a", "b", "b", "c"], - 'rating': [.2, .3, .4, .1, .3, .5, 0.9]}) + self.df = SFrame( + { + "user_id": ["0", "0", "0", "1", "1", "2", "2"], + "item_id": ["a", "b", "c", "a", "b", "b", "c"], + "rating": [0.2, 0.3, 0.4, 0.1, 0.3, 0.5, 0.9], + } + ) def test_train_rmse_returns(self): - models_with_targets = ['factorization_recommender', - 'ranking_factorization_recommender', - 'item_similarity_recommender_pearson', - 'popularity_recommender_with_target'] + models_with_targets = [ + "factorization_recommender", + "ranking_factorization_recommender", + "item_similarity_recommender_pearson", + "popularity_recommender_with_target", + ] for name in models_with_targets: - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem - ''' - m = self._get_trained_model(name, self.df, - user_id='user_id', - item_id='item_id', - target='rating', - test_export_to_coreml = False) + """ + m = self._get_trained_model( + name, + self.df, + user_id="user_id", + item_id="item_id", + target="rating", + test_export_to_coreml=False, + ) assert m.training_rmse is None or m.training_rmse >= 0 def test_get_counts(self): - m = self._get_trained_model('item_similarity_recommender_pearson', - self.df, - user_id='user_id', - item_id='item_id', - target='rating') + m = self._get_trained_model( + "item_similarity_recommender_pearson", + self.df, + user_id="user_id", + item_id="item_id", + target="rating", + ) item_counts = m.get_num_users_per_item() expected = tc.SFrame() - expected['item_id'] = ['a', 'b', 'c'] - expected['num_users'] = [2, 3, 2] + expected["item_id"] = ["a", "b", "c"] + expected["num_users"] = [2, 3, 2] assert_sframe_equal(item_counts, expected) user_counts = m.get_num_items_per_user() expected = tc.SFrame() - expected['user_id'] = ['0', '1', '2'] - expected['num_items'] = [3, 2, 2] + expected["user_id"] = ["0", "1", "2"] + expected["num_items"] = [3, 2, 2] assert_sframe_equal(user_counts, expected) - class NewUserTest(RecommenderTestBase): - def setUp(self): n_total = 200 df_size = 200 - users = ['U%d' % i for i in range(n_total)] - items = ['I%d' % i for i in range(n_total)] + users = ["U%d" % i for i in range(n_total)] + items = ["I%d" % i for i in range(n_total)] ratings = [float(i) / n_total for i in range(n_total)] random.seed(0) @@ -498,33 +556,47 @@ def setUp(self): def sample_set(L): return [random.choice(L) for i in range(df_size)] - self.df1 = SFrame({'user_id': sample_set(users), - 'item_id': sample_set(items), - 'rating': sample_set(ratings)}) - - self.df2 = SFrame({'user_id': sample_set(users), - 'item_id': sample_set(items), - 'rating': sample_set(ratings)}) - - self.df3 = SFrame({'user_id': sample_set(users), - 'item_id': sample_set(items), - 'rating': sample_set(ratings)}) - + self.df1 = SFrame( + { + "user_id": sample_set(users), + "item_id": sample_set(items), + "rating": sample_set(ratings), + } + ) + + self.df2 = SFrame( + { + "user_id": sample_set(users), + "item_id": sample_set(items), + "rating": sample_set(ratings), + } + ) + + self.df3 = SFrame( + { + "user_id": sample_set(users), + "item_id": sample_set(items), + "rating": sample_set(ratings), + } + ) def test_score(self): for model_name in model_names: print("New Users: Predict:", model_name) - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem - ''' - m = self._get_trained_model(model_name, self.df1, - user_id='user_id', - item_id='item_id', - target='rating', - test_export_to_coreml=False) + """ + m = self._get_trained_model( + model_name, + self.df1, + user_id="user_id", + item_id="item_id", + target="rating", + test_export_to_coreml=False, + ) for data in [self.df1, self.df2, self.df3]: @@ -535,41 +607,54 @@ def test_recommend(self): for model_name in model_names: print("New Users: Recommend:", model_name) - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem - ''' - m = self._get_trained_model(model_name, self.df1, - user_id='user_id', item_id='item_id', target='rating', - test_export_to_coreml=False) + """ + m = self._get_trained_model( + model_name, + self.df1, + user_id="user_id", + item_id="item_id", + target="rating", + test_export_to_coreml=False, + ) for data in [self.df2, self.df3]: recs = m.recommend(new_observation_data=data) assert type(recs) == SFrame + class GetSimilarItemsTest(RecommenderTestBase): def setUp(self): - item_column = 'my_item_column' - user_column = 'my_user_id' - sf = SFrame({user_column : ["0", "0", "0", "1", "1", "2", "2", "3", "3"], - item_column : ["a", "b", "c", "b", "c", "c", "d", "a", "d"], - 'rating': [1., .3, .5, .5, .6, 1., .1, .1, 1.5]}) - + item_column = "my_item_column" + user_column = "my_user_id" + sf = SFrame( + { + user_column: ["0", "0", "0", "1", "1", "2", "2", "3", "3"], + item_column: ["a", "b", "c", "b", "c", "c", "d", "a", "d"], + "rating": [1.0, 0.3, 0.5, 0.5, 0.6, 1.0, 0.1, 0.1, 1.5], + } + ) models = [] - for mod in [tc.recommender.item_similarity_recommender, - tc.recommender.factorization_recommender, - tc.recommender.popularity_recommender]: - m = mod.create(sf, user_column, item_column, target='rating') - ''' + for mod in [ + tc.recommender.item_similarity_recommender, + tc.recommender.factorization_recommender, + tc.recommender.popularity_recommender, + ]: + m = mod.create(sf, user_column, item_column, target="rating") + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem - ''' - if isinstance(m, tc.recommender.item_similarity_recommender.ItemSimilarityRecommender): - self._test_coreml_export(m, ['a','b'], [1.,.3]) + """ + if isinstance( + m, tc.recommender.item_similarity_recommender.ItemSimilarityRecommender + ): + self._test_coreml_export(m, ["a", "b"], [1.0, 0.3]) models.append(m) self.sf = sf @@ -579,18 +664,18 @@ def setUp(self): def run_get_similar_items(self, m): - sf = m.get_similar_items(k = 2) - sf1 = m.get_similar_items(items=[], k = 2) - sf2 = m.get_similar_items(items=SArray(), k = 2) + sf = m.get_similar_items(k=2) + sf1 = m.get_similar_items(items=[], k=2) + sf2 = m.get_similar_items(items=SArray(), k=2) self.assertEqual(sf.num_rows(), m.num_items * 2) self.assertEqual(sf1.num_rows(), 0) self.assertEqual(sf2.num_rows(), 0) - sf3 = m.get_similar_items(items=["a", "b"], k = 2) - sf4 = m.get_similar_items(items=["a", "e"], k = 2) - sf5 = m.get_similar_items(items=["e", "f"], k = 2) - sf6 = m.get_similar_items(items=["e", "f"], k = 2, verbose=False) + sf3 = m.get_similar_items(items=["a", "b"], k=2) + sf4 = m.get_similar_items(items=["a", "e"], k=2) + sf5 = m.get_similar_items(items=["e", "f"], k=2) + sf6 = m.get_similar_items(items=["e", "f"], k=2, verbose=False) d = list(sf3) @@ -604,36 +689,33 @@ def run_get_similar_items(self, m): # similarity between "b" and "d" is 0. # So for "b", only two items are returned for item_similarity model - self.assertEqual(sf3.num_rows(), 2*2) + self.assertEqual(sf3.num_rows(), 2 * 2) self.assertEqual(sf4.num_rows(), 2) self.assertEqual(sf5.num_rows(), 0) self.assertEqual(sf6.num_rows(), 0) for s in [sf, sf1, sf2, sf3, sf4, sf5, sf6]: self.assertEqual(s.column_names()[0], self.item_column) - self.assertEqual(s.column_names()[1], 'similar') - self.assertEqual(s.column_names()[2], 'score') # TEMP - self.assertEqual(s.column_names()[3], 'rank') - - + self.assertEqual(s.column_names()[1], "similar") + self.assertEqual(s.column_names()[2], "score") # TEMP + self.assertEqual(s.column_names()[3], "rank") def run_get_similar_users(self, m): try: - sf = m.get_similar_users(k = 2) + sf = m.get_similar_users(k=2) except ToolkitError: return - sf1 = m.get_similar_users(users=[], k = 2) - sf2 = m.get_similar_users(users=SArray(), k = 2) + sf1 = m.get_similar_users(users=[], k=2) + sf2 = m.get_similar_users(users=SArray(), k=2) self.assertEqual(sf.num_rows(), m.num_users * 2) self.assertEqual(sf1.num_rows(), 0) self.assertEqual(sf2.num_rows(), 0) - - sf3 = m.get_similar_users(users=["0", "1"], k = 2) - sf4 = m.get_similar_users(users=["0", "4"], k = 2) - sf5 = m.get_similar_users(users=["4", "5"], k = 2) + sf3 = m.get_similar_users(users=["0", "1"], k=2) + sf4 = m.get_similar_users(users=["0", "4"], k=2) + sf5 = m.get_similar_users(users=["4", "5"], k=2) d = list(sf3) @@ -647,15 +729,15 @@ def run_get_similar_users(self, m): # similarity between "b" and "d" is 0. # So for "b", only two items are returned for item_similarity model - self.assertEqual(sf3.num_rows(), 2*2) + self.assertEqual(sf3.num_rows(), 2 * 2) self.assertEqual(sf4.num_rows(), 2) self.assertEqual(sf5.num_rows(), 0) for s in [sf, sf1, sf2, sf3, sf4, sf5]: self.assertEqual(s.column_names()[0], self.user_column) - self.assertEqual(s.column_names()[1], 'similar') - self.assertEqual(s.column_names()[2], 'score') - self.assertEqual(s.column_names()[3], 'rank') + self.assertEqual(s.column_names()[1], "similar") + self.assertEqual(s.column_names()[2], "score") + self.assertEqual(s.column_names()[3], "rank") def test_similar_items_correctness(self): @@ -663,8 +745,9 @@ def test_similar_items_correctness(self): # so everything is essentially unique. Test the get similar # items by duplicating an item and test that it's the most # similar. - data = (tc.util.generate_random_regression_sframe(500, "ZZ", random_seed=0) - .rename({"X1-Z" : "user", "X2-Z" : "item"}, inplace=True)) + data = tc.util.generate_random_regression_sframe( + 500, "ZZ", random_seed=0 + ).rename({"X1-Z": "user", "X2-Z": "item"}, inplace=True) item = data["item"][0] @@ -675,32 +758,47 @@ def test_similar_items_correctness(self): def test_model(m): - ret_sf1 = m.get_similar_items([item], k = 1) + ret_sf1 = m.get_similar_items([item], k=1) self.assertEqual(ret_sf1[0]["item"], item) self.assertEqual(ret_sf1[0]["similar"], 1000) - ret_sf2 = m.get_similar_items([1000], k = 1) + ret_sf2 = m.get_similar_items([1000], k=1) self.assertEqual(ret_sf2[0]["item"], 1000) self.assertEqual(ret_sf2[0]["similar"], item) - - - test_model(tc.recommender.item_similarity_recommender.create(data, "user", "item", "target")) - test_model(tc.recommender.popularity_recommender.create(data, "user", "item", "target")) - - test_model(tc.recommender.factorization_recommender.create( - data, "user", "item", "target", - num_factors = 8, - regularization = 0, - solver = "als", - max_iterations = 50) ) - - test_model(tc.recommender.ranking_factorization_recommender.create( - data[["user", "item"]], "user", "item", - regularization = 0, - num_factors = 8, - solver = "ials", - max_iterations = 50) ) + test_model( + tc.recommender.item_similarity_recommender.create( + data, "user", "item", "target" + ) + ) + test_model( + tc.recommender.popularity_recommender.create(data, "user", "item", "target") + ) + + test_model( + tc.recommender.factorization_recommender.create( + data, + "user", + "item", + "target", + num_factors=8, + regularization=0, + solver="als", + max_iterations=50, + ) + ) + + test_model( + tc.recommender.ranking_factorization_recommender.create( + data[["user", "item"]], + "user", + "item", + regularization=0, + num_factors=8, + solver="ials", + max_iterations=50, + ) + ) def test_similar_users_correctness(self): @@ -708,8 +806,9 @@ def test_similar_users_correctness(self): # so everything is essentially unique. Test the get similar # items by duplicating an item and test that it's the most # similar. - data = (tc.util.generate_random_regression_sframe(200, "zZ", random_seed=0) - .rename({"X1-z" : "user", "X2-Z" : "item"}, inplace=True)) + data = tc.util.generate_random_regression_sframe( + 200, "zZ", random_seed=0 + ).rename({"X1-z": "user", "X2-Z": "item"}, inplace=True) user = data["user"][0] item = data["item"][0] @@ -719,38 +818,51 @@ def test_similar_users_correctness(self): data = data.append(new_data) - for mod in [tc.recommender.factorization_recommender, - tc.recommender.popularity_recommender]: + for mod in [ + tc.recommender.factorization_recommender, + tc.recommender.popularity_recommender, + ]: m = mod.create(data, "user", "item", "target") def test_model(m): - ret_sf1 = m.get_similar_users([user], k = 1) + ret_sf1 = m.get_similar_users([user], k=1) self.assertEqual(ret_sf1[0]["user"], user) self.assertEqual(ret_sf1[0]["similar"], 10000) - ret_sf2 = m.get_similar_users([10000], k = 1) + ret_sf2 = m.get_similar_users([10000], k=1) self.assertEqual(ret_sf2[0]["user"], 10000) self.assertEqual(ret_sf2[0]["similar"], user) - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem self._test_coreml_export(m, [item]) - ''' - - test_model(tc.recommender.factorization_recommender.create( - data, "user", "item", "target", - num_factors = 8, - regularization = 0, - solver = "als", - max_iterations = 50) ) - - test_model(tc.recommender.ranking_factorization_recommender.create( - data[["user", "item"]], "user", "item", - regularization = 0, - num_factors = 8, - solver = "ials", - max_iterations = 50) ) + """ + + test_model( + tc.recommender.factorization_recommender.create( + data, + "user", + "item", + "target", + num_factors=8, + regularization=0, + solver="als", + max_iterations=50, + ) + ) + + test_model( + tc.recommender.ranking_factorization_recommender.create( + data[["user", "item"]], + "user", + "item", + regularization=0, + num_factors=8, + solver="ials", + max_iterations=50, + ) + ) def test_get_similar_items(self): for m in self.models: @@ -762,60 +874,70 @@ def test_get_similar_users(self): class ItemSimTest(RecommenderTestBase): - def setUp(self): - df = SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "3", "3"], - 'item_id': ["a", "b", "c", "b", "c", "c", "d", "a", "d"], - 'rating': [1., .3, .5, .5, .6, 1., .1, .1, 1.5]}) + df = SFrame( + { + "user_id": ["0", "0", "0", "1", "1", "2", "2", "3", "3"], + "item_id": ["a", "b", "c", "b", "c", "c", "d", "a", "d"], + "rating": [1.0, 0.3, 0.5, 0.5, 0.6, 1.0, 0.1, 0.1, 1.5], + } + ) self.df = df - def test_save_load(self): - m = tc.recommender.item_similarity_recommender.create(self.df, user_id='user_id', - item_id = 'item_id', target='rating', - similarity_type = "cosine") + m = tc.recommender.item_similarity_recommender.create( + self.df, + user_id="user_id", + item_id="item_id", + target="rating", + similarity_type="cosine", + ) try: write_dir = tempfile.mkdtemp() - fn = join(write_dir, 'tmp.gl') + fn = join(write_dir, "tmp.gl") m.save(fn) m1 = tc.load_model(fn) rec = m.recommend() rec1 = m1.recommend() - assert (rec.head(100)['score'] - rec1.head(100)['score']).sum() < DELTA + assert (rec.head(100)["score"] - rec1.head(100)["score"]).sum() < DELTA finally: shutil.rmtree(write_dir) - self._test_coreml_export(m, ['a','b'], [1.,.3]) + self._test_coreml_export(m, ["a", "b"], [1.0, 0.3]) -class PopularityRecommenderTest(RecommenderTestBase): +class PopularityRecommenderTest(RecommenderTestBase): def setUp(self): - self.df = SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"], - 'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"], - 'rating': [.2, .3, .4, .1, .3, .3, .5, .9]}) + self.df = SFrame( + { + "user_id": ["0", "0", "0", "1", "1", "2", "2", "2"], + "item_id": ["a", "b", "c", "a", "b", "b", "c", "d"], + "rating": [0.2, 0.3, 0.4, 0.1, 0.3, 0.3, 0.5, 0.9], + } + ) self.train = self.df.head(4) self.test = self.df.tail(4) self.sframe_comparer = util.SFrameComparer() def test_saved_predictions(self): - m = tc.popularity_recommender.create(self.train, target='rating') + m = tc.popularity_recommender.create(self.train, target="rating") preds = m.item_predictions assert preds is not None - assert preds.num_rows() == len(self.train['item_id'].unique()) + assert preds.num_rows() == len(self.train["item_id"].unique()) nn = m.get_similar_items() assert nn is not None - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem self._test_coreml_export(m, ['a','b'], [.2,.3]) - ''' + """ def test_popularity_model(self): # Test that popularity without targets just counts each item's @@ -824,24 +946,24 @@ def test_popularity_model(self): actual = m.predict(self.df) expected = tc.SArray([2, 3, 2, 2, 3, 3, 2, 1]).astype(float) self.sframe_comparer._assert_sarray_equal(actual, expected) - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem self._test_coreml_export(m, ['a','b']) - ''' + """ # Test a popularity model that uses the target column. - m = tc.popularity_recommender.create(self.train, target='rating') + m = tc.popularity_recommender.create(self.train, target="rating") yhat = m.predict(self.df) N, P = self.df.shape assert len(yhat) == N - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem self._test_coreml_export(m, ['a','b'], [.2,.3]) - ''' + """ # Check the dimensions of predictions on the training set yhat = m.predict(self.train) @@ -849,8 +971,8 @@ def test_popularity_model(self): assert len(yhat) == N # Check the mean rating of the first item in the training set - first_item = self.train['item_id'][0] - mean_rating = self.train['rating'][self.train['item_id'] == first_item].mean() + first_item = self.train["item_id"][0] + mean_rating = self.train["rating"][self.train["item_id"] == first_item].mean() pred = yhat.head(1)[0] assert abs(pred - mean_rating) < DELTA @@ -861,17 +983,17 @@ def test_popularity_model(self): # Check the mean rating of the first item in the test set. # Should be the mean observed rating as seen in the training set. - first_item = self.test['item_id'][0] - mean_rating = self.train['rating'][self.train['item_id'] == first_item].mean() + first_item = self.test["item_id"][0] + mean_rating = self.train["rating"][self.train["item_id"] == first_item].mean() pred = yhat.head(1)[0] assert abs(pred - mean_rating) < DELTA chosen = "d" # pick an item that was in the test set and not in train - mean_rating = self.train['rating'].mean() + mean_rating = self.train["rating"].mean() yhat = m.predict(self.df) - ix = SArray(self.df['item_id'] == chosen) + ix = SArray(self.df["item_id"] == chosen) new_item_preds = yhat[ix][0] - assert(abs(new_item_preds - mean_rating) < DELTA) + assert abs(new_item_preds - mean_rating) < DELTA def test_largescale_recommendations(self): @@ -879,22 +1001,26 @@ def test_largescale_recommendations(self): for i in range(500): for j in range(i): - user_item_list.append( (i, j) ) + user_item_list.append((i, j)) random.shuffle(user_item_list) - sf = tc.SFrame({"user_id" : [u for u, i in user_item_list], - "item_id" : [i for u, i in user_item_list],}) + sf = tc.SFrame( + { + "user_id": [u for u, i in user_item_list], + "item_id": [i for u, i in user_item_list], + } + ) m = tc.popularity_recommender.create(sf) - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem self._test_coreml_export(m, [1,2]) - ''' + """ - res = m.recommend(users = list(range(500)), k = 1) + res = m.recommend(users=list(range(500)), k=1) # Run some tests that make sure these are recommended correctly for d in res: @@ -908,7 +1034,7 @@ def test_largescale_recommendations(self): assert score == 499 - user # how many times it's been seen # Now run tests with top_k = 2 - res = m.recommend(users = list(range(500)), k = 2) + res = m.recommend(users=list(range(500)), k=2) # Run some tests that make sure these are recommended correctly for d in res[::2]: @@ -933,28 +1059,37 @@ def test_largescale_recommendations(self): def test_compare_against_baseline(self): - df_1 = tc.SFrame({'user_id': [1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4], - 'item_id':[1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4], - 'rating':[2,3,4,5,2,3.5,3.5,5,5,3,2.5,1.5,5,4,3,.5]}) + df_1 = tc.SFrame( + { + "user_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4], + "item_id": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4], + "rating": [2, 3, 4, 5, 2, 3.5, 3.5, 5, 5, 3, 2.5, 1.5, 5, 4, 3, 0.5], + } + ) df_2 = df_1[["user_id", "item_id"]] ############################################################ - base_model = tc.recommender.popularity_recommender.create(df_1, "user_id", "item_id", "rating") + base_model = tc.recommender.popularity_recommender.create( + df_1, "user_id", "item_id", "rating" + ) - for mod in [tc.recommender.popularity_recommender, - tc.recommender.factorization_recommender, - tc.recommender.item_similarity_recommender]: + for mod in [ + tc.recommender.popularity_recommender, + tc.recommender.factorization_recommender, + tc.recommender.item_similarity_recommender, + ]: m = mod.create(df_1, "user_id", "item_id", "rating") pop_model = m._get_popularity_baseline() self.sframe_comparer._assert_sframe_equal( - pop_model.recommend(), base_model.recommend()) + pop_model.recommend(), base_model.recommend() + ) - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem @@ -962,47 +1097,75 @@ def test_compare_against_baseline(self): self._test_coreml_export(m, [1,2], [2,3]) else: self._test_coreml_export(m, [1,2]) - ''' + """ ############################################################ - base_model = tc.recommender.popularity_recommender.create(df_2, "user_id", "item_id") + base_model = tc.recommender.popularity_recommender.create( + df_2, "user_id", "item_id" + ) - for mod in [tc.recommender.popularity_recommender, - tc.recommender.ranking_factorization_recommender, - tc.recommender.item_similarity_recommender]: + for mod in [ + tc.recommender.popularity_recommender, + tc.recommender.ranking_factorization_recommender, + tc.recommender.item_similarity_recommender, + ]: m = mod.create(df_2, "user_id", "item_id") pop_model = m._get_popularity_baseline() self.sframe_comparer._assert_sframe_equal( - pop_model.recommend(), base_model.recommend()) + pop_model.recommend(), base_model.recommend() + ) - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem self._test_coreml_export(m, [1,2]) - ''' + """ class RecommendTest(RecommenderTestBase): - def setUp(self): - self.sf = tc.SFrame({'user_id': [1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4], - 'item_id':[1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4], - 'rating':[2,3,4,5,2,3.5,3.5,5,5,3,2.5,1.5,5,4,3,.5], - 'time':[10,11,12,13,10,11,12,13,13,12,11,10,13,12,11,10]}) + self.sf = tc.SFrame( + { + "user_id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4], + "item_id": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4], + "rating": [2, 3, 4, 5, 2, 3.5, 3.5, 5, 5, 3, 2.5, 1.5, 5, 4, 3, 0.5], + "time": [ + 10, + 11, + 12, + 13, + 10, + 11, + 12, + 13, + 13, + 12, + 11, + 10, + 13, + 12, + 11, + 10, + ], + } + ) def test_num_recommendations(self): - m = tc.ranking_factorization_recommender.create(self.sf, 'user_id', 'item_id', 'rating') + m = tc.ranking_factorization_recommender.create( + self.sf, "user_id", "item_id", "rating" + ) # Test that observation side data was used. - assert(set(m.observation_data_column_names) == - set(['user_id', 'item_id', 'rating', 'time'])) + assert set(m.observation_data_column_names) == set( + ["user_id", "item_id", "rating", "time"] + ) """ TODO: test CoreML export, when we can support having side data, and @@ -1016,87 +1179,164 @@ def test_num_recommendations(self): for diversity in [0, 1]: - r = m.recommend(users=None, k=5, items=None, - new_observation_data=None, new_user_data=None, - new_item_data=None, exclude=None, exclude_known=False, diversity=diversity) + r = m.recommend( + users=None, + k=5, + items=None, + new_observation_data=None, + new_user_data=None, + new_item_data=None, + exclude=None, + exclude_known=False, + diversity=diversity, + ) assert r.num_rows() == 16 r = m.recommend() assert r.num_rows() == 0 - r = m.recommend(users=None, k=2, items=None, - new_observation_data=None, new_user_data=None, - new_item_data=None, exclude=None, exclude_known=False, diversity=diversity) + r = m.recommend( + users=None, + k=2, + items=None, + new_observation_data=None, + new_user_data=None, + new_item_data=None, + exclude=None, + exclude_known=False, + diversity=diversity, + ) assert r.num_rows() == 8 - r = m.recommend(users=[1,2,3], k=2, items=None, - new_observation_data=None, new_user_data=None, - new_item_data=None, exclude=None, exclude_known=False, diversity=diversity) + r = m.recommend( + users=[1, 2, 3], + k=2, + items=None, + new_observation_data=None, + new_user_data=None, + new_item_data=None, + exclude=None, + exclude_known=False, + diversity=diversity, + ) assert r.num_rows() == 6 - r = m.recommend(users=[1,2,3], k=3, items=[2,3,4], - new_observation_data=None, new_user_data=None, - new_item_data=None, exclude=None, exclude_known=False, diversity=diversity) + r = m.recommend( + users=[1, 2, 3], + k=3, + items=[2, 3, 4], + new_observation_data=None, + new_user_data=None, + new_item_data=None, + exclude=None, + exclude_known=False, + diversity=diversity, + ) assert r.num_rows() == 9 - new_user_data = tc.SFrame({'user_id': [1,2,3], 'state': ['OR', 'WA', 'CA']}) - r = m.recommend(users=[1,2,3], k=3, items=[2,3,4], - new_observation_data=None, new_user_data=new_user_data, - new_item_data=None, exclude=None, exclude_known=False, diversity=diversity) + new_user_data = tc.SFrame( + {"user_id": [1, 2, 3], "state": ["OR", "WA", "CA"]} + ) + r = m.recommend( + users=[1, 2, 3], + k=3, + items=[2, 3, 4], + new_observation_data=None, + new_user_data=new_user_data, + new_item_data=None, + exclude=None, + exclude_known=False, + diversity=diversity, + ) assert r.num_rows() == 9 - restriction_sf = tc.SFrame({'user_id': [1,1,2,2], 'item_id': [1, 2, 2, 3]}) - r = m.recommend(users=[1,2,3], k=3, items=restriction_sf, - new_observation_data=None, new_user_data=None, - new_item_data=None, exclude=None, exclude_known=False, diversity=diversity) + restriction_sf = tc.SFrame( + {"user_id": [1, 1, 2, 2], "item_id": [1, 2, 2, 3]} + ) + r = m.recommend( + users=[1, 2, 3], + k=3, + items=restriction_sf, + new_observation_data=None, + new_user_data=None, + new_item_data=None, + exclude=None, + exclude_known=False, + diversity=diversity, + ) assert r.num_rows() == 4 s = set((t["user_id"], t["item_id"]) for t in r[["user_id", "item_id"]]) - assert s == {(1,1), (1,2), (2, 2), (2, 3)} - - exclude = tc.SFrame({'user_id': [2, 3], 'item_id': [2, 3]}) - r2 = m.recommend(users=None, k=5, items=None, - new_observation_data=None, new_user_data=None, - new_item_data=None, exclude=exclude, exclude_known=False, diversity=diversity) - assert r2.num_rows() == 16-2 + assert s == {(1, 1), (1, 2), (2, 2), (2, 3)} + + exclude = tc.SFrame({"user_id": [2, 3], "item_id": [2, 3]}) + r2 = m.recommend( + users=None, + k=5, + items=None, + new_observation_data=None, + new_user_data=None, + new_item_data=None, + exclude=exclude, + exclude_known=False, + diversity=diversity, + ) + assert r2.num_rows() == 16 - 2 def test_other_arguments(self): - obs = tc.SFrame({'user_id': [5,5,5,5], - 'item_id': [1,2,3,4], - 'time':[10, 10, 11, 11]}) - new_user_data = tc.SFrame({'user_id': [1,2,3], 'state': ['OR', 'WA', 'CA']}) - new_item_data = tc.SFrame({'item_id': [1,2,3], 'category': ['A', 'A', 'B']}) - exclude = tc.SFrame({'user_id': [2, 3], 'item_id': [2, 3]}) - - items = [None, tc.SArray([1,2,3])] + obs = tc.SFrame( + {"user_id": [5, 5, 5, 5], "item_id": [1, 2, 3, 4], "time": [10, 10, 11, 11]} + ) + new_user_data = tc.SFrame({"user_id": [1, 2, 3], "state": ["OR", "WA", "CA"]}) + new_item_data = tc.SFrame({"item_id": [1, 2, 3], "category": ["A", "A", "B"]}) + exclude = tc.SFrame({"user_id": [2, 3], "item_id": [2, 3]}) + + items = [None, tc.SArray([1, 2, 3])] new_observation_datas = [None, obs] new_user_datas = [None, new_user_data] new_item_datas = [None, new_item_data] excludes = [None, exclude] diversities = [0, 1] - options = [items, new_observation_datas, - new_user_datas, new_item_datas, excludes, diversities] - m = tc.ranking_factorization_recommender.create(self.sf, target='rating') + options = [ + items, + new_observation_datas, + new_user_datas, + new_item_datas, + excludes, + diversities, + ] + m = tc.ranking_factorization_recommender.create(self.sf, target="rating") """ TODO: test CoreML export, when we can support having side data, and thus, all factorization models. self._test_coreml_export(m, [1,2], [2,3]) """ - for (item, new_observation_data, new_user_data, new_item_data, exclude, diversity) \ - in itertools.product(*options): - r = m.recommend(users=None, k=5, items=item, - new_observation_data=new_observation_data, - new_user_data=new_user_data, - new_item_data=new_item_data, - diversity=diversity, - exclude=exclude, exclude_known=False) + for ( + item, + new_observation_data, + new_user_data, + new_item_data, + exclude, + diversity, + ) in itertools.product(*options): + r = m.recommend( + users=None, + k=5, + items=item, + new_observation_data=new_observation_data, + new_user_data=new_user_data, + new_item_data=new_item_data, + diversity=diversity, + exclude=exclude, + exclude_known=False, + ) assert r is not None def test_exclude(self): - exclude = tc.SFrame({'user_id': ["2", "3"], 'item_id': [2, 3]}) - m = tc.ranking_factorization_recommender.create(self.sf, target='rating') + exclude = tc.SFrame({"user_id": ["2", "3"], "item_id": [2, 3]}) + m = tc.ranking_factorization_recommender.create(self.sf, target="rating") r = m.recommend(users=None, exclude=exclude, exclude_known=False) assert r.num_rows() == 14 @@ -1104,16 +1344,24 @@ def test_side_data_used(self): # Test whether or not recommendations change when not using the explicit # observation side data. - sf2 = self.sf[['user_id', 'item_id', 'rating']] - user_query = tc.SFrame({'user_id': [1,1,2,2,3,3,4,4], - 'time': [10,13,10,13,10,13,10,13]}) - user_query_2 = tc.SFrame({'user_id': [1,1,2,2,3,3,4,4], - 'time': [13,10,13,10,13,10,13,10]}) + sf2 = self.sf[["user_id", "item_id", "rating"]] + user_query = tc.SFrame( + { + "user_id": [1, 1, 2, 2, 3, 3, 4, 4], + "time": [10, 13, 10, 13, 10, 13, 10, 13], + } + ) + user_query_2 = tc.SFrame( + { + "user_id": [1, 1, 2, 2, 3, 3, 4, 4], + "time": [13, 10, 13, 10, 13, 10, 13, 10], + } + ) # Use ranking factorization model with rating and make sure model # predictions change based on whether time is included - m1 = tc.ranking_factorization_recommender.create(self.sf, target='rating') - m2 = tc.ranking_factorization_recommender.create(sf2, target='rating') + m1 = tc.ranking_factorization_recommender.create(self.sf, target="rating") + m2 = tc.ranking_factorization_recommender.create(sf2, target="rating") """ TODO: test CoreML export, when we can support having side data, and @@ -1125,58 +1373,75 @@ def test_side_data_used(self): r1 = m1.recommend(exclude_known=False) r2 = m2.recommend(exclude_known=False) - assert not all(r1['score'] == r2['score']) + assert not all(r1["score"] == r2["score"]) r3 = m1.recommend(users=user_query, exclude_known=False) - self.assertRaises(ToolkitError, lambda: m2.recommend(users=user_query, exclude_known=False)) - assert not all(r1['score'] == r3['score'][::2]) + self.assertRaises( + ToolkitError, lambda: m2.recommend(users=user_query, exclude_known=False) + ) + assert not all(r1["score"] == r3["score"][::2]) # allow to take a list of dictionaries of the form [{'user_id':1,'time':10}] etc. - flattened_query = list(user_query.apply(lambda x:x)) + flattened_query = list(user_query.apply(lambda x: x)) r3 = m1.recommend(users=flattened_query, exclude_known=False) - self.assertRaises(ToolkitError, lambda: m2.recommend(users=flattened_query, exclude_known=False)) - assert not all(r1['score'] == r3['score'][::2]) + self.assertRaises( + ToolkitError, + lambda: m2.recommend(users=flattened_query, exclude_known=False), + ) + assert not all(r1["score"] == r3["score"][::2]) # Use ranking factorization model without rating and make sure model # predictions change based on whether time is included. - m1 = tc.ranking_factorization_recommender.create(self.sf[['user_id', 'item_id', 'time']]) - m2 = tc.ranking_factorization_recommender.create(self.sf[['user_id', 'item_id']]) + m1 = tc.ranking_factorization_recommender.create( + self.sf[["user_id", "item_id", "time"]] + ) + m2 = tc.ranking_factorization_recommender.create( + self.sf[["user_id", "item_id"]] + ) r1 = m1.recommend(exclude_known=False) r2 = m2.recommend(exclude_known=False) - assert not all(r1['score'] == r2['score']) + assert not all(r1["score"] == r2["score"]) r3 = m1.recommend(users=user_query, exclude_known=False) - assert not all(r1['score'] == r3['score'][::2]) + assert not all(r1["score"] == r3["score"][::2]) - self.assertRaises(ToolkitError, lambda: m2.recommend(users=user_query, exclude_known=False)) + self.assertRaises( + ToolkitError, lambda: m2.recommend(users=user_query, exclude_known=False) + ) r5 = m1.recommend(users=user_query_2, exclude_known=False) # Use factorization model with rating and make sure model # predictions change based on whether time is included. - m1 = tc.factorization_recommender.create(self.sf, target='rating') - m2 = tc.factorization_recommender.create(sf2, target='rating') + m1 = tc.factorization_recommender.create(self.sf, target="rating") + m2 = tc.factorization_recommender.create(sf2, target="rating") r1 = m1.recommend(exclude_known=False) r2 = m2.recommend(exclude_known=False) - assert not all(r1['score'] == r2['score']) + assert not all(r1["score"] == r2["score"]) r3 = m1.recommend(users=user_query, exclude_known=False) - self.assertRaises(ToolkitError, lambda: m2.recommend(users=user_query, exclude_known=False)) - assert not all(r1['score'] == r3['score'][::2]) + self.assertRaises( + ToolkitError, lambda: m2.recommend(users=user_query, exclude_known=False) + ) + assert not all(r1["score"] == r3["score"][::2]) -class ItemIntersectionTest(RecommenderTestBase): +class ItemIntersectionTest(RecommenderTestBase): def test_with_rating(self): - df_dict = {'user_id': [0, 1, 2, 0, 1, 3, 3, 4], - 'item_id': [0, 0, 0, 1, 1, 1, 2, 2], - 'rating': [2, 3, 4, 5, 6, 7, 8, 9]} + df_dict = { + "user_id": [0, 1, 2, 0, 1, 3, 3, 4], + "item_id": [0, 0, 0, 1, 1, 1, 2, 2], + "rating": [2, 3, 4, 5, 6, 7, 8, 9], + } df = SFrame(df_dict) - m = tc.recommender.popularity_recommender.create(df, "user_id", "item_id", "rating") - ''' + m = tc.recommender.popularity_recommender.create( + df, "user_id", "item_id", "rating" + ) + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem @@ -1184,11 +1449,15 @@ def test_with_rating(self): self._test_coreml_export(m, [0,1], [2,5]) else: self._test_coreml_export(m, [0,1]) - ''' + """ - query_items = [(0,0), (0,1), (0,2), (1,2), (1,0), (0, 5)] - query_sf = tc.SFrame({"item_id_1" : [v1 for v1, v2 in query_items], - "item_id_2" : [v2 for v1, v2 in query_items]}) + query_items = [(0, 0), (0, 1), (0, 2), (1, 2), (1, 0), (0, 5)] + query_sf = tc.SFrame( + { + "item_id_1": [v1 for v1, v2 in query_items], + "item_id_2": [v2 for v1, v2 in query_items], + } + ) out = m._get_item_intersection_info(query_items) @@ -1196,57 +1465,65 @@ def test_with_rating(self): true_out["num_users_1"] = [3, 3, 3, 3, 3, 3] true_out["num_users_2"] = [3, 3, 2, 2, 3, 0] - true_out["intersection"] = [{0 : (2, 2), 1 : (3, 3), 2 : (4, 4)}, - {0 : (2, 5), 1 : (3, 6)}, - {}, - {3 : (7, 8)}, - {0 : (5, 2), 1 : (6, 3)}, - {}] + true_out["intersection"] = [ + {0: (2, 2), 1: (3, 3), 2: (4, 4)}, + {0: (2, 5), 1: (3, 6)}, + {}, + {3: (7, 8)}, + {0: (5, 2), 1: (6, 3)}, + {}, + ] util.SFrameComparer()._assert_sframe_equal(out, true_out) def test_without_rating(self): - df_dict = {'user_id': [0, 1, 2, 0, 1, 3, 3, 4], - 'item_id': [0, 0, 0, 1, 1, 1, 2, 2]} + df_dict = { + "user_id": [0, 1, 2, 0, 1, 3, 3, 4], + "item_id": [0, 0, 0, 1, 1, 1, 2, 2], + } df = SFrame(df_dict) m = tc.recommender.popularity_recommender.create(df, "user_id", "item_id") - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem self._test_coreml_export(m, [0,1]) - ''' + """ - query_items = [(0,0), (0,1), (0,2), (1,2), (1,0), (0, 5)] - query_sf = tc.SFrame({"item_id_1" : [v1 for v1, v2 in query_items], - "item_id_2" : [v2 for v1, v2 in query_items]}) + query_items = [(0, 0), (0, 1), (0, 2), (1, 2), (1, 0), (0, 5)] + query_sf = tc.SFrame( + { + "item_id_1": [v1 for v1, v2 in query_items], + "item_id_2": [v2 for v1, v2 in query_items], + } + ) out = m._get_item_intersection_info(query_items) true_out = copy(query_sf) - true_out["num_users_1"] = [3, 3, 3, 3, 3, 3] true_out["num_users_2"] = [3, 3, 2, 2, 3, 0] - true_out["intersection"] = [{0 : (1, 1), 1 : (1, 1), 2 : (1, 1)}, - {0 : (1, 1), 1 : (1, 1)}, - {}, - {3 : (1, 1)}, - {0 : (1, 1), 1 : (1, 1)}, - {}] + true_out["intersection"] = [ + {0: (1, 1), 1: (1, 1), 2: (1, 1)}, + {0: (1, 1), 1: (1, 1)}, + {}, + {3: (1, 1)}, + {0: (1, 1), 1: (1, 1)}, + {}, + ] util.SFrameComparer()._assert_sframe_equal(out, true_out) class RecommenderTest(RecommenderTestBase): - def setUp(self): - ratings_test_data = '''userID,placeID,rating + ratings_test_data = """userID,placeID,rating U1077,135085,0 U1077,135038,1 U1077,132825,1 @@ -1267,20 +1544,20 @@ def setUp(self): U1103,132630,0 U1103,132613,0 U1103,132667,0 -U1103,135104,0''' +U1103,135104,0""" try: write_dir = tempfile.mkdtemp() filename = join(write_dir, "tmp_data_file") - o = open(filename, 'w') + o = open(filename, "w") o.write(ratings_test_data) o.close() self.df = SFrame.read_csv(filename) - self.df_dict = {n : list(self.df[n]) for n in self.df.column_names()} + self.df_dict = {n: list(self.df[n]) for n in self.df.column_names()} - ratings_test_data = '''userID,placeID,NewColumn,rating + ratings_test_data = """userID,placeID,NewColumn,rating U1077,135,085,0 U1077,135,038,1 U1077,132,825,1 @@ -1301,32 +1578,34 @@ def setUp(self): U1103,132,630,0 U1103,132,613,0 U1103,132,667,0 - U1103,135,104,0''' + U1103,135,104,0""" - o = open(filename, 'w') + o = open(filename, "w") o.write(ratings_test_data) o.close() self.df_with_extra_side = SFrame.read_csv(filename) - self.user_id = 'userID' - self.item_id = 'placeID' - self.target = 'rating' + self.user_id = "userID" + self.item_id = "placeID" + self.target = "rating" self.train = self.df.head(10) self.test = self.df.tail(self.df.num_rows() - 10) - self.df_improper = SFrame.read_csv(filename, delimiter='|') + self.df_improper = SFrame.read_csv(filename, delimiter="|") self.models = [] for model_name in model_names: - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem - ''' - m = self._get_trained_model(model_name, - self.df, - user_id=self.user_id, - item_id=self.item_id, - target=self.target, - test_export_to_coreml=False) + """ + m = self._get_trained_model( + model_name, + self.df, + user_id=self.user_id, + item_id=self.item_id, + target=self.target, + test_export_to_coreml=False, + ) self.models.append(m) finally: @@ -1338,7 +1617,7 @@ def test_implicit(self): m4 = tc.recommender.create(implicit, self.user_id, self.item_id, ranking=False) assert m4 is not None - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem @@ -1346,107 +1625,136 @@ def test_implicit(self): self._test_coreml_export(m4, ['135085','135038'], [0,1]) else: self._test_coreml_export(m4, ['135085','135038']) - ''' + """ def test_recommend_from_interactions(self): - data=tc.SFrame({'userId':[1,1,1,2,2,2,3,3,3],'movieId':[10,11,12,10,13,14,10,11,14]}) - exclude_pairs=tc.SFrame({'movieId':[14]}) - recommendations=tc.SFrame({'movieId':[10]}) - model=tc.item_similarity_recommender.create(data,user_id='userId',item_id='movieId') - recommendations=model.recommend_from_interactions(observed_items=recommendations,exclude=exclude_pairs) - assert 14 not in recommendations['movieId'] + data = tc.SFrame( + { + "userId": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "movieId": [10, 11, 12, 10, 13, 14, 10, 11, 14], + } + ) + exclude_pairs = tc.SFrame({"movieId": [14]}) + recommendations = tc.SFrame({"movieId": [10]}) + model = tc.item_similarity_recommender.create( + data, user_id="userId", item_id="movieId" + ) + recommendations = model.recommend_from_interactions( + observed_items=recommendations, exclude=exclude_pairs + ) + assert 14 not in recommendations["movieId"] def test_compare_models(self): from turicreate.toolkits.recommender.util import compare_models - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem - ''' - - model1 = self._get_trained_model('popularity_recommender_with_target', - self.df, - user_id=self.user_id, - item_id=self.item_id, - target=self.target, - test_export_to_coreml=False) - model2 = self._get_trained_model('item_similarity_recommender', - self.df, - user_id=self.user_id, - item_id=self.item_id, - target=self.target, - test_export_to_coreml=False) - - x = compare_models(self.test, [model1, model2], - skip_set=self.train, make_plot=False) - assert x is not None + """ - model1 = self._get_trained_model('popularity_recommender', - self.df, - user_id=self.user_id, - item_id=self.item_id, - target=self.target, - test_export_to_coreml=False) - model2 = self._get_trained_model('ranking_factorization_recommender_no_target', - self.df, - user_id=self.user_id, - item_id=self.item_id, - target=self.target, - test_export_to_coreml=False) - - x = compare_models(self.test, [model1, model2], - skip_set=self.train, make_plot=False) + model1 = self._get_trained_model( + "popularity_recommender_with_target", + self.df, + user_id=self.user_id, + item_id=self.item_id, + target=self.target, + test_export_to_coreml=False, + ) + model2 = self._get_trained_model( + "item_similarity_recommender", + self.df, + user_id=self.user_id, + item_id=self.item_id, + target=self.target, + test_export_to_coreml=False, + ) + + x = compare_models( + self.test, [model1, model2], skip_set=self.train, make_plot=False + ) assert x is not None + model1 = self._get_trained_model( + "popularity_recommender", + self.df, + user_id=self.user_id, + item_id=self.item_id, + target=self.target, + test_export_to_coreml=False, + ) + model2 = self._get_trained_model( + "ranking_factorization_recommender_no_target", + self.df, + user_id=self.user_id, + item_id=self.item_id, + target=self.target, + test_export_to_coreml=False, + ) + + x = compare_models( + self.test, [model1, model2], skip_set=self.train, make_plot=False + ) + assert x is not None - model2 = self._get_trained_model('factorization_recommender', - self.df, - user_id=self.user_id, - item_id=self.item_id, - target=self.target, - test_export_to_coreml=False) - - x = compare_models(self.test, [model1, model2], - skip_set=self.train, make_plot=False) + model2 = self._get_trained_model( + "factorization_recommender", + self.df, + user_id=self.user_id, + item_id=self.item_id, + target=self.target, + test_export_to_coreml=False, + ) + + x = compare_models( + self.test, [model1, model2], skip_set=self.train, make_plot=False + ) assert x is not None def _run_recommend_consistency_test(self, is_regression): if is_regression: - X1 = tc.util.generate_random_sframe(1000, 'ZZ') - X2 = tc.util.generate_random_sframe(500, 'ZZ') - methods = ['popularity_recommender', - 'ranking_factorization_recommender', - 'item_similarity_recommender'] + X1 = tc.util.generate_random_sframe(1000, "ZZ") + X2 = tc.util.generate_random_sframe(500, "ZZ") + methods = [ + "popularity_recommender", + "ranking_factorization_recommender", + "item_similarity_recommender", + ] else: - X1 = tc.util.generate_random_regression_sframe(1000, 'ZZ') - X2 = tc.util.generate_random_regression_sframe(500, 'ZZ') - - methods = ['popularity_recommender', - 'factorization_recommender', - 'item_similarity_recommender', - 'item_similarity_recommender_cosine', - 'item_similarity_recommender_pearson'] + X1 = tc.util.generate_random_regression_sframe(1000, "ZZ") + X2 = tc.util.generate_random_regression_sframe(500, "ZZ") + methods = [ + "popularity_recommender", + "factorization_recommender", + "item_similarity_recommender", + "item_similarity_recommender_cosine", + "item_similarity_recommender_pearson", + ] - users = list(X2['X1-Z'].unique()) + users = list(X2["X1-Z"].unique()) random.seed(0) - blocks = sorted(list(range(10)) + random.sample(range(X2.num_rows()), 50) + [X2.num_rows()]) - blocks_users = sorted(list(range(10)) + random.sample(range(len(users)), 30) + [len(users)]) + blocks = sorted( + list(range(10)) + random.sample(range(X2.num_rows()), 50) + [X2.num_rows()] + ) + blocks_users = sorted( + list(range(10)) + random.sample(range(len(users)), 30) + [len(users)] + ) for method in methods: - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem - ''' + """ - m = self._get_trained_model(method, X1, *X1.column_names(), - test_export_to_coreml=False) + m = self._get_trained_model( + method, X1, *X1.column_names(), test_export_to_coreml=False + ) # Make sure the predictions are the same. preds_1 = m.predict(X2) @@ -1466,14 +1774,14 @@ def _run_recommend_consistency_test(self, is_regression): for r in pred_accumulator[1:]: pred_accumulator[0] = pred_accumulator[0].append(r) - assert( (preds_1 == pred_accumulator[0]).all() ) + assert (preds_1 == pred_accumulator[0]).all() ############################################################ # Broken up recommendations recs_1 = m.recommend(users) recs_accumulator = [] - for lb,ub in zip(blocks_users[:-1], blocks_users[1:]): + for lb, ub in zip(blocks_users[:-1], blocks_users[1:]): recs_accumulator.append(m.recommend(users[lb:ub])) for r in recs_accumulator[1:]: @@ -1484,12 +1792,13 @@ def _run_recommend_consistency_test(self, is_regression): ############################################################ # Broken up recommendations, new data - recs_2 = m.recommend(users, new_observation_data = X1) + recs_2 = m.recommend(users, new_observation_data=X1) recs_accumulator_2 = [] - for lb,ub in zip(blocks_users[:-1], blocks_users[1:]): + for lb, ub in zip(blocks_users[:-1], blocks_users[1:]): recs_accumulator_2.append( - m.recommend(users[lb:ub], new_observation_data = X1)) + m.recommend(users[lb:ub], new_observation_data=X1) + ) for r in recs_accumulator_2[1:]: recs_accumulator_2[0] = recs_accumulator_2[0].append(r) @@ -1503,49 +1812,75 @@ def test_recommend_consistency(self): self._run_recommend_consistency_test(True) def test_reg_value_regression(self): - for method in ['factorization_recommender', 'ranking_factorization_recommender']: + for method in [ + "factorization_recommender", + "ranking_factorization_recommender", + ]: for sgd_step_size in [0, 1e-20, 1e20]: - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem - ''' - args = {'max_iterations': 5, - 'regularization': 1e-20, - 'linear_regularization': 1e20, - 'sgd_step_size': sgd_step_size} - m = self._get_trained_model(method, self.df, self.user_id, - self.item_id, target=self.target, - test_export_to_coreml=False, **args) + """ + args = { + "max_iterations": 5, + "regularization": 1e-20, + "linear_regularization": 1e20, + "sgd_step_size": sgd_step_size, + } + m = self._get_trained_model( + method, + self.df, + self.user_id, + self.item_id, + target=self.target, + test_export_to_coreml=False, + **args + ) assert m is not None - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem - ''' - args = {'max_iterations': 5, - 'regularization': 1e20, - 'linear_regularization': 1e-20, - 'sgd_step_size': sgd_step_size} - m = self._get_trained_model(method, self.df, self.user_id, - self.item_id, target=self.target, - test_export_to_coreml=False, **args) + """ + args = { + "max_iterations": 5, + "regularization": 1e20, + "linear_regularization": 1e-20, + "sgd_step_size": sgd_step_size, + } + m = self._get_trained_model( + method, + self.df, + self.user_id, + self.item_id, + target=self.target, + test_export_to_coreml=False, + **args + ) assert m is not None def test_common_functions(self): - df_dict_tup = {k : tuple(v) for k, v in six.iteritems(self.df_dict)} + df_dict_tup = {k: tuple(v) for k, v in six.iteritems(self.df_dict)} df_dict_ar = df_dict_tup.copy() - df_dict_ar["rating"] = array.array('d', df_dict_ar["rating"]) + df_dict_ar["rating"] = array.array("d", df_dict_ar["rating"]) for m in self.models: m._name() for k in m._list_fields(): m._get(k) m.summary() - for data in [self.df, self.train, self.test, self.df_dict, df_dict_ar, df_dict_tup]: + for data in [ + self.df, + self.train, + self.test, + self.df_dict, + df_dict_ar, + df_dict_tup, + ]: preds = m.predict(data) assert type(preds) == SArray @@ -1557,11 +1892,11 @@ def test_common_functions(self): assert e is not None assert type(e) == dict - e = m.evaluate(data, metric='rmse', verbose=False) + e = m.evaluate(data, metric="rmse", verbose=False) assert e is not None assert type(e) == dict - e = m.evaluate(data, metric='precision_recall', verbose=False) + e = m.evaluate(data, metric="precision_recall", verbose=False) assert e is not None assert type(e) == dict @@ -1576,16 +1911,16 @@ def test_common_functions(self): assert (preds_1 == preds_2).all() assert (preds_1 == preds_3).all() - def test_random_split(self): sf = tc.util.generate_random_sframe(20000, "cc") - sf = sf.rename(dict(zip(sf.column_names(), ("user", "item")) ), inplace=True) + sf = sf.rename(dict(zip(sf.column_names(), ("user", "item"))), inplace=True) for proportion in [0, 0.2, 0.5, 1]: - for seed in [0,1,2,3, None]: + for seed in [0, 1, 2, 3, None]: - train, test = random_split_by_user(sf, "user", "item", - item_test_proportion = proportion) + train, test = random_split_by_user( + sf, "user", "item", item_test_proportion=proportion + ) if proportion == 0: self.assertEqual(test.num_rows(), 0) @@ -1605,27 +1940,36 @@ def test_random_split(self): assert sf.column_names() == test.column_names() assert type(train) == tc.SFrame, "Training split has incorrect type." - assert type(test) == tc.SFrame, "Test split has incorrect type." - assert sf.num_rows() == train.num_rows() + test.num_rows(), \ - "Train/test split not a proper partition." + assert type(test) == tc.SFrame, "Test split has incorrect type." + assert ( + sf.num_rows() == train.num_rows() + test.num_rows() + ), "Train/test split not a proper partition." def test_random_split_consistency(self): sf = tc.util.generate_random_sframe(20000, "cc") - sf = sf.rename(dict(zip(sf.column_names(), ("user", "item")) ), inplace=True) + sf = sf.rename(dict(zip(sf.column_names(), ("user", "item"))), inplace=True) for proportion in [0, 0.2, 0.8, 1]: - for seed in [0,1,2,3]: - - train, test = random_split_by_user(sf, "user", "item", - max_num_users=30, - random_seed = seed, - item_test_proportion = proportion) - - train_2, test_2 = random_split_by_user(sf, "user", "item", - max_num_users=30, - random_seed = seed, - item_test_proportion = proportion) + for seed in [0, 1, 2, 3]: + + train, test = random_split_by_user( + sf, + "user", + "item", + max_num_users=30, + random_seed=seed, + item_test_proportion=proportion, + ) + + train_2, test_2 = random_split_by_user( + sf, + "user", + "item", + max_num_users=30, + random_seed=seed, + item_test_proportion=proportion, + ) assert_sframe_equal(train, train_2) assert_sframe_equal(test, test_2) @@ -1639,21 +1983,25 @@ def test_random_split_random_generation(self): for proportion in [0, 0.2, 0.8, 1]: - for seed in [0,1,2,3]: - - train, test = random_split_by_user(sf, - user_id, - item_id, - random_seed = seed, - max_num_users = max_num_users, - item_test_proportion = proportion) - - train_2, test_2 = random_split_by_user(sf, - user_id, - item_id, - random_seed = seed, - max_num_users = max_num_users, - item_test_proportion = proportion) + for seed in [0, 1, 2, 3]: + + train, test = random_split_by_user( + sf, + user_id, + item_id, + random_seed=seed, + max_num_users=max_num_users, + item_test_proportion=proportion, + ) + + train_2, test_2 = random_split_by_user( + sf, + user_id, + item_id, + random_seed=seed, + max_num_users=max_num_users, + item_test_proportion=proportion, + ) if proportion == 0: self.assertEqual(test.num_rows(), 0) @@ -1665,8 +2013,9 @@ def test_random_split_random_generation(self): assert sf.column_names() == train.column_names() assert sf.column_names() == test.column_names() - assert sf.num_rows() == train.num_rows() + test.num_rows(), \ - "Train/test split not a proper partition." + assert ( + sf.num_rows() == train.num_rows() + test.num_rows() + ), "Train/test split not a proper partition." rows_1 = set([str(x) for x in (list(train) + list(test))]) rows_2 = set(str(x) for x in sf) @@ -1680,20 +2029,22 @@ def test_improper_parse(self): # improperly parsed data def _create_recommender(model_name): - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem - ''' - return self._get_trained_model(model_name, self.df_improper, - user_id=self.user_id, - item_id=self.item_id, - target=self.target, - test_export_to_coreml=False) + """ + return self._get_trained_model( + model_name, + self.df_improper, + user_id=self.user_id, + item_id=self.item_id, + target=self.target, + test_export_to_coreml=False, + ) for model_name in model_names: - self.assertRaises(RuntimeError, - lambda: _create_recommender(model_name)) + self.assertRaises(RuntimeError, lambda: _create_recommender(model_name)) def test_save_and_load(self): @@ -1714,68 +2065,82 @@ def test_save_and_load(self): shutil.rmtree(write_dir) def test_bad_arguments(self): - def _create_recommender(m, args): - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem - ''' - return self._get_trained_model(m, self.train, - user_id=self.user_id, - item_id=self.item_id, - target=self.target, - test_export_to_coreml=False, - **args) + """ + return self._get_trained_model( + m, + self.train, + user_id=self.user_id, + item_id=self.item_id, + target=self.target, + test_export_to_coreml=False, + **args + ) for m in model_names: - self.assertRaises(Exception, - lambda: _create_recommender(m, {'arg_that_isnt_there':None})) + self.assertRaises( + Exception, lambda: _create_recommender(m, {"arg_that_isnt_there": None}) + ) # Test bad arguments for factorization recommender - model_name = 'factorization_recommender' - - self.assertRaises(ToolkitError, - lambda: _create_recommender(model_name, {'num_factors':'chuck_norris'})) - - self.assertRaises(ToolkitError, - lambda: _create_recommender(model_name, {'num_factors': np.NaN})) - - self.assertRaises(ToolkitError, - lambda: _create_recommender(model_name, {'num_factors': np.Inf})) - - self.assertRaises(ToolkitError, - lambda: _create_recommender(model_name, {'num_factors': -np.Inf})) - - self.assertRaises(ToolkitError, - lambda: _create_recommender(model_name, {'num_factors': -1})) - - self.assertRaises(ToolkitError, - lambda: _create_recommender(model_name, {'max_iterations':1.5})) - - self.assertRaises(ToolkitError, - lambda: _create_recommender(model_name, {'max_iterations':'chuck norris'})) - - self.assertRaises(ToolkitError, - lambda: _create_recommender(model_name, {'max_iterations': -1})) - - self.assertRaises(ToolkitError, - lambda: _create_recommender(model_name, {'max_iterations': np.NaN})) - - - + model_name = "factorization_recommender" + + self.assertRaises( + ToolkitError, + lambda: _create_recommender(model_name, {"num_factors": "chuck_norris"}), + ) + + self.assertRaises( + ToolkitError, + lambda: _create_recommender(model_name, {"num_factors": np.NaN}), + ) + + self.assertRaises( + ToolkitError, + lambda: _create_recommender(model_name, {"num_factors": np.Inf}), + ) + + self.assertRaises( + ToolkitError, + lambda: _create_recommender(model_name, {"num_factors": -np.Inf}), + ) + + self.assertRaises( + ToolkitError, lambda: _create_recommender(model_name, {"num_factors": -1}) + ) + + self.assertRaises( + ToolkitError, + lambda: _create_recommender(model_name, {"max_iterations": 1.5}), + ) + + self.assertRaises( + ToolkitError, + lambda: _create_recommender(model_name, {"max_iterations": "chuck norris"}), + ) + + self.assertRaises( + ToolkitError, + lambda: _create_recommender(model_name, {"max_iterations": -1}), + ) + + self.assertRaises( + ToolkitError, + lambda: _create_recommender(model_name, {"max_iterations": np.NaN}), + ) def test_recommend(self): - m = tc.recommender.create(self.train, - self.user_id, - self.item_id, - verbose=False) + m = tc.recommender.create(self.train, self.user_id, self.item_id, verbose=False) # Test that we can provide recommendations for a subset of users. num_recommendations = 5 - train_users = SArray(list(set(self.train['userID']))) - test_users = SArray(list(set(self.test['userID']))) - ''' + train_users = SArray(list(set(self.train["userID"]))) + test_users = SArray(list(set(self.test["userID"]))) + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem @@ -1783,7 +2148,7 @@ def test_recommend(self): self._test_coreml_export(m, ['135085','135038'], [0,1]) else: self._test_coreml_export(m, ['135085','135038']) - ''' + """ recs = m.recommend(users=train_users, k=num_recommendations) assert recs.num_rows() == num_recommendations * 3 # 3 unique users in train @@ -1800,7 +2165,12 @@ def test_recommend(self): users = self.test[self.user_id].unique() assert recs.num_rows() == num_recommendations * len(users) assert recs.num_columns() == 4 - assert list(recs.column_names()) == [self.user_id, self.item_id, 'score', 'rank'] + assert list(recs.column_names()) == [ + self.user_id, + self.item_id, + "score", + "rank", + ] # Check they are the correct type assert type(recs[self.user_id][0]) == type(self.df[self.user_id][0]) @@ -1821,7 +2191,9 @@ def test_recommend(self): assert actual_users == recommended_users # All recommended items are in the recommended set - actual_items = set(self.train[self.item_id].unique()) | set(self.test[self.item_id].unique()) + actual_items = set(self.train[self.item_id].unique()) | set( + self.test[self.item_id].unique() + ) recommended_items = set(r_train[self.item_id].unique()) assert recommended_items.issubset(actual_items) @@ -1840,25 +2212,25 @@ def test_recommend(self): # The provided data set are properly excluded from the returned # recommendations recs = m.recommend(users=train_users).to_dataframe() - actual_pairs = [str(a) + ' ' + str(b) - for (a, b) in zip(self.train[self.user_id], - self.train[self.item_id])] + actual_pairs = [ + str(a) + " " + str(b) + for (a, b) in zip(self.train[self.user_id], self.train[self.item_id]) + ] actual_pairs = frozenset(actual_pairs) - recommended_pairs = [str(a) + ' ' + str(b) - for (a, b) in zip(recs[self.user_id], recs[self.item_id])] + recommended_pairs = [ + str(a) + " " + str(b) + for (a, b) in zip(recs[self.user_id], recs[self.item_id]) + ] recommended_pairs = frozenset(recommended_pairs) assert actual_pairs.intersection(recommended_pairs) == frozenset() - def test_rmse(self): df = self.df.to_dataframe() sf = self.df - m = tc.recommender.factorization_recommender.create(sf, - self.user_id, - self.item_id, - target='rating', - verbose=False) - ''' + m = tc.recommender.factorization_recommender.create( + sf, self.user_id, self.item_id, target="rating", verbose=False + ) + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem @@ -1866,46 +2238,55 @@ def test_rmse(self): self._test_coreml_export(m, ['135085','135038'], [0,1]) else: self._test_coreml_export(m, ['135085','135038']) - ''' + """ res = m.evaluate_rmse(sf, m.target) # Compute real answers - df['prediction'] = m.predict(sf) - df['residual'] = np.square(df['prediction'] - df['rating']) - rmse_by_user = df.groupby(self.user_id)['residual'].mean().apply(lambda x: np.sqrt(x)) - rmse_by_item = df.groupby(self.item_id)['residual'].mean().apply(lambda x: np.sqrt(x)) - rmse_overall = np.sqrt(df['residual'].mean()) + df["prediction"] = m.predict(sf) + df["residual"] = np.square(df["prediction"] - df["rating"]) + rmse_by_user = ( + df.groupby(self.user_id)["residual"].mean().apply(lambda x: np.sqrt(x)) + ) + rmse_by_item = ( + df.groupby(self.item_id)["residual"].mean().apply(lambda x: np.sqrt(x)) + ) + rmse_overall = np.sqrt(df["residual"].mean()) # Compare overall RMSE - assert (rmse_overall - res['rmse_overall']) < DELTA + assert (rmse_overall - res["rmse_overall"]) < DELTA # Compare by RMSE by user - cpp_rmse_by_user = res['rmse_by_user'].to_dataframe() + cpp_rmse_by_user = res["rmse_by_user"].to_dataframe() rmse_by_user = rmse_by_user.reset_index() - assert set(cpp_rmse_by_user.columns.values) == set([self.user_id, "rmse", "count"]) + assert set(cpp_rmse_by_user.columns.values) == set( + [self.user_id, "rmse", "count"] + ) # No NaNs assert not pd.isnull(cpp_rmse_by_user["rmse"]).any() assert not pd.isnull(cpp_rmse_by_user["count"]).any() - comparison = pd.merge(rmse_by_user, cpp_rmse_by_user, - left_on=self.user_id, right_on=self.user_id) - assert all(comparison['residual'] - comparison['rmse'] < DELTA) + comparison = pd.merge( + rmse_by_user, cpp_rmse_by_user, left_on=self.user_id, right_on=self.user_id + ) + assert all(comparison["residual"] - comparison["rmse"] < DELTA) - cpp_rmse_by_item = res['rmse_by_item'].to_dataframe() + cpp_rmse_by_item = res["rmse_by_item"].to_dataframe() - assert set(cpp_rmse_by_item.columns.values) == set([self.item_id, "rmse", "count"]) + assert set(cpp_rmse_by_item.columns.values) == set( + [self.item_id, "rmse", "count"] + ) # No NaNs assert not pd.isnull(cpp_rmse_by_item["rmse"]).any() assert not pd.isnull(cpp_rmse_by_item["count"]).any() rmse_by_item = rmse_by_item.reset_index() - comparison = pd.merge(rmse_by_item, cpp_rmse_by_item, - left_on=self.item_id, right_on=self.item_id) - assert all(comparison['residual'] - comparison['rmse'] < DELTA) - + comparison = pd.merge( + rmse_by_item, cpp_rmse_by_item, left_on=self.item_id, right_on=self.item_id + ) + assert all(comparison["residual"] - comparison["rmse"] < DELTA) def precision(self, actual, predicted, k): assert k > 0 @@ -1935,64 +2316,62 @@ def recall(self, actual, predicted, k): num_hits += 1.0 return num_hits / len(actual) - def test_small_example(self): sf = tc.SFrame() - sf['user_id'] = ['0','0','0','1','1','2','2','3','3'] - sf['item_id'] = ['A','B','C','B','C','C','D','A','D'] + sf["user_id"] = ["0", "0", "0", "1", "1", "2", "2", "3", "3"] + sf["item_id"] = ["A", "B", "C", "B", "C", "C", "D", "A", "D"] train = sf sf = tc.SFrame() - sf['user_id'] = ['0','0','0','1','1','2'] - sf['item_id'] = ['D','E','F','A','F','F'] + sf["user_id"] = ["0", "0", "0", "1", "1", "2"] + sf["item_id"] = ["D", "E", "F", "A", "F", "F"] test = sf - user_id = 'user_id' - item_id = 'item_id' + user_id = "user_id" + item_id = "item_id" - m = tc.recommender.item_similarity_recommender.create(train, - user_id, - item_id, - verbose=False) - ''' + m = tc.recommender.item_similarity_recommender.create( + train, user_id, item_id, verbose=False + ) + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem self._test_coreml_export(m, ['A','B']) - ''' + """ train_preds = m.predict(train) assert len(train_preds) == train.num_rows() - recs = m.recommend(users=SArray(['0','1','2','3'])) - sorted_scores = recs.sort(['user_id', 'item_id'])['score'] - diffs = sorted_scores - tc.SArray([(1./3+0+1./4)/3, - (1./3+1./4)/2, - (1./4)/2, - (1./4+1./3)/2, - (2./3+0)/2, - (1./3+0)/2, - (1./4+1./4)/2]) + recs = m.recommend(users=SArray(["0", "1", "2", "3"])) + sorted_scores = recs.sort(["user_id", "item_id"])["score"] + diffs = sorted_scores - tc.SArray( + [ + (1.0 / 3 + 0 + 1.0 / 4) / 3, + (1.0 / 3 + 1.0 / 4) / 2, + (1.0 / 4) / 2, + (1.0 / 4 + 1.0 / 3) / 2, + (2.0 / 3 + 0) / 2, + (1.0 / 3 + 0) / 2, + (1.0 / 4 + 1.0 / 4) / 2, + ] + ) assert all(abs(diffs) < DELTA) test_preds = m.predict(test) assert len(test_preds) == test.num_rows() - def test_precision_recall(self): train = self.train test = self.test - m = tc.recommender.create(train, - self.user_id, - self.item_id, - verbose=False) + m = tc.recommender.create(train, self.user_id, self.item_id, verbose=False) - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem self._test_coreml_export(m, ['135085','135038']) - ''' + """ users = set(list(test[self.user_id])) cutoff = 5 @@ -2002,20 +2381,26 @@ def test_precision_recall(self): assert type(r) == dict # Convert to DataFrame for the tests below - r = m.evaluate_precision_recall(test, cutoffs=[cutoff], - skip_set=train) + r = m.evaluate_precision_recall(test, cutoffs=[cutoff], skip_set=train) assert r is not None assert type(r) == dict # Test out of order columns - r = m.evaluate_precision_recall(test[[self.item_id, self.user_id]], cutoffs=[cutoff], - skip_set=train) + r = m.evaluate_precision_recall( + test[[self.item_id, self.user_id]], cutoffs=[cutoff], skip_set=train + ) assert r is not None assert type(r) == dict recs = m.recommend(k=cutoff).to_dataframe() - results = r['precision_recall_by_user'] - assert results.column_names() == [self.user_id, 'cutoff', 'precision', 'recall', 'count'] + results = r["precision_recall_by_user"] + assert results.column_names() == [ + self.user_id, + "cutoff", + "precision", + "recall", + "count", + ] for user in users: @@ -2028,8 +2413,8 @@ def test_precision_recall(self): if len(predicted) > 0: # Get answers from C++ - p = results['precision'][results[self.user_id] == user][0] - r = results['recall'][results[self.user_id] == user][0] + p = results["precision"][results[self.user_id] == user][0] + r = results["recall"][results[self.user_id] == user][0] p2 = self.precision(actual, predicted, cutoff) r2 = self.recall(actual, predicted, cutoff) @@ -2040,55 +2425,82 @@ def test_precision_recall(self): class SideDataTests(RecommenderTestBase): - def setUp(self): - self.sf = tc.SFrame({'userID': ["0", "0", "0", "1", "1", "2", "8", "10"], - 'placeID': ["a", "b", "c", "a", "b", "b", "c", "d"], - 'rating': [.2, .3, .4, .1, .3, .3, .5, 0.9]}) - - self.test_sf = tc.SFrame({'userID': ["0", "0", "0", "1", "1", "2", "2", "2"], - 'placeID': ["a", "b", "c", "a", "b", "b", "c", "d"], - 'rating': [.2, .3, .4, .1, .3, .3, .5, 0.9]}) - - - self.user_side = tc.SFrame({'userID': ["0", "1", "20"], - 'blahID': ["a", "b", "b"], - 'blahREAL': [0.1, 12, 22], - 'blahVECTOR': [array.array('d',[0,1]), array.array('d',[0,2]), array.array('d',[2,3])], - 'blahDICT': [{'a' : 23}, {'a' : 13}, {'a' : 23, 'b' : 32}], - }) - - self.item_side = tc.SFrame({'placeID': ["a", "b", "f"], - 'blahID2': ["e", "e", "3"], - 'blahREAL2': [0.4, 12, 22], - 'blahVECTOR2': [array.array('d', [0,1,2]), array.array('d',[0,2,3]), array.array('d', [2,3,3])], - 'blahDICT2': [{'a' : 23}, {'b' : 13}, {'a' : 23, 'c' : 32}]}) - - self.user_id = 'userID' - self.item_id = 'placeID' - self.target = 'rating' + self.sf = tc.SFrame( + { + "userID": ["0", "0", "0", "1", "1", "2", "8", "10"], + "placeID": ["a", "b", "c", "a", "b", "b", "c", "d"], + "rating": [0.2, 0.3, 0.4, 0.1, 0.3, 0.3, 0.5, 0.9], + } + ) + + self.test_sf = tc.SFrame( + { + "userID": ["0", "0", "0", "1", "1", "2", "2", "2"], + "placeID": ["a", "b", "c", "a", "b", "b", "c", "d"], + "rating": [0.2, 0.3, 0.4, 0.1, 0.3, 0.3, 0.5, 0.9], + } + ) + + self.user_side = tc.SFrame( + { + "userID": ["0", "1", "20"], + "blahID": ["a", "b", "b"], + "blahREAL": [0.1, 12, 22], + "blahVECTOR": [ + array.array("d", [0, 1]), + array.array("d", [0, 2]), + array.array("d", [2, 3]), + ], + "blahDICT": [{"a": 23}, {"a": 13}, {"a": 23, "b": 32}], + } + ) + + self.item_side = tc.SFrame( + { + "placeID": ["a", "b", "f"], + "blahID2": ["e", "e", "3"], + "blahREAL2": [0.4, 12, 22], + "blahVECTOR2": [ + array.array("d", [0, 1, 2]), + array.array("d", [0, 2, 3]), + array.array("d", [2, 3, 3]), + ], + "blahDICT2": [{"a": 23}, {"b": 13}, {"a": 23, "c": 32}], + } + ) + + self.user_id = "userID" + self.item_id = "placeID" + self.target = "rating" def test_bad_input(self): try: - m = tc.recommender.create(self.sf, self.user_id, self.item_id, user_data='bad input') + m = tc.recommender.create( + self.sf, self.user_id, self.item_id, user_data="bad input" + ) except TypeError as e: - self.assertEqual(str(e), 'Provided user_data must be an SFrame.') + self.assertEqual(str(e), "Provided user_data must be an SFrame.") try: - m = tc.recommender.create(self.sf, self.user_id, self.item_id, item_data='bad input') + m = tc.recommender.create( + self.sf, self.user_id, self.item_id, item_data="bad input" + ) except TypeError as e: - self.assertEqual(str(e), 'Provided item_data must be an SFrame.') - + self.assertEqual(str(e), "Provided item_data must be an SFrame.") def test_model_creation(self): def check_model(m): - expected = ['num_users', 'num_items', - 'num_user_side_features', - 'num_item_side_features', - 'user_side_data_column_names', - 'user_side_data_column_types', - 'item_side_data_column_names', - 'item_side_data_column_types'] + expected = [ + "num_users", + "num_items", + "num_user_side_features", + "num_item_side_features", + "user_side_data_column_names", + "user_side_data_column_types", + "item_side_data_column_names", + "item_side_data_column_types", + ] observed = m._list_fields() for e in expected: @@ -2105,19 +2517,21 @@ def check_model(m): if model_name == "item_content_recommender": continue - ''' + """ TODO: Test CoreML export, when we have a dirarchiver that doesn't depend on the filesystem - ''' - m = self._get_trained_model(model_name, - self.sf, - user_id=self.user_id, - item_id=self.item_id, - target=self.target, - test_export_to_coreml=False, - user_data=u_side, - item_data=i_side) + """ + m = self._get_trained_model( + model_name, + self.sf, + user_id=self.user_id, + item_id=self.item_id, + target=self.target, + test_export_to_coreml=False, + user_data=u_side, + item_data=i_side, + ) m.save(fn) m1 = tc.load_model(fn) check_model(m) @@ -2131,19 +2545,15 @@ def test_recommender_create(self): sf_binary_target = self.sf sf_binary_target[self.target] = 1 - m = tc.recommender.create(sf_w_target, - self.user_id, self.item_id) + m = tc.recommender.create(sf_w_target, self.user_id, self.item_id) assert isinstance(m, ItemSimilarityRecommender) - self._test_coreml_export(m, ['a','b']) + self._test_coreml_export(m, ["a", "b"]) - m = tc.recommender.create(sf_no_target, - self.user_id, self.item_id) + m = tc.recommender.create(sf_no_target, self.user_id, self.item_id) assert isinstance(m, ItemSimilarityRecommender) - self._test_coreml_export(m, ['a','b']) + self._test_coreml_export(m, ["a", "b"]) - m = tc.recommender.create(sf_w_target, - self.user_id, self.item_id, - self.target) + m = tc.recommender.create(sf_w_target, self.user_id, self.item_id, self.target) assert isinstance(m, RankingFactorizationRecommender) """ TODO: test CoreML export, when we can support serializing user @@ -2151,10 +2561,9 @@ def test_recommender_create(self): self._test_coreml_export(m, ['a','b'], [.2,.3]) """ - m = tc.recommender.create(sf_w_target, - self.user_id, self.item_id, - self.target, - ranking=False) + m = tc.recommender.create( + sf_w_target, self.user_id, self.item_id, self.target, ranking=False + ) assert isinstance(m, FactorizationRecommender) """ TODO: test CoreML export, when we can support serializing user @@ -2162,10 +2571,9 @@ def test_recommender_create(self): self._test_coreml_export(m, ['a','b'], [.2,.3]) """ - m = tc.recommender.create(sf_binary_target, - self.user_id, self.item_id, - self.target, - ranking=False) + m = tc.recommender.create( + sf_binary_target, self.user_id, self.item_id, self.target, ranking=False + ) assert isinstance(m, FactorizationRecommender) """ TODO: test CoreML export, when we can support serializing user @@ -2173,11 +2581,14 @@ def test_recommender_create(self): self._test_coreml_export(m, ['a','b'], [.2,.3]) """ - m = tc.recommender.create(sf_w_target, - self.user_id, self.item_id, - self.target, - ranking=False, - user_data=self.user_side) + m = tc.recommender.create( + sf_w_target, + self.user_id, + self.item_id, + self.target, + ranking=False, + user_data=self.user_side, + ) assert isinstance(m, FactorizationRecommender) """ TODO: test CoreML export, when we can support serializing user @@ -2185,11 +2596,14 @@ def test_recommender_create(self): self._test_coreml_export(m, ['a','b'], [.2,.3]) """ - m = tc.recommender.create(sf_w_target, - self.user_id, self.item_id, - self.target, - user_data=self.user_side, - item_data=self.item_side) + m = tc.recommender.create( + sf_w_target, + self.user_id, + self.item_id, + self.target, + user_data=self.user_side, + item_data=self.item_side, + ) assert isinstance(m, RankingFactorizationRecommender) """ TODO: test CoreML export, when we can support serializing user @@ -2197,11 +2611,14 @@ def test_recommender_create(self): self._test_coreml_export(m, ['a','b'], [.2,.3]) """ - m = tc.recommender.create(sf_no_target, - self.user_id, self.item_id, - ranking=False, - user_data=self.user_side, - item_data=self.item_side) + m = tc.recommender.create( + sf_no_target, + self.user_id, + self.item_id, + ranking=False, + user_data=self.user_side, + item_data=self.item_side, + ) assert isinstance(m, RankingFactorizationRecommender) """ TODO: test CoreML export, when we can support serializing user @@ -2209,44 +2626,67 @@ def test_recommender_create(self): self._test_coreml_export(m, ['a','b'], [.2,.3]) """ - m = tc.recommender.create(sf_no_target, - self.user_id, self.item_id, - ranking=False) + m = tc.recommender.create( + sf_no_target, self.user_id, self.item_id, ranking=False + ) assert isinstance(m, ItemSimilarityRecommender) - self._test_coreml_export(m, ['a','b']) + self._test_coreml_export(m, ["a", "b"]) class FactorizationTests(RecommenderTestBase): - def setUp(self): - self.model_names = ['default', - 'factorization_recommender', - 'ranking_factorization_recommender'] - - self.df = tc.SFrame({'userID': ["0", "0", "0", "1", "1", "2", "2", "2"], - 'placeID': ["a", "b", "c", "a", "b", "b", "c", "d"], - 'rating': [.2, .3, .4, .1, .3, .3, .5, 0.9]}) - self.test_df = tc.SFrame({'userID': ["0", "0", "0", "1", "1", "2", "2", "2"], - 'placeID': ["a", "b", "c", "a", "b", "b", "c", "d"], - 'rating': [.2, .3, .4, .1, .3, .3, .5, 0.9]}) - - - self.user_side = tc.SFrame({'userID': ["0", "1", "2"], - 'blahID': ["a", "b", "b"], - 'blahREAL': [0.1, 12, 22], - 'blahVECTOR': [array.array('d',[0,1]), array.array('d',[0,2]), array.array('d',[2,3])], - 'blahDICT': [{'a' : 23}, {'a' : 13}, {'a' : 23, 'b' : 32}], - }) - - self.item_side = tc.SFrame({'placeID': ["a", "b", "d"], - 'blahID2': ["e", "e", "3"], - 'blahREAL2': [0.4, 12, 22], - 'blahVECTOR2': [array.array('d',[0,1,2]), array.array('d',[0,2,3]), array.array('d',[2,3,3])], - 'blahDICT2': [{'a' : 23}, {'b' : 13}, {'a' : 23, 'c' : 32, None : 12}]}) - - self.user_id = 'userID' - self.item_id = 'placeID' - self.target = 'rating' + self.model_names = [ + "default", + "factorization_recommender", + "ranking_factorization_recommender", + ] + + self.df = tc.SFrame( + { + "userID": ["0", "0", "0", "1", "1", "2", "2", "2"], + "placeID": ["a", "b", "c", "a", "b", "b", "c", "d"], + "rating": [0.2, 0.3, 0.4, 0.1, 0.3, 0.3, 0.5, 0.9], + } + ) + self.test_df = tc.SFrame( + { + "userID": ["0", "0", "0", "1", "1", "2", "2", "2"], + "placeID": ["a", "b", "c", "a", "b", "b", "c", "d"], + "rating": [0.2, 0.3, 0.4, 0.1, 0.3, 0.3, 0.5, 0.9], + } + ) + + self.user_side = tc.SFrame( + { + "userID": ["0", "1", "2"], + "blahID": ["a", "b", "b"], + "blahREAL": [0.1, 12, 22], + "blahVECTOR": [ + array.array("d", [0, 1]), + array.array("d", [0, 2]), + array.array("d", [2, 3]), + ], + "blahDICT": [{"a": 23}, {"a": 13}, {"a": 23, "b": 32}], + } + ) + + self.item_side = tc.SFrame( + { + "placeID": ["a", "b", "d"], + "blahID2": ["e", "e", "3"], + "blahREAL2": [0.4, 12, 22], + "blahVECTOR2": [ + array.array("d", [0, 1, 2]), + array.array("d", [0, 2, 3]), + array.array("d", [2, 3, 3]), + ], + "blahDICT2": [{"a": 23}, {"b": 13}, {"a": 23, "c": 32, None: 12}], + } + ) + + self.user_id = "userID" + self.item_id = "placeID" + self.target = "rating" self.models = [] @@ -2255,80 +2695,89 @@ def setUp(self): for model_name in self.model_names: - m = self._get_trained_model(model_name, - self.df, - user_id = self.user_id, - item_id = self.item_id, - target = self.target, - test_export_to_coreml=False, - user_data = u_side, - item_data = i_side) + m = self._get_trained_model( + model_name, + self.df, + user_id=self.user_id, + item_id=self.item_id, + target=self.target, + test_export_to_coreml=False, + user_data=u_side, + item_data=i_side, + ) self.models.append((model_name, m)) - def test_evaluate_with_side_data(self): for u_side in [None, self.user_side]: for i_side in [None, self.item_side]: for (mname, m) in self.models: - e = m.evaluate(self.test_df, - new_user_data=u_side, - new_item_data=i_side, - verbose=False) - assert 'precision_recall_by_user' in e + e = m.evaluate( + self.test_df, + new_user_data=u_side, + new_item_data=i_side, + verbose=False, + ) + assert "precision_recall_by_user" in e - recs = m.recommend(k = 1, - new_user_data=u_side, - new_item_data=i_side) + recs = m.recommend(k=1, new_user_data=u_side, new_item_data=i_side) assert recs is not None assert recs.num_rows() == len(self.df[self.user_id].unique()) - def test_data_summary_fields(self): for (model_name, m) in self.models: - expected = ['num_users', 'num_items', - 'num_user_side_features', - 'num_item_side_features', - 'observation_data_column_names', - 'user_side_data_column_names', - 'user_side_data_column_types', - 'item_side_data_column_names', - 'item_side_data_column_types'] + expected = [ + "num_users", + "num_items", + "num_user_side_features", + "num_item_side_features", + "observation_data_column_names", + "user_side_data_column_names", + "user_side_data_column_types", + "item_side_data_column_names", + "item_side_data_column_types", + ] observed = m._list_fields() for e in expected: assert e in observed def test_matrix_factorization_values(self): - test_vars = [("side_data_factorization", [True, False]), - # ("verbose", [True, False]), - ("binary_target", [True, False]), - ("nmf", [True, False]), - ("random_seed", [0, 5]), - ("solver", ["sgd", "als"])] + test_vars = [ + ("side_data_factorization", [True, False]), + # ("verbose", [True, False]), + ("binary_target", [True, False]), + ("nmf", [True, False]), + ("random_seed", [0, 5]), + ("solver", ["sgd", "als"]), + ] for var, values in test_vars: for v in values: m = tc.factorization_recommender.create( - self.df, 'userID', 'placeID', 'rating', **{var: v}) + self.df, "userID", "placeID", "rating", **{var: v} + ) assert m._get(var) == v def test_ranking_factorization_values(self): - test_vars = [("side_data_factorization", [True, False]), - # ("verbose", [True, False]), # not used - ("binary_target", [True, False]), - ("nmf", [True, False]), - ("random_seed", [0, 5]), - ("solver", ["sgd", "ials"])] + test_vars = [ + ("side_data_factorization", [True, False]), + # ("verbose", [True, False]), # not used + ("binary_target", [True, False]), + ("nmf", [True, False]), + ("random_seed", [0, 5]), + ("solver", ["sgd", "ials"]), + ] for var, values in test_vars: for v in values: m = tc.ranking_factorization_recommender.create( - self.df, 'userID', 'placeID', 'rating', **{var: v}) + self.df, "userID", "placeID", "rating", **{var: v} + ) assert m._get(var) == v @@ -2338,63 +2787,64 @@ def test_retrieve_factors(self): d = m._get("coefficients") - if 'nmf' not in model_name: - assert 'intercept' in d - - assert 'userID' in d - assert 'placeID' in d + if "nmf" not in model_name: + assert "intercept" in d - assert set(d['userID']['userID']) == set(self.df['userID']) - assert set(d['placeID']['placeID']) == set(self.df['placeID']) + assert "userID" in d + assert "placeID" in d - if 'linear_regression' not in model_name: - assert len(d['userID']['factors'][0]) == m._get("num_factors") + assert set(d["userID"]["userID"]) == set(self.df["userID"]) + assert set(d["placeID"]["placeID"]) == set(self.df["placeID"]) - if 'blahID' in d: - assert set(d['blahID']['blahID']) == set(self.user_side['blahID']) + if "linear_regression" not in model_name: + assert len(d["userID"]["factors"][0]) == m._get("num_factors") - if 'blahID2' in d: - assert set(d['blahID2']['blahID2']) == set(self.item_side['blahID2']) + if "blahID" in d: + assert set(d["blahID"]["blahID"]) == set(self.user_side["blahID"]) - if 'blahREAL' in d: - assert list(d['blahREAL']['index']) == [0] + if "blahID2" in d: + assert set(d["blahID2"]["blahID2"]) == set(self.item_side["blahID2"]) - if 'blahREAL2' in d: - assert list(d['blahREAL2']['index']) == [0] + if "blahREAL" in d: + assert list(d["blahREAL"]["index"]) == [0] - if 'blahVECTOR' in d: - assert list(d['blahVECTOR']['index']) == [0, 1] + if "blahREAL2" in d: + assert list(d["blahREAL2"]["index"]) == [0] - if 'blahVECTOR2' in d: - assert list(d['blahVECTOR2']['index']) == [0, 1, 2] + if "blahVECTOR" in d: + assert list(d["blahVECTOR"]["index"]) == [0, 1] - if 'blahDICT' in d: - assert set(d['blahDICT']['blahDICT']) == {'a', 'b'} + if "blahVECTOR2" in d: + assert list(d["blahVECTOR2"]["index"]) == [0, 1, 2] - if 'blahDICT2' in d: - assert set(d['blahDICT2']['blahDICT2']) == {'a', 'b', 'c', None} + if "blahDICT" in d: + assert set(d["blahDICT"]["blahDICT"]) == {"a", "b"} + if "blahDICT2" in d: + assert set(d["blahDICT2"]["blahDICT2"]) == {"a", "b", "c", None} def test_MF_recommend_bug(self): X = tc.SFrame() - X["user"] = list(range(10000)) - X["item"] = [i % 5 for i in range(10000)] + X["user"] = list(range(10000)) + X["item"] = [i % 5 for i in range(10000)] X["rating"] = [float(i) / 10000 for i in range(10000)] m = tc.recommender.factorization_recommender.create( - X, "user", "item", "rating", max_iterations = 1) + X, "user", "item", "rating", max_iterations=1 + ) # sometimes segfaults in gl 1.3.0 m.recommend([10000]) -class TestContentRecommender(RecommenderTestBase): +class TestContentRecommender(RecommenderTestBase): def test_basic(self): - item_data = tc.SFrame({"my_item_id" : range(10), - "data" : [ [1, 0] ]*5 + [ [0, 1] ]*5}) + item_data = tc.SFrame( + {"my_item_id": range(10), "data": [[1, 0]] * 5 + [[0, 1]] * 5} + ) m = tc.recommender.item_content_recommender.create(item_data, "my_item_id") @@ -2402,10 +2852,12 @@ def test_basic(self): self.assertEqual(m._get("num_items"), 10) self.assertEqual(m._get("num_observations"), 0) - new_observation_data = tc.SFrame({"__implicit_user__" : [0]*4, "my_item_id" : range(4)}) + new_observation_data = tc.SFrame( + {"__implicit_user__": [0] * 4, "my_item_id": range(4)} + ) # Test the recommend API in this case. - out = m.recommend([0], k=1, new_observation_data = new_observation_data) + out = m.recommend([0], k=1, new_observation_data=new_observation_data) self.assertEqual(out.column_names()[1], "my_item_id") self.assertEqual(out["my_item_id"].dtype, int) @@ -2415,7 +2867,7 @@ def test_basic(self): self.assertEqual(out[0]["my_item_id"], 4) # Test the recommend_from_interactions. - out_2 = m.recommend_from_interactions(list(range(4)), k = 1) + out_2 = m.recommend_from_interactions(list(range(4)), k=1) self.assertEqual(out_2.column_names()[0], "my_item_id") self.assertEqual(out_2["my_item_id"].dtype, int) @@ -2423,41 +2875,46 @@ def test_basic(self): self.assertEqual(out_2.column_names()[2], "rank") self.assertEqual(out_2[0]["my_item_id"], 4) - self._test_coreml_export(m, [0,1]) - + self._test_coreml_export(m, [0, 1]) def test_weights(self): - item_data = tc.SFrame({"my_item_id" : range(4), - "data_1" : [ [1, 0], [1, 0], [0, 1], [0.5, 0.5] ], - "data_2" : [ [0, 1], [1, 0], [0, 1], [0.5, 0.5] ] }) + item_data = tc.SFrame( + { + "my_item_id": range(4), + "data_1": [[1, 0], [1, 0], [0, 1], [0.5, 0.5]], + "data_2": [[0, 1], [1, 0], [0, 1], [0.5, 0.5]], + } + ) # If the weights are set to auto, then they are currently set # equally. In this case, element 3 will be closer to 0 than 1 or 2. m_1 = tc.recommender.item_content_recommender.create(item_data, "my_item_id") - out_1 = m_1.recommend_from_interactions([0], k = 1) + out_1 = m_1.recommend_from_interactions([0], k=1) self.assertEqual(out_1[0]["my_item_id"], 3) # If the weights are set so that data_1 is 1 and data_2 is 0, # then 1 will be closer to 0 than 2 or 3. - m_2 = tc.recommender.item_content_recommender.create(item_data, "my_item_id", weights = {"data_1" : 1, "data_2" : 0}) - out_2 = m_2.recommend_from_interactions([0], k = 1) + m_2 = tc.recommender.item_content_recommender.create( + item_data, "my_item_id", weights={"data_1": 1, "data_2": 0} + ) + out_2 = m_2.recommend_from_interactions([0], k=1) self.assertEqual(out_2[0]["my_item_id"], 1) # If the weights are set so that data_1 is 0 and data_2 is 1, # then 2 will be closer to 0 than 1 or 3. - m_3 = tc.recommender.item_content_recommender.create(item_data, "my_item_id", weights = {"data_1" : 0, "data_2" : 1}) - out_3 = m_3.recommend_from_interactions([0], k = 1) + m_3 = tc.recommender.item_content_recommender.create( + item_data, "my_item_id", weights={"data_1": 0, "data_2": 1} + ) + out_3 = m_3.recommend_from_interactions([0], k=1) self.assertEqual(out_3[0]["my_item_id"], 2) for m in [m_1, m_2, m_3]: - self._test_coreml_export(m, [0,1]) - + self._test_coreml_export(m, [0, 1]) def test_basic_string_type(self): - item_data = tc.SFrame({"my_item_id" : range(10), - "data" : [ "a" ]*5 + [ "b" ]*5}) + item_data = tc.SFrame({"my_item_id": range(10), "data": ["a"] * 5 + ["b"] * 5}) m = tc.recommender.item_content_recommender.create(item_data, "my_item_id") @@ -2465,10 +2922,12 @@ def test_basic_string_type(self): self.assertEqual(m._get("num_items"), 10) self.assertEqual(m._get("num_observations"), 0) - new_observation_data = tc.SFrame({"__implicit_user__" : [0]*4, "my_item_id" : range(4)}) + new_observation_data = tc.SFrame( + {"__implicit_user__": [0] * 4, "my_item_id": range(4)} + ) # Test the recommend API in this case. - out = m.recommend([0], k=1, new_observation_data = new_observation_data) + out = m.recommend([0], k=1, new_observation_data=new_observation_data) self.assertEqual(out.column_names()[1], "my_item_id") self.assertEqual(out["my_item_id"].dtype, int) @@ -2478,7 +2937,7 @@ def test_basic_string_type(self): self.assertEqual(out[0]["my_item_id"], 4) # Test the recommend_from_interactions. - out_2 = m.recommend_from_interactions(list(range(4)), k = 1) + out_2 = m.recommend_from_interactions(list(range(4)), k=1) self.assertEqual(out_2.column_names()[0], "my_item_id") self.assertEqual(out_2["my_item_id"].dtype, int) @@ -2486,8 +2945,7 @@ def test_basic_string_type(self): self.assertEqual(out_2.column_names()[2], "rank") self.assertEqual(out_2[0]["my_item_id"], 4) - self._test_coreml_export(m, [0,1]) - + self._test_coreml_export(m, [0, 1]) def test_basic_mixed_types(self): @@ -2506,17 +2964,22 @@ def test_basic_mixed_types(self): self.assertEqual(out.num_rows(), 10) - observation_data = tc.SFrame({"users" : list(range(8))*5, - "item_id" : [str(r) for r in list(range(10))*4]}) + observation_data = tc.SFrame( + { + "users": list(range(8)) * 5, + "item_id": [str(r) for r in list(range(10)) * 4], + } + ) - self._test_coreml_export(m, ['0','1']) + self._test_coreml_export(m, ["0", "1"]) m = tc.recommender.item_content_recommender.create( - item_data, "item_id", observation_data, "users") + item_data, "item_id", observation_data, "users" + ) self.assertEqual(m._get("num_users"), 8) self.assertEqual(m._get("num_items"), 50) - self.assertEqual(m._get("num_observations"), 5*8) + self.assertEqual(m._get("num_observations"), 5 * 8) # Test the recommend API in this case. out_2 = m.recommend_from_interactions(["0", "1", "2"], k=10) @@ -2524,13 +2987,15 @@ def test_basic_mixed_types(self): assert_sframe_equal(out, out_2) # Test that it preserves the correct - out_3 = m.recommend([0], k = 10) + out_3 = m.recommend([0], k=10) - user_0_items = set( (observation_data["item_id"])[observation_data["users"] == 0]) + user_0_items = set( + (observation_data["item_id"])[observation_data["users"] == 0] + ) out_3_items = set(out_3["item_id"]) self.assertEqual(len(user_0_items & out_3_items), 0) - self._test_coreml_export(m, ['0','1']) + self._test_coreml_export(m, ["0", "1"]) def test_get_similar_items(self): @@ -2547,23 +3012,23 @@ def test_get_similar_items(self): self.assertEqual(m._get("num_items"), 50) self.assertEqual(m._get("num_observations"), 0) - sim_items = m.get_similar_items([str(i) for i in range(25)], k = 1) + sim_items = m.get_similar_items([str(i) for i in range(25)], k=1) self.assertEqual(sim_items.num_rows(), 25) for d in sim_items: self.assertEqual(int(d["similar"]), int(d["item_id"]) + 25) - self._test_coreml_export(m, ['0','1']) + self._test_coreml_export(m, ["0", "1"]) def test_regression_1(self): - temp_sframe = tc.SFrame({"my_item_id" : range(4), - "data_1" : [0,1,0,0] , - "data_2" : [0,1,0,0] }) - tc.item_content_recommender.create(temp_sframe,'my_item_id') + temp_sframe = tc.SFrame( + {"my_item_id": range(4), "data_1": [0, 1, 0, 0], "data_2": [0, 1, 0, 0]} + ) + tc.item_content_recommender.create(temp_sframe, "my_item_id") -class ItemSimilarityCoreMLExportTest(unittest.TestCase): +class ItemSimilarityCoreMLExportTest(unittest.TestCase): def test_export_model_size(self): # Test that the users are completely dropped. @@ -2578,12 +3043,15 @@ def test_export_model_size(self): Xr = Xr.append(X2) # Train two recommenders, one with 20x the number of users. - m1 = tc.recommender.item_similarity_recommender.create(X, user_id="X1-s", item_id="X2-s") - m2 = tc.recommender.item_similarity_recommender.create(Xr, user_id="X1-s", item_id="X2-s") + m1 = tc.recommender.item_similarity_recommender.create( + X, user_id="X1-s", item_id="X2-s" + ) + m2 = tc.recommender.item_similarity_recommender.create( + Xr, user_id="X1-s", item_id="X2-s" + ) self.assertEqual(m1.num_users, 10) - self.assertEqual(m2.num_users, 20*10) - + self.assertEqual(m2.num_users, 20 * 10) temp_file_path_1 = _mkstemp()[1] temp_file_path_2 = _mkstemp()[1] diff --git a/src/python/turicreate/test/test_recsys_api.py b/src/python/turicreate/test/test_recsys_api.py index 64699437d8..6240ec41aa 100644 --- a/src/python/turicreate/test/test_recsys_api.py +++ b/src/python/turicreate/test/test_recsys_api.py @@ -10,58 +10,62 @@ import sys import turicreate as tc -DELTA = .000001 +DELTA = 0.000001 class AdditionalDataTest(unittest.TestCase): - def setUp(self): data = tc.SFrame() - data['user_id'] = ["a", "b", "b", "c", "c", "c"] - data['item_id'] = ['x', 'x', 'y', 'v', 'w', 'z'] - data['rating'] = [0, 1, 2, 3, 4, 5] + data["user_id"] = ["a", "b", "b", "c", "c", "c"] + data["item_id"] = ["x", "x", "y", "v", "w", "z"] + data["rating"] = [0, 1, 2, 3, 4, 5] # Make internal indices so that we can check predictions/ranking. # IDs are in the order they are seen in the above data SFrame. - user_index = {'a':0, 'b':1, 'c':2} - item_index = {'x':0, 'y':1, 'v':2, 'w':3, 'z':4} + user_index = {"a": 0, "b": 1, "c": 2} + item_index = {"x": 0, "y": 1, "v": 2, "w": 3, "z": 4} user_data = tc.SFrame() - user_data['user_id'] = ['a', 'b'] - user_data['user_feature_value'] = [.5, .9] - user_data['user_dict_value'] = [{1 : .5}, {4 : .9}] - user_data['user_vect_value'] = [[0,1,2], [2,3,4]] - user_data['user_str_dict_value'] = [{"tt" : .5}, {"ttt" : .9}] + user_data["user_id"] = ["a", "b"] + user_data["user_feature_value"] = [0.5, 0.9] + user_data["user_dict_value"] = [{1: 0.5}, {4: 0.9}] + user_data["user_vect_value"] = [[0, 1, 2], [2, 3, 4]] + user_data["user_str_dict_value"] = [{"tt": 0.5}, {"ttt": 0.9}] item_data = tc.SFrame() - item_data['item_id'] = ['x', 'v', 'w', 'y'] - item_data['item_feature_value'] = [-.3, .7, .3, .05] - item_data['item_dict_value'] = [{1 : .5}, {4 : .9}, {4 : .9}, {5 : 1, 6 : 2}] - item_data['item_vect_value'] = [[0,1,2], [2,3,4], [2,3,4], [2,3,5] ] - item_data['item_str_dict_value'] = [{"tt" : .5}, {"tt" : .9}, {"t" : .9}, {"ttt" : .9}] + item_data["item_id"] = ["x", "v", "w", "y"] + item_data["item_feature_value"] = [-0.3, 0.7, 0.3, 0.05] + item_data["item_dict_value"] = [{1: 0.5}, {4: 0.9}, {4: 0.9}, {5: 1, 6: 2}] + item_data["item_vect_value"] = [[0, 1, 2], [2, 3, 4], [2, 3, 4], [2, 3, 5]] + item_data["item_str_dict_value"] = [ + {"tt": 0.5}, + {"tt": 0.9}, + {"t": 0.9}, + {"ttt": 0.9}, + ] new_data = tc.SFrame() - new_data['user_id'] = ['a', 'b'] - new_data['item_id'] = ['v', 'z'] - new_data['rating'] = [7, 8] + new_data["user_id"] = ["a", "b"] + new_data["item_id"] = ["v", "z"] + new_data["rating"] = [7, 8] new_user_data = tc.SFrame() - new_user_data['user_id'] = ['a', 'c'] - new_user_data['user_feature_value'] = [0.0, 2.9] - new_user_data['user_dict_value'] = [{1 : .5}, {4 : .9}] - new_user_data['user_vect_value'] = [[0,1,2], [2,3,4]] - new_user_data['user_str_dict_value'] = [{"tt" : .5}, {"ttt" : .9}] + new_user_data["user_id"] = ["a", "c"] + new_user_data["user_feature_value"] = [0.0, 2.9] + new_user_data["user_dict_value"] = [{1: 0.5}, {4: 0.9}] + new_user_data["user_vect_value"] = [[0, 1, 2], [2, 3, 4]] + new_user_data["user_str_dict_value"] = [{"tt": 0.5}, {"ttt": 0.9}] new_item_data = tc.SFrame() - new_item_data['item_id'] = ['y', 'z'] - new_item_data['item_feature_value'] = [.5, .6] - new_item_data['item_dict_value'] = [{1 : .5}, {4 : .9}] - new_item_data['item_vect_value'] = [[0,1,2], [2,3,4]] - new_item_data['item_str_dict_value'] = [{"tt" : .5}, {"ttt" : .9}] + new_item_data["item_id"] = ["y", "z"] + new_item_data["item_feature_value"] = [0.5, 0.6] + new_item_data["item_dict_value"] = [{1: 0.5}, {4: 0.9}] + new_item_data["item_vect_value"] = [[0, 1, 2], [2, 3, 4]] + new_item_data["item_str_dict_value"] = [{"tt": 0.5}, {"ttt": 0.9}] exclude = tc.SFrame() - exclude['user_id'] = ['a'] - exclude['item_id'] = ['x'] + exclude["user_id"] = ["a"] + exclude["item_id"] = ["x"] - users_all = tc.SArray(['a', 'b', 'c']) - items_all = tc.SArray(['v', 'w', 'x', 'y', 'z']) - items_some = tc.SArray(['v', 'w']) + users_all = tc.SArray(["a", "b", "c"]) + items_all = tc.SArray(["v", "w", "x", "y", "z"]) + items_some = tc.SArray(["v", "w"]) self.data = data self.user_data = user_data @@ -74,23 +78,27 @@ def setUp(self): self.items_all = items_all self.items_some = items_some self.user_index = user_index - self.item_index=item_index + self.item_index = item_index def test_recommender_models(self): data = self.data user_data = self.user_data item_data = self.item_data - for mod in [tc.factorization_recommender, - tc.ranking_factorization_recommender, - tc.popularity_recommender, - tc.item_similarity_recommender]: - - m = mod.create(data, - user_id='user_id', - item_id='item_id', - target='rating', - user_data=user_data, - item_data=item_data) + for mod in [ + tc.factorization_recommender, + tc.ranking_factorization_recommender, + tc.popularity_recommender, + tc.item_similarity_recommender, + ]: + + m = mod.create( + data, + user_id="user_id", + item_id="item_id", + target="rating", + user_data=user_data, + item_data=item_data, + ) assert m is not None self._test_score(m) @@ -111,7 +119,7 @@ def _test_basic(self, m): assert result is not None for field in m._list_fields(): - m._get(field) + m._get(field) def _test_score(self, m): data = self.data @@ -138,14 +146,20 @@ def _test_recommend(self, m): top_k = 5 # Test recommend returns something - recs = m.recommend(users_all, top_k, exclude, items_all, - new_data, new_user_data, new_item_data, - exclude_known=True) + recs = m.recommend( + users_all, + top_k, + exclude, + items_all, + new_data, + new_user_data, + new_item_data, + exclude_known=True, + ) assert recs is not None # Test recommend when no new data is provided - recs = m.recommend(users_all, top_k, exclude, items_all, - exclude_known=True) + recs = m.recommend(users_all, top_k, exclude, items_all, exclude_known=True) assert recs is not None recs2 = m.recommend() @@ -157,27 +171,34 @@ def test_new_side_data_regression(self): user_data = self.user_data item_data = self.item_data - for mod in [tc.recommender.item_similarity_recommender, - tc.recommender.factorization_recommender, - tc.recommender.ranking_factorization_recommender, - tc.recommender.popularity_recommender]: + for mod in [ + tc.recommender.item_similarity_recommender, + tc.recommender.factorization_recommender, + tc.recommender.ranking_factorization_recommender, + tc.recommender.popularity_recommender, + ]: - m = mod.create(data, 'user_id', 'item_id', 'rating') + m = mod.create(data, "user_id", "item_id", "rating") # Make sure it doesn't crash - m.recommend(new_user_data = user_data) - m.recommend(new_item_data = item_data) + m.recommend(new_user_data=user_data) + m.recommend(new_item_data=item_data) def test_kwargs(self): data = self.data - for mod in [tc.recommender.item_similarity_recommender, - tc.recommender.factorization_recommender, - tc.recommender.ranking_factorization_recommender, - tc.recommender.popularity_recommender]: - self.assertRaises(TypeError, lambda: \ - mod.create(data, 'user_id', 'item_id', 'rating', i_want_a_pony = True)) - + for mod in [ + tc.recommender.item_similarity_recommender, + tc.recommender.factorization_recommender, + tc.recommender.ranking_factorization_recommender, + tc.recommender.popularity_recommender, + ]: + self.assertRaises( + TypeError, + lambda: mod.create( + data, "user_id", "item_id", "rating", i_want_a_pony=True + ), + ) def test_side_data_errors(self): @@ -193,6 +214,7 @@ def test_side_data_errors(self): X["rating"] = [12, 13] from copy import copy + X2 = copy(X) # Add in one that overlaps the item hair in the field below @@ -207,14 +229,21 @@ def test_side_data_errors(self): item_side["item_id"] = [1231, 1232] item_side["item_hair"] = ["big", "bigger"] - - for mod in [tc.recommender.item_similarity_recommender, - tc.recommender.factorization_recommender, - tc.recommender.ranking_factorization_recommender, - tc.recommender.popularity_recommender]: - - m = mod.create(X, 'user_id', 'item_id', 'rating', - user_data = user_side, item_data = item_side) + for mod in [ + tc.recommender.item_similarity_recommender, + tc.recommender.factorization_recommender, + tc.recommender.ranking_factorization_recommender, + tc.recommender.popularity_recommender, + ]: + + m = mod.create( + X, + "user_id", + "item_id", + "rating", + user_data=user_side, + item_data=item_side, + ) self.assertRaises(Exception, lambda: m.recommend(X2)) diff --git a/src/python/turicreate/test/test_regression.py b/src/python/turicreate/test/test_regression.py index b8186f8a05..555b066efc 100644 --- a/src/python/turicreate/test/test_regression.py +++ b/src/python/turicreate/test/test_regression.py @@ -19,36 +19,39 @@ class RegressionCreateTest(unittest.TestCase): """ Creation test helper function. """ - def _test_create(self, n, d, validation_set = 'auto'): - # Simulate test data - np.random.seed(42) - sf = tc.SFrame() + def _test_create(self, n, d, validation_set="auto"): - for i in range(d): - sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + # Simulate test data + np.random.seed(42) + sf = tc.SFrame() - target = np.random.rand(n) - sf['target'] = target - model = tc.regression.create(sf, target = 'target', features =None, - validation_set = validation_set) - self.assertTrue(model is not None) + for i in range(d): + sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - features = sf.column_names() - features.remove('target') - model = tc.regression.create(sf, target = 'target', features = features, - validation_set = validation_set) - self.assertTrue(model is not None) + target = np.random.rand(n) + sf["target"] = target + model = tc.regression.create( + sf, target="target", features=None, validation_set=validation_set + ) + self.assertTrue(model is not None) + features = sf.column_names() + features.remove("target") + model = tc.regression.create( + sf, target="target", features=features, validation_set=validation_set + ) + self.assertTrue(model is not None) """ Test create. """ + def test_create(self): self._test_create(99, 10) self._test_create(100, 100) self._test_create(20000, 10) - self._test_create(99, 10, validation_set = None) - self._test_create(100, 100, validation_set = None) - self._test_create(20000, 10, validation_set = None) + self._test_create(99, 10, validation_set=None) + self._test_create(100, 100, validation_set=None) + self._test_create(20000, 10, validation_set=None) diff --git a/src/python/turicreate/test/test_sarray.py b/src/python/turicreate/test/test_sarray.py index 02da014625..7f6e54581b 100644 --- a/src/python/turicreate/test/test_sarray.py +++ b/src/python/turicreate/test/test_sarray.py @@ -38,20 +38,35 @@ class SArrayTest(unittest.TestCase): def setUp(self): self.int_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] self.bool_data = [x % 2 == 0 for x in range(10)] - self.datetime_data = [dt.datetime(2013, 5, 7, 10, 4, 10), - dt.datetime(1902, 10, 21, 10, 34, 10).replace(tzinfo=GMT(0.0)),None] - self.datetime_data2 = [dt.datetime(2013, 5, 7, 10, 4, 10, 109321), - dt.datetime(1902, 10, 21, 10, 34, 10, 991111).replace(tzinfo=GMT(0.0)),None] - self.float_data = [1., 2., 3., 4., 5., 6., 7., 8., 9., 10.] - self.string_data = ["abc", "def", "hello", "world", "pika", "chu", "hello", "world"] - self.vec_data = [array.array('d', [i, i+1]) for i in self.int_data] + self.datetime_data = [ + dt.datetime(2013, 5, 7, 10, 4, 10), + dt.datetime(1902, 10, 21, 10, 34, 10).replace(tzinfo=GMT(0.0)), + None, + ] + self.datetime_data2 = [ + dt.datetime(2013, 5, 7, 10, 4, 10, 109321), + dt.datetime(1902, 10, 21, 10, 34, 10, 991111).replace(tzinfo=GMT(0.0)), + None, + ] + self.float_data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] + self.string_data = [ + "abc", + "def", + "hello", + "world", + "pika", + "chu", + "hello", + "world", + ] + self.vec_data = [array.array("d", [i, i + 1]) for i in self.int_data] self.np_array_data = [np.array(x) for x in self.vec_data] self.empty_np_array_data = [np.array([])] self.np_matrix_data = [np.matrix(x) for x in self.vec_data] self.list_data = [[i, str(i), i * 1.0] for i in self.int_data] - self.dict_data = [{str(i): i, i : float(i)} for i in self.int_data] + self.dict_data = [{str(i): i, i: float(i)} for i in self.int_data] # json dict only allows string keys - self.dict_json_data = [{str(i): i} for i in self.int_data] + self.dict_json_data = [{str(i): i} for i in self.int_data] self.url = "http://s3-us-west-2.amazonaws.com/testdatasets/a_to_z.txt.gz" def __test_equal(self, _sarray, _data, _type): @@ -121,94 +136,129 @@ def test_creation(self): self.assertRaises(TypeError, self.__test_creation, [self.string_data, int]) self.assertRaises(TypeError, self.__test_creation, [self.string_data, float]) - expected_output = [chr(x) for x in range(ord('a'), ord('a') + 26)] + expected_output = [chr(x) for x in range(ord("a"), ord("a") + 26)] self.__test_equal(SArray(self.url, str), expected_output, str) self.__test_creation(self.vec_data, array.array, self.vec_data) self.__test_creation(self.np_array_data, np.ndarray, self.np_array_data) - self.__test_creation(self.empty_np_array_data, np.ndarray, - self.empty_np_array_data) + self.__test_creation( + self.empty_np_array_data, np.ndarray, self.empty_np_array_data + ) self.__test_creation(self.np_matrix_data, np.ndarray, self.np_matrix_data) self.__test_creation(self.list_data, list, self.list_data) self.__test_creation(self.dict_data, dict, self.dict_data) # test with map/filter type - self.__test_creation_raw(map(lambda x: x + 10, self.int_data), - int, - [x + 10 for x in self.int_data]) - self.__test_creation_raw(map(lambda x: x * 10, self.int_data), - float, - [float(x) * 10 for x in self.int_data]) - self.__test_creation_raw(map(lambda x: x * 10, self.string_data), - str, - [x * 10 for x in self.string_data]) - - self.__test_creation_raw(filter(lambda x: x < 5, self.int_data), - int, - list(filter(lambda x: x < 5, self.int_data))) - self.__test_creation_raw(filter(lambda x: x > 5, self.float_data), - float, - list(filter(lambda x: x > 5, self.float_data))) - self.__test_creation_raw(filter(lambda x: len(x) > 3, self.string_data), - str, - list(filter(lambda x: len(x) > 3, self.string_data))) - - self.__test_creation_pd(map(lambda x: x + 10, self.int_data), - int, - [x + 10 for x in self.int_data]) - self.__test_creation_pd(map(lambda x: x * 10, self.int_data), - float, - [float(x) * 10 for x in self.int_data]) - self.__test_creation_pd(map(lambda x: x * 10, self.string_data), - str, - [x * 10 for x in self.string_data]) + self.__test_creation_raw( + map(lambda x: x + 10, self.int_data), int, [x + 10 for x in self.int_data] + ) + self.__test_creation_raw( + map(lambda x: x * 10, self.int_data), + float, + [float(x) * 10 for x in self.int_data], + ) + self.__test_creation_raw( + map(lambda x: x * 10, self.string_data), + str, + [x * 10 for x in self.string_data], + ) + + self.__test_creation_raw( + filter(lambda x: x < 5, self.int_data), + int, + list(filter(lambda x: x < 5, self.int_data)), + ) + self.__test_creation_raw( + filter(lambda x: x > 5, self.float_data), + float, + list(filter(lambda x: x > 5, self.float_data)), + ) + self.__test_creation_raw( + filter(lambda x: len(x) > 3, self.string_data), + str, + list(filter(lambda x: len(x) > 3, self.string_data)), + ) + + self.__test_creation_pd( + map(lambda x: x + 10, self.int_data), int, [x + 10 for x in self.int_data] + ) + self.__test_creation_pd( + map(lambda x: x * 10, self.int_data), + float, + [float(x) * 10 for x in self.int_data], + ) + self.__test_creation_pd( + map(lambda x: x * 10, self.string_data), + str, + [x * 10 for x in self.string_data], + ) # test with type inference self.__test_creation_type_inference(self.int_data, int, self.int_data) self.__test_creation_type_inference(self.float_data, float, self.float_data) - self.__test_creation_type_inference(self.bool_data, int, [int(x) for x in self.bool_data]) + self.__test_creation_type_inference( + self.bool_data, int, [int(x) for x in self.bool_data] + ) self.__test_creation_type_inference(self.string_data, str, self.string_data) self.__test_creation_type_inference(self.vec_data, array.array, self.vec_data) - self.__test_creation_type_inference(self.np_array_data, np.ndarray, - self.np_array_data) - self.__test_creation_type_inference(self.empty_np_array_data, - np.ndarray, - self.empty_np_array_data) - self.__test_creation_type_inference(self.np_matrix_data, np.ndarray, - self.np_matrix_data) - self.__test_creation_type_inference([np.bool_(True),np.bool_(False)],int,[1,0]) - self.__test_creation((1,2,3,4), int, [1,2,3,4]) - - self.__test_creation_type_inference_raw(map(lambda x: x + 10, self.int_data), - int, - [x + 10 for x in self.int_data]) - self.__test_creation_type_inference_raw(map(lambda x: x * 10, self.float_data), - float, - [x * 10 for x in self.float_data]) - self.__test_creation_type_inference_raw(map(lambda x: x * 10, self.string_data), - str, - [x * 10 for x in self.string_data]) - - self.__test_creation_type_inference_pd(map(lambda x: x + 10, self.int_data), - int, - [x + 10 for x in self.int_data]) - self.__test_creation_type_inference_pd(map(lambda x: x * 10, self.float_data), - float, - [float(x) * 10 for x in self.float_data]) - self.__test_creation_type_inference_pd(map(lambda x: x * 10, self.string_data), - str, - [x * 10 for x in self.string_data]) - - self.__test_creation_type_inference_raw(filter(lambda x: x < 5, self.int_data), - int, - list(filter(lambda x: x < 5, self.int_data))) - self.__test_creation_type_inference_raw(filter(lambda x: x > 5, self.float_data), - float, - list(filter(lambda x: x > 5, self.float_data))) - self.__test_creation_type_inference_raw(filter(lambda x: len(x) > 3, self.string_data), - str, - list(filter(lambda x: len(x) > 3, self.string_data))) + self.__test_creation_type_inference( + self.np_array_data, np.ndarray, self.np_array_data + ) + self.__test_creation_type_inference( + self.empty_np_array_data, np.ndarray, self.empty_np_array_data + ) + self.__test_creation_type_inference( + self.np_matrix_data, np.ndarray, self.np_matrix_data + ) + self.__test_creation_type_inference( + [np.bool_(True), np.bool_(False)], int, [1, 0] + ) + self.__test_creation((1, 2, 3, 4), int, [1, 2, 3, 4]) + + self.__test_creation_type_inference_raw( + map(lambda x: x + 10, self.int_data), int, [x + 10 for x in self.int_data] + ) + self.__test_creation_type_inference_raw( + map(lambda x: x * 10, self.float_data), + float, + [x * 10 for x in self.float_data], + ) + self.__test_creation_type_inference_raw( + map(lambda x: x * 10, self.string_data), + str, + [x * 10 for x in self.string_data], + ) + + self.__test_creation_type_inference_pd( + map(lambda x: x + 10, self.int_data), int, [x + 10 for x in self.int_data] + ) + self.__test_creation_type_inference_pd( + map(lambda x: x * 10, self.float_data), + float, + [float(x) * 10 for x in self.float_data], + ) + self.__test_creation_type_inference_pd( + map(lambda x: x * 10, self.string_data), + str, + [x * 10 for x in self.string_data], + ) + + self.__test_creation_type_inference_raw( + filter(lambda x: x < 5, self.int_data), + int, + list(filter(lambda x: x < 5, self.int_data)), + ) + self.__test_creation_type_inference_raw( + filter(lambda x: x > 5, self.float_data), + float, + list(filter(lambda x: x > 5, self.float_data)), + ) + self.__test_creation_type_inference_raw( + filter(lambda x: len(x) > 3, self.string_data), + str, + list(filter(lambda x: len(x) > 3, self.string_data)), + ) # genertors def __generator_parrot(data): @@ -216,79 +266,100 @@ def __generator_parrot(data): yield ii self.__test_creation_raw(__generator_parrot(self.int_data), int, self.int_data) - self.__test_creation_raw(__generator_parrot(self.float_data), float, self.float_data) - self.__test_creation_raw(__generator_parrot(self.string_data), str, self.string_data) + self.__test_creation_raw( + __generator_parrot(self.float_data), float, self.float_data + ) + self.__test_creation_raw( + __generator_parrot(self.string_data), str, self.string_data + ) self.__test_creation_pd(__generator_parrot(self.int_data), int, self.int_data) - self.__test_creation_pd(__generator_parrot(self.float_data), float, self.float_data) - self.__test_creation_pd(__generator_parrot(self.string_data), str, self.string_data) + self.__test_creation_pd( + __generator_parrot(self.float_data), float, self.float_data + ) + self.__test_creation_pd( + __generator_parrot(self.string_data), str, self.string_data + ) - self.__test_creation_type_inference_raw(__generator_parrot(self.int_data), int, self.int_data) - self.__test_creation_type_inference_raw(__generator_parrot(self.float_data), float, self.float_data) - self.__test_creation_type_inference_raw(__generator_parrot(self.string_data), str, self.string_data) + self.__test_creation_type_inference_raw( + __generator_parrot(self.int_data), int, self.int_data + ) + self.__test_creation_type_inference_raw( + __generator_parrot(self.float_data), float, self.float_data + ) + self.__test_creation_type_inference_raw( + __generator_parrot(self.string_data), str, self.string_data + ) - self.__test_creation_type_inference_pd(__generator_parrot(self.int_data), int, self.int_data) - self.__test_creation_type_inference_pd(__generator_parrot(self.float_data), float, self.float_data) - self.__test_creation_type_inference_pd(__generator_parrot(self.string_data), str, self.string_data) + self.__test_creation_type_inference_pd( + __generator_parrot(self.int_data), int, self.int_data + ) + self.__test_creation_type_inference_pd( + __generator_parrot(self.float_data), float, self.float_data + ) + self.__test_creation_type_inference_pd( + __generator_parrot(self.string_data), str, self.string_data + ) # Test numpy types, which are not compatible with the pd.Series path in # __test_creation and __test_creation_type_inference - self.__test_equal(SArray(np.array(self.vec_data), array.array), - self.vec_data, array.array) - self.__test_equal(SArray(np.matrix(self.vec_data), array.array), - self.vec_data, array.array) - self.__test_equal(SArray(np.array(self.vec_data)), - self.vec_data, array.array) - self.__test_equal(SArray(np.matrix(self.vec_data)), - self.vec_data, array.array) + self.__test_equal( + SArray(np.array(self.vec_data), array.array), self.vec_data, array.array + ) + self.__test_equal( + SArray(np.matrix(self.vec_data), array.array), self.vec_data, array.array + ) + self.__test_equal(SArray(np.array(self.vec_data)), self.vec_data, array.array) + self.__test_equal(SArray(np.matrix(self.vec_data)), self.vec_data, array.array) # Test python 3 - self.__test_equal(SArray(filter(lambda x: True, self.int_data)), self.int_data, int) + self.__test_equal( + SArray(filter(lambda x: True, self.int_data)), self.int_data, int + ) self.__test_equal(SArray(map(lambda x: x, self.int_data)), self.int_data, int) def test_list_with_none_creation(self): - tlist=[[2,3,4],[5,6],[4,5,10,None]] - g=SArray(tlist) + tlist = [[2, 3, 4], [5, 6], [4, 5, 10, None]] + g = SArray(tlist) self.assertEqual(len(g), len(tlist)) for i in range(len(tlist)): - self.assertEqual(g[i], tlist[i]) + self.assertEqual(g[i], tlist[i]) def test_append_empty_sarray(self): - existing_sa = SArray([1,2,3]) + existing_sa = SArray([1, 2, 3]) new_sa = SArray() - existing_sa=existing_sa.append(new_sa) - self.assertTrue(int,existing_sa.dtype) + existing_sa = existing_sa.append(new_sa) + self.assertTrue(int, existing_sa.dtype) def test_append_sarray_to_empty_sarray(self): existing_sa = SArray() - new_sa = SArray([1,2,3]) - existing_sa=existing_sa.append(new_sa) - self.assertTrue(int,existing_sa.dtype) - + new_sa = SArray([1, 2, 3]) + existing_sa = existing_sa.append(new_sa) + self.assertTrue(int, existing_sa.dtype) + def test_nan_to_none_type_conversion(self): - sa=SArray([1.0,2.0,float('nan'),3.0]) - sa=sa.astype(int) + sa = SArray([1.0, 2.0, float("nan"), 3.0]) + sa = sa.astype(int) self.assertTrue(None in sa) - def test_list_with_array_creation(self): import array - t = array.array('d',[1.1,2,3,4,5.5]) - g=SArray(t) + + t = array.array("d", [1.1, 2, 3, 4, 5.5]) + g = SArray(t) self.assertEqual(len(g), len(t)) self.assertEqual(g.dtype, float) glist = list(g) for i in range(len(glist)): - self.assertAlmostEqual(glist[i], t[i]) + self.assertAlmostEqual(glist[i], t[i]) - t = array.array('i',[1,2,3,4,5]) - g=SArray(t) + t = array.array("i", [1, 2, 3, 4, 5]) + g = SArray(t) self.assertEqual(len(g), len(t)) self.assertEqual(g.dtype, int) glist = list(g) for i in range(len(glist)): - self.assertEqual(glist[i], t[i]) - + self.assertEqual(glist[i], t[i]) def test_in(self): sint = SArray(self.int_data, int) @@ -298,33 +369,36 @@ def test_in(self): self.assertTrue("abc" in sstr) self.assertFalse("zzzzzz" in sstr) self.assertFalse("" in sstr) - self.__test_equal(sstr.contains("ll"), ["ll" in i for i in self.string_data], int) + self.__test_equal( + sstr.contains("ll"), ["ll" in i for i in self.string_data], int + ) self.__test_equal(sstr.contains("a"), ["a" in i for i in self.string_data], int) - svec = SArray([[1.0,2.0],[2.0,3.0],[3.0,4.0],[4.0,5.0]], array.array) - self.__test_equal(svec.contains(1.0), [1,0,0,0], int) - self.__test_equal(svec.contains(0.0), [0,0,0,0], int) - self.__test_equal(svec.contains(2), [1,1,0,0], int) - - slist = SArray([[1,"22"],[2,"33"],[3,"44"],[4,None]], list) - self.__test_equal(slist.contains(1.0), [1,0,0,0], int) - self.__test_equal(slist.contains(3), [0,0,1,0], int) - self.__test_equal(slist.contains("33"), [0,1,0,0], int) - self.__test_equal(slist.contains("3"), [0,0,0,0], int) - self.__test_equal(slist.contains(None), [0,0,0,1], int) - - sdict = SArray([{1:"2"},{2:"3"},{3:"4"},{"4":"5"}], dict) - self.__test_equal(sdict.contains(1.0), [1,0,0,0], int) - self.__test_equal(sdict.contains(3), [0,0,1,0], int) - self.__test_equal(sdict.contains("4"), [0,0,0,1], int) - self.__test_equal(sdict.contains("3"), [0,0,0,0], int) - - - self.__test_equal(SArray(['ab','bc','cd']).is_in('abc'), [1,1,0], int) - self.__test_equal(SArray(['a','b','c']).is_in(['a','b']), [1,1,0], int) - self.__test_equal(SArray([1,2,3]).is_in(array.array('d',[1.0,2.0])), [1,1,0], int) - self.__test_equal(SArray([1,2,None]).is_in([1, None]), [1,0,1], int) - self.__test_equal(SArray([1,2,None]).is_in([1]), [1,0,0], int) + svec = SArray([[1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0]], array.array) + self.__test_equal(svec.contains(1.0), [1, 0, 0, 0], int) + self.__test_equal(svec.contains(0.0), [0, 0, 0, 0], int) + self.__test_equal(svec.contains(2), [1, 1, 0, 0], int) + + slist = SArray([[1, "22"], [2, "33"], [3, "44"], [4, None]], list) + self.__test_equal(slist.contains(1.0), [1, 0, 0, 0], int) + self.__test_equal(slist.contains(3), [0, 0, 1, 0], int) + self.__test_equal(slist.contains("33"), [0, 1, 0, 0], int) + self.__test_equal(slist.contains("3"), [0, 0, 0, 0], int) + self.__test_equal(slist.contains(None), [0, 0, 0, 1], int) + + sdict = SArray([{1: "2"}, {2: "3"}, {3: "4"}, {"4": "5"}], dict) + self.__test_equal(sdict.contains(1.0), [1, 0, 0, 0], int) + self.__test_equal(sdict.contains(3), [0, 0, 1, 0], int) + self.__test_equal(sdict.contains("4"), [0, 0, 0, 1], int) + self.__test_equal(sdict.contains("3"), [0, 0, 0, 0], int) + + self.__test_equal(SArray(["ab", "bc", "cd"]).is_in("abc"), [1, 1, 0], int) + self.__test_equal(SArray(["a", "b", "c"]).is_in(["a", "b"]), [1, 1, 0], int) + self.__test_equal( + SArray([1, 2, 3]).is_in(array.array("d", [1.0, 2.0])), [1, 1, 0], int + ) + self.__test_equal(SArray([1, 2, None]).is_in([1, None]), [1, 0, 1], int) + self.__test_equal(SArray([1, 2, None]).is_in([1]), [1, 0, 0], int) def test_save_load(self): @@ -350,20 +424,20 @@ def test_save_load(self): slist = SArray(self.list_data, list) sdict = SArray(self.dict_data, dict) - sint.save('intarr.sidx') - sflt.save('fltarr.sidx') - sstr.save('strarr.sidx') - svec.save('vecarr.sidx') - slist.save('listarr.sidx') - sdict.save('dictarr.sidx') - - sint2 = SArray('intarr.sidx') - sflt2 = SArray('fltarr.sidx') - sstr2 = SArray('strarr.sidx') - svec2 = SArray('vecarr.sidx') - slist2 = SArray('listarr.sidx') - sdict2 = SArray('dictarr.sidx') - self.assertRaises(IOError, lambda: SArray('__no_such_file__.sidx')) + sint.save("intarr.sidx") + sflt.save("fltarr.sidx") + sstr.save("strarr.sidx") + svec.save("vecarr.sidx") + slist.save("listarr.sidx") + sdict.save("dictarr.sidx") + + sint2 = SArray("intarr.sidx") + sflt2 = SArray("fltarr.sidx") + sstr2 = SArray("strarr.sidx") + svec2 = SArray("vecarr.sidx") + slist2 = SArray("listarr.sidx") + sdict2 = SArray("dictarr.sidx") + self.assertRaises(IOError, lambda: SArray("__no_such_file__.sidx")) self.__test_equal(sint2, self.int_data, int) self.__test_equal(sflt2, [float(x) for x in self.int_data], float) @@ -372,7 +446,7 @@ def test_save_load(self): self.__test_equal(slist2, self.list_data, list) self.__test_equal(sdict2, self.dict_data, dict) - #cleanup + # cleanup del sint2 del sflt2 del sstr2 @@ -387,35 +461,41 @@ def test_save_load(self): self._remove_sarray_files("dictarr") def test_save_load_text(self): - self._remove_single_file('txt_int_arr.txt') + self._remove_single_file("txt_int_arr.txt") sint = SArray(self.int_data, int) - sint.save('txt_int_arr.txt') - self.assertTrue(os.path.exists('txt_int_arr.txt')) - f = open('txt_int_arr.txt') + sint.save("txt_int_arr.txt") + self.assertTrue(os.path.exists("txt_int_arr.txt")) + f = open("txt_int_arr.txt") lines = f.readlines() for i in range(len(sint)): self.assertEqual(int(lines[i]), sint[i]) - self._remove_single_file('txt_int_arr.txt') + self._remove_single_file("txt_int_arr.txt") - self._remove_single_file('txt_int_arr') - sint.save('txt_int_arr', format='text') - self.assertTrue(os.path.exists('txt_int_arr')) - f = open('txt_int_arr') + self._remove_single_file("txt_int_arr") + sint.save("txt_int_arr", format="text") + self.assertTrue(os.path.exists("txt_int_arr")) + f = open("txt_int_arr") lines = f.readlines() for i in range(len(sint)): self.assertEqual(int(lines[i]), sint[i]) - self._remove_single_file('txt_int_arr') - + self._remove_single_file("txt_int_arr") + def test_read_json(self): # boolean type will be read in as int - data_pairs = [('int_data', int), ('bool_data', int), ('float_data', float), - ('string_data', str), ('list_data', list), ('dict_json_data', dict)] + data_pairs = [ + ("int_data", int), + ("bool_data", int), + ("float_data", float), + ("string_data", str), + ("list_data", list), + ("dict_json_data", dict), + ] for attr, data_type in data_pairs: - filename = attr + '.json' + filename = attr + ".json" self._remove_single_file(filename) data = getattr(self, attr) - with open(filename, 'w') as f: + with open(filename, "w") as f: json.dump(data, f) read_sarray = SArray.read_json(filename) @@ -425,17 +505,16 @@ def test_read_json(self): def test_read_json_infer_type(self): data = [None, 1, 2, None, 3.0, 4, 5.0, 6, None] converted_data = [float(i) if i is not None else i for i in data] - filename = 'read_json_infer_type.json' + filename = "read_json_infer_type.json" self._remove_single_file(filename) - with open(filename, 'w') as f: + with open(filename, "w") as f: json.dump(data, f) read_sarray = SArray.read_json(filename) self.__test_equal(read_sarray, converted_data, float) self._remove_single_file(filename) - def _remove_single_file(self, filename): try: os.remove(filename) @@ -443,7 +522,7 @@ def _remove_single_file(self, filename): pass def _remove_sarray_files(self, prefix): - filelist = [ f for f in os.listdir(".") if f.startswith(prefix) ] + filelist = [f for f in os.listdir(".") if f.startswith(prefix)] for f in filelist: shutil.rmtree(f) @@ -451,47 +530,55 @@ def test_transform(self): sa_char = SArray(self.url, str) sa_int = sa_char.apply(lambda char: ord(char), int) - expected_output = [x for x in range(ord('a'), ord('a') + 26)] + expected_output = [x for x in range(ord("a"), ord("a") + 26)] self.__test_equal(sa_int, expected_output, int) # Test randomness across segments, randomized sarray should have different elements. - sa_random = SArray(range(0, 16), int).apply(lambda x: random.randint(0, 1000), int) + sa_random = SArray(range(0, 16), int).apply( + lambda x: random.randint(0, 1000), int + ) vec = list(sa_random.head(len(sa_random))) self.assertFalse(all([x == vec[0] for x in vec])) # test transform with missing values - sa = SArray([1,2,3,None,4,5]) - sa1 = sa.apply(lambda x : x + 1) - self.__test_equal(sa1, [2,3,4,None,5,6], int) + sa = SArray([1, 2, 3, None, 4, 5]) + sa1 = sa.apply(lambda x: x + 1) + self.__test_equal(sa1, [2, 3, 4, None, 5, 6], int) def test_transform_with_multiple_lambda(self): sa_char = SArray(self.url, str) sa_int = sa_char.apply(lambda char: ord(char), int) sa2_int = sa_int.apply(lambda val: val + 1, int) - expected_output = [x for x in range(ord('a') + 1, ord('a') + 26 + 1)] + expected_output = [x for x in range(ord("a") + 1, ord("a") + 26 + 1)] self.__test_equal(sa2_int, expected_output, int) def test_transform_with_exception(self): - sa_char = SArray(['a' for i in range(10000)], str) + sa_char = SArray(["a" for i in range(10000)], str) # # type mismatch exception - self.assertRaises(TypeError, lambda: sa_char.apply(lambda char: char, int).head(1)) + self.assertRaises( + TypeError, lambda: sa_char.apply(lambda char: char, int).head(1) + ) # # divide by 0 exception - self.assertRaises(ZeroDivisionError, lambda: sa_char.apply(lambda char: ord(char) / 0, float)) + self.assertRaises( + ZeroDivisionError, lambda: sa_char.apply(lambda char: ord(char) / 0, float) + ) def test_transform_with_type_inference(self): sa_char = SArray(self.url, str) sa_int = sa_char.apply(lambda char: ord(char)) - expected_output = [x for x in range(ord('a'), ord('a') + 26)] + expected_output = [x for x in range(ord("a"), ord("a") + 26)] self.__test_equal(sa_int, expected_output, int) - sa_bool = sa_char.apply(lambda char: ord(char) > ord('c')) - expected_output = [int(x > ord('c')) for x in range(ord('a'), ord('a') + 26)] + sa_bool = sa_char.apply(lambda char: ord(char) > ord("c")) + expected_output = [int(x > ord("c")) for x in range(ord("a"), ord("a") + 26)] self.__test_equal(sa_bool, expected_output, int) # # divide by 0 exception - self.assertRaises(ZeroDivisionError, lambda: sa_char.apply(lambda char: ord(char) / 0)) + self.assertRaises( + ZeroDivisionError, lambda: sa_char.apply(lambda char: ord(char) / 0) + ) # Test randomness across segments, randomized sarray should have different elements. sa_random = SArray(range(0, 16), int).apply(lambda x: random.randint(0, 1000)) @@ -499,8 +586,8 @@ def test_transform_with_type_inference(self): self.assertFalse(all([x == vec[0] for x in vec])) def test_transform_on_lists(self): - sa_int = SArray(self.int_data, int) - sa_vec2 = sa_int.apply(lambda x: [x, x+1, str(x)]) + sa_int = SArray(self.int_data, int) + sa_vec2 = sa_int.apply(lambda x: [x, x + 1, str(x)]) expected = [[i, i + 1, str(i)] for i in self.int_data] self.__test_equal(sa_vec2, expected, list) sa_int_again = sa_vec2.apply(lambda x: int(x[0])) @@ -519,31 +606,35 @@ def test_transform_on_lists(self): # transform dict to list sa_dict = SArray(self.dict_data, dict) # Python 3 doesn't return keys in same order from identical dictionaries. - sort_by_type = lambda x : str(type(x)) - sa_list = sa_dict.apply(lambda x: sorted(list(x), key = sort_by_type)) - self.__test_equal(sa_list, [sorted(list(x), key = sort_by_type) for x in self.dict_data], list) + sort_by_type = lambda x: str(type(x)) + sa_list = sa_dict.apply(lambda x: sorted(list(x), key=sort_by_type)) + self.__test_equal( + sa_list, [sorted(list(x), key=sort_by_type) for x in self.dict_data], list + ) def test_transform_dict(self): # lambda accesses dict - sa_dict = SArray([{'a':1}, {1:2}, {'c': 'a'}, None], dict) - sa_bool_r = sa_dict.apply(lambda x: 'a' in x if x is not None else None, skip_na=False) + sa_dict = SArray([{"a": 1}, {1: 2}, {"c": "a"}, None], dict) + sa_bool_r = sa_dict.apply( + lambda x: "a" in x if x is not None else None, skip_na=False + ) expected_output = [1, 0, 0, None] self.__test_equal(sa_bool_r, expected_output, int) # lambda returns dict - expected_output = [{'a':1}, {1:2}, None, {'c': 'a'}] + expected_output = [{"a": 1}, {1: 2}, None, {"c": "a"}] sa_dict = SArray(expected_output, dict) lambda_out = sa_dict.apply(lambda x: x) self.__test_equal(lambda_out, expected_output, dict) def test_filter_dict(self): - expected_output = [{'a':1}] + expected_output = [{"a": 1}] sa_dict = SArray(expected_output, dict) - ret = sa_dict.filter(lambda x: 'a' in x) + ret = sa_dict.filter(lambda x: "a" in x) self.__test_equal(ret, expected_output, dict) # try second time to make sure the lambda system still works - expected_output = [{1:2}] + expected_output = [{1: 2}] sa_dict = SArray(expected_output, dict) lambda_out = sa_dict.filter(lambda x: 1 in x) self.__test_equal(lambda_out, expected_output, dict) @@ -551,28 +642,30 @@ def test_filter_dict(self): def test_filter(self): # test empty s = SArray([], float) - no_change = s.filter(lambda x : x == 0) + no_change = s.filter(lambda x: x == 0) self.assertEqual(len(no_change), 0) # test normal case s = SArray(self.int_data, int) middle_of_array = s.filter(lambda x: x > 3 and x < 8) - self.assertEqual(list(middle_of_array.head(10)), [x for x in range(4,8)]) + self.assertEqual(list(middle_of_array.head(10)), [x for x in range(4, 8)]) # test normal string case s = SArray(self.string_data, str) - exp_val_list = [x for x in self.string_data if x != 'world'] + exp_val_list = [x for x in self.string_data if x != "world"] # Remove all words whose second letter is not in the first half of the alphabet - second_letter = s.filter(lambda x: len(x) > 1 and (ord(x[1]) > ord('a')) and (ord(x[1]) < ord('n'))) + second_letter = s.filter( + lambda x: len(x) > 1 and (ord(x[1]) > ord("a")) and (ord(x[1]) < ord("n")) + ) self.assertEqual(list(second_letter.head(10)), exp_val_list) # test not-a-lambda def a_filter_func(x): - return ((x > 4.4) and (x < 6.8)) + return (x > 4.4) and (x < 6.8) s = SArray(self.int_data, float) another = s.filter(a_filter_func) - self.assertEqual(list(another.head(10)), [5.,6.]) + self.assertEqual(list(another.head(10)), [5.0, 6.0]) sa = SArray(self.float_data) @@ -581,21 +674,20 @@ def a_filter_func(x): self.assertEqual(list(sa.head(10)), list(sa2.head(10))) # filter by zeros - sa_filter = SArray([0,0,0,0,0,0,0,0,0,0]) + sa_filter = SArray([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) sa2 = sa[sa_filter] self.assertEqual(len(sa2), 0) # filter by wrong size - sa_filter = SArray([0,2,5]) + sa_filter = SArray([0, 2, 5]) with self.assertRaises(IndexError): sa2 = sa[sa_filter] - def test_any_all(self): - s = SArray([0,1,2,3,4,5,6,7,8,9], int) + s = SArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], int) self.assertEqual(s.any(), True) self.assertEqual(s.all(), False) - s = SArray([0,0,0,0,0], int) + s = SArray([0, 0, 0, 0, 0], int) self.assertEqual(s.any(), False) self.assertEqual(s.all(), False) @@ -616,7 +708,7 @@ def test_any_all(self): self.assertEqual(s.any(), False) self.assertEqual(s.all(), False) - s = SArray([[],[1.0]], array.array) + s = SArray([[], [1.0]], array.array) self.assertEqual(s.any(), True) self.assertEqual(s.all(), False) @@ -627,14 +719,16 @@ def test_astype(self): self.assertEqual(as_out.dtype, float) # test float -> int - s = SArray(list(map(lambda x: x+0.2, self.float_data)), float) + s = SArray(list(map(lambda x: x + 0.2, self.float_data)), float) as_out = s.astype(int) self.assertEqual(list(as_out.head(10)), self.int_data) # test int->string s = SArray(self.int_data, int) as_out = s.astype(str) - self.assertEqual(list(as_out.head(10)), list(map(lambda x: str(x), self.int_data))) + self.assertEqual( + list(as_out.head(10)), list(map(lambda x: str(x), self.int_data)) + ) i_out = as_out.astype(int) self.assertEqual(list(i_out.head(10)), list(s.head(10))) @@ -646,23 +740,23 @@ def test_astype(self): with self.assertRaises(RuntimeError): s.astype(float) - s = SArray(["a","1","2","3"]) + s = SArray(["a", "1", "2", "3"]) with self.assertRaises(RuntimeError): s.astype(int) - self.assertEqual(list(s.astype(int,True).head(4)), [None,1,2,3]) + self.assertEqual(list(s.astype(int, True).head(4)), [None, 1, 2, 3]) - s = SArray(["[1 2 3]","[4;5]"]) + s = SArray(["[1 2 3]", "[4;5]"]) ret = list(s.astype(array.array).head(2)) - self.assertEqual(ret, [array.array('d',[1,2,3]),array.array('d',[4,5])]) + self.assertEqual(ret, [array.array("d", [1, 2, 3]), array.array("d", [4, 5])]) - s = SArray(["[1,\"b\",3]","[4,5]"]) + s = SArray(['[1,"b",3]', "[4,5]"]) ret = list(s.astype(list).head(2)) - self.assertEqual(ret, [[1,"b",3],[4,5]]) + self.assertEqual(ret, [[1, "b", 3], [4, 5]]) - s = SArray(["{\"a\":2,\"b\":3}","{}"]) + s = SArray(['{"a":2,"b":3}', "{}"]) ret = list(s.astype(dict).head(2)) - self.assertEqual(ret, [{"a":2,"b":3},{}]) + self.assertEqual(ret, [{"a": 2, "b": 3}, {}]) s = SArray(["[1abc]"]) ret = list(s.astype(list).head(1)) @@ -670,18 +764,22 @@ def test_astype(self): s = SArray(["{1xyz:1a,2b:2}"]) ret = list(s.astype(dict).head(1)) - self.assertEqual(ret, [{"1xyz":"1a","2b":2}]) + self.assertEqual(ret, [{"1xyz": "1a", "2b": 2}]) # astype between list and array - s = SArray([array.array('d',[1.0,2.0]), array.array('d',[2.0,3.0])]) + s = SArray([array.array("d", [1.0, 2.0]), array.array("d", [2.0, 3.0])]) ret = list(s.astype(list)) - self.assertEqual(ret, [[1.0, 2.0], [2.0,3.0]]) + self.assertEqual(ret, [[1.0, 2.0], [2.0, 3.0]]) ret = list(s.astype(list).astype(array.array)) self.assertEqual(list(s), list(ret)) with self.assertRaises(RuntimeError): - ret = list(SArray([["a",1.0],["b",2.0]]).astype(array.array)) + ret = list(SArray([["a", 1.0], ["b", 2.0]]).astype(array.array)) - badcast = list(SArray([["a",1.0],["b",2.0]]).astype(array.array, undefined_on_failure=True)) + badcast = list( + SArray([["a", 1.0], ["b", 2.0]]).astype( + array.array, undefined_on_failure=True + ) + ) self.assertEqual(badcast, [None, None]) with self.assertRaises(TypeError): @@ -691,7 +789,7 @@ def test_clip(self): # invalid types s = SArray(self.string_data, str) with self.assertRaises(RuntimeError): - s.clip(25,26) + s.clip(25, 26) with self.assertRaises(RuntimeError): s.clip_lower(25) with self.assertRaises(RuntimeError): @@ -700,7 +798,7 @@ def test_clip(self): # int w/ int, test lower and upper functions too # int w/float, no change s = SArray(self.int_data, int) - clip_out = s.clip(3,7).head(10) + clip_out = s.clip(3, 7).head(10) # test that our list isn't cast to float if nothing happened clip_out_nc = s.clip(0.2, 10.2).head(10) lclip_out = s.clip_lower(3).head(10) @@ -708,7 +806,7 @@ def test_clip(self): self.assertEqual(len(clip_out), len(self.int_data)) self.assertEqual(len(lclip_out), len(self.int_data)) self.assertEqual(len(rclip_out), len(self.int_data)) - for i in range(0,len(clip_out)): + for i in range(0, len(clip_out)): if i < 2: self.assertEqual(clip_out[i], 3) self.assertEqual(lclip_out[i], 3) @@ -730,15 +828,15 @@ def test_clip(self): fs = SArray(self.float_data, float) ficlip_out = fs.clip(3, 7).head(10) ffclip_out = fs.clip(2.8, 7.2).head(10) - for i in range(0,len(clip_out)): + for i in range(0, len(clip_out)): if i < 2: self.assertAlmostEqual(clip_out[i], 2.8) self.assertAlmostEqual(ffclip_out[i], 2.8) - self.assertAlmostEqual(ficlip_out[i], 3.) + self.assertAlmostEqual(ficlip_out[i], 3.0) elif i > 6: self.assertAlmostEqual(clip_out[i], 7.2) self.assertAlmostEqual(ffclip_out[i], 7.2) - self.assertAlmostEqual(ficlip_out[i], 7.) + self.assertAlmostEqual(ficlip_out[i], 7.0) else: self.assertAlmostEqual(clip_out[i], self.float_data[i]) self.assertAlmostEqual(ffclip_out[i], self.float_data[i]) @@ -759,30 +857,29 @@ def test_clip(self): self.assertEqual(a, b) def test_missing(self): - s=SArray(self.int_data, int) + s = SArray(self.int_data, int) self.assertEqual(s.countna(), 0) - s=SArray(self.int_data + [None], int) + s = SArray(self.int_data + [None], int) self.assertEqual(s.countna(), 1) - s=SArray(self.float_data, float) + s = SArray(self.float_data, float) self.assertEqual(s.countna(), 0) - s=SArray(self.float_data + [None], float) + s = SArray(self.float_data + [None], float) self.assertEqual(s.countna(), 1) - s=SArray(self.string_data, str) + s = SArray(self.string_data, str) self.assertEqual(s.countna(), 0) - s=SArray(self.string_data + [None], str) + s = SArray(self.string_data + [None], str) self.assertEqual(s.countna(), 1) - s=SArray(self.vec_data, array.array) + s = SArray(self.vec_data, array.array) self.assertEqual(s.countna(), 0) - s=SArray(self.vec_data + [None], array.array) + s = SArray(self.vec_data + [None], array.array) self.assertEqual(s.countna(), 1) - def test_nonzero(self): # test empty - s = SArray([],int) + s = SArray([], int) nz_out = s.nnz() self.assertEqual(nz_out, 0) @@ -792,7 +889,7 @@ def test_nonzero(self): self.assertEqual(nz_out, len(self.float_data)) # test all zero - s = SArray([0 for x in range(0,10)], int) + s = SArray([0 for x in range(0, 10)], int) nz_out = s.nnz() self.assertEqual(nz_out, 0) @@ -851,25 +948,25 @@ def test_tail(self): self.assertEqual(len(s.tail()), 0) # test standard tail - s = SArray([x for x in range(0,40)], int) - self.assertEqual(list(s.tail()), [x for x in range(30,40)]) + s = SArray([x for x in range(0, 40)], int) + self.assertEqual(list(s.tail()), [x for x in range(30, 40)]) # smaller amount - self.assertEqual(list(s.tail(3)), [x for x in range(37,40)]) + self.assertEqual(list(s.tail(3)), [x for x in range(37, 40)]) # larger amount - self.assertEqual(list(s.tail(40)), [x for x in range(0,40)]) + self.assertEqual(list(s.tail(40)), [x for x in range(0, 40)]) # too large - self.assertEqual(list(s.tail(81)), [x for x in range(0,40)]) + self.assertEqual(list(s.tail(81)), [x for x in range(0, 40)]) def test_max_min_sum_mean(self): # negative and positive - s = SArray([-2,-1,0,1,2], int) + s = SArray([-2, -1, 0, 1, 2], int) self.assertEqual(s.max(), 2) self.assertEqual(s.min(), -2) self.assertEqual(s.sum(), 0) - self.assertAlmostEqual(s.mean(), 0.) + self.assertAlmostEqual(s.mean(), 0.0) # test valid and invalid types s = SArray(self.string_data, str) @@ -889,13 +986,13 @@ def test_max_min_sum_mean(self): self.assertAlmostEqual(s.mean(), 5.5) s = SArray(self.float_data, float) - self.assertEqual(s.max(), 10.) - self.assertEqual(s.min(), 1.) - self.assertEqual(s.sum(), 55.) + self.assertEqual(s.max(), 10.0) + self.assertEqual(s.min(), 1.0) + self.assertEqual(s.sum(), 55.0) self.assertAlmostEqual(s.mean(), 5.5) # test all negative - s = SArray(list(map(lambda x: x*-1, self.int_data)), int) + s = SArray(list(map(lambda x: x * -1, self.int_data)), int) self.assertEqual(s.max(), -1) self.assertEqual(s.min(), -10) self.assertEqual(s.sum(), -55) @@ -914,7 +1011,7 @@ def test_max_min_sum_mean(self): t = SArray([], int).sum() self.assertTrue(type(t) == int or type(t) == long) self.assertTrue(t == 0) - self.assertTrue(SArray([], array.array).sum() == array.array('d',[])) + self.assertTrue(SArray([], array.array).sum() == array.array("d", [])) # test big ints huge_int = 9223372036854775807 @@ -922,19 +1019,19 @@ def test_max_min_sum_mean(self): self.assertEqual(s.max(), huge_int) self.assertEqual(s.min(), 1) # yes, we overflow - self.assertEqual(s.sum(), (huge_int+1)*-1) + self.assertEqual(s.sum(), (huge_int + 1) * -1) # ...but not here - self.assertAlmostEqual(s.mean(), 4611686018427387904.) + self.assertAlmostEqual(s.mean(), 4611686018427387904.0) - a = SArray([[1,2],[1,2],[1,2]], array.array) - self.assertEqual(a.sum(), array.array('d', [3,6])) - self.assertEqual(a.mean(), array.array('d', [1,2])) + a = SArray([[1, 2], [1, 2], [1, 2]], array.array) + self.assertEqual(a.sum(), array.array("d", [3, 6])) + self.assertEqual(a.mean(), array.array("d", [1, 2])) with self.assertRaises(RuntimeError): a.max() with self.assertRaises(RuntimeError): a.min() - a = SArray([[1,2],[1,2],[1,2,3]], array.array) + a = SArray([[1, 2], [1, 2], [1, 2, 3]], array.array) with self.assertRaises(RuntimeError): a.sum() with self.assertRaises(RuntimeError): @@ -942,13 +1039,13 @@ def test_max_min_sum_mean(self): def test_max_min_sum_mean_missing(self): # negative and positive - s = SArray([-2,0,None,None,None], int) + s = SArray([-2, 0, None, None, None], int) self.assertEqual(s.max(), 0) self.assertEqual(s.min(), -2) self.assertEqual(s.sum(), -2) self.assertAlmostEqual(s.mean(), -1) - s = SArray([None,None,None], int) + s = SArray([None, None, None], int) self.assertEqual(s.max(), None) self.assertEqual(s.min(), None) self.assertEqual(s.sum(), 0) @@ -957,7 +1054,7 @@ def test_max_min_sum_mean_missing(self): def test_python_special_functions(self): s = SArray([], int) self.assertEqual(len(s), 0) - self.assertEqual(str(s), '[]') + self.assertEqual(str(s), "[]") self.assertRaises(ValueError, lambda: bool(s)) # increasing ints @@ -969,14 +1066,14 @@ def test_python_special_functions(self): realsum = sum(self.int_data) sum1 = sum([x for x in s]) sum2 = s.sum() - sum3 = s.apply(lambda x:x, int).sum() + sum3 = s.apply(lambda x: x, int).sum() self.assertEqual(sum1, realsum) self.assertEqual(sum2, realsum) self.assertEqual(sum3, realsum) # abs - s=np.array(range(-10, 10)) + s = np.array(range(-10, 10)) t = SArray(s, int) self.__test_equal(abs(t), list(abs(s)), int) t = SArray(s, float) @@ -985,7 +1082,7 @@ def test_python_special_functions(self): self.__test_equal(SArray(abs(t)[0]), list(abs(s)), float) def test_scalar_operators(self): - s=np.array([1,2,3,4,5,6,7,8,9,10]) + s = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) t = SArray(s, int) self.__test_equal(t + 1, list(s + 1), int) self.__test_equal(t - 1, list(s - 1), int) @@ -1010,41 +1107,57 @@ def test_scalar_operators(self): self.__test_equal(2.0 / t, list(2.0 / s), float) self.__test_equal(2 / t, list(2.0 / s), float) self.__test_equal(2.5 * t, list(2.5 * s), float) - self.__test_equal(2**t, list(2**s), float) + self.__test_equal(2 ** t, list(2 ** s), float) - s_neg = np.array([-1,-2,-3,5,6,7,8,9,10]) + s_neg = np.array([-1, -2, -3, 5, 6, 7, 8, 9, 10]) t_neg = SArray(s_neg, int) self.__test_equal(t_neg // 5, list(s_neg // 5), int) self.__test_equal(t_neg % 5, list(s_neg % 5), int) - s=["a","b","c"] + s = ["a", "b", "c"] t = SArray(s, str) self.__test_equal(t + "x", [i + "x" for i in s], str) with self.assertRaises(RuntimeError): - t - 'x' + t - "x" with self.assertRaises(RuntimeError): - t * 'x' + t * "x" with self.assertRaises(RuntimeError): - t / 'x' + t / "x" s = SArray(self.vec_data, array.array) - self.__test_equal(s + 1, [array.array('d', [float(j) + 1 for j in i]) for i in self.vec_data], array.array) - self.__test_equal(s - 1, [array.array('d', [float(j) - 1 for j in i]) for i in self.vec_data], array.array) - self.__test_equal(s * 2, [array.array('d', [float(j) * 2 for j in i]) for i in self.vec_data], array.array) - self.__test_equal(s / 2, [array.array('d', [float(j) / 2 for j in i]) for i in self.vec_data], array.array) - s = SArray([1,2,3,4,None]) + self.__test_equal( + s + 1, + [array.array("d", [float(j) + 1 for j in i]) for i in self.vec_data], + array.array, + ) + self.__test_equal( + s - 1, + [array.array("d", [float(j) - 1 for j in i]) for i in self.vec_data], + array.array, + ) + self.__test_equal( + s * 2, + [array.array("d", [float(j) * 2 for j in i]) for i in self.vec_data], + array.array, + ) + self.__test_equal( + s / 2, + [array.array("d", [float(j) / 2 for j in i]) for i in self.vec_data], + array.array, + ) + s = SArray([1, 2, 3, 4, None]) self.__test_equal(s == None, [0, 0, 0, 0, 1], int) self.__test_equal(s != None, [1, 1, 1, 1, 0], int) def test_modulus_operator(self): - l = [-5,-4,-3,-2,-1,0,1,2,3,4,5] + l = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5] t = SArray(l, int) self.__test_equal(t % 2, [i % 2 for i in l], int) self.__test_equal(t % -2, [i % -2 for i in l], int) def test_vector_operators(self): - s=np.array([1,2,3,4,5,6,7,8,9,10]) - s2=np.array([5,4,3,2,1,10,9,8,7,6]) + s = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + s2 = np.array([5, 4, 3, 2, 1, 10, 9, 8, 7, 6]) t = SArray(s, int) t2 = SArray(s2, int) self.__test_equal(t + t2, list(s + s2), int) @@ -1061,57 +1174,187 @@ def test_vector_operators(self): self.__test_equal(t != t2, list(s != s2), int) s = SArray(self.vec_data, array.array) - self.__test_almost_equal(s + s, [array.array('d', [float(j) + float(j) for j in i]) for i in self.vec_data], array.array) - self.__test_almost_equal(s - s, [array.array('d', [float(j) - float(j) for j in i]) for i in self.vec_data], array.array) - self.__test_almost_equal(s * s, [array.array('d', [float(j) * float(j) for j in i]) for i in self.vec_data], array.array) - self.__test_almost_equal(s / s, [array.array('d', [float(j) / float(j) for j in i]) for i in self.vec_data], array.array) - self.__test_almost_equal(s ** s, [array.array('d', [float(j) ** float(j) for j in i]) for i in self.vec_data], array.array) - self.__test_almost_equal(s // s, [array.array('d', [float(j) // float(j) for j in i]) for i in self.vec_data], array.array) + self.__test_almost_equal( + s + s, + [array.array("d", [float(j) + float(j) for j in i]) for i in self.vec_data], + array.array, + ) + self.__test_almost_equal( + s - s, + [array.array("d", [float(j) - float(j) for j in i]) for i in self.vec_data], + array.array, + ) + self.__test_almost_equal( + s * s, + [array.array("d", [float(j) * float(j) for j in i]) for i in self.vec_data], + array.array, + ) + self.__test_almost_equal( + s / s, + [array.array("d", [float(j) / float(j) for j in i]) for i in self.vec_data], + array.array, + ) + self.__test_almost_equal( + s ** s, + [ + array.array("d", [float(j) ** float(j) for j in i]) + for i in self.vec_data + ], + array.array, + ) + self.__test_almost_equal( + s // s, + [ + array.array("d", [float(j) // float(j) for j in i]) + for i in self.vec_data + ], + array.array, + ) t = SArray(self.float_data, float) - self.__test_almost_equal(s + t, [array.array('d', [float(j) + i[1] for j in i[0]]) for i in zip(self.vec_data, self.float_data)], array.array) - self.__test_almost_equal(s - t, [array.array('d', [float(j) - i[1] for j in i[0]]) for i in zip(self.vec_data, self.float_data)], array.array) - self.__test_almost_equal(s * t, [array.array('d', [float(j) * i[1] for j in i[0]]) for i in zip(self.vec_data, self.float_data)], array.array) - self.__test_almost_equal(s / t, [array.array('d', [float(j) / i[1] for j in i[0]]) for i in zip(self.vec_data, self.float_data)], array.array) - self.__test_almost_equal(s ** t, [array.array('d', [float(j) ** i[1] for j in i[0]]) for i in zip(self.vec_data, self.float_data)], array.array) - self.__test_almost_equal(s // t, [array.array('d', [float(j) // i[1] for j in i[0]]) for i in zip(self.vec_data, self.float_data)], array.array) - self.__test_almost_equal(+s, [array.array('d', [float(j) for j in i]) for i in self.vec_data], array.array) - self.__test_almost_equal(-s, [array.array('d', [-float(j) for j in i]) for i in self.vec_data], array.array) + self.__test_almost_equal( + s + t, + [ + array.array("d", [float(j) + i[1] for j in i[0]]) + for i in zip(self.vec_data, self.float_data) + ], + array.array, + ) + self.__test_almost_equal( + s - t, + [ + array.array("d", [float(j) - i[1] for j in i[0]]) + for i in zip(self.vec_data, self.float_data) + ], + array.array, + ) + self.__test_almost_equal( + s * t, + [ + array.array("d", [float(j) * i[1] for j in i[0]]) + for i in zip(self.vec_data, self.float_data) + ], + array.array, + ) + self.__test_almost_equal( + s / t, + [ + array.array("d", [float(j) / i[1] for j in i[0]]) + for i in zip(self.vec_data, self.float_data) + ], + array.array, + ) + self.__test_almost_equal( + s ** t, + [ + array.array("d", [float(j) ** i[1] for j in i[0]]) + for i in zip(self.vec_data, self.float_data) + ], + array.array, + ) + self.__test_almost_equal( + s // t, + [ + array.array("d", [float(j) // i[1] for j in i[0]]) + for i in zip(self.vec_data, self.float_data) + ], + array.array, + ) + self.__test_almost_equal( + +s, + [array.array("d", [float(j) for j in i]) for i in self.vec_data], + array.array, + ) + self.__test_almost_equal( + -s, + [array.array("d", [-float(j) for j in i]) for i in self.vec_data], + array.array, + ) neg_float_data = [-v for v in self.float_data] t = SArray(neg_float_data, float) - self.__test_almost_equal(s + t, [array.array('d', [float(j) + i[1] for j in i[0]]) for i in zip(self.vec_data, neg_float_data)], array.array) - self.__test_almost_equal(s - t, [array.array('d', [float(j) - i[1] for j in i[0]]) for i in zip(self.vec_data, neg_float_data)], array.array) - self.__test_almost_equal(s * t, [array.array('d', [float(j) * i[1] for j in i[0]]) for i in zip(self.vec_data, neg_float_data)], array.array) - self.__test_almost_equal(s / t, [array.array('d', [float(j) / i[1] for j in i[0]]) for i in zip(self.vec_data, neg_float_data)], array.array) - self.__test_almost_equal(s ** t, [array.array('d', [float(j) ** i[1] for j in i[0]]) for i in zip(self.vec_data, neg_float_data)], array.array) - self.__test_almost_equal(s // t, [array.array('d', [float(j) // i[1] for j in i[0]]) for i in zip(self.vec_data, neg_float_data)], array.array) - self.__test_almost_equal(t // s, [array.array('d', [i[1] // float(j) for j in i[0]]) for i in zip(self.vec_data, neg_float_data)], array.array) - - s = SArray([1,2,3,4,None]) - self.assertTrue((s==s).all()) - s = SArray([1,2,3,4,None]) - self.assertFalse((s!=s).any()) + self.__test_almost_equal( + s + t, + [ + array.array("d", [float(j) + i[1] for j in i[0]]) + for i in zip(self.vec_data, neg_float_data) + ], + array.array, + ) + self.__test_almost_equal( + s - t, + [ + array.array("d", [float(j) - i[1] for j in i[0]]) + for i in zip(self.vec_data, neg_float_data) + ], + array.array, + ) + self.__test_almost_equal( + s * t, + [ + array.array("d", [float(j) * i[1] for j in i[0]]) + for i in zip(self.vec_data, neg_float_data) + ], + array.array, + ) + self.__test_almost_equal( + s / t, + [ + array.array("d", [float(j) / i[1] for j in i[0]]) + for i in zip(self.vec_data, neg_float_data) + ], + array.array, + ) + self.__test_almost_equal( + s ** t, + [ + array.array("d", [float(j) ** i[1] for j in i[0]]) + for i in zip(self.vec_data, neg_float_data) + ], + array.array, + ) + self.__test_almost_equal( + s // t, + [ + array.array("d", [float(j) // i[1] for j in i[0]]) + for i in zip(self.vec_data, neg_float_data) + ], + array.array, + ) + self.__test_almost_equal( + t // s, + [ + array.array("d", [i[1] // float(j) for j in i[0]]) + for i in zip(self.vec_data, neg_float_data) + ], + array.array, + ) - def test_div_corner(self): + s = SArray([1, 2, 3, 4, None]) + self.assertTrue((s == s).all()) + s = SArray([1, 2, 3, 4, None]) + self.assertFalse((s != s).any()) + def test_div_corner(self): def try_eq_sa_val(left_val, right_val): if type(left_val) is list: - left_val = array.array('d', left_val) + left_val = array.array("d", left_val) if type(right_val) is list: - right_val = array.array('d', right_val) + right_val = array.array("d", right_val) left_type = type(left_val) v1 = (SArray([left_val], left_type) // right_val)[0] if type(right_val) is array.array: if type(left_val) is array.array: - v2 = array.array('d', [lv // rv for lv, rv in zip(left_val, right_val)]) + v2 = array.array( + "d", [lv // rv for lv, rv in zip(left_val, right_val)] + ) else: - v2 = array.array('d', [left_val // rv for rv in right_val]) + v2 = array.array("d", [left_val // rv for rv in right_val]) else: if type(left_val) is array.array: - v2 = array.array('d', [lv // right_val for lv in left_val]) + v2 = array.array("d", [lv // right_val for lv in left_val]) else: v2 = left_val // right_val @@ -1134,29 +1377,29 @@ def try_eq_sa_val(left_val, right_val): try_eq_sa_val([1, -1], 2) try_eq_sa_val([1, -1], 2.0) - try_eq_sa_val(2,[3, -3]) - try_eq_sa_val(2.0,[3, -3]) - + try_eq_sa_val(2, [3, -3]) + try_eq_sa_val(2.0, [3, -3]) def test_floodiv_corner(self): - def try_eq_sa_val(left_val, right_val): if type(left_val) is list: - left_val = array.array('d', left_val) + left_val = array.array("d", left_val) if type(right_val) is list: - right_val = array.array('d', right_val) + right_val = array.array("d", right_val) left_type = type(left_val) v1 = (SArray([left_val], left_type) // right_val)[0] if type(right_val) is array.array: if type(left_val) is array.array: - v2 = array.array('d', [lv // rv for lv, rv in zip(left_val, right_val)]) + v2 = array.array( + "d", [lv // rv for lv, rv in zip(left_val, right_val)] + ) else: - v2 = array.array('d', [left_val // rv for rv in right_val]) + v2 = array.array("d", [left_val // rv for rv in right_val]) else: if type(left_val) is array.array: - v2 = array.array('d', [lv // right_val for lv in left_val]) + v2 = array.array("d", [lv // right_val for lv in left_val]) else: v2 = left_val // right_val @@ -1180,16 +1423,16 @@ def try_eq_sa_val(left_val, right_val): try_eq_sa_val([1, -1], 2) try_eq_sa_val([1, -1], 2.0) - try_eq_sa_val(2,[3, -3]) - try_eq_sa_val(2.0,[3, -3]) + try_eq_sa_val(2, [3, -3]) + try_eq_sa_val(2.0, [3, -3]) from math import isnan def try_eq_sa_correct(left_val, right_val, correct): if type(left_val) is list: - left_val = array.array('d', left_val) + left_val = array.array("d", left_val) if type(right_val) is list: - right_val = array.array('d', right_val) + right_val = array.array("d", right_val) left_type = type(left_val) v1 = (SArray([left_val], left_type) // right_val)[0] @@ -1209,100 +1452,104 @@ def try_eq_sa_correct(left_val, right_val, correct): try_eq_sa_correct(0, 0, None) try_eq_sa_correct(-1, 0, None) - try_eq_sa_correct(1.0, 0, float('inf')) - try_eq_sa_correct(0.0, 0, float('nan')) - try_eq_sa_correct(-1.0, 0, float('-inf')) + try_eq_sa_correct(1.0, 0, float("inf")) + try_eq_sa_correct(0.0, 0, float("nan")) + try_eq_sa_correct(-1.0, 0, float("-inf")) - try_eq_sa_correct([1.0,0,-1], 0, [float('inf'), float('nan'), float('-inf')]) - try_eq_sa_correct(1, [1.0, 0], [1., float('inf')]) - try_eq_sa_correct(-1, [1.0, 0], [-1., float('-inf')]) - try_eq_sa_correct(0, [1.0, 0], [0., float('nan')]) + try_eq_sa_correct([1.0, 0, -1], 0, [float("inf"), float("nan"), float("-inf")]) + try_eq_sa_correct(1, [1.0, 0], [1.0, float("inf")]) + try_eq_sa_correct(-1, [1.0, 0], [-1.0, float("-inf")]) + try_eq_sa_correct(0, [1.0, 0], [0.0, float("nan")]) def test_logical_ops(self): - s=np.array([0,0,0,0,1,1,1,1]) - s2=np.array([0,1,0,1,0,1,0,1]) + s = np.array([0, 0, 0, 0, 1, 1, 1, 1]) + s2 = np.array([0, 1, 0, 1, 0, 1, 0, 1]) t = SArray(s, int) t2 = SArray(s2, int) self.__test_equal(t & t2, list(((s & s2) > 0).astype(int)), int) self.__test_equal(t | t2, list(((s | s2) > 0).astype(int)), int) def test_logical_ops_missing_value_propagation(self): - s=[0, 0,0,None, None, None,1,1, 1] - s2=[0,None,1,0, None, 1, 0,None,1] + s = [0, 0, 0, None, None, None, 1, 1, 1] + s2 = [0, None, 1, 0, None, 1, 0, None, 1] t = SArray(s, int) t2 = SArray(s2, int) - and_result = [0,0,0,0,None,None,0,None,1] - or_result = [0,None,1,None,None,1,1,1,1] + and_result = [0, 0, 0, 0, None, None, 0, None, 1] + or_result = [0, None, 1, None, None, 1, 1, 1, 1] self.__test_equal(t & t2, and_result, int) self.__test_equal(t | t2, or_result, int) def test_string_operators(self): - s=["a","b","c","d","e","f","g","h","i","j"] - s2=["e","d","c","b","a","j","i","h","g","f"] + s = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] + s2 = ["e", "d", "c", "b", "a", "j", "i", "h", "g", "f"] t = SArray(s, str) t2 = SArray(s2, str) - self.__test_equal(t + t2, ["".join(x) for x in zip(s,s2)], str) + self.__test_equal(t + t2, ["".join(x) for x in zip(s, s2)], str) self.__test_equal(t + "x", [x + "x" for x in s], str) - self.__test_equal(t < t2, [x < y for (x,y) in zip(s,s2)], int) - self.__test_equal(t > t2, [x > y for (x,y) in zip(s,s2)], int) - self.__test_equal(t == t2, [x == y for (x,y) in zip(s,s2)], int) - self.__test_equal(t != t2, [x != y for (x,y) in zip(s,s2)], int) - self.__test_equal(t <= t2, [x <= y for (x,y) in zip(s,s2)], int) - self.__test_equal(t >= t2, [x >= y for (x,y) in zip(s,s2)], int) - + self.__test_equal(t < t2, [x < y for (x, y) in zip(s, s2)], int) + self.__test_equal(t > t2, [x > y for (x, y) in zip(s, s2)], int) + self.__test_equal(t == t2, [x == y for (x, y) in zip(s, s2)], int) + self.__test_equal(t != t2, [x != y for (x, y) in zip(s, s2)], int) + self.__test_equal(t <= t2, [x <= y for (x, y) in zip(s, s2)], int) + self.__test_equal(t >= t2, [x >= y for (x, y) in zip(s, s2)], int) def test_vector_operator_missing_propagation(self): - t = SArray([1,2,3,4,None,6,7,8,9,None], float) # missing 4th and 9th - t2 = SArray([None,4,3,2,np.nan,10,9,8,7,6], float) # missing 0th and 4th + t = SArray([1, 2, 3, 4, None, 6, 7, 8, 9, None], float) # missing 4th and 9th + t2 = SArray( + [None, 4, 3, 2, np.nan, 10, 9, 8, 7, 6], float + ) # missing 0th and 4th self.assertEqual(len((t + t2).dropna()), 7) self.assertEqual(len((t - t2).dropna()), 7) self.assertEqual(len((t * t2).dropna()), 7) def test_sarray_image_equality(self): current_file_dir = os.path.dirname(os.path.realpath(__file__)) - image_url_1 = current_file_dir + '/images/sample.png' - image_url_2 = current_file_dir + '/images/sample.jpg' - i = load_images(image_url_1)['image'] - j = load_images(image_url_2)['image'] + image_url_1 = current_file_dir + "/images/sample.png" + image_url_2 = current_file_dir + "/images/sample.jpg" + i = load_images(image_url_1)["image"] + j = load_images(image_url_2)["image"] - self.__test_equal(i == i, [x == y for (x,y) in zip(i, i)], int) - self.__test_equal(j == j, [x == y for (x,y) in zip(j, j)], int) - self.__test_equal(i == j, [x == y for (x,y) in zip(i, j)], int) + self.__test_equal(i == i, [x == y for (x, y) in zip(i, i)], int) + self.__test_equal(j == j, [x == y for (x, y) in zip(j, j)], int) + self.__test_equal(i == j, [x == y for (x, y) in zip(i, j)], int) def test_dropna(self): - no_nas = ['strings', 'yeah', 'nan', 'NaN', 'NA', 'None'] + no_nas = ["strings", "yeah", "nan", "NaN", "NA", "None"] t = SArray(no_nas) self.assertEqual(len(t.dropna()), 6) self.assertEqual(list(t.dropna()), no_nas) - t2 = SArray([None,np.nan]) + t2 = SArray([None, np.nan]) self.assertEqual(len(t2.dropna()), 0) self.assertEqual(list(SArray(self.int_data).dropna()), self.int_data) self.assertEqual(list(SArray(self.float_data).dropna()), self.float_data) def test_fillna(self): # fillna shouldn't fill anything - no_nas = ['strings', 'yeah', 'nan', 'NaN', 'NA', 'None'] + no_nas = ["strings", "yeah", "nan", "NaN", "NA", "None"] t = SArray(no_nas) - out = t.fillna('hello') + out = t.fillna("hello") self.assertEqual(list(out), no_nas) # Normal integer case (float auto casted to int) - t = SArray([53,23,None,np.nan,5]) - self.assertEqual(list(t.fillna(-1.0)), [53,23,-1,-1,5]) + t = SArray([53, 23, None, np.nan, 5]) + self.assertEqual(list(t.fillna(-1.0)), [53, 23, -1, -1, 5]) # dict type - t = SArray(self.dict_data+[None]) - self.assertEqual(list(t.fillna({1:'1'})), self.dict_data+[{1:'1'}]) + t = SArray(self.dict_data + [None]) + self.assertEqual(list(t.fillna({1: "1"})), self.dict_data + [{1: "1"}]) # list type - t = SArray(self.list_data+[None]) - self.assertEqual(list(t.fillna([0,0,0])), self.list_data+[[0,0,0]]) + t = SArray(self.list_data + [None]) + self.assertEqual(list(t.fillna([0, 0, 0])), self.list_data + [[0, 0, 0]]) # vec type - t = SArray(self.vec_data+[None]) - self.assertEqual(list(t.fillna(array.array('f',[0.0,0.0]))), self.vec_data+[array.array('f',[0.0,0.0])]) + t = SArray(self.vec_data + [None]) + self.assertEqual( + list(t.fillna(array.array("f", [0.0, 0.0]))), + self.vec_data + [array.array("f", [0.0, 0.0])], + ) # empty sarray t = SArray() @@ -1310,8 +1557,8 @@ def test_fillna(self): def test_sample(self): sa = SArray(data=self.int_data) - sa_sample = sa.sample(.5, 9) - sa_sample2 = sa.sample(.5, 9) + sa_sample = sa.sample(0.5, 9) + sa_sample2 = sa.sample(0.5, 9) self.assertEqual(list(sa_sample.head()), list(sa_sample2.head())) @@ -1321,19 +1568,18 @@ def test_sample(self): with self.assertRaises(ValueError): sa.sample(3) - sa_sample = SArray().sample(.5, 9) + sa_sample = SArray().sample(0.5, 9) self.assertEqual(len(sa_sample), 0) self.assertEqual(len(SArray.from_sequence(100).sample(0.5, 1, exact=True)), 50) self.assertEqual(len(SArray.from_sequence(100).sample(0.5, 2, exact=True)), 50) - def test_hash(self): - a = SArray([0,1,0,1,0,1,0,1], int) + a = SArray([0, 1, 0, 1, 0, 1, 0, 1], int) b = a.hash() zero_hash = b[0] one_hash = b[1] self.assertTrue((b[a] == one_hash).all()) - self.assertTrue((b[1-a] == zero_hash).all()) + self.assertTrue((b[1 - a] == zero_hash).all()) # I can hash other stuff too # does not throw @@ -1349,7 +1595,6 @@ def test_hash(self): # different seeds give different hash values self.assertTrue((a.hash(seed=0) != a.hash(seed=1)).all()) - def test_random_integers(self): a = SArray.random_integers(0) self.assertEqual(len(a), 0) @@ -1357,13 +1602,18 @@ def test_random_integers(self): self.assertEqual(len(a), 1000) def test_vector_slice(self): - d=[[1],[1,2],[1,2,3]] - g=SArray(d, array.array) - self.assertEqual(list(g.vector_slice(0).head()), [1,1,1]) - self.assertEqual(list(g.vector_slice(0,2).head()), [None,array.array('d', [1,2]),array.array('d', [1,2])]) - self.assertEqual(list(g.vector_slice(0,3).head()), [None,None,array.array('d', [1,2,3])]) + d = [[1], [1, 2], [1, 2, 3]] + g = SArray(d, array.array) + self.assertEqual(list(g.vector_slice(0).head()), [1, 1, 1]) + self.assertEqual( + list(g.vector_slice(0, 2).head()), + [None, array.array("d", [1, 2]), array.array("d", [1, 2])], + ) + self.assertEqual( + list(g.vector_slice(0, 3).head()), [None, None, array.array("d", [1, 2, 3])] + ) - g=SArray(self.vec_data, array.array) + g = SArray(self.vec_data, array.array) self.__test_equal(g.vector_slice(0), self.float_data, float) self.__test_equal(g.vector_slice(0, 2), self.vec_data, array.array) @@ -1372,12 +1622,13 @@ def _my_element_slice(self, arr, start=None, stop=None, step=1): def _slice_equality_test(self, arr, start=None, stop=None, step=1): self.assertEqual( - list(arr.element_slice(start, stop, step)), - list(self._my_element_slice(arr,start,stop,step))) + list(arr.element_slice(start, stop, step)), + list(self._my_element_slice(arr, start, stop, step)), + ) def test_element_slice(self): - #string slicing - g=SArray(range(1,1000, 10)).astype(str) + # string slicing + g = SArray(range(1, 1000, 10)).astype(str) self._slice_equality_test(g, 0, 2) self._slice_equality_test(g, 0, -1, 2) self._slice_equality_test(g, -1, -3) @@ -1385,8 +1636,8 @@ def test_element_slice(self): self._slice_equality_test(g, None, None, -1) self._slice_equality_test(g, -100, -1) - #list slicing - g=SArray(range(1,10)).apply(lambda x: list(range(x)), list) + # list slicing + g = SArray(range(1, 10)).apply(lambda x: list(range(x)), list) self._slice_equality_test(g, 0, 2) self._slice_equality_test(g, 0, -1, 2) self._slice_equality_test(g, -1, -3) @@ -1394,9 +1645,10 @@ def test_element_slice(self): self._slice_equality_test(g, None, None, -1) self._slice_equality_test(g, -100, -1) - #array slicing + # array slicing import array - g=SArray(range(1,10)).apply(lambda x: array.array('d', range(x))) + + g = SArray(range(1, 10)).apply(lambda x: array.array("d", range(x))) self._slice_equality_test(g, 0, 2) self._slice_equality_test(g, 0, -1, 2) self._slice_equality_test(g, -1, -3) @@ -1404,12 +1656,12 @@ def test_element_slice(self): self._slice_equality_test(g, None, None, -1) self._slice_equality_test(g, -100, -1) - #this should fail + # this should fail with self.assertRaises(TypeError): - g=SArray(range(1,1000)).element_slice(1) + g = SArray(range(1, 1000)).element_slice(1) with self.assertRaises(TypeError): - g=SArray(range(1,1000)).astype(float).element_slice(1) + g = SArray(range(1, 1000)).astype(float).element_slice(1) def test_lazy_eval(self): sa = SArray(range(-10, 10)) @@ -1452,15 +1704,17 @@ def test_dict_keys(self): # self.dict_data = [{str(i): i, i : float(i)} for i in self.int_data] sa = SArray(self.dict_data) sa_keys = sa.dict_keys() - self.assertEqual([set(i) for i in sa_keys], [{str(i), i} for i in self.int_data]) + self.assertEqual( + [set(i) for i in sa_keys], [{str(i), i} for i in self.int_data] + ) # na value - d = [{'a': 1}, {None: 2}, {"b": None}, None] + d = [{"a": 1}, {None: 2}, {"b": None}, None] sa = SArray(d) sa_keys = sa.dict_keys() - self.assertEqual(list(sa_keys), [['a'], [None], ['b'], None]) + self.assertEqual(list(sa_keys), [["a"], [None], ["b"], None]) - #empty SArray + # empty SArray sa = SArray() with self.assertRaises(RuntimeError): sa.dict_keys() @@ -1476,12 +1730,12 @@ def test_dict_values(self): self.assertEqual(list(sa_values), [[i, float(i)] for i in self.int_data]) # na value - d = [{'a': 1}, {None: 'str'}, {"b": None}, None] + d = [{"a": 1}, {None: "str"}, {"b": None}, None] sa = SArray(d) sa_values = sa.dict_values() - self.assertEqual(list(sa_values), [[1], ['str'], [None], None]) + self.assertEqual(list(sa_values), [[1], ["str"], [None], None]) - #empty SArray + # empty SArray sa = SArray() with self.assertRaises(RuntimeError): sa.dict_values() @@ -1492,12 +1746,12 @@ def test_dict_values(self): def test_dict_trim_by_keys(self): # self.dict_data = [{str(i): i, i : float(i)} for i in self.int_data] - d = [{'a':1, 'b': [1,2]}, {None: 'str'}, {"b": None, "c": 1}, None] + d = [{"a": 1, "b": [1, 2]}, {None: "str"}, {"b": None, "c": 1}, None] sa = SArray(d) - sa_values = sa.dict_trim_by_keys(['a', 'b']) - self.assertEqual(list(sa_values), [{}, {None: 'str'}, {"c": 1}, None]) + sa_values = sa.dict_trim_by_keys(["a", "b"]) + self.assertEqual(list(sa_values), [{}, {None: "str"}, {"c": 1}, None]) - #empty SArray + # empty SArray sa = SArray() with self.assertRaises(RuntimeError): sa.dict_trim_by_keys([]) @@ -1507,24 +1761,30 @@ def test_dict_trim_by_keys(self): def test_dict_trim_by_values(self): # self.dict_data = [{str(i): i, i : float(i)} for i in self.int_data] - d = [{'a':1, 'b': 20, 'c':None}, {"b": 4, None: 5}, None] + d = [{"a": 1, "b": 20, "c": None}, {"b": 4, None: 5}, None] sa = SArray(d) - sa_values = sa.dict_trim_by_values(5,10) - self.assertEqual(list(sa_values), [{'c':None}, {None:5}, None]) + sa_values = sa.dict_trim_by_values(5, 10) + self.assertEqual(list(sa_values), [{"c": None}, {None: 5}, None]) # no upper key sa_values = sa.dict_trim_by_values(2) - self.assertEqual(list(sa_values), [{'b': 20, 'c':None}, {"b": 4, None:5}, None]) + self.assertEqual( + list(sa_values), [{"b": 20, "c": None}, {"b": 4, None: 5}, None] + ) # no param sa_values = sa.dict_trim_by_values() - self.assertEqual(list(sa_values), [{'a':1, 'b': 20, 'c':None}, {"b": 4, None: 5}, None]) + self.assertEqual( + list(sa_values), [{"a": 1, "b": 20, "c": None}, {"b": 4, None: 5}, None] + ) # no lower key sa_values = sa.dict_trim_by_values(upper=7) - self.assertEqual(list(sa_values), [{'a':1, 'c':None}, {"b": 4, None: 5}, None]) + self.assertEqual( + list(sa_values), [{"a": 1, "c": None}, {"b": 4, None: 5}, None] + ) - #empty SArray + # empty SArray sa = SArray() with self.assertRaises(RuntimeError): sa.dict_trim_by_values() @@ -1533,25 +1793,25 @@ def test_dict_trim_by_values(self): self.assertEqual(list(sa.dict_trim_by_values().head(10)), [], list) def test_dict_has_any_keys(self): - d = [{'a':1, 'b': 20, 'c':None}, {"b": 4, None: 5}, None, {'a':0}] + d = [{"a": 1, "b": 20, "c": None}, {"b": 4, None: 5}, None, {"a": 0}] sa = SArray(d) sa_values = sa.dict_has_any_keys([]) - self.assertEqual(list(sa_values), [0,0,None,0]) + self.assertEqual(list(sa_values), [0, 0, None, 0]) - sa_values = sa.dict_has_any_keys(['a']) - self.assertEqual(list(sa_values), [1,0,None,1]) + sa_values = sa.dict_has_any_keys(["a"]) + self.assertEqual(list(sa_values), [1, 0, None, 1]) # one value is auto convert to list sa_values = sa.dict_has_any_keys("a") - self.assertEqual(list(sa_values), [1,0,None,1]) + self.assertEqual(list(sa_values), [1, 0, None, 1]) - sa_values = sa.dict_has_any_keys(['a', 'b']) - self.assertEqual(list(sa_values), [1,1,None,1]) + sa_values = sa.dict_has_any_keys(["a", "b"]) + self.assertEqual(list(sa_values), [1, 1, None, 1]) with self.assertRaises(TypeError): sa.dict_has_any_keys() - #empty SArray + # empty SArray sa = SArray() with self.assertRaises(TypeError): sa.dict_has_any_keys() @@ -1560,28 +1820,28 @@ def test_dict_has_any_keys(self): self.assertEqual(list(sa.dict_has_any_keys([]).head(10)), [], list) def test_dict_has_all_keys(self): - d = [{'a':1, 'b': 20, 'c':None}, {"b": 4, None: 5}, None, {'a':0}] + d = [{"a": 1, "b": 20, "c": None}, {"b": 4, None: 5}, None, {"a": 0}] sa = SArray(d) sa_values = sa.dict_has_all_keys([]) - self.assertEqual(list(sa_values), [1,1,None,1]) + self.assertEqual(list(sa_values), [1, 1, None, 1]) - sa_values = sa.dict_has_all_keys(['a']) - self.assertEqual(list(sa_values), [1,0,None,1]) + sa_values = sa.dict_has_all_keys(["a"]) + self.assertEqual(list(sa_values), [1, 0, None, 1]) # one value is auto convert to list sa_values = sa.dict_has_all_keys("a") - self.assertEqual(list(sa_values), [1,0,None,1]) + self.assertEqual(list(sa_values), [1, 0, None, 1]) - sa_values = sa.dict_has_all_keys(['a', 'b']) - self.assertEqual(list(sa_values), [1,0,None,0]) + sa_values = sa.dict_has_all_keys(["a", "b"]) + self.assertEqual(list(sa_values), [1, 0, None, 0]) sa_values = sa.dict_has_all_keys([None, "b"]) - self.assertEqual(list(sa_values), [0,1,None,0]) + self.assertEqual(list(sa_values), [0, 1, None, 0]) with self.assertRaises(TypeError): sa.dict_has_all_keys() - #empty SArray + # empty SArray sa = SArray() with self.assertRaises(TypeError): sa.dict_has_all_keys() @@ -1592,7 +1852,7 @@ def test_dict_has_all_keys(self): def test_save_load_cleanup_file(self): # similarly for SArray with util.TempDirectory() as f: - sa = SArray(range(1,1000000)) + sa = SArray(range(1, 1000000)) sa.save(f) # 17 for each sarray, 1 object.bin, 1 ini @@ -1603,7 +1863,7 @@ def test_save_load_cleanup_file(self): sa1 = SArray(f) # create another SFrame and save to the same location - sa2 = SArray([str(i) for i in range(1,100000)]) + sa2 = SArray([str(i) for i in range(1, 100000)]) sa2.save(f) file_count = len(os.listdir(f)) @@ -1637,7 +1897,7 @@ def test_unique(self): self.assertEqual(list(test.unique()), [1]) # Test many of one value - test = SArray([1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]) + test = SArray([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) self.assertEqual(list(test.unique()), [1]) # Test all unique values @@ -1645,7 +1905,30 @@ def test_unique(self): self.assertEqual(sorted(list(test.unique())), self.int_data) # Test an interesting sequence - interesting_ints = [4654,4352436,5453,7556,45435,4654,5453,4654,5453,1,1,1,5,5,5,8,66,7,7,77,90,-34] + interesting_ints = [ + 4654, + 4352436, + 5453, + 7556, + 45435, + 4654, + 5453, + 4654, + 5453, + 1, + 1, + 1, + 5, + 5, + 5, + 8, + 66, + 7, + 7, + 77, + 90, + -34, + ] test = SArray(interesting_ints) u = test.unique() self.assertEqual(len(u), 13) @@ -1673,29 +1956,29 @@ def test_item_len(self): self.assertEqual(test.item_length()) # wrong type - test = SArray([1,2,3]) + test = SArray([1, 2, 3]) with self.assertRaises(TypeError): self.assertEqual(test.item_length()) - test = SArray(['1','2','3']) + test = SArray(["1", "2", "3"]) with self.assertRaises(TypeError): self.assertEqual(test.item_length()) # vector type - test = SArray([[], [1], [1,2], [1,2,3], None]) + test = SArray([[], [1], [1, 2], [1, 2, 3], None]) item_length = test.item_length() - self.assertEqual(list(item_length), list([0, 1,2,3,None])) + self.assertEqual(list(item_length), list([0, 1, 2, 3, None])) # dict type - test = SArray([{}, {'key1': 1}, {'key2':1, 'key1':2}, None]) - self.assertEqual(list(test.item_length()), list([0, 1,2,None])) + test = SArray([{}, {"key1": 1}, {"key2": 1, "key1": 2}, None]) + self.assertEqual(list(test.item_length()), list([0, 1, 2, None])) # list type - test = SArray([[], [1,2], ['str', 'str2'], None]) - self.assertEqual(list(test.item_length()), list([0, 2,2,None])) + test = SArray([[], [1, 2], ["str", "str2"], None]) + self.assertEqual(list(test.item_length()), list([0, 2, 2, None])) def test_random_access(self): - t = list(range(0,100000)) + t = list(range(0, 100000)) s = SArray(t) # simple slices self.__test_equal(s[1:10000], t[1:10000], int) @@ -1715,15 +1998,15 @@ def test_random_access(self): self.assertEqual(s[-10], t[-10]) # A cache boundary - self.assertEqual(s[32*1024-1], t[32*1024-1]) - self.assertEqual(s[32*1024], t[32*1024]) + self.assertEqual(s[32 * 1024 - 1], t[32 * 1024 - 1]) + self.assertEqual(s[32 * 1024], t[32 * 1024]) # totally different self.assertEqual(s[19312], t[19312]) # edge case oddities self.__test_equal(s[10:100:100], t[10:100:100], int) - self.__test_equal(s[-100:len(s):10], t[-100:len(t):10], int) + self.__test_equal(s[-100 : len(s) : 10], t[-100 : len(t) : 10], int) self.__test_equal(s[-1:-2], t[-1:-2], int) self.__test_equal(s[-1:-1000:2], t[-1:-1000:2], int) with self.assertRaises(IndexError): @@ -1737,30 +2020,29 @@ def test_random_access(self): self.assertEqual(s[-i], t[-i]) def test_sort(self): - test = SArray([1,2,3,5,1,4]) - ascending = SArray([1,1,2,3,4,5]) - descending = SArray([5,4,3,2,1,1]) + test = SArray([1, 2, 3, 5, 1, 4]) + ascending = SArray([1, 1, 2, 3, 4, 5]) + descending = SArray([5, 4, 3, 2, 1, 1]) result = test.sort() self.assertEqual(list(result), list(ascending)) - result = test.sort(ascending = False) + result = test.sort(ascending=False) self.assertEqual(list(result), list(descending)) with self.assertRaises(TypeError): - SArray([[1,2], [2,3]]).sort() + SArray([[1, 2], [2, 3]]).sort() def test_unicode_encode_should_not_fail(self): - g=SArray([{'a':u'\u2019'}]) - g=SArray([u'123',u'\u2019']) - g=SArray(['123',u'\u2019']) - + g = SArray([{"a": u"\u2019"}]) + g = SArray([u"123", u"\u2019"]) + g = SArray(["123", u"\u2019"]) def test_from_const(self): - g = SArray.from_const('a', 100) + g = SArray.from_const("a", 100) self.assertEqual(len(g), 100) - self.assertEqual(list(g), ['a']*100) - g = SArray.from_const(dt.datetime(2013, 5, 7, 10, 4, 10),10) + self.assertEqual(list(g), ["a"] * 100) + g = SArray.from_const(dt.datetime(2013, 5, 7, 10, 4, 10), 10) self.assertEqual(len(g), 10) - self.assertEqual(list(g), [dt.datetime(2013, 5, 7, 10, 4, 10)]*10) + self.assertEqual(list(g), [dt.datetime(2013, 5, 7, 10, 4, 10)] * 10) g = SArray.from_const(0, 0) self.assertEqual(len(g), 0) @@ -1808,29 +2090,44 @@ def test_from_sequence(self): def test_datetime(self): sa = SArray(self.datetime_data) - self.__test_equal(sa ,self.datetime_data,dt.datetime) + self.__test_equal(sa, self.datetime_data, dt.datetime) sa = SArray(self.datetime_data2) - self.__test_equal(sa ,self.datetime_data2,dt.datetime) - - ret = sa.split_datetime(limit=['year','month','day','hour','minute', - 'second','us','weekday', 'isoweekday','tmweekday']) + self.__test_equal(sa, self.datetime_data2, dt.datetime) + + ret = sa.split_datetime( + limit=[ + "year", + "month", + "day", + "hour", + "minute", + "second", + "us", + "weekday", + "isoweekday", + "tmweekday", + ] + ) self.assertEqual(ret.num_columns(), 10) - self.__test_equal(ret['X.year'] , [2013, 1902, None], int) - self.__test_equal(ret['X.month'] , [5, 10, None], int) - self.__test_equal(ret['X.day'] , [7, 21, None], int) - self.__test_equal(ret['X.hour'] , [10, 10, None], int) - self.__test_equal(ret['X.minute'] , [4, 34, None], int) - self.__test_equal(ret['X.second'] , [10, 10, None], int) - self.__test_equal(ret['X.us'] , [109321, 991111, None], int) - self.__test_equal(ret['X.weekday'] , [1, 1, None], int) - self.__test_equal(ret['X.isoweekday'] , [2, 2, None], int) - self.__test_equal(ret['X.tmweekday'] , [2, 2, None], int) + self.__test_equal(ret["X.year"], [2013, 1902, None], int) + self.__test_equal(ret["X.month"], [5, 10, None], int) + self.__test_equal(ret["X.day"], [7, 21, None], int) + self.__test_equal(ret["X.hour"], [10, 10, None], int) + self.__test_equal(ret["X.minute"], [4, 34, None], int) + self.__test_equal(ret["X.second"], [10, 10, None], int) + self.__test_equal(ret["X.us"], [109321, 991111, None], int) + self.__test_equal(ret["X.weekday"], [1, 1, None], int) + self.__test_equal(ret["X.isoweekday"], [2, 2, None], int) + self.__test_equal(ret["X.tmweekday"], [2, 2, None], int) def test_datetime_difference(self): sa = SArray(self.datetime_data) sa2 = SArray(self.datetime_data2) res = sa2 - sa - expected = [float(x.microsecond) / 1000000.0 if x is not None else x for x in self.datetime_data2] + expected = [ + float(x.microsecond) / 1000000.0 if x is not None else x + for x in self.datetime_data2 + ] self.assertEqual(len(res), len(expected)) for i in range(len(res)): if res[i] is None: @@ -1839,127 +2136,124 @@ def test_datetime_difference(self): self.assertAlmostEqual(res[i], expected[i], places=6) def test_datetime_lambda(self): - data = [dt.datetime(2013, 5, 7, 10, 4, 10, 109321), - dt.datetime(1902, 10, 21, 10, 34, 10, 991111, - tzinfo=GMT(1))] - g=SArray(data) - gstr=g.apply(lambda x:str(x)) + data = [ + dt.datetime(2013, 5, 7, 10, 4, 10, 109321), + dt.datetime(1902, 10, 21, 10, 34, 10, 991111, tzinfo=GMT(1)), + ] + g = SArray(data) + gstr = g.apply(lambda x: str(x)) self.__test_equal(gstr, [str(x) for x in g], str) - gident=g.apply(lambda x:x) + gident = g.apply(lambda x: x) self.__test_equal(gident, list(g), dt.datetime) def test_datetime_to_str(self): sa = SArray(self.datetime_data) sa_string_back = sa.datetime_to_str() - self.__test_equal(sa_string_back,['2013-05-07T10:04:10', '1902-10-21T10:34:10GMT+00', None],str) + self.__test_equal( + sa_string_back, + ["2013-05-07T10:04:10", "1902-10-21T10:34:10GMT+00", None], + str, + ) - sa = SArray([None,None,None],dtype=dt.datetime) + sa = SArray([None, None, None], dtype=dt.datetime) sa_string_back = sa.datetime_to_str() - self.__test_equal(sa_string_back,[None,None,None],str) + self.__test_equal(sa_string_back, [None, None, None], str) sa = SArray(dtype=dt.datetime) sa_string_back = sa.datetime_to_str() - self.__test_equal(sa_string_back,[],str) + self.__test_equal(sa_string_back, [], str) - sa = SArray([None,None,None]) - self.assertRaises(TypeError,sa.datetime_to_str) + sa = SArray([None, None, None]) + self.assertRaises(TypeError, sa.datetime_to_str) sa = SArray() - self.assertRaises(TypeError,sa.datetime_to_str) - - + self.assertRaises(TypeError, sa.datetime_to_str) def test_str_to_datetime(self): - sa_string = SArray(['2013-05-07T10:04:10', '1902-10-21T10:34:10GMT+00', None]) + sa_string = SArray(["2013-05-07T10:04:10", "1902-10-21T10:34:10GMT+00", None]) sa_datetime_back = sa_string.str_to_datetime() expected = self.datetime_data - self.__test_equal(sa_datetime_back,expected,dt.datetime) + self.__test_equal(sa_datetime_back, expected, dt.datetime) - sa_string = SArray([None,None,None],str) + sa_string = SArray([None, None, None], str) sa_datetime_back = sa_string.str_to_datetime() - self.__test_equal(sa_datetime_back,[None,None,None],dt.datetime) + self.__test_equal(sa_datetime_back, [None, None, None], dt.datetime) sa_string = SArray(dtype=str) sa_datetime_back = sa_string.str_to_datetime() - self.__test_equal(sa_datetime_back,[],dt.datetime) - - sa = SArray([None,None,None]) - self.assertRaises(TypeError,sa.str_to_datetime) + self.__test_equal(sa_datetime_back, [], dt.datetime) + sa = SArray([None, None, None]) + self.assertRaises(TypeError, sa.str_to_datetime) sa = SArray() - self.assertRaises(TypeError,sa.str_to_datetime) + self.assertRaises(TypeError, sa.str_to_datetime) # hour without leading zero - sa = SArray(['10/30/2014 9:01']) - sa = sa.str_to_datetime('%m/%d/%Y %H:%M') + sa = SArray(["10/30/2014 9:01"]) + sa = sa.str_to_datetime("%m/%d/%Y %H:%M") expected = [dt.datetime(2014, 10, 30, 9, 1)] - self.__test_equal(sa,expected,dt.datetime) + self.__test_equal(sa, expected, dt.datetime) # without delimiters - sa = SArray(['10302014 0901', '10302014 2001']) - sa = sa.str_to_datetime('%m%d%Y %H%M') - expected = [dt.datetime(2014, 10, 30, 9, 1), - dt.datetime(2014, 10, 30, 20, 1)] - self.__test_equal(sa,expected,dt.datetime) + sa = SArray(["10302014 0901", "10302014 2001"]) + sa = sa.str_to_datetime("%m%d%Y %H%M") + expected = [dt.datetime(2014, 10, 30, 9, 1), dt.datetime(2014, 10, 30, 20, 1)] + self.__test_equal(sa, expected, dt.datetime) # another without delimiter test - sa = SArray(['20110623T191001']) + sa = SArray(["20110623T191001"]) sa = sa.str_to_datetime("%Y%m%dT%H%M%S%F%q") expected = [dt.datetime(2011, 6, 23, 19, 10, 1)] - self.__test_equal(sa,expected,dt.datetime) + self.__test_equal(sa, expected, dt.datetime) # am pm - sa = SArray(['10/30/2014 9:01am', '10/30/2014 9:01pm']) - sa = sa.str_to_datetime('%m/%d/%Y %H:%M%p') - expected = [dt.datetime(2014, 10, 30, 9, 1), - dt.datetime(2014, 10, 30, 21, 1)] - self.__test_equal(sa,expected,dt.datetime) - - sa = SArray(['10/30/2014 9:01AM', '10/30/2014 9:01PM']) - sa = sa.str_to_datetime('%m/%d/%Y %H:%M%P') - expected = [dt.datetime(2014, 10, 30, 9, 1), - dt.datetime(2014, 10, 30, 21, 1)] - self.__test_equal(sa,expected,dt.datetime) + sa = SArray(["10/30/2014 9:01am", "10/30/2014 9:01pm"]) + sa = sa.str_to_datetime("%m/%d/%Y %H:%M%p") + expected = [dt.datetime(2014, 10, 30, 9, 1), dt.datetime(2014, 10, 30, 21, 1)] + self.__test_equal(sa, expected, dt.datetime) + + sa = SArray(["10/30/2014 9:01AM", "10/30/2014 9:01PM"]) + sa = sa.str_to_datetime("%m/%d/%Y %H:%M%P") + expected = [dt.datetime(2014, 10, 30, 9, 1), dt.datetime(2014, 10, 30, 21, 1)] + self.__test_equal(sa, expected, dt.datetime) # failure 13pm - sa = SArray(['10/30/2014 13:01pm']) + sa = SArray(["10/30/2014 13:01pm"]) with self.assertRaises(RuntimeError): - sa.str_to_datetime('%m/%d/%Y %H:%M%p') + sa.str_to_datetime("%m/%d/%Y %H:%M%p") # failure hour 13 when %l should only have up to hour 12 - sa = SArray(['10/30/2014 13:01']) + sa = SArray(["10/30/2014 13:01"]) with self.assertRaises(RuntimeError): - sa.str_to_datetime('%m/%d/%Y %l:%M') + sa.str_to_datetime("%m/%d/%Y %l:%M") with self.assertRaises(RuntimeError): - sa.str_to_datetime('%m/%d/%Y %L:%M') - - sa = SArray(['2013-05-07T10:04:10', - '1902-10-21T10:34:10UTC+05:45']) - expected = [dt.datetime(2013, 5, 7, 10, 4, 10), - dt.datetime(1902, 10, 21, 10, 34, 10).replace(tzinfo=GMT(5.75))] - self.__test_equal(sa.str_to_datetime() ,expected,dt.datetime) - - + sa.str_to_datetime("%m/%d/%Y %L:%M") + sa = SArray(["2013-05-07T10:04:10", "1902-10-21T10:34:10UTC+05:45"]) + expected = [ + dt.datetime(2013, 5, 7, 10, 4, 10), + dt.datetime(1902, 10, 21, 10, 34, 10).replace(tzinfo=GMT(5.75)), + ] + self.__test_equal(sa.str_to_datetime(), expected, dt.datetime) def test_apply_with_partial(self): sa = SArray([1, 2, 3, 4, 5]) def concat_fn(character, number): - return '%s%d' % (character, number) + return "%s%d" % (character, number) - my_partial_fn = functools.partial(concat_fn, 'x') + my_partial_fn = functools.partial(concat_fn, "x") sa_transformed = sa.apply(my_partial_fn) - self.assertEqual(list(sa_transformed), ['x1', 'x2', 'x3', 'x4', 'x5']) + self.assertEqual(list(sa_transformed), ["x1", "x2", "x3", "x4", "x5"]) def test_apply_with_functor(self): sa = SArray([1, 2, 3, 4, 5]) @@ -1969,31 +2263,31 @@ def __init__(self, character): self.character = character def __call__(self, number): - return '%s%d' % (self.character, number) + return "%s%d" % (self.character, number) - concatenator = Concatenator('x') + concatenator = Concatenator("x") sa_transformed = sa.apply(concatenator) - self.assertEqual(list(sa_transformed), ['x1', 'x2', 'x3', 'x4', 'x5']) + self.assertEqual(list(sa_transformed), ["x1", "x2", "x3", "x4", "x5"]) def test_argmax_argmin(self): - sa = SArray([1,4,-1,10,3,5,8]) - index = [sa.argmax(),sa.argmin()] - expected = [3,2] - self.assertEqual(index,expected) + sa = SArray([1, 4, -1, 10, 3, 5, 8]) + index = [sa.argmax(), sa.argmin()] + expected = [3, 2] + self.assertEqual(index, expected) - sa = SArray([1,4.3,-1.4,0,3,5.6,8.9]) - index = [sa.argmax(),sa.argmin()] - expected = [6,2] - self.assertEqual(index,expected) + sa = SArray([1, 4.3, -1.4, 0, 3, 5.6, 8.9]) + index = [sa.argmax(), sa.argmin()] + expected = [6, 2] + self.assertEqual(index, expected) - #empty case + # empty case sa = SArray([]) - index = [sa.argmax(),sa.argmin()] - expected = [None,None] - self.assertEqual(index,expected) + index = [sa.argmax(), sa.argmin()] + expected = [None, None] + self.assertEqual(index, expected) # non-numeric type - sa = SArray(["434","43"]) + sa = SArray(["434", "43"]) with self.assertRaises(TypeError): sa.argmax() @@ -2003,15 +2297,15 @@ def test_argmax_argmin(self): def test_apply_with_recursion(self): sa = SArray(range(1000)) sastr = sa.astype(str) - rets = sa.apply(lambda x:sastr[x]) + rets = sa.apply(lambda x: sastr[x]) self.assertEqual(list(rets), list(sastr)) def test_save_sarray(self): - '''save lazily evaluated SArray should not materialize to target folder - ''' + """save lazily evaluated SArray should not materialize to target folder + """ data = SArray(range(1000)) data = data[data > 50] - #lazy and good + # lazy and good tmp_dir = tempfile.mkdtemp() data.save(tmp_dir) shutil.rmtree(tmp_dir) @@ -2021,6 +2315,7 @@ def test_to_numpy(self): X = SArray(range(100)) import numpy as np import numpy.testing as nptest + Y = np.array(range(100)) nptest.assert_array_equal(X.to_numpy(), Y) @@ -2030,615 +2325,652 @@ def test_to_numpy(self): def test_rolling_mean(self): data = SArray(range(1000)) - neg_data = SArray(range(-100,100,2)) + neg_data = SArray(range(-100, 100, 2)) ### Small backward window including current - res = data.rolling_mean(-3,0) - expected = [None for i in range(3)] + [i + .5 for i in range(1,998)] - self.__test_equal(res,expected,float) + res = data.rolling_mean(-3, 0) + expected = [None for i in range(3)] + [i + 0.5 for i in range(1, 998)] + self.__test_equal(res, expected, float) # Test float inputs as well - res = data.astype(float).rolling_mean(-3,0) - self.__test_equal(res,expected,float) + res = data.astype(float).rolling_mean(-3, 0) + self.__test_equal(res, expected, float) # Test min observations res = data.rolling_mean(-3, 0, min_observations=5) - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) res = data.rolling_mean(-3, 0, min_observations=4) - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) res = data.rolling_mean(-3, 0, min_observations=3) expected[2] = 1.0 - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) res = data.rolling_mean(-3, 0, min_observations=2) expected[1] = 0.5 - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) res = data.rolling_mean(-3, 0, min_observations=1) expected[0] = 0.0 - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) res = data.rolling_mean(-3, 0, min_observations=0) - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) with self.assertRaises(ValueError): - res = data.rolling_mean(-3,0,min_observations=-1) + res = data.rolling_mean(-3, 0, min_observations=-1) - res = neg_data.rolling_mean(-3,0) - expected = [None for i in range(3)] + [float(i) for i in range(-97,96,2)] - self.__test_equal(res,expected,float) + res = neg_data.rolling_mean(-3, 0) + expected = [None for i in range(3)] + [float(i) for i in range(-97, 96, 2)] + self.__test_equal(res, expected, float) # Test float inputs as well - res = neg_data.astype(float).rolling_mean(-3,0) - self.__test_equal(res,expected,float) + res = neg_data.astype(float).rolling_mean(-3, 0) + self.__test_equal(res, expected, float) # Test vector input - res = SArray(self.vec_data).rolling_mean(-3,0) - expected = [None for i in range(3)] + [array.array('d',[i+.5, i+1.5]) for i in range(2,9)] - self.__test_equal(res,expected,array.array) + res = SArray(self.vec_data).rolling_mean(-3, 0) + expected = [None for i in range(3)] + [ + array.array("d", [i + 0.5, i + 1.5]) for i in range(2, 9) + ] + self.__test_equal(res, expected, array.array) ### Small forward window including current - res = data.rolling_mean(0,4) - expected = [float(i) for i in range(2,998)] + [None for i in range(4)] - self.__test_equal(res,expected,float) + res = data.rolling_mean(0, 4) + expected = [float(i) for i in range(2, 998)] + [None for i in range(4)] + self.__test_equal(res, expected, float) - res = neg_data.rolling_mean(0,4) - expected = [float(i) for i in range(-96,95,2)] + [None for i in range(4)] - self.__test_equal(res,expected,float) + res = neg_data.rolling_mean(0, 4) + expected = [float(i) for i in range(-96, 95, 2)] + [None for i in range(4)] + self.__test_equal(res, expected, float) ### Small backward window not including current - res = data.rolling_mean(-5,-1) - expected = [None for i in range(5)] + [float(i) for i in range(2,997)] - self.__test_equal(res,expected,float) + res = data.rolling_mean(-5, -1) + expected = [None for i in range(5)] + [float(i) for i in range(2, 997)] + self.__test_equal(res, expected, float) - res = neg_data.rolling_mean(-5,-1) - expected = [None for i in range(5)] + [float(i) for i in range(-96,94,2)] - self.__test_equal(res,expected,float) + res = neg_data.rolling_mean(-5, -1) + expected = [None for i in range(5)] + [float(i) for i in range(-96, 94, 2)] + self.__test_equal(res, expected, float) ### Small forward window not including current - res = data.rolling_mean(1,5) - expected = [float(i) for i in range(3,998)] + [None for i in range(5)] - self.__test_equal(res,expected,float) + res = data.rolling_mean(1, 5) + expected = [float(i) for i in range(3, 998)] + [None for i in range(5)] + self.__test_equal(res, expected, float) - res = neg_data.rolling_mean(1,5) - expected = [float(i) for i in range(-94,96,2)] + [None for i in range(5)] - self.__test_equal(res,expected,float) + res = neg_data.rolling_mean(1, 5) + expected = [float(i) for i in range(-94, 96, 2)] + [None for i in range(5)] + self.__test_equal(res, expected, float) ### "Centered" rolling aggregate - res = data.rolling_mean(-2,2) - expected = [None for i in range(2)] + [float(i) for i in range(2,998)] + [None for i in range(2)] - self.__test_equal(res,expected,float) + res = data.rolling_mean(-2, 2) + expected = ( + [None for i in range(2)] + + [float(i) for i in range(2, 998)] + + [None for i in range(2)] + ) + self.__test_equal(res, expected, float) - res = neg_data.rolling_mean(-2,2) - expected = [None for i in range(2)] + [float(i) for i in range(-96,96,2)] + [None for i in range(2)] - self.__test_equal(res,expected,float) + res = neg_data.rolling_mean(-2, 2) + expected = ( + [None for i in range(2)] + + [float(i) for i in range(-96, 96, 2)] + + [None for i in range(2)] + ) + self.__test_equal(res, expected, float) ### Lopsided rolling aggregate - res = data.rolling_mean(-2,1) - expected = [None for i in range(2)] + [i + .5 for i in range(1,998)] + [None for i in range(1)] - self.__test_equal(res,expected,float) + res = data.rolling_mean(-2, 1) + expected = ( + [None for i in range(2)] + + [i + 0.5 for i in range(1, 998)] + + [None for i in range(1)] + ) + self.__test_equal(res, expected, float) - res = neg_data.rolling_mean(-2,1) - expected = [None for i in range(2)] + [float(i) for i in range(-97,97,2)] + [None for i in range(1)] - self.__test_equal(res,expected,float) + res = neg_data.rolling_mean(-2, 1) + expected = ( + [None for i in range(2)] + + [float(i) for i in range(-97, 97, 2)] + + [None for i in range(1)] + ) + self.__test_equal(res, expected, float) ### A very forward window - res = data.rolling_mean(500,502) - expected = [float(i) for i in range(501,999)] + [None for i in range(502)] - self.__test_equal(res,expected,float) + res = data.rolling_mean(500, 502) + expected = [float(i) for i in range(501, 999)] + [None for i in range(502)] + self.__test_equal(res, expected, float) - res = neg_data.rolling_mean(50,52) - expected = [float(i) for i in range(2,98,2)] + [None for i in range(52)] - self.__test_equal(res,expected,float) + res = neg_data.rolling_mean(50, 52) + expected = [float(i) for i in range(2, 98, 2)] + [None for i in range(52)] + self.__test_equal(res, expected, float) ### A very backward window - res = data.rolling_mean(-502,-500) - expected = [None for i in range(502)] + [float(i) for i in range(1,499)] - self.__test_equal(res,expected,float) + res = data.rolling_mean(-502, -500) + expected = [None for i in range(502)] + [float(i) for i in range(1, 499)] + self.__test_equal(res, expected, float) - res = neg_data.rolling_mean(-52,-50) - expected = [None for i in range(52)] + [float(i) for i in range(-98,-2,2)] - self.__test_equal(res,expected,float) + res = neg_data.rolling_mean(-52, -50) + expected = [None for i in range(52)] + [float(i) for i in range(-98, -2, 2)] + self.__test_equal(res, expected, float) ### A window size much larger than anticipated segment size - res = data.rolling_mean(0,749) - expected = [i + .5 for i in range(374,625)] + [None for i in range(749)] - self.__test_equal(res,expected,float) + res = data.rolling_mean(0, 749) + expected = [i + 0.5 for i in range(374, 625)] + [None for i in range(749)] + self.__test_equal(res, expected, float) ### A window size larger than the array - res = data.rolling_mean(0,1000) + res = data.rolling_mean(0, 1000) expected = [None for i in range(1000)] - self.__test_equal(res,expected,type(None)) + self.__test_equal(res, expected, type(None)) ### A window size of 1 - res = data.rolling_mean(0,0) + res = data.rolling_mean(0, 0) self.__test_equal(res, list(data), float) - res = data.rolling_mean(-2,-2) + res = data.rolling_mean(-2, -2) expected = [None for i in range(2)] + list(data[0:998]) self.__test_equal(res, expected, float) - res = data.rolling_mean(3,3) + res = data.rolling_mean(3, 3) expected = list(data[3:1000]) + [None for i in range(3)] self.__test_equal(res, expected, float) ### A negative window size with self.assertRaises(RuntimeError): - res = data.rolling_mean(4,2) + res = data.rolling_mean(4, 2) ### Non-numeric - with self.assertRaisesRegexp(RuntimeError, '.*support.*type.*'): - res = SArray(self.string_data).rolling_mean(0,1) + with self.assertRaisesRegexp(RuntimeError, ".*support.*type.*"): + res = SArray(self.string_data).rolling_mean(0, 1) ### Empty SArray sa = SArray() - res = sa.rolling_mean(0,1) + res = sa.rolling_mean(0, 1) self.__test_equal(res, [], type(None)) ### Small SArray - sa = SArray([1,2,3]) - res = sa.rolling_mean(0,1) - self.__test_equal(res, [1.5,2.5,None], float) + sa = SArray([1, 2, 3]) + res = sa.rolling_mean(0, 1) + self.__test_equal(res, [1.5, 2.5, None], float) def test_rolling_sum(self): data = SArray(range(1000)) - neg_data = SArray(range(-100,100,2)) + neg_data = SArray(range(-100, 100, 2)) ### Small backward window including current - res = data.rolling_sum(-3,0) - expected = [None for i in range(3)] + [i for i in range(6,3994,4)] - self.__test_equal(res,expected,int) + res = data.rolling_sum(-3, 0) + expected = [None for i in range(3)] + [i for i in range(6, 3994, 4)] + self.__test_equal(res, expected, int) # Test float inputs as well - res = data.astype(float).rolling_sum(-3,0) - self.__test_equal(res,expected,float) + res = data.astype(float).rolling_sum(-3, 0) + self.__test_equal(res, expected, float) # Test min observations res = data.rolling_sum(-3, 0, min_observations=5) - self.__test_equal(res,expected,int) + self.__test_equal(res, expected, int) res = data.rolling_sum(-3, 0, min_observations=4) - self.__test_equal(res,expected,int) + self.__test_equal(res, expected, int) res = data.rolling_sum(-3, 0, min_observations=3) expected[2] = 3 - self.__test_equal(res,expected,int) + self.__test_equal(res, expected, int) res = data.rolling_sum(-3, 0, min_observations=2) expected[1] = 1 - self.__test_equal(res,expected,int) + self.__test_equal(res, expected, int) res = data.rolling_sum(-3, 0, min_observations=1) expected[0] = 0 - self.__test_equal(res,expected,int) + self.__test_equal(res, expected, int) res = data.rolling_sum(-3, 0, min_observations=0) - self.__test_equal(res,expected,int) + self.__test_equal(res, expected, int) with self.assertRaises(ValueError): - res = data.rolling_sum(-3,0,min_observations=-1) + res = data.rolling_sum(-3, 0, min_observations=-1) - res = neg_data.rolling_sum(-3,0) - expected = [None for i in range(3)] + [i for i in range(-388,388,8)] - self.__test_equal(res,expected,int) + res = neg_data.rolling_sum(-3, 0) + expected = [None for i in range(3)] + [i for i in range(-388, 388, 8)] + self.__test_equal(res, expected, int) # Test float inputs as well - res = neg_data.astype(float).rolling_sum(-3,0) - self.__test_equal(res,expected,float) + res = neg_data.astype(float).rolling_sum(-3, 0) + self.__test_equal(res, expected, float) # Test vector input - res = SArray(self.vec_data).rolling_sum(-3,0) - expected = [None for i in range(3)] + [array.array('d',[i, i+4]) for i in range(10,38,4)] - self.__test_equal(res,expected,array.array) + res = SArray(self.vec_data).rolling_sum(-3, 0) + expected = [None for i in range(3)] + [ + array.array("d", [i, i + 4]) for i in range(10, 38, 4) + ] + self.__test_equal(res, expected, array.array) ### Small forward window including current - res = data.rolling_sum(0,4) - expected = [i for i in range(10,4990,5)] + [None for i in range(4)] - self.__test_equal(res,expected,int) + res = data.rolling_sum(0, 4) + expected = [i for i in range(10, 4990, 5)] + [None for i in range(4)] + self.__test_equal(res, expected, int) - res = neg_data.rolling_sum(0,4) - expected = [i for i in range(-480,480,10)] + [None for i in range(4)] - self.__test_equal(res,expected,int) + res = neg_data.rolling_sum(0, 4) + expected = [i for i in range(-480, 480, 10)] + [None for i in range(4)] + self.__test_equal(res, expected, int) ### Small backward window not including current - res = data.rolling_sum(-5,-1) - expected = [None for i in range(5)] + [i for i in range(10,4985,5)] - self.__test_equal(res,expected,int) + res = data.rolling_sum(-5, -1) + expected = [None for i in range(5)] + [i for i in range(10, 4985, 5)] + self.__test_equal(res, expected, int) - res = neg_data.rolling_sum(-5,-1) - expected = [None for i in range(5)] + [i for i in range(-480,470,10)] - self.__test_equal(res,expected,int) + res = neg_data.rolling_sum(-5, -1) + expected = [None for i in range(5)] + [i for i in range(-480, 470, 10)] + self.__test_equal(res, expected, int) ### Small forward window not including current - res = data.rolling_sum(1,5) - expected = [i for i in range(15,4990,5)] + [None for i in range(5)] - self.__test_equal(res,expected,int) + res = data.rolling_sum(1, 5) + expected = [i for i in range(15, 4990, 5)] + [None for i in range(5)] + self.__test_equal(res, expected, int) - res = neg_data.rolling_sum(1,5) - expected = [i for i in range(-470,480,10)] + [None for i in range(5)] - self.__test_equal(res,expected,int) + res = neg_data.rolling_sum(1, 5) + expected = [i for i in range(-470, 480, 10)] + [None for i in range(5)] + self.__test_equal(res, expected, int) ### "Centered" rolling aggregate - res = data.rolling_sum(-2,2) - expected = [None for i in range(2)] + [i for i in range(10,4990,5)] + [None for i in range(2)] - self.__test_equal(res,expected,int) + res = data.rolling_sum(-2, 2) + expected = ( + [None for i in range(2)] + + [i for i in range(10, 4990, 5)] + + [None for i in range(2)] + ) + self.__test_equal(res, expected, int) - res = neg_data.rolling_sum(-2,2) - expected = [None for i in range(2)] + [i for i in range(-480,480,10)] + [None for i in range(2)] - self.__test_equal(res,expected,int) + res = neg_data.rolling_sum(-2, 2) + expected = ( + [None for i in range(2)] + + [i for i in range(-480, 480, 10)] + + [None for i in range(2)] + ) + self.__test_equal(res, expected, int) ### Lopsided rolling aggregate - res = data.rolling_sum(-2,1) - expected = [None for i in range(2)] + [i for i in range(6,3994,4)] + [None for i in range(1)] - self.__test_equal(res,expected,int) + res = data.rolling_sum(-2, 1) + expected = ( + [None for i in range(2)] + + [i for i in range(6, 3994, 4)] + + [None for i in range(1)] + ) + self.__test_equal(res, expected, int) - res = neg_data.rolling_sum(-2,1) - expected = [None for i in range(2)] + [i for i in range(-388,388,8)] + [None for i in range(1)] - self.__test_equal(res,expected,int) + res = neg_data.rolling_sum(-2, 1) + expected = ( + [None for i in range(2)] + + [i for i in range(-388, 388, 8)] + + [None for i in range(1)] + ) + self.__test_equal(res, expected, int) ### A very forward window - res = data.rolling_sum(500,502) - expected = [i for i in range(1503,2997,3)] + [None for i in range(502)] - self.__test_equal(res,expected,int) + res = data.rolling_sum(500, 502) + expected = [i for i in range(1503, 2997, 3)] + [None for i in range(502)] + self.__test_equal(res, expected, int) - res = neg_data.rolling_sum(50,52) - expected = [i for i in range(6,294,6)] + [None for i in range(52)] - self.__test_equal(res,expected,int) + res = neg_data.rolling_sum(50, 52) + expected = [i for i in range(6, 294, 6)] + [None for i in range(52)] + self.__test_equal(res, expected, int) ### A very backward window - res = data.rolling_sum(-502,-500) - expected = [None for i in range(502)] + [i for i in range(3,1497,3)] - self.__test_equal(res,expected,int) + res = data.rolling_sum(-502, -500) + expected = [None for i in range(502)] + [i for i in range(3, 1497, 3)] + self.__test_equal(res, expected, int) - res = neg_data.rolling_sum(-52,-50) - expected = [None for i in range(52)] + [i for i in range(-294,-6,6)] - self.__test_equal(res,expected,int) + res = neg_data.rolling_sum(-52, -50) + expected = [None for i in range(52)] + [i for i in range(-294, -6, 6)] + self.__test_equal(res, expected, int) ### A window size much larger than anticipated segment size - res = data.rolling_sum(0,749) - expected = [i for i in range(280875,469125,750)] + [None for i in range(749)] - self.__test_equal(res,expected,int) + res = data.rolling_sum(0, 749) + expected = [i for i in range(280875, 469125, 750)] + [None for i in range(749)] + self.__test_equal(res, expected, int) ### A window size larger than the array - res = data.rolling_sum(0,1000) + res = data.rolling_sum(0, 1000) expected = [None for i in range(1000)] - self.__test_equal(res,expected,type(None)) + self.__test_equal(res, expected, type(None)) ### A window size of 1 - res = data.rolling_sum(0,0) + res = data.rolling_sum(0, 0) self.__test_equal(res, list(data), int) - res = data.rolling_sum(-2,-2) + res = data.rolling_sum(-2, -2) expected = [None for i in range(2)] + list(data[0:998]) self.__test_equal(res, expected, int) - res = data.rolling_sum(3,3) + res = data.rolling_sum(3, 3) expected = list(data[3:1000]) + [None for i in range(3)] self.__test_equal(res, expected, int) ### A negative window size with self.assertRaises(RuntimeError): - res = data.rolling_sum(4,2) + res = data.rolling_sum(4, 2) ### Non-numeric - with self.assertRaisesRegexp(RuntimeError, '.*support.*type.*'): - res = SArray(self.string_data).rolling_sum(0,1) + with self.assertRaisesRegexp(RuntimeError, ".*support.*type.*"): + res = SArray(self.string_data).rolling_sum(0, 1) ### Empty SArray sa = SArray() - res = sa.rolling_sum(0,1) + res = sa.rolling_sum(0, 1) self.__test_equal(res, [], type(None)) ### Small SArray - sa = SArray([1,2,3]) - res = sa.rolling_sum(0,1) - self.__test_equal(res, [3,5,None], int) + sa = SArray([1, 2, 3]) + res = sa.rolling_sum(0, 1) + self.__test_equal(res, [3, 5, None], int) def test_rolling_max(self): data = SArray(range(1000)) ### Small backward window including current - res = data.rolling_max(-3,0) - expected = [None for i in range(3)] + [i for i in range(3,1000)] - self.__test_equal(res,expected,int) + res = data.rolling_max(-3, 0) + expected = [None for i in range(3)] + [i for i in range(3, 1000)] + self.__test_equal(res, expected, int) # Test float inputs as well - res = data.astype(float).rolling_max(-3,0) - self.__test_equal(res,expected,float) + res = data.astype(float).rolling_max(-3, 0) + self.__test_equal(res, expected, float) # Test min observations res = data.rolling_max(-3, 0, min_observations=5) - self.__test_equal(res,expected,int) + self.__test_equal(res, expected, int) res = data.rolling_max(-3, 0, min_observations=4) - self.__test_equal(res,expected,int) + self.__test_equal(res, expected, int) res = data.rolling_max(-3, 0, min_observations=3) expected[2] = 2 - self.__test_equal(res,expected,int) + self.__test_equal(res, expected, int) with self.assertRaises(ValueError): - res = data.rolling_max(-3,0,min_observations=-1) + res = data.rolling_max(-3, 0, min_observations=-1) # Test vector input - with self.assertRaisesRegexp(RuntimeError, '.*support.*type.*'): - res = SArray(self.vec_data).rolling_max(-3,0) + with self.assertRaisesRegexp(RuntimeError, ".*support.*type.*"): + res = SArray(self.vec_data).rolling_max(-3, 0) ### Small forward window including current - res = data.rolling_max(0,4) - expected = [float(i) for i in range(4,1000)] + [None for i in range(4)] - self.__test_equal(res,expected,int) + res = data.rolling_max(0, 4) + expected = [float(i) for i in range(4, 1000)] + [None for i in range(4)] + self.__test_equal(res, expected, int) ### A window size of 1 - res = data.rolling_max(0,0) + res = data.rolling_max(0, 0) self.__test_equal(res, list(data), int) - res = data.rolling_max(-2,-2) + res = data.rolling_max(-2, -2) expected = [None for i in range(2)] + list(data[0:998]) self.__test_equal(res, expected, int) - res = data.rolling_max(3,3) + res = data.rolling_max(3, 3) expected = list(data[3:1000]) + [None for i in range(3)] self.__test_equal(res, expected, int) ### A negative window size with self.assertRaises(RuntimeError): - res = data.rolling_max(4,2) + res = data.rolling_max(4, 2) ### Non-numeric - with self.assertRaisesRegexp(RuntimeError, '.*support.*type.*'): - res = SArray(self.string_data).rolling_max(0,1) + with self.assertRaisesRegexp(RuntimeError, ".*support.*type.*"): + res = SArray(self.string_data).rolling_max(0, 1) ### Empty SArray sa = SArray() - res = sa.rolling_max(0,1) + res = sa.rolling_max(0, 1) self.__test_equal(res, [], type(None)) ### Small SArray - sa = SArray([1,2,3]) - res = sa.rolling_max(0,1) - self.__test_equal(res, [2,3,None], int) + sa = SArray([1, 2, 3]) + res = sa.rolling_max(0, 1) + self.__test_equal(res, [2, 3, None], int) def test_rolling_min(self): data = SArray(range(1000)) ### Small backward window including current - res = data.rolling_min(-3,0) - expected = [None for i in range(3)] + [i for i in range(0,997)] - self.__test_equal(res,expected,int) + res = data.rolling_min(-3, 0) + expected = [None for i in range(3)] + [i for i in range(0, 997)] + self.__test_equal(res, expected, int) # Test float inputs as well - res = data.astype(float).rolling_min(-3,0) - self.__test_equal(res,expected,float) + res = data.astype(float).rolling_min(-3, 0) + self.__test_equal(res, expected, float) # Test min observations res = data.rolling_min(-3, 0, min_observations=5) - self.__test_equal(res,expected,int) + self.__test_equal(res, expected, int) res = data.rolling_min(-3, 0, min_observations=4) - self.__test_equal(res,expected,int) + self.__test_equal(res, expected, int) res = data.rolling_min(-3, 0, min_observations=3) expected[2] = 0 - self.__test_equal(res,expected,int) + self.__test_equal(res, expected, int) with self.assertRaises(ValueError): - res = data.rolling_min(-3,0,min_observations=-1) + res = data.rolling_min(-3, 0, min_observations=-1) # Test vector input - with self.assertRaisesRegexp(RuntimeError, '.*support.*type.*'): - res = SArray(self.vec_data).rolling_min(-3,0) + with self.assertRaisesRegexp(RuntimeError, ".*support.*type.*"): + res = SArray(self.vec_data).rolling_min(-3, 0) ### Small forward window including current - res = data.rolling_min(0,4) - expected = [float(i) for i in range(0,996)] + [None for i in range(4)] - self.__test_equal(res,expected,int) + res = data.rolling_min(0, 4) + expected = [float(i) for i in range(0, 996)] + [None for i in range(4)] + self.__test_equal(res, expected, int) ### A window size of 1 - res = data.rolling_min(0,0) + res = data.rolling_min(0, 0) self.__test_equal(res, list(data), int) - res = data.rolling_min(-2,-2) + res = data.rolling_min(-2, -2) expected = [None for i in range(2)] + list(data[0:998]) self.__test_equal(res, expected, int) - res = data.rolling_min(3,3) + res = data.rolling_min(3, 3) expected = list(data[3:1000]) + [None for i in range(3)] self.__test_equal(res, expected, int) ### A negative window size with self.assertRaises(RuntimeError): - res = data.rolling_min(4,2) + res = data.rolling_min(4, 2) ### Non-numeric - with self.assertRaisesRegexp(RuntimeError, '.*support.*type.*'): - res = SArray(self.string_data).rolling_min(0,1) + with self.assertRaisesRegexp(RuntimeError, ".*support.*type.*"): + res = SArray(self.string_data).rolling_min(0, 1) ### Empty SArray sa = SArray() - res = sa.rolling_min(0,1) + res = sa.rolling_min(0, 1) self.__test_equal(res, [], type(None)) ### Small SArray - sa = SArray([1,2,3]) - res = sa.rolling_min(0,1) - self.__test_equal(res, [1,2,None], int) + sa = SArray([1, 2, 3]) + res = sa.rolling_min(0, 1) + self.__test_equal(res, [1, 2, None], int) def test_rolling_var(self): data = SArray(range(1000)) ### Small backward window including current - res = data.rolling_var(-3,0) + res = data.rolling_var(-3, 0) expected = [None for i in range(3)] + [1.25 for i in range(997)] - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) # Test float inputs as well - res = data.astype(float).rolling_var(-3,0) - self.__test_equal(res,expected,float) + res = data.astype(float).rolling_var(-3, 0) + self.__test_equal(res, expected, float) # Test min observations res = data.rolling_var(-3, 0, min_observations=5) - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) res = data.rolling_var(-3, 0, min_observations=4) - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) res = data.rolling_var(-3, 0, min_observations=3) - expected[2] = (2.0/3.0) - self.__test_equal(res,expected,float) + expected[2] = 2.0 / 3.0 + self.__test_equal(res, expected, float) with self.assertRaises(ValueError): - res = data.rolling_var(-3,0,min_observations=-1) + res = data.rolling_var(-3, 0, min_observations=-1) # Test vector input - with self.assertRaisesRegexp(RuntimeError, '.*support.*type.*'): - res = SArray(self.vec_data).rolling_var(-3,0) + with self.assertRaisesRegexp(RuntimeError, ".*support.*type.*"): + res = SArray(self.vec_data).rolling_var(-3, 0) ### Small forward window including current - res = data.rolling_var(0,4) + res = data.rolling_var(0, 4) expected = [2 for i in range(996)] + [None for i in range(4)] - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) ### A window size of 1 - res = data.rolling_var(0,0) + res = data.rolling_var(0, 0) self.__test_equal(res, [0 for i in range(1000)], float) - res = data.rolling_var(-2,-2) - self.__test_equal(res, [None,None] + [0 for i in range(998)], float) + res = data.rolling_var(-2, -2) + self.__test_equal(res, [None, None] + [0 for i in range(998)], float) ### A negative window size with self.assertRaises(RuntimeError): - res = data.rolling_var(4,2) + res = data.rolling_var(4, 2) ### Non-numeric - with self.assertRaisesRegexp(RuntimeError, '.*support.*type.*'): - res = SArray(self.string_data).rolling_var(0,1) + with self.assertRaisesRegexp(RuntimeError, ".*support.*type.*"): + res = SArray(self.string_data).rolling_var(0, 1) ### Empty SArray sa = SArray() - res = sa.rolling_var(0,1) + res = sa.rolling_var(0, 1) self.__test_equal(res, [], type(None)) ### Small SArray - sa = SArray([1,2,3]) - res = sa.rolling_var(0,1) - self.__test_equal(res, [.25,.25,None], float) + sa = SArray([1, 2, 3]) + res = sa.rolling_var(0, 1) + self.__test_equal(res, [0.25, 0.25, None], float) def test_rolling_stdv(self): data = SArray(range(1000)) ### Small backward window including current - res = data.rolling_stdv(-3,0) + res = data.rolling_stdv(-3, 0) expected = [None for i in range(3)] + [1.118033988749895 for i in range(997)] - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) # Test float inputs as well - res = data.astype(float).rolling_stdv(-3,0) - self.__test_equal(res,expected,float) + res = data.astype(float).rolling_stdv(-3, 0) + self.__test_equal(res, expected, float) # Test min observations res = data.rolling_stdv(-3, 0, min_observations=5) - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) res = data.rolling_stdv(-3, 0, min_observations=4) - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) res = data.rolling_stdv(-3, 0, min_observations=3) - expected[2] = math.sqrt(2.0/3.0) - self.__test_equal(res,expected,float) + expected[2] = math.sqrt(2.0 / 3.0) + self.__test_equal(res, expected, float) with self.assertRaises(ValueError): - res = data.rolling_stdv(-3,0,min_observations=-1) + res = data.rolling_stdv(-3, 0, min_observations=-1) # Test vector input - with self.assertRaisesRegexp(RuntimeError, '.*support.*type.*'): - res = SArray(self.vec_data).rolling_stdv(-3,0) + with self.assertRaisesRegexp(RuntimeError, ".*support.*type.*"): + res = SArray(self.vec_data).rolling_stdv(-3, 0) ### Small forward window including current - res = data.rolling_stdv(0,4) + res = data.rolling_stdv(0, 4) expected = [math.sqrt(2) for i in range(996)] + [None for i in range(4)] - self.__test_equal(res,expected,float) + self.__test_equal(res, expected, float) ### A window size of 1 - res = data.rolling_stdv(0,0) + res = data.rolling_stdv(0, 0) self.__test_equal(res, [0 for i in range(1000)], float) - res = data.rolling_stdv(-2,-2) - self.__test_equal(res, [None,None] + [0 for i in range(998)], float) + res = data.rolling_stdv(-2, -2) + self.__test_equal(res, [None, None] + [0 for i in range(998)], float) ### A negative window size with self.assertRaises(RuntimeError): - res = data.rolling_stdv(4,2) + res = data.rolling_stdv(4, 2) ### Non-numeric - with self.assertRaisesRegexp(RuntimeError, '.*support.*type.*'): - res = SArray(self.string_data).rolling_stdv(0,1) + with self.assertRaisesRegexp(RuntimeError, ".*support.*type.*"): + res = SArray(self.string_data).rolling_stdv(0, 1) ### Empty SArray sa = SArray() - res = sa.rolling_stdv(0,1) + res = sa.rolling_stdv(0, 1) self.__test_equal(res, [], type(None)) ### Small SArray - sa = SArray([1,2,3]) - res = sa.rolling_stdv(0,1) - self.__test_equal(res, [.5,.5,None], float) + sa = SArray([1, 2, 3]) + res = sa.rolling_stdv(0, 1) + self.__test_equal(res, [0.5, 0.5, None], float) def test_rolling_count(self): data = SArray(range(100)) ### Small backward window including current - res = data.rolling_count(-3,0) - expected = [1,2,3] + [4 for i in range(97)] - self.__test_equal(res,expected,int) + res = data.rolling_count(-3, 0) + expected = [1, 2, 3] + [4 for i in range(97)] + self.__test_equal(res, expected, int) # Test float inputs - res = data.astype(float).rolling_count(-3,0) - self.__test_equal(res,expected,int) + res = data.astype(float).rolling_count(-3, 0) + self.__test_equal(res, expected, int) # Test vector input - res = SArray(self.vec_data).rolling_count(-3,0) - expected = [1,2,3] + [4 for i in range(7)] - self.__test_equal(res,expected,int) + res = SArray(self.vec_data).rolling_count(-3, 0) + expected = [1, 2, 3] + [4 for i in range(7)] + self.__test_equal(res, expected, int) ### Test string input - res = SArray(self.string_data).rolling_count(-3,0) - self.__test_equal(res,expected[0:8],int) + res = SArray(self.string_data).rolling_count(-3, 0) + self.__test_equal(res, expected[0:8], int) ### Small forward window including current - res = data.rolling_count(0,4) - expected = [5 for i in range(0,96)] + [4,3,2,1] - self.__test_equal(res,expected,int) + res = data.rolling_count(0, 4) + expected = [5 for i in range(0, 96)] + [4, 3, 2, 1] + self.__test_equal(res, expected, int) ### A window size of 1 - res = data.rolling_count(0,0) + res = data.rolling_count(0, 0) self.__test_equal(res, [1 for i in range(100)], int) - res = data.rolling_count(-2,-2) - self.__test_equal(res, [0,0] + [1 for i in range(98)], int) + res = data.rolling_count(-2, -2) + self.__test_equal(res, [0, 0] + [1 for i in range(98)], int) ### A negative window size with self.assertRaises(RuntimeError): - res = data.rolling_count(4,2) + res = data.rolling_count(4, 2) ### Empty SArray sa = SArray() - res = sa.rolling_count(0,1) + res = sa.rolling_count(0, 1) self.__test_equal(res, [], type(None)) ### Small SArray - sa = SArray([1,2,3]) - res = sa.rolling_count(0,1) - self.__test_equal(res, [2,2,1], int) + sa = SArray([1, 2, 3]) + res = sa.rolling_count(0, 1) + self.__test_equal(res, [2, 2, 1], int) - sa = SArray([1,2,None]) - res = sa.rolling_count(0,1) - self.__test_equal(res, [2,1,0], int) + sa = SArray([1, 2, None]) + res = sa.rolling_count(0, 1) + self.__test_equal(res, [2, 1, 0], int) def cumulative_aggregate_comparison(self, out, ans): import array + self.assertEqual(out.dtype, ans.dtype) self.assertEqual(len(out), len(ans)) for i in range(len(out)): @@ -2648,16 +2980,15 @@ def cumulative_aggregate_comparison(self, out, ans): self.assertTrue(out[i] is None) if type(out[i]) != array.array: - self.assertAlmostEqual(out[i], ans[i]) + self.assertAlmostEqual(out[i], ans[i]) else: - self.assertEqual(len(out[i]), len(ans[i])) - oi = out[i] - ansi = ans[i] - for j in range(len(oi)): - self.assertAlmostEqual(oi, ansi) + self.assertEqual(len(out[i]), len(ans[i])) + oi = out[i] + ansi = ans[i] + for j in range(len(oi)): + self.assertAlmostEqual(oi, ansi) def test_cumulative_sum(self): - def single_test(src, ans): out = src.cumulative_sum() self.cumulative_aggregate_comparison(out, ans) @@ -2669,39 +3000,35 @@ def single_test(src, ans): with self.assertRaises(RuntimeError): sa = SArray([{"bar": 1}]).cumulative_sum() with self.assertRaises(RuntimeError): - sa = SArray([[1], [1,1], [1], [1]]).cumulative_sum() + sa = SArray([[1], [1, 1], [1], [1]]).cumulative_sum() single_test( - SArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), - SArray([0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55]) + SArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + SArray([0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55]), ) single_test( SArray([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]), - SArray([0.1, 1.2, 3.3, 6.4, 10.5, 15.6, 21.7, 28.8]) + SArray([0.1, 1.2, 3.3, 6.4, 10.5, 15.6, 21.7, 28.8]), ) single_test( SArray([[11.0, 2.0], [22.0, 1.0], [3.0, 4.0], [4.0, 4.0]]), - SArray([[11.0, 2.0], [33.0, 3.0], [36.0, 7.0], [40.0, 11.0]]) + SArray([[11.0, 2.0], [33.0, 3.0], [36.0, 7.0], [40.0, 11.0]]), ) single_test( SArray([None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), - SArray([None, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55]) - ) - single_test( - SArray([None, 1, None, 3, None, 5]), - SArray([None, 1, 1, 4, 4, 9]) + SArray([None, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55]), ) + single_test(SArray([None, 1, None, 3, None, 5]), SArray([None, 1, 1, 4, 4, 9])) single_test( SArray([None, [33.0, 3.0], [3.0, 4.0], [4.0, 4.0]]), - SArray([None, [33.0, 3.0], [36.0, 7.0], [40.0, 11.0]]) + SArray([None, [33.0, 3.0], [36.0, 7.0], [40.0, 11.0]]), ) single_test( SArray([None, [33.0, 3.0], None, [4.0, 4.0]]), - SArray([None, [33.0, 3.0], [33.0, 3.0], [37.0, 7.0]]) + SArray([None, [33.0, 3.0], [33.0, 3.0], [37.0, 7.0]]), ) def test_cumulative_mean(self): - def single_test(src, ans): out = src.cumulative_mean() self.cumulative_aggregate_comparison(out, ans) @@ -2713,40 +3040,37 @@ def single_test(src, ans): with self.assertRaises(RuntimeError): sa = SArray([{"bar": 1}]).cumulative_mean() with self.assertRaises(RuntimeError): - sa = SArray([[1], [1,1], [1], [1]]).cumulative_mean() + sa = SArray([[1], [1, 1], [1], [1]]).cumulative_mean() single_test( - SArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), - SArray([0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]) + SArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + SArray([0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]), ) single_test( SArray([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]), - SArray([0.1, 0.6, 1.1, 1.6, 2.1, 2.6, 3.1, 3.6]) + SArray([0.1, 0.6, 1.1, 1.6, 2.1, 2.6, 3.1, 3.6]), ) single_test( - SArray([[11.0, 22.0], [33.0, 66.0], [4.0, 2.0], [4.0, 2.0]]), - SArray([[11.0, 22.0], [22.0, 44.0], [16.0, 30.0], [13.0, 23.0]]) + SArray([[11.0, 22.0], [33.0, 66.0], [4.0, 2.0], [4.0, 2.0]]), + SArray([[11.0, 22.0], [22.0, 44.0], [16.0, 30.0], [13.0, 23.0]]), ) single_test( SArray([None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), - SArray([None, 0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]) + SArray([None, 0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]), ) single_test( - SArray([None, 1, None, 3, None, 5]), - SArray([None, 1, 1.0, 2.0, 2.0, 3.0]) + SArray([None, 1, None, 3, None, 5]), SArray([None, 1, 1.0, 2.0, 2.0, 3.0]) ) single_test( - SArray([None, [11.0, 22.0], [33.0, 66.0], [4.0, 2.0]]), - SArray([None, [11.0, 22.0], [22.0, 44.0], [16.0, 30.0]]) + SArray([None, [11.0, 22.0], [33.0, 66.0], [4.0, 2.0]]), + SArray([None, [11.0, 22.0], [22.0, 44.0], [16.0, 30.0]]), ) single_test( SArray([None, [11.0, 22.0], None, [33.0, 66.0], [4.0, 2.0]]), - SArray([None, [11.0, 22.0], [11.0, 22.0], [22.0, 44.0], [16.0, 30.0]]) + SArray([None, [11.0, 22.0], [11.0, 22.0], [22.0, 44.0], [16.0, 30.0]]), ) - def test_cumulative_min(self): - def single_test(src, ans): out = src.cumulative_min() self.cumulative_aggregate_comparison(out, ans) @@ -2758,29 +3082,25 @@ def single_test(src, ans): with self.assertRaises(RuntimeError): sa = SArray([{"bar": 1}]).cumulative_min() with self.assertRaises(RuntimeError): - sa = SArray([[1], [1,1], [1], [1]]).cumulative_min() + sa = SArray([[1], [1, 1], [1], [1]]).cumulative_min() with self.assertRaises(RuntimeError): sa = SArray([[1], [1], [1], [1]]).cumulative_min() single_test( - SArray([0, 1, 2, 3, 4, 5, -1, 7, 8, -2, 10]), - SArray([0, 0, 0, 0, 0, 0, -1, -1, -1, -2, -2]) + SArray([0, 1, 2, 3, 4, 5, -1, 7, 8, -2, 10]), + SArray([0, 0, 0, 0, 0, 0, -1, -1, -1, -2, -2]), ) single_test( SArray([7.1, 6.1, 3.1, 3.9, 4.1, 2.1, 2.9, 0.1]), - SArray([7.1, 6.1, 3.1, 3.1, 3.1, 2.1, 2.1, 0.1]) + SArray([7.1, 6.1, 3.1, 3.1, 3.1, 2.1, 2.1, 0.1]), ) single_test( SArray([None, 8, 6, 3, 4, None, 6, 2, 8, 9, 1]), - SArray([None, 8, 6, 3, 3, 3, 3, 2, 2, 2, 1]) - ) - single_test( - SArray([None, 5, None, 3, None, 10]), - SArray([None, 5, 5, 3, 3, 3]) + SArray([None, 8, 6, 3, 3, 3, 3, 2, 2, 2, 1]), ) + single_test(SArray([None, 5, None, 3, None, 10]), SArray([None, 5, 5, 3, 3, 3])) def test_cumulative_max(self): - def single_test(src, ans): out = src.cumulative_max() self.cumulative_aggregate_comparison(out, ans) @@ -2792,29 +3112,27 @@ def single_test(src, ans): with self.assertRaises(RuntimeError): sa = SArray([{"bar": 1}]).cumulative_max() with self.assertRaises(RuntimeError): - sa = SArray([[1], [1,1], [1], [1]]).cumulative_max() + sa = SArray([[1], [1, 1], [1], [1]]).cumulative_max() with self.assertRaises(RuntimeError): sa = SArray([[1], [1], [1], [1]]).cumulative_max() single_test( - SArray([0, 1, 0, 3, 5, 4, 1, 7, 6, 2, 10]), - SArray([0, 1, 1, 3, 5, 5, 5, 7, 7, 7, 10]) + SArray([0, 1, 0, 3, 5, 4, 1, 7, 6, 2, 10]), + SArray([0, 1, 1, 3, 5, 5, 5, 7, 7, 7, 10]), ) single_test( SArray([2.1, 6.1, 3.1, 3.9, 2.1, 8.1, 8.9, 10.1]), - SArray([2.1, 6.1, 6.1, 6.1, 6.1, 8.1, 8.9, 10.1]) + SArray([2.1, 6.1, 6.1, 6.1, 6.1, 8.1, 8.9, 10.1]), ) single_test( SArray([None, 1, 6, 3, 4, None, 4, 2, 8, 9, 1]), - SArray([None, 1, 6, 6, 6, 6, 6, 6, 8, 9, 9]) + SArray([None, 1, 6, 6, 6, 6, 6, 6, 8, 9, 9]), ) single_test( - SArray([None, 2, None, 3, None, 10]), - SArray([None, 2, 2, 3, 3, 10]) + SArray([None, 2, None, 3, None, 10]), SArray([None, 2, 2, 3, 3, 10]) ) def test_cumulative_std(self): - def single_test(src, ans): out = src.cumulative_std() self.cumulative_aggregate_comparison(out, ans) @@ -2826,35 +3144,68 @@ def single_test(src, ans): with self.assertRaises(RuntimeError): sa = SArray([{"bar": 1}]).cumulative_std() with self.assertRaises(RuntimeError): - sa = SArray([[1], [1,1], [1], [1]]).cumulative_std() + sa = SArray([[1], [1, 1], [1], [1]]).cumulative_std() with self.assertRaises(RuntimeError): sa = SArray([[1], [1], [1], [1]]).cumulative_std() single_test( - SArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), - SArray([0.0, 0.5, 0.816496580927726, 1.118033988749895, - 1.4142135623730951, 1.707825127659933, 2.0, 2.29128784747792, - 2.581988897471611, 2.8722813232690143, 3.1622776601683795]) + SArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + SArray( + [ + 0.0, + 0.5, + 0.816496580927726, + 1.118033988749895, + 1.4142135623730951, + 1.707825127659933, + 2.0, + 2.29128784747792, + 2.581988897471611, + 2.8722813232690143, + 3.1622776601683795, + ] + ), ) single_test( SArray([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]), - SArray([0.0, 0.5, 0.81649658092772603, 1.1180339887498949, - 1.4142135623730949, 1.707825127659933, 1.9999999999999998, - 2.2912878474779195]) + SArray( + [ + 0.0, + 0.5, + 0.81649658092772603, + 1.1180339887498949, + 1.4142135623730949, + 1.707825127659933, + 1.9999999999999998, + 2.2912878474779195, + ] + ), ) single_test( SArray([None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), - SArray([None, 0.0, 0.5, 0.816496580927726, 1.118033988749895, - 1.4142135623730951, 1.707825127659933, 2.0, 2.29128784747792, - 2.581988897471611, 2.8722813232690143, 3.1622776601683795]) + SArray( + [ + None, + 0.0, + 0.5, + 0.816496580927726, + 1.118033988749895, + 1.4142135623730951, + 1.707825127659933, + 2.0, + 2.29128784747792, + 2.581988897471611, + 2.8722813232690143, + 3.1622776601683795, + ] + ), ) single_test( - SArray([None, 1, None, 3, None, 5]), - SArray([None, 0.0, 0.0, 1.0, 1.0, 1.6329931618554521]) + SArray([None, 1, None, 3, None, 5]), + SArray([None, 0.0, 0.0, 1.0, 1.0, 1.6329931618554521]), ) def test_cumulative_var(self): - def single_test(src, ans): out = src.cumulative_var() self.cumulative_aggregate_comparison(out, ans) @@ -2866,120 +3217,167 @@ def single_test(src, ans): with self.assertRaises(RuntimeError): sa = SArray([{"bar": 1}]).cumulative_var() with self.assertRaises(RuntimeError): - sa = SArray([[1], [1,1], [1], [1]]).cumulative_var() + sa = SArray([[1], [1, 1], [1], [1]]).cumulative_var() with self.assertRaises(RuntimeError): sa = SArray([[1], [1], [1], [1]]).cumulative_var() single_test( - SArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), - SArray([0.0, 0.25, 0.6666666666666666, 1.25, 2.0, 2.9166666666666665, - 4.0, 5.25, 6.666666666666667, 8.25, 10.0]) + SArray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + SArray( + [ + 0.0, + 0.25, + 0.6666666666666666, + 1.25, + 2.0, + 2.9166666666666665, + 4.0, + 5.25, + 6.666666666666667, + 8.25, + 10.0, + ] + ), ) single_test( SArray([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]), - SArray( [0.0, 0.25000000000000006, 0.6666666666666666, 1.25, - 1.9999999999999996, 2.916666666666666, 3.999999999999999, - 5.249999999999998]) + SArray( + [ + 0.0, + 0.25000000000000006, + 0.6666666666666666, + 1.25, + 1.9999999999999996, + 2.916666666666666, + 3.999999999999999, + 5.249999999999998, + ] + ), ) single_test( SArray([None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), - SArray([None, 0.0, 0.25, 0.6666666666666666, 1.25, 2.0, 2.9166666666666665, - 4.0, 5.25, 6.666666666666667, 8.25, 10.0]) + SArray( + [ + None, + 0.0, + 0.25, + 0.6666666666666666, + 1.25, + 2.0, + 2.9166666666666665, + 4.0, + 5.25, + 6.666666666666667, + 8.25, + 10.0, + ] + ), ) single_test( - SArray([None, 1, None, 3, None, 5]), - SArray([None, 0.0, 0.0, 1.0, 1.0, 2.6666666666666665]) + SArray([None, 1, None, 3, None, 5]), + SArray([None, 0.0, 0.0, 1.0, 1.0, 2.6666666666666665]), ) def test_numpy_datetime64(self): # Make all datetimes naive - expected = [i.replace(tzinfo=GMT(0.0)) \ - if i is not None and i.tzinfo is None else i for i in self.datetime_data] + expected = [ + i.replace(tzinfo=GMT(0.0)) if i is not None and i.tzinfo is None else i + for i in self.datetime_data + ] # A regular list - iso_str_list = [np.datetime64('2013-05-07T10:04:10Z'), - np.datetime64('1902-10-21T10:34:10Z'), - None] + iso_str_list = [ + np.datetime64("2013-05-07T10:04:10Z"), + np.datetime64("1902-10-21T10:34:10Z"), + None, + ] sa = SArray(iso_str_list) - self.__test_equal(sa,expected,dt.datetime) + self.__test_equal(sa, expected, dt.datetime) - iso_str_list[2] = np.datetime64('NaT') + iso_str_list[2] = np.datetime64("NaT") sa = SArray(iso_str_list) - self.__test_equal(sa,expected,dt.datetime) + self.__test_equal(sa, expected, dt.datetime) # A numpy array np_ary = np.array(iso_str_list) sa = SArray(np_ary) - self.__test_equal(sa,expected,dt.datetime) + self.__test_equal(sa, expected, dt.datetime) ### Every possible type of datetime64 - test_str = '1969-12-31T23:59:56Z' - available_time_units = ['h','m','s','ms','us','ns','ps','fs','as'] - expected = [dt.datetime(1969,12,31,23,59,56,tzinfo=GMT(0.0)) for i in range(7)] - expected.insert(0,dt.datetime(1969,12,31,23,59,0,tzinfo=GMT(0.0))) - expected.insert(0,dt.datetime(1969,12,31,23,0,0,tzinfo=GMT(0.0))) + test_str = "1969-12-31T23:59:56Z" + available_time_units = ["h", "m", "s", "ms", "us", "ns", "ps", "fs", "as"] + expected = [ + dt.datetime(1969, 12, 31, 23, 59, 56, tzinfo=GMT(0.0)) for i in range(7) + ] + expected.insert(0, dt.datetime(1969, 12, 31, 23, 59, 0, tzinfo=GMT(0.0))) + expected.insert(0, dt.datetime(1969, 12, 31, 23, 0, 0, tzinfo=GMT(0.0))) for i in range(len(available_time_units)): - sa = SArray([np.datetime64(test_str,available_time_units[i])]) - self.__test_equal(sa,[expected[i]],dt.datetime) - - test_str = '1908-06-01' - available_date_units = ['Y','M','W','D'] - expected = [dt.datetime(1908,6,1,0,0,0,tzinfo=GMT(0.0)) for i in range(4)] - expected[2] = dt.datetime(1908,5,28,0,0,0,tzinfo=GMT(0.0)) # weeks start on Thursday? - expected[0] = dt.datetime(1908,1,1,0,0,0,tzinfo=GMT(0.0)) + sa = SArray([np.datetime64(test_str, available_time_units[i])]) + self.__test_equal(sa, [expected[i]], dt.datetime) + + test_str = "1908-06-01" + available_date_units = ["Y", "M", "W", "D"] + expected = [dt.datetime(1908, 6, 1, 0, 0, 0, tzinfo=GMT(0.0)) for i in range(4)] + expected[2] = dt.datetime( + 1908, 5, 28, 0, 0, 0, tzinfo=GMT(0.0) + ) # weeks start on Thursday? + expected[0] = dt.datetime(1908, 1, 1, 0, 0, 0, tzinfo=GMT(0.0)) for i in range(len(available_date_units)): - sa = SArray([np.datetime64(test_str,available_date_units[i])]) - self.__test_equal(sa,[expected[i]],dt.datetime) + sa = SArray([np.datetime64(test_str, available_date_units[i])]) + self.__test_equal(sa, [expected[i]], dt.datetime) # Daylight savings time (Just to be safe. datetime64 deals in UTC, and # we store times in UTC by default, so this shouldn't affect anything) - sa = SArray([np.datetime64('2015-03-08T02:38:00-08')]) - expected = [dt.datetime(2015,3,8,10,38,tzinfo=GMT(0.0))] + sa = SArray([np.datetime64("2015-03-08T02:38:00-08")]) + expected = [dt.datetime(2015, 3, 8, 10, 38, tzinfo=GMT(0.0))] self.__test_equal(sa, expected, dt.datetime) # timezone considerations - sa = SArray([np.datetime64('2016-01-01T05:45:00+0545')]) - expected = [dt.datetime(2016,1,1,0,0,0,tzinfo=GMT(0.0))] + sa = SArray([np.datetime64("2016-01-01T05:45:00+0545")]) + expected = [dt.datetime(2016, 1, 1, 0, 0, 0, tzinfo=GMT(0.0))] self.__test_equal(sa, expected, dt.datetime) ### Out of our datetime range with self.assertRaises(TypeError): - sa = SArray([np.datetime64('1066-10-14T09:00:00Z')]) + sa = SArray([np.datetime64("1066-10-14T09:00:00Z")]) def test_pandas_timestamp(self): - iso_str_list = [pd.Timestamp('2013-05-07T10:04:10'), - pd.Timestamp('1902-10-21T10:34:10Z'), - None] + iso_str_list = [ + pd.Timestamp("2013-05-07T10:04:10"), + pd.Timestamp("1902-10-21T10:34:10Z"), + None, + ] sa = SArray(iso_str_list) - self.__test_equal(sa,self.datetime_data,dt.datetime) + self.__test_equal(sa, self.datetime_data, dt.datetime) iso_str_list[2] = pd.NaT sa = SArray(iso_str_list) - self.__test_equal(sa,self.datetime_data,dt.datetime) + self.__test_equal(sa, self.datetime_data, dt.datetime) - sa = SArray([pd.Timestamp('2015-03-08T02:38:00-08')]) - expected = [dt.datetime(2015,3,8,2,38,tzinfo=GMT(-8.0))] + sa = SArray([pd.Timestamp("2015-03-08T02:38:00-08")]) + expected = [dt.datetime(2015, 3, 8, 2, 38, tzinfo=GMT(-8.0))] self.__test_equal(sa, expected, dt.datetime) - sa = SArray([pd.Timestamp('2016-01-01 05:45:00', tz=GMT(5.75))]) - expected = [dt.datetime(2016,1,1,5,45,0,tzinfo=GMT(5.75))] + sa = SArray([pd.Timestamp("2016-01-01 05:45:00", tz=GMT(5.75))]) + expected = [dt.datetime(2016, 1, 1, 5, 45, 0, tzinfo=GMT(5.75))] self.__test_equal(sa, expected, dt.datetime) def test_decimal(self): import decimal + test_val = decimal.Decimal(3.0) sa = SArray([test_val]) expected = [3.0] self.__test_equal(sa, expected, float) def test_timedelta(self): - test_val = dt.timedelta(1,1) + test_val = dt.timedelta(1, 1) sa = SArray([test_val]) expected = [86401.0] self.__test_equal(sa, expected, float) def test_materialize(self): - sa= SArray(range(100)) + sa = SArray(range(100)) sa = sa[sa > 10] self.assertFalse(sa.is_materialized()) sa.materialize() @@ -2993,52 +3391,66 @@ def test_ternary(self): self.__test_equal(SArray.where(a > 10, a, a), lista, int) # clip lower - self.__test_equal(SArray.where(a > 10, a, 10), - [i if i > 10 else 10 for i in lista], int) + self.__test_equal( + SArray.where(a > 10, a, 10), [i if i > 10 else 10 for i in lista], int + ) # clip upper - self.__test_equal(SArray.where(a > 10, 10, a), - [10 if i > 10 else i for i in lista], int) + self.__test_equal( + SArray.where(a > 10, 10, a), [10 if i > 10 else i for i in lista], int + ) # constants - self.__test_equal(SArray.where(a > 10, 10, 9), - [10 if i > 10 else 9 for i in lista], int) + self.__test_equal( + SArray.where(a > 10, 10, 9), [10 if i > 10 else 9 for i in lista], int + ) # constant float - self.__test_equal(SArray.where(a > 10, 10.0, 9.0), - [10.0 if i > 10 else 9.0 for i in lista], float) + self.__test_equal( + SArray.where(a > 10, 10.0, 9.0), + [10.0 if i > 10 else 9.0 for i in lista], + float, + ) # constant str - self.__test_equal(SArray.where(a > 10, "10", "9"), - ["10" if i > 10 else "9" for i in lista], str) + self.__test_equal( + SArray.where(a > 10, "10", "9"), + ["10" if i > 10 else "9" for i in lista], + str, + ) - #inconsistent types + # inconsistent types with self.assertRaises(TypeError): - SArray.where(a > 10, 10, "9") # 10 and "9" different types + SArray.where(a > 10, 10, "9") # 10 and "9" different types - #inconsistent types + # inconsistent types with self.assertRaises(TypeError): - SArray.where(a > 10, a, "9") # expecting an integer for "a" + SArray.where(a > 10, a, "9") # expecting an integer for "a" # technically different types but type coercion happened - self.__test_equal(SArray.where(a > 10, a, 10.0), - [i if i > 10 else 10 for i in lista], int) + self.__test_equal( + SArray.where(a > 10, a, 10.0), [i if i > 10 else 10 for i in lista], int + ) # list types - self.__test_equal(SArray.where(a > 10, [], [1], list), - [[] if i > 10 else [1] for i in lista], list) + self.__test_equal( + SArray.where(a > 10, [], [1], list), + [[] if i > 10 else [1] for i in lista], + list, + ) # really the same as the above, but using an SArray in place # of a constant in istrue. And hoping the type coercion # will take care of [1] b = SArray([[] for i in range(1000)]) - self.__test_equal(SArray.where(a > 10, b, [1]), - [[] if i > 10 else [1] for i in lista], list) + self.__test_equal( + SArray.where(a > 10, b, [1]), [[] if i > 10 else [1] for i in lista], list + ) def test_shape(self): sa = SArray() self.assertEqual(sa.shape, (0,)) - for i in [0,1,2,10,345]: + for i in [0, 1, 2, 10, 345]: sa = SArray(range(i)) self.assertEqual(sa.shape, (i,)) @@ -3046,10 +3458,11 @@ def test_random_split(self): sa = SArray(range(10)) (train, test) = sa.random_split(0.8, seed=12423) self.assertEqual(list(train), [0, 1, 2, 3, 5, 7, 8, 9]) - self.assertEqual(list(test), [4,6]) + self.assertEqual(list(test), [4, 6]) def test_copy(self): from copy import copy + sa = SArray(range(1000)) sa_copy = copy(sa) @@ -3059,6 +3472,7 @@ def test_copy(self): def test_deepcopy(self): from copy import deepcopy + sa = SArray(range(1000)) sa_copy = deepcopy(sa) @@ -3067,43 +3481,43 @@ def test_deepcopy(self): assert (sa == sa_copy).all() def test_value_counts(self): - sa = SArray([1,1,2,2,2,2,3,3,3,3,3,3,3]) + sa = SArray([1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]) c = sa.value_counts() - self.assertEqual(c.column_names(), ['value','count']) - self.__test_equal(c['value'], [3,2,1], int) - self.__test_equal(c['count'], [7,4,2], int) + self.assertEqual(c.column_names(), ["value", "count"]) + self.__test_equal(c["value"], [3, 2, 1], int) + self.__test_equal(c["count"], [7, 4, 2], int) sa = SArray() c = sa.value_counts() self.assertEqual(len(c), 0) def test_ndarray_shape(self): - a1 = np.array([[1,2,3,4],[5,6,7,8]], 'd') - a2 = a1.reshape(4,2) + a1 = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], "d") + a2 = a1.reshape(4, 2) a3 = a1.transpose() - a4 = a3.reshape(2,4) + a4 = a3.reshape(2, 4) - b1 = a1[:2,:2] - b2 = a2[:2,:2] - b3 = a3[:2,:2] - b4 = a4[:2,:2] + b1 = a1[:2, :2] + b2 = a2[:2, :2] + b3 = a3[:2, :2] + b4 = a4[:2, :2] c1 = b1.transpose() c2 = b2.transpose() c3 = b3.transpose() c4 = b4.transpose() - d1 = a1[:2,2:4] - d2 = a2[2:4,:2] - d3 = a3[2:4,:2] - d4 = a4[:2,2:4] + d1 = a1[:2, 2:4] + d2 = a2[2:4, :2] + d3 = a3[2:4, :2] + d4 = a4[:2, 2:4] - originals = [a1,a2,a3,a4,b1,b2,b3,b4,c1,c2,c3,c4,d1,d2,d3,d4] + originals = [a1, a2, a3, a4, b1, b2, b3, b4, c1, c2, c3, c4, d1, d2, d3, d4] sa = SArray(originals) l = list(sa) # check roundtriping of SArray ndarray type for i in range(len(l)): - self.assertTrue(np.array_equal(l[i], originals[i])) + self.assertTrue(np.array_equal(l[i], originals[i])) # check roundtriping again because the ndarray type SArray # returned is slightly odd (it uses a custom bufferprotocol @@ -3116,105 +3530,117 @@ def test_ndarray_shape(self): # test slicing slice_true = [x[1:] for x in originals] - slice_test = list(sa.apply(lambda x:x[1:])) + slice_test = list(sa.apply(lambda x: x[1:])) for i in range(len(l)): self.assertTrue(np.array_equal(slice_test[i], slice_true[i])) # test slice round tripping # test SArray(slice_true) def test_ndarray_ops(self): - a1 = np.array([[1,2,3,4],[5,6,7,8]], 'd') - a2 = a1.reshape(4,2) - sa = SArray([a1,a2]) + a1 = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], "d") + a2 = a1.reshape(4, 2) + sa = SArray([a1, a2]) - b1 = np.array([[2,1,4,3],[6,5,8,7]], 'd') - b2 = a1.reshape(4,2) - sb = SArray([b1,b2]) + b1 = np.array([[2, 1, 4, 3], [6, 5, 8, 7]], "d") + b2 = a1.reshape(4, 2) + sb = SArray([b1, b2]) res = sa + sb - self.assertTrue(np.array_equal(res[0], a1+b1)) - self.assertTrue(np.array_equal(res[1], a2+b2)) + self.assertTrue(np.array_equal(res[0], a1 + b1)) + self.assertTrue(np.array_equal(res[1], a2 + b2)) res = sa + 1 - self.assertTrue(np.array_equal(res[0], a1+1)) - self.assertTrue(np.array_equal(res[1], a2+1)) + self.assertTrue(np.array_equal(res[0], a1 + 1)) + self.assertTrue(np.array_equal(res[1], a2 + 1)) res = 1 + sa - self.assertTrue(np.array_equal(res[0], 1+a1)) - self.assertTrue(np.array_equal(res[1], 1+a2)) + self.assertTrue(np.array_equal(res[0], 1 + a1)) + self.assertTrue(np.array_equal(res[1], 1 + a2)) res = sa - sb - self.assertTrue(np.array_equal(res[0], a1-b1)) - self.assertTrue(np.array_equal(res[1], a2-b2)) + self.assertTrue(np.array_equal(res[0], a1 - b1)) + self.assertTrue(np.array_equal(res[1], a2 - b2)) res = sa - 1 - self.assertTrue(np.array_equal(res[0], a1-1)) - self.assertTrue(np.array_equal(res[1], a2-1)) + self.assertTrue(np.array_equal(res[0], a1 - 1)) + self.assertTrue(np.array_equal(res[1], a2 - 1)) res = 1 - sa - self.assertTrue(np.array_equal(res[0], 1-a1)) - self.assertTrue(np.array_equal(res[1], 1-a2)) + self.assertTrue(np.array_equal(res[0], 1 - a1)) + self.assertTrue(np.array_equal(res[1], 1 - a2)) res = sa * sb - self.assertTrue(np.array_equal(res[0], a1*b1)) - self.assertTrue(np.array_equal(res[1], a2*b2)) + self.assertTrue(np.array_equal(res[0], a1 * b1)) + self.assertTrue(np.array_equal(res[1], a2 * b2)) res = sa * 2 - self.assertTrue(np.array_equal(res[0], a1*2)) - self.assertTrue(np.array_equal(res[1], a2*2)) + self.assertTrue(np.array_equal(res[0], a1 * 2)) + self.assertTrue(np.array_equal(res[1], a2 * 2)) res = 2 * sa - self.assertTrue(np.array_equal(res[0], 2*a1)) - self.assertTrue(np.array_equal(res[1], 2*a2)) + self.assertTrue(np.array_equal(res[0], 2 * a1)) + self.assertTrue(np.array_equal(res[1], 2 * a2)) res = sa / sb - self.assertTrue(np.array_equal(res[0], a1/b1)) - self.assertTrue(np.array_equal(res[1], a2/b2)) + self.assertTrue(np.array_equal(res[0], a1 / b1)) + self.assertTrue(np.array_equal(res[1], a2 / b2)) res = sa / 2 - self.assertTrue(np.array_equal(res[0], a1/2.0)) - self.assertTrue(np.array_equal(res[1], a2/2.0)) + self.assertTrue(np.array_equal(res[0], a1 / 2.0)) + self.assertTrue(np.array_equal(res[1], a2 / 2.0)) res = sa / 2.0 - self.assertTrue(np.array_equal(res[0], a1/2.0)) - self.assertTrue(np.array_equal(res[1], a2/2.0)) + self.assertTrue(np.array_equal(res[0], a1 / 2.0)) + self.assertTrue(np.array_equal(res[1], a2 / 2.0)) res = 2.0 / sa - self.assertTrue(np.array_equal(res[0], 2.0/a1)) - self.assertTrue(np.array_equal(res[1], 2.0/a2)) + self.assertTrue(np.array_equal(res[0], 2.0 / a1)) + self.assertTrue(np.array_equal(res[1], 2.0 / a2)) # misshappen with self.assertRaises(RuntimeError): res.sum() - self.assertTrue(np.array_equal(SArray([a1,b1]).sum(), a1+b1)) + self.assertTrue(np.array_equal(SArray([a1, b1]).sum(), a1 + b1)) def test_type_casting(self): - x = SFrame({'a': [[1,2], None, [3,4], None]}) - x['a'] = SArray(x['a'], list) - self.assertTrue(x['a'].dtype == list) + x = SFrame({"a": [[1, 2], None, [3, 4], None]}) + x["a"] = SArray(x["a"], list) + self.assertTrue(x["a"].dtype == list) def test_filter_by(self): - #integer example - x = SArray([1,2,3,4,5,6,7]) - self.assertEqual(sorted(x.filter_by([11,7,2,8,4])), [2,4,7]) - self.assertEqual(sorted(x.filter_by([11,7,2,8,4,3], exclude=True)), [1,5,6]) - - #empty SArray - self.assertEqual(sorted(x.filter_by([77,22,18,42])), []) - self.assertEqual(sorted(x.filter_by([77,22,18,42], exclude=True)), list(x)) - - #duplicates - self.assertEqual(sorted(x.filter_by([2,2,3,44])), [2,3]) - x = SArray([1,2,2,3,4,5,6,7]) - self.assertEqual(sorted(x.filter_by([2,2,3,44])), [2,2,3]) - - #strings - x = SArray(['dog', 'cat', 'cow', 'horse']) - self.assertEqual(sorted(x.filter_by(['cat', 'hamster', 'dog', 'fish', 'bird', 'snake'])), ['cat', 'dog']) - self.assertEqual(sorted(x.filter_by(['cat', 'hamster', 'dog', 'fish', 'bird', 'snake'], exclude=True)), ['cow', 'horse']) - self.assertEqual(sorted(x.filter_by('dog')), ['dog']) + # integer example + x = SArray([1, 2, 3, 4, 5, 6, 7]) + self.assertEqual(sorted(x.filter_by([11, 7, 2, 8, 4])), [2, 4, 7]) + self.assertEqual( + sorted(x.filter_by([11, 7, 2, 8, 4, 3], exclude=True)), [1, 5, 6] + ) + + # empty SArray + self.assertEqual(sorted(x.filter_by([77, 22, 18, 42])), []) + self.assertEqual(sorted(x.filter_by([77, 22, 18, 42], exclude=True)), list(x)) + + # duplicates + self.assertEqual(sorted(x.filter_by([2, 2, 3, 44])), [2, 3]) + x = SArray([1, 2, 2, 3, 4, 5, 6, 7]) + self.assertEqual(sorted(x.filter_by([2, 2, 3, 44])), [2, 2, 3]) + + # strings + x = SArray(["dog", "cat", "cow", "horse"]) + self.assertEqual( + sorted(x.filter_by(["cat", "hamster", "dog", "fish", "bird", "snake"])), + ["cat", "dog"], + ) + self.assertEqual( + sorted( + x.filter_by( + ["cat", "hamster", "dog", "fish", "bird", "snake"], exclude=True + ) + ), + ["cow", "horse"], + ) + self.assertEqual(sorted(x.filter_by("dog")), ["dog"]) def test_abs(self): sa = SArray([-1.10, 2, -3.33, 4]) diff --git a/src/python/turicreate/test/test_sarray_builder.py b/src/python/turicreate/test/test_sarray_builder.py index c261425f1e..170c0df352 100644 --- a/src/python/turicreate/test/test_sarray_builder.py +++ b/src/python/turicreate/test/test_sarray_builder.py @@ -13,6 +13,7 @@ import datetime as dt from .._cython.cy_flexible_type import GMT + class SArrayBuilderTest(unittest.TestCase): def __test_equal(self, _sarray, _data, _type): self.assertEqual(_sarray.dtype, _type) @@ -33,16 +34,26 @@ def __test_append_multiple(self, sb, data, dtype): self.__test_equal(sa, data, dtype) def test_basic(self): - data_to_test = [([1,-1,None,2],int), - ([i for i in range(20000)], int), - ([None, 1.0, -1.0, 2.3],float), - (["hi", None, "hello", "None"],str), - ([dt.datetime(2013, 5, 7, 10, 4, 10), - dt.datetime(1902, 10, 21, 10, 34, 10).replace(tzinfo=GMT(0.0)),None],dt.datetime), - ([["hi",1],None,["hi",2,3],["hello"]],list), - ([array.array('d',[1.0,2.0]),array.array('d',[3.0,4.0]),None],array.array), - ([{'a':1,'b':2},{'c':3,'d':4},None],dict), - ] + data_to_test = [ + ([1, -1, None, 2], int), + ([i for i in range(20000)], int), + ([None, 1.0, -1.0, 2.3], float), + (["hi", None, "hello", "None"], str), + ( + [ + dt.datetime(2013, 5, 7, 10, 4, 10), + dt.datetime(1902, 10, 21, 10, 34, 10).replace(tzinfo=GMT(0.0)), + None, + ], + dt.datetime, + ), + ([["hi", 1], None, ["hi", 2, 3], ["hello"]], list), + ( + [array.array("d", [1.0, 2.0]), array.array("d", [3.0, 4.0]), None], + array.array, + ), + ([{"a": 1, "b": 2}, {"c": 3, "d": 4}, None], dict), + ] for i in data_to_test: sb = SArrayBuilder(i[1]) self.__test_append(sb, i[0], i[1]) @@ -54,7 +65,7 @@ def test_history(self): sb = SArrayBuilder(int, history_size=10) sb.append_multiple((i for i in range(8))) hist = sb.read_history(3) - self.assertEqual(hist,[5,6,7]) + self.assertEqual(hist, [5, 6, 7]) hist = sb.read_history(20) self.assertEqual(hist, [i for i in range(8)]) @@ -63,11 +74,11 @@ def test_history(self): sb.append_multiple((i for i in range(5))) hist = sb.read_history(10) - self.assertEqual(hist, [3,4,5,6,7,0,1,2,3,4]) + self.assertEqual(hist, [3, 4, 5, 6, 7, 0, 1, 2, 3, 4]) sb.append(50) hist = sb.read_history(10) - self.assertEqual(hist, [4,5,6,7,0,1,2,3,4,50]) + self.assertEqual(hist, [4, 5, 6, 7, 0, 1, 2, 3, 4, 50]) hist = sb.read_history(-1) self.assertEqual(hist, []) @@ -75,24 +86,24 @@ def test_history(self): self.assertEqual(hist, []) sa = sb.close() - self.__test_equal(sa,[i for i in range(8)] + [i for i in range(5)] + [50],int) + self.__test_equal(sa, [i for i in range(8)] + [i for i in range(5)] + [50], int) def test_segments(self): sb = SArrayBuilder(int, num_segments=4) - sb.append_multiple((i for i in range(20,30)), segment=2) - sb.append_multiple((i for i in range(10,20)), segment=1) - sb.append_multiple((i for i in range(30,40)), segment=3) - sb.append_multiple((i for i in range(0,10)), segment=0) + sb.append_multiple((i for i in range(20, 30)), segment=2) + sb.append_multiple((i for i in range(10, 20)), segment=1) + sb.append_multiple((i for i in range(30, 40)), segment=3) + sb.append_multiple((i for i in range(0, 10)), segment=0) hist = sb.read_history(3, segment=0) - self.assertSequenceEqual(hist, [7,8,9]) + self.assertSequenceEqual(hist, [7, 8, 9]) hist = sb.read_history(3, segment=1) - self.assertSequenceEqual(hist, [17,18,19]) + self.assertSequenceEqual(hist, [17, 18, 19]) hist = sb.read_history(3, segment=2) - self.assertSequenceEqual(hist, [27,28,29]) + self.assertSequenceEqual(hist, [27, 28, 29]) hist = sb.read_history(3, segment=3) - self.assertSequenceEqual(hist, [37,38,39]) + self.assertSequenceEqual(hist, [37, 38, 39]) with self.assertRaises(RuntimeError): sb.read_history(3, segment=99) diff --git a/src/python/turicreate/test/test_sarray_sketch.py b/src/python/turicreate/test/test_sarray_sketch.py index b51002e0e8..3472eb6536 100644 --- a/src/python/turicreate/test/test_sarray_sketch.py +++ b/src/python/turicreate/test/test_sarray_sketch.py @@ -22,12 +22,11 @@ class SArraySketchTest(unittest.TestCase): - - def __validate_sketch_result(self, sketch, sa, delta = 1E-7): + def __validate_sketch_result(self, sketch, sa, delta=1e-7): df = pd.DataFrame(list(sa.dropna())) pds = pd.Series(list(sa.dropna())) - if (sa.dtype == int or sa.dtype == float): - if (len(sa) == 0): + if sa.dtype == int or sa.dtype == float: + if len(sa) == 0: self.assertTrue(math.isnan(sketch.min())) self.assertTrue(math.isnan(sketch.min())) self.assertEqual(sketch.sum(), 0.0) @@ -41,20 +40,25 @@ def __validate_sketch_result(self, sketch, sa, delta = 1E-7): self.assertAlmostEqual(sketch.mean(), sa.dropna().mean(), delta=delta) self.assertAlmostEqual(sketch.var(), sa.dropna().var(), delta=delta) self.assertAlmostEqual(sketch.std(), sa.dropna().std(), delta=delta) - self.assertAlmostEqual(sketch.quantile(0.5), df.quantile(0.5)[0], delta=1) + self.assertAlmostEqual( + sketch.quantile(0.5), df.quantile(0.5)[0], delta=1 + ) self.assertEqual(sketch.quantile(0), df.quantile(0)[0]) self.assertEqual(sketch.quantile(1), df.quantile(1)[0]) - self.assertEqual(sketch.frequent_items(), SArray(pds).summary().frequent_items()) + self.assertEqual( + sketch.frequent_items(), SArray(pds).summary().frequent_items() + ) for item in pds.value_counts().index: - self.assertEqual(sketch.frequency_count(item), pds.value_counts()[item]) + self.assertEqual( + sketch.frequency_count(item), pds.value_counts()[item] + ) self.assertAlmostEqual(sketch.num_unique(), len(sa.unique()), delta=3) else: with self.assertRaises(RuntimeError): sketch.quantile((0.5)) - self.assertEqual(sketch.num_missing(), sa.countna()) self.assertEqual(sketch.size(), len(sa)) self.assertEqual(sketch.sketch_ready(), True) @@ -75,46 +79,48 @@ def test_sketch_int(self): self.__validate_sketch_result(sa.summary(), sa) def test_sketch_float(self): - int_data = [1.2, 3,.4, 6.789, None] + int_data = [1.2, 3, 0.4, 6.789, None] sa = SArray(data=int_data) self.__validate_sketch_result(sa.summary(), sa) def test_vector_sketch(self): - vector_data = [[], [1,2], [3], [4,5,6,7], [8,9,10], None] + vector_data = [[], [1, 2], [3], [4, 5, 6, 7], [8, 9, 10], None] sa = SArray(data=vector_data) sketch = sa.summary() self.__validate_sketch_result(sketch, sa) - self.__validate_sketch_result(sketch.element_length_summary(), sa.dropna().item_length()) + self.__validate_sketch_result( + sketch.element_length_summary(), sa.dropna().item_length() + ) flattened = list(itertools.chain.from_iterable(list(sa.dropna()))) self.__validate_sketch_result(sketch.element_summary(), SArray(flattened)) fi = sketch.frequent_items() self.assertEqual(len(fi), 5) - self.assertEqual((fi['[1 2]']), 1) - self.assertEqual((fi['[4 5 6 7]']), 1) + self.assertEqual((fi["[1 2]"]), 1) + self.assertEqual((fi["[4 5 6 7]"]), 1) # sub sketch with one key - s = sa.summary(sub_sketch_keys = 1).element_sub_sketch(1) + s = sa.summary(sub_sketch_keys=1).element_sub_sketch(1) expected = sa.vector_slice(1) self.__validate_sketch_result(s, expected) # sub sketch with multiple keys - keys = [1,3] - s = sa.summary(sub_sketch_keys = keys).element_sub_sketch(keys) + keys = [1, 3] + s = sa.summary(sub_sketch_keys=keys).element_sub_sketch(keys) self.assertEqual(len(s), len(keys)) for key in keys: self.assertTrue(key in s) expected = sa.vector_slice(key) self.__validate_sketch_result(s[key], expected) - indexes = range(0,10) - s = sa.summary(sub_sketch_keys = indexes).element_sub_sketch() + indexes = range(0, 10) + s = sa.summary(sub_sketch_keys=indexes).element_sub_sketch() self.assertEqual(len(s), len(indexes)) def test_list_sketch(self): - list_data = [[], [1,2],[1,2], ['a', 'a', 'a', 'b'], [ 1 ,1 , 2], None] + list_data = [[], [1, 2], [1, 2], ["a", "a", "a", "b"], [1, 1, 2], None] sa = SArray(list_data) self.__validate_nested_sketch_result(sa) sketch = sa.summary() @@ -126,11 +132,18 @@ def test_list_sketch(self): fi = sketch.frequent_items() self.assertEqual(len(fi), 4) - self.assertEqual((fi['[1,2]']), 2) + self.assertEqual((fi["[1,2]"]), 2) self.assertEqual((fi['["a","a","a","b"]']), 1) def test_dict_sketch_int_value(self): - dict_data = [{}, {'a':1, 'b':2}, {'a':1, 'b':2}, {'a':3, 'c':1}, {'a': 1, 'b': 2, 'c': 3}, None] + dict_data = [ + {}, + {"a": 1, "b": 2}, + {"a": 1, "b": 2}, + {"a": 3, "c": 1}, + {"a": 1, "b": 2, "c": 3}, + None, + ] sa = SArray(data=dict_data) self.__validate_nested_sketch_result(sa) @@ -154,20 +167,22 @@ def test_dict_sketch_int_value(self): # Get dict value sketch value_summary = sketch.dict_value_summary() - another_rep = list(itertools.chain.from_iterable(list(sa.dict_values().dropna()))) + another_rep = list( + itertools.chain.from_iterable(list(sa.dict_values().dropna())) + ) self.__validate_sketch_result(value_summary, SArray(another_rep)) # sub sketch with one key - s = sa.summary(sub_sketch_keys ='a').element_sub_sketch('a') - expected = sa.unpack(column_name_prefix="")['a'] + s = sa.summary(sub_sketch_keys="a").element_sub_sketch("a") + expected = sa.unpack(column_name_prefix="")["a"] self.__validate_sketch_result(s, expected) - s = sa.summary(sub_sketch_keys ='Nonexist').element_sub_sketch('Nonexist') + s = sa.summary(sub_sketch_keys="Nonexist").element_sub_sketch("Nonexist") self.assertEqual(s.num_missing(), len(sa)) # sub sketch with multiple keys - keys = ['a', 'b'] - s = sa.summary(sub_sketch_keys =keys).element_sub_sketch(keys) + keys = ["a", "b"] + s = sa.summary(sub_sketch_keys=keys).element_sub_sketch(keys) self.assertEqual(len(s), len(keys)) for key in keys: self.assertTrue(key in s) @@ -176,7 +191,12 @@ def test_dict_sketch_int_value(self): def test_dict_sketch_str_value(self): # Dict value sketch type should be auto inferred - dict_data = [{'a':'b', 'b':'c'}, {'a':'b', 'b':'c'}, {'a':'d', 'b':'4'}, None] + dict_data = [ + {"a": "b", "b": "c"}, + {"a": "b", "b": "c"}, + {"a": "d", "b": "4"}, + None, + ] sa = SArray(data=dict_data) self.__validate_nested_sketch_result(sa) @@ -187,9 +207,17 @@ def test_dict_sketch_str_value(self): # The order in which keys are reported is different in python2 vs python3. # So when the dictionary is converted to a string, it results in different # strings. Try both possible combinations for dictionary. - v = fi['{"b":"c", "a":"b"}'] if '{"b":"c", "a":"b"}' in fi else fi['{"a":"b", "b":"c"}'] + v = ( + fi['{"b":"c", "a":"b"}'] + if '{"b":"c", "a":"b"}' in fi + else fi['{"a":"b", "b":"c"}'] + ) self.assertEqual(v, 2) - v = fi['{"a":"d", "b":"4"}'] if '{"a":"d", "b":"4"}' in fi else fi['{"b":"4", "a":"d"}'] + v = ( + fi['{"a":"d", "b":"4"}'] + if '{"a":"d", "b":"4"}' in fi + else fi['{"b":"4", "a":"d"}'] + ) self.assertEqual(v, 1) # Get dict key sketch @@ -199,20 +227,22 @@ def test_dict_sketch_str_value(self): # Get dict value sketch value_summary = sketch.dict_value_summary() - another_rep = list(itertools.chain.from_iterable(list(sa.dict_values().dropna()))) + another_rep = list( + itertools.chain.from_iterable(list(sa.dict_values().dropna())) + ) self.__validate_sketch_result(value_summary, SArray(another_rep)) # sub sketch with one key - s = sa.summary(sub_sketch_keys ='a').element_sub_sketch('a') - expected = sa.unpack(column_name_prefix="")['a'] + s = sa.summary(sub_sketch_keys="a").element_sub_sketch("a") + expected = sa.unpack(column_name_prefix="")["a"] self.__validate_sketch_result(s, expected) - s = sa.summary(sub_sketch_keys ='Nonexist').element_sub_sketch('Nonexist') + s = sa.summary(sub_sketch_keys="Nonexist").element_sub_sketch("Nonexist") self.assertEqual(s.num_missing(), len(sa)) # sub sketch with multiple keys - keys = ['a', 'b'] - s = sa.summary(sub_sketch_keys =keys).element_sub_sketch(keys) + keys = ["a", "b"] + s = sa.summary(sub_sketch_keys=keys).element_sub_sketch(keys) self.assertEqual(len(s), len(keys)) for key in keys: self.assertTrue(key in s) @@ -220,7 +250,7 @@ def test_dict_sketch_str_value(self): self.__validate_sketch_result(s[key], expected) # allow pass in empty keys, which will retrieve all keys - s = sa.summary(sub_sketch_keys =keys).element_sub_sketch() + s = sa.summary(sub_sketch_keys=keys).element_sub_sketch() self.assertEqual(len(s), len(keys)) for key in keys: self.assertTrue(key in s) @@ -228,7 +258,7 @@ def test_dict_sketch_str_value(self): self.__validate_sketch_result(s[key], expected) def test_dict_many_nones(self): - sa = SArray([None] * 200 + [{'a':'b'}]) + sa = SArray([None] * 200 + [{"a": "b"}]) self.assertEqual(sa.summary().num_elements_processed(), 201) def test_str_sketch(self): @@ -236,24 +266,24 @@ def test_str_sketch(self): sa = SArray(data=str_data) sketch = sa.summary() with self.assertRaises(RuntimeError): - sketch.min() + sketch.min() with self.assertRaises(RuntimeError): - sketch.max() + sketch.max() with self.assertRaises(RuntimeError): - sketch.sum() + sketch.sum() with self.assertRaises(RuntimeError): - sketch.mean() + sketch.mean() with self.assertRaises(RuntimeError): - sketch.var() + sketch.var() with self.assertRaises(RuntimeError): - sketch.std() + sketch.std() self.assertAlmostEqual(sketch.num_unique(), 10, delta=3) self.assertEqual(sketch.num_missing(), 1) self.assertEqual(sketch.size(), len(str_data)) with self.assertRaises(RuntimeError): - sketch.quantile(0.5) + sketch.quantile(0.5) self.assertEqual(sketch.frequency_count("1"), 1) self.assertEqual(sketch.frequency_count("2"), 1) t = sketch.frequent_items() @@ -270,10 +300,10 @@ def test_empty_sketch(self): self.assertEqual(sketch.var(), 0) self.assertEqual(sketch.std(), 0) self.assertEqual(sketch.num_unique(), 0) - self.assertEqual(sketch.num_missing(),0) + self.assertEqual(sketch.num_missing(), 0) self.assertEqual(sketch.size(), 0) with self.assertRaises(RuntimeError): - sketch.quantile(0.5) + sketch.quantile(0.5) t = sketch.frequent_items() self.assertEqual(len(t), 0) @@ -281,10 +311,10 @@ def test_empty_sketch(self): def test_large_value_sketch(self): sa = SArray([1234567890 for i in range(100)]) sk = sa.summary() - self.__validate_sketch_result(sa.summary(), sa, 1E-5) + self.__validate_sketch_result(sa.summary(), sa, 1e-5) def test_cancellation(self): - sa = SArray(range(1,10000)) + sa = SArray(range(1, 10000)) s = sa.summary(background=True) s.cancel() # this can be rather non-deterministic, so there is very little diff --git a/src/python/turicreate/test/test_sframe.py b/src/python/turicreate/test/test_sframe.py index b1c3471635..96d172d183 100644 --- a/src/python/turicreate/test/test_sframe.py +++ b/src/python/turicreate/test/test_sframe.py @@ -39,39 +39,66 @@ class SFrameTest(unittest.TestCase): def setUp(self): self.int_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] - self.float_data = [1., 2., 3., 4., 5., 6., 7., 8., 9., 10.] + self.float_data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] self.string_data = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] self.a_to_z = [str(chr(97 + i)) for i in range(0, 26)] - self.dataframe = pd.DataFrame({'int_data': self.int_data, 'float_data': self.float_data, 'string_data': self.string_data}) - - self.int_data2 = range(50,60) - self.float_data2 = [1.0 * i for i in range(50,60)] - self.string_data2 = [str(i) for i in range(50,60)] - self.dataframe2 = pd.DataFrame({'int_data': self.int_data2, 'float_data': self.float_data2, 'string_data': self.string_data2}) - self.vec_data = [array.array('d', [i, i+1]) for i in self.int_data] + self.dataframe = pd.DataFrame( + { + "int_data": self.int_data, + "float_data": self.float_data, + "string_data": self.string_data, + } + ) + + self.int_data2 = range(50, 60) + self.float_data2 = [1.0 * i for i in range(50, 60)] + self.string_data2 = [str(i) for i in range(50, 60)] + self.dataframe2 = pd.DataFrame( + { + "int_data": self.int_data2, + "float_data": self.float_data2, + "string_data": self.string_data2, + } + ) + self.vec_data = [array.array("d", [i, i + 1]) for i in self.int_data] self.list_data = [[i, str(i), i * 1.0] for i in self.int_data] - self.dict_data = [{str(i): i, i : float(i)} for i in self.int_data] - self.datetime_data = [dt.datetime(2013, 5, 7, 10, 4, 10), - dt.datetime(1902, 10, 21, 10, 34, 10).replace(tzinfo=GMT(0.0))] - self.all_type_cols = [self.int_data, - self.float_data, - self.string_data, - self.vec_data, - self.list_data, - self.dict_data, - self.datetime_data*5] - self.sf_all_types = SFrame({"X"+str(i[0]):i[1] for i in zip(range(1,8), - self.all_type_cols)}) + self.dict_data = [{str(i): i, i: float(i)} for i in self.int_data] + self.datetime_data = [ + dt.datetime(2013, 5, 7, 10, 4, 10), + dt.datetime(1902, 10, 21, 10, 34, 10).replace(tzinfo=GMT(0.0)), + ] + self.all_type_cols = [ + self.int_data, + self.float_data, + self.string_data, + self.vec_data, + self.list_data, + self.dict_data, + self.datetime_data * 5, + ] + self.sf_all_types = SFrame( + {"X" + str(i[0]): i[1] for i in zip(range(1, 8), self.all_type_cols)} + ) # Taken from http://en.wikipedia.org/wiki/Join_(SQL) for fun. self.employees_sf = SFrame() - self.employees_sf.add_column(SArray(['Rafferty','Jones','Heisenberg','Robinson','Smith','John']), 'last_name', inplace=True) - self.employees_sf.add_column(SArray([31,33,33,34,34,None]), 'dep_id', inplace=True) + self.employees_sf.add_column( + SArray(["Rafferty", "Jones", "Heisenberg", "Robinson", "Smith", "John"]), + "last_name", + inplace=True, + ) + self.employees_sf.add_column( + SArray([31, 33, 33, 34, 34, None]), "dep_id", inplace=True + ) # XXX: below are only used by one test! self.departments_sf = SFrame() - self.departments_sf.add_column(SArray([31,33,34,35]), 'dep_id', inplace=True) - self.departments_sf.add_column(SArray(['Sales','Engineering','Clerical','Marketing']), 'dep_name', inplace=True) + self.departments_sf.add_column(SArray([31, 33, 34, 35]), "dep_id", inplace=True) + self.departments_sf.add_column( + SArray(["Sales", "Engineering", "Clerical", "Marketing"]), + "dep_name", + inplace=True, + ) def __assert_sarray_equal(self, sa1, sa2): l1 = list(sa1) @@ -89,12 +116,13 @@ def __assert_sarray_equal(self, sa1, sa2): self.assertTrue(key in v1) self.assertEqual(v1[key], v2[key]) - elif (hasattr(v1, "__iter__")): + elif hasattr(v1, "__iter__"): self.assertEqual(len(v1), len(v2)) for j in range(len(v1)): - t1 = v1[j]; t2 = v2[j] - if (type(t1) == float): - if (math.isnan(t1)): + t1 = v1[j] + t2 = v2[j] + if type(t1) == float: + if math.isnan(t1): self.assertTrue(math.isnan(t2)) else: self.assertEqual(t1, t2) @@ -106,39 +134,46 @@ def __assert_sarray_equal(self, sa1, sa2): def test_split_datetime(self): from_zone = GMT(0) to_zone = GMT(4.5) - utc = dt.datetime.strptime('2011-01-21 02:37:21', '%Y-%m-%d %H:%M:%S') + utc = dt.datetime.strptime("2011-01-21 02:37:21", "%Y-%m-%d %H:%M:%S") utc = utc.replace(tzinfo=from_zone) central = utc.astimezone(to_zone) - sa = SArray([utc,central]) + sa = SArray([utc, central]) expected = SFrame() - expected ['X.year'] = [2011,2011] - expected ['X.month'] = [1,1] - expected ['X.day'] = [21,21] - expected ['X.hour'] = [2,7] - expected ['X.minute'] = [37,7] - expected ['X.second'] = [21,21] - expected ['X.timezone'] = [0.0,4.5] + expected["X.year"] = [2011, 2011] + expected["X.month"] = [1, 1] + expected["X.day"] = [21, 21] + expected["X.hour"] = [2, 7] + expected["X.minute"] = [37, 7] + expected["X.second"] = [21, 21] + expected["X.timezone"] = [0.0, 4.5] result = sa.split_datetime(timezone=True) assert_frame_equal(result.to_dataframe(), expected.to_dataframe()) # column names expected = SFrame() - expected ['ttt.year'] = [2011,2011] - expected ['ttt.minute'] = [37,7] - expected ['ttt.second'] = [21,21] - - result = sa.split_datetime(column_name_prefix='ttt',limit=['year','minute','second']) - self.assertEqual(result.column_names(), ['ttt.year', 'ttt.minute', 'ttt.second']) + expected["ttt.year"] = [2011, 2011] + expected["ttt.minute"] = [37, 7] + expected["ttt.second"] = [21, 21] + + result = sa.split_datetime( + column_name_prefix="ttt", limit=["year", "minute", "second"] + ) + self.assertEqual( + result.column_names(), ["ttt.year", "ttt.minute", "ttt.second"] + ) assert_frame_equal(result.to_dataframe(), expected.to_dataframe()) - sf = SFrame({'datetime': sa}) - result = sf.split_datetime('datetime', column_name_prefix='ttt',limit=['year','minute','second']) - self.assertEqual(result.column_names(), ['ttt.year', 'ttt.minute', 'ttt.second']) + sf = SFrame({"datetime": sa}) + result = sf.split_datetime( + "datetime", column_name_prefix="ttt", limit=["year", "minute", "second"] + ) + self.assertEqual( + result.column_names(), ["ttt.year", "ttt.minute", "ttt.second"] + ) assert_frame_equal(result.to_dataframe(), expected.to_dataframe()) - def __test_equal(self, sf, df): # asserts two frames are equal, ignoring column ordering. self.assertEqual(sf.num_rows(), df.shape[0]) @@ -149,14 +184,14 @@ def __create_test_df(self, size): int_data = [] float_data = [] string_data = [] - for i in range(0,size): + for i in range(0, size): int_data.append(i) float_data.append(float(i)) string_data.append(str(i)) - return pd.DataFrame({'int_data': int_data, - 'float_data': float_data, - 'string_data': string_data}) + return pd.DataFrame( + {"int_data": int_data, "float_data": float_data, "string_data": string_data} + ) # Test if the rows are all the same...row order does not matter. # (I do expect column order to be the same) @@ -168,33 +203,38 @@ def test_creation_from_dataframe(self): sf_empty = SFrame(data=pd.DataFrame()) self.__test_equal(sf_empty, pd.DataFrame()) - sf = SFrame(data=self.dataframe, format='dataframe') + sf = SFrame(data=self.dataframe, format="dataframe") self.__test_equal(sf, self.dataframe) - sf = SFrame(data=self.dataframe, format='auto') + sf = SFrame(data=self.dataframe, format="auto") self.__test_equal(sf, self.dataframe) - original_p = pd.DataFrame({'a':[1.0, float('nan')]}) - effective_p = pd.DataFrame({'a':[1.0, None]}) + original_p = pd.DataFrame({"a": [1.0, float("nan")]}) + effective_p = pd.DataFrame({"a": [1.0, None]}) sf = SFrame(data=original_p) self.__test_equal(sf, effective_p) - original_p = pd.DataFrame({'a':['a',None,'b']}) + original_p = pd.DataFrame({"a": ["a", None, "b"]}) sf = SFrame(data=original_p) self.__test_equal(sf, original_p) def test_auto_parse_csv_with_bom(self): - with tempfile.NamedTemporaryFile(mode='w', delete=False) as csvfile: - df = pd.DataFrame({'float_data': self.float_data, - 'int_data': self.int_data, - 'string_data': self.a_to_z[:len(self.int_data)]}) + with tempfile.NamedTemporaryFile(mode="w", delete=False) as csvfile: + df = pd.DataFrame( + { + "float_data": self.float_data, + "int_data": self.int_data, + "string_data": self.a_to_z[: len(self.int_data)], + } + ) df.to_csv(csvfile, index=False) csvfile.close() import codecs - with open(csvfile.name, 'rb') as f: + + with open(csvfile.name, "rb") as f: content = f.read() - with open(csvfile.name, 'wb') as f: + with open(csvfile.name, "wb") as f: f.write(codecs.BOM_UTF8) f.write(content) @@ -203,10 +243,14 @@ def test_auto_parse_csv_with_bom(self): self.__test_equal(sf, df) def test_auto_parse_csv(self): - with tempfile.NamedTemporaryFile(mode='w', delete=False) as csvfile: - df = pd.DataFrame({'float_data': self.float_data, - 'int_data': self.int_data, - 'string_data': self.a_to_z[:len(self.int_data)]}) + with tempfile.NamedTemporaryFile(mode="w", delete=False) as csvfile: + df = pd.DataFrame( + { + "float_data": self.float_data, + "int_data": self.int_data, + "string_data": self.a_to_z[: len(self.int_data)], + } + ) df.to_csv(csvfile, index=False) csvfile.close() @@ -216,91 +260,107 @@ def test_auto_parse_csv(self): self.__test_equal(sf, df) def test_drop_duplicate(self): - sf = SFrame({'A': ['a', 'b', 'a','C'], 'B': ['b', 'a', 'b','D'], 'C': [1, 2, 1,8]}) - df = pd.DataFrame({'A': ['a', 'b', 'a','C'], 'B': ['b', 'a', 'b','D'], 'C': [1, 2, 1,8]}) - sf1=sf.drop_duplicates(subset=["A","B"]) - sf1=sf1.topk("C",reverse=True) - df1=df.drop_duplicates(subset=["A","B"]).reset_index(drop=True) - self.__test_equal(sf1,df1) - + sf = SFrame( + {"A": ["a", "b", "a", "C"], "B": ["b", "a", "b", "D"], "C": [1, 2, 1, 8]} + ) + df = pd.DataFrame( + {"A": ["a", "b", "a", "C"], "B": ["b", "a", "b", "D"], "C": [1, 2, 1, 8]} + ) + sf1 = sf.drop_duplicates(subset=["A", "B"]) + sf1 = sf1.topk("C", reverse=True) + df1 = df.drop_duplicates(subset=["A", "B"]).reset_index(drop=True) + self.__test_equal(sf1, df1) + def test_parse_csv(self): - with tempfile.NamedTemporaryFile(mode='w', delete=False) as csvfile: + with tempfile.NamedTemporaryFile(mode="w", delete=False) as csvfile: self.dataframe.to_csv(csvfile, index=False) csvfile.close() # list type hints - sf = SFrame.read_csv(csvfile.name, - column_type_hints=[int, int, str]) + sf = SFrame.read_csv(csvfile.name, column_type_hints=[int, int, str]) self.assertEqual(sf.dtype, [int, int, str]) - sf['int_data'] = sf['int_data'].astype(int) - sf['float_data'] = sf['float_data'].astype(float) - sf['string_data'] = sf['string_data'].astype(str) + sf["int_data"] = sf["int_data"].astype(int) + sf["float_data"] = sf["float_data"].astype(float) + sf["string_data"] = sf["string_data"].astype(str) self.__test_equal(sf, self.dataframe) # list type hints, incorrect number of columns - self.assertRaises(RuntimeError, - lambda: SFrame.read_csv(csvfile.name, - column_type_hints=[int, float])) + self.assertRaises( + RuntimeError, + lambda: SFrame.read_csv(csvfile.name, column_type_hints=[int, float]), + ) # dictionary type hints - sf = SFrame.read_csv(csvfile.name, - column_type_hints={'int_data': int, - 'float_data': float, - 'string_data': str}) + sf = SFrame.read_csv( + csvfile.name, + column_type_hints={ + "int_data": int, + "float_data": float, + "string_data": str, + }, + ) self.__test_equal(sf, self.dataframe) # partial dictionary type hints - sf = SFrame.read_csv(csvfile.name, - column_type_hints={'float_data': float, - 'string_data': str}) + sf = SFrame.read_csv( + csvfile.name, + column_type_hints={"float_data": float, "string_data": str}, + ) self.__test_equal(sf, self.dataframe) # single value type hints sf = SFrame.read_csv(csvfile.name, column_type_hints=str) self.assertEqual(sf.dtype, [str, str, str]) - all_string_column_df = self.dataframe.apply(lambda x: [str(ele) for ele in x]) + all_string_column_df = self.dataframe.apply( + lambda x: [str(ele) for ele in x] + ) self.__test_equal(sf, all_string_column_df) # single value type hints row limit sf = SFrame.read_csv(csvfile.name, column_type_hints=str, nrows=5) self.assertEqual(sf.dtype, [str, str, str]) - all_string_column_df = self.dataframe.apply(lambda x: [str(ele) for ele in x]) + all_string_column_df = self.dataframe.apply( + lambda x: [str(ele) for ele in x] + ) self.assertEqual(len(sf), 5) - self.__test_equal(sf, all_string_column_df[0:len(sf)]) - + self.__test_equal(sf, all_string_column_df[0 : len(sf)]) sf = SFrame.read_csv(csvfile.name) - sf2 = SFrame(csvfile.name, format='csv') + sf2 = SFrame(csvfile.name, format="csv") self.__test_equal(sf2, sf.to_dataframe()) f = open(csvfile.name, "w") - f.write('a,b,c\n') - f.write('NA,PIKA,CHU\n') - f.write('1.0,2,3\n') + f.write("a,b,c\n") + f.write("NA,PIKA,CHU\n") + f.write("1.0,2,3\n") f.close() - sf = SFrame.read_csv(csvfile.name, - na_values=['NA','PIKA','CHU'], - column_type_hints={'a':float,'b':int,'c':str}) - t = list(sf['a']) + sf = SFrame.read_csv( + csvfile.name, + na_values=["NA", "PIKA", "CHU"], + column_type_hints={"a": float, "b": int, "c": str}, + ) + t = list(sf["a"]) self.assertEqual(t[0], None) self.assertEqual(t[1], 1.0) - t = list(sf['b']) + t = list(sf["b"]) self.assertEqual(t[0], None) self.assertEqual(t[1], 2) - t = list(sf['c']) + t = list(sf["c"]) self.assertEqual(t[0], None) self.assertEqual(t[1], "3") def test_parse_csv_non_multi_line_unmatched_quotation(self): - data = [{'type': 'foo', 'text_string': 'foo foo.'}, - {'type': 'bar', 'text_string': 'bar " bar.'}, - {'type': 'foo', 'text_string': 'foo".'}] + data = [ + {"type": "foo", "text_string": "foo foo."}, + {"type": "bar", "text_string": 'bar " bar.'}, + {"type": "foo", "text_string": 'foo".'}, + ] - with tempfile.NamedTemporaryFile(mode='w', delete=False) as csvfile: - with open(csvfile.name, 'w') as f: - f.write("type,text_string\n") # header + with tempfile.NamedTemporaryFile(mode="w", delete=False) as csvfile: + with open(csvfile.name, "w") as f: + f.write("type,text_string\n") # header for l in data: - f.write(l['type'] + ',' + l['text_string'] + '\n') + f.write(l["type"] + "," + l["text_string"] + "\n") sf = SFrame.read_csv(csvfile.name, quote_char=None) self.assertEqual(len(sf), len(data)) @@ -311,7 +371,7 @@ def test_save_load_file_cleanup(self): # when some file is in use, file should not be deleted with util.TempDirectory() as f: sf = SFrame() - sf['a'] = SArray(range(1,1000000)) + sf["a"] = SArray(range(1, 1000000)) sf.save(f) # many for each sarray, 1 sframe_idx, 1 object.bin, 1 ini @@ -323,8 +383,8 @@ def test_save_load_file_cleanup(self): # create another SFrame and save to the same location sf2 = SFrame() - sf2['b'] = SArray([str(i) for i in range(1,100000)]) - sf2['c'] = SArray(range(1, 100000)) + sf2["b"] = SArray([str(i) for i in range(1, 100000)]) + sf2["c"] = SArray(range(1, 100000)) sf2.save(f) file_count = len(os.listdir(f)) @@ -347,57 +407,58 @@ def test_save_load(self): # Check top level load function, with no suffix with util.TempDirectory() as f: - sf = SFrame(data=self.dataframe, format='dataframe') + sf = SFrame(data=self.dataframe, format="dataframe") sf.save(f) sf2 = load_sframe(f) self.__test_equal(sf2, self.dataframe) # Check individual formats with the SFrame constructor - formats = ['.csv'] + formats = [".csv"] for suffix in formats: f = tempfile.NamedTemporaryFile(suffix=suffix, delete=False) - sf = SFrame(data=self.dataframe, format='dataframe') + sf = SFrame(data=self.dataframe, format="dataframe") sf.save(f.name) sf2 = SFrame(f.name) - sf2['int_data'] = sf2['int_data'].astype(int) - sf2['float_data'] = sf2['float_data'].astype(float) - sf2['string_data'] = sf2['string_data'].astype(str) + sf2["int_data"] = sf2["int_data"].astype(int) + sf2["float_data"] = sf2["float_data"].astype(float) + sf2["string_data"] = sf2["string_data"].astype(str) self.__test_equal(sf2, self.dataframe) - g=SArray([['a','b',3],[{'a':'b'}],[1,2,3]]) - g2=SFrame() - g2['x']=g + g = SArray([["a", "b", 3], [{"a": "b"}], [1, 2, 3]]) + g2 = SFrame() + g2["x"] = g g2.save(f.name) - g3=SFrame.read_csv(f.name,column_type_hints=list) + g3 = SFrame.read_csv(f.name, column_type_hints=list) self.__test_equal(g2, g3.to_dataframe()) f.close() os.unlink(f.name) # Make sure this file don't exist before testing - self.assertRaises(IOError, lambda: SFrame(data='__no_such_file__.frame_idx', format='sframe')) + self.assertRaises( + IOError, lambda: SFrame(data="__no_such_file__.frame_idx", format="sframe") + ) del sf2 - def test_save_load_reference(self): # Check top level load function, with no suffix with util.TempDirectory() as f: - sf = SFrame(data=self.dataframe, format='dataframe') + sf = SFrame(data=self.dataframe, format="dataframe") originallen = len(sf) sf.save(f) del sf sf = SFrame(f) # make a new column of "1s and save it back - int_data2 = sf['int_data'] + 1 + int_data2 = sf["int_data"] + 1 int_data2.materialize() - sf['int_data2'] = int_data2 + sf["int_data2"] = int_data2 sf._save_reference(f) del sf sf = SFrame(f) - self.assertTrue(((sf['int_data2'] - sf['int_data']) == 1).all()) + self.assertTrue(((sf["int_data2"] - sf["int_data"]) == 1).all()) # try to append and save reference expected = sf.to_dataframe() @@ -405,111 +466,234 @@ def test_save_load_reference(self): sf._save_reference(f) sf = SFrame(f) - self.assertTrue(((sf['int_data2'] - sf['int_data']) == 1).all()) + self.assertTrue(((sf["int_data2"] - sf["int_data"]) == 1).all()) self.assertEqual(2 * originallen, len(sf)) assert_frame_equal(sf[originallen:].to_dataframe(), expected) assert_frame_equal(sf[:originallen].to_dataframe(), expected) def test_save_to_csv(self): - f = tempfile.NamedTemporaryFile(suffix='.csv', delete=False) - sf = SFrame(data=self.dataframe, format='dataframe') - sf.save(f.name, format='csv') - sf2 = SFrame.read_csv(f.name, column_type_hints={'int_data': int, 'float_data': float, 'string_data': str}) + f = tempfile.NamedTemporaryFile(suffix=".csv", delete=False) + sf = SFrame(data=self.dataframe, format="dataframe") + sf.save(f.name, format="csv") + sf2 = SFrame.read_csv( + f.name, + column_type_hints={ + "int_data": int, + "float_data": float, + "string_data": str, + }, + ) self.__test_equal(sf2, self.dataframe) - sf.export_csv(f.name, delimiter=':') - sf2 = SFrame.read_csv(f.name, column_type_hints={'int_data': int, 'float_data': float, 'string_data': str}, delimiter=':') + sf.export_csv(f.name, delimiter=":") + sf2 = SFrame.read_csv( + f.name, + column_type_hints={ + "int_data": int, + "float_data": float, + "string_data": str, + }, + delimiter=":", + ) self.__test_equal(sf2, self.dataframe) - sf.export_csv(f.name, delimiter=':', line_terminator='\r\n') - sf2 = SFrame.read_csv(f.name, column_type_hints={'int_data': int, 'float_data': float, 'string_data': str}, delimiter=':', line_terminator='\r\n') + sf.export_csv(f.name, delimiter=":", line_terminator="\r\n") + sf2 = SFrame.read_csv( + f.name, + column_type_hints={ + "int_data": int, + "float_data": float, + "string_data": str, + }, + delimiter=":", + line_terminator="\r\n", + ) self.__test_equal(sf2, self.dataframe) - sf.export_csv(f.name, delimiter=':', line_terminator='\r\n', double_quote=False) - sf2 = SFrame.read_csv(f.name, column_type_hints={'int_data': int, 'float_data': float, 'string_data': str}, delimiter=':', line_terminator='\r\n', double_quote=False) + sf.export_csv(f.name, delimiter=":", line_terminator="\r\n", double_quote=False) + sf2 = SFrame.read_csv( + f.name, + column_type_hints={ + "int_data": int, + "float_data": float, + "string_data": str, + }, + delimiter=":", + line_terminator="\r\n", + double_quote=False, + ) self.__test_equal(sf2, self.dataframe) - sf.export_csv(f.name, delimiter=':', line_terminator='\r\n', double_quote=False, quote_char='\'') - sf2 = SFrame.read_csv(f.name, column_type_hints={'int_data': int, 'float_data': float, 'string_data': str}, delimiter=':', line_terminator='\r\n', double_quote=False, quote_char='\'') + sf.export_csv( + f.name, + delimiter=":", + line_terminator="\r\n", + double_quote=False, + quote_char="'", + ) + sf2 = SFrame.read_csv( + f.name, + column_type_hints={ + "int_data": int, + "float_data": float, + "string_data": str, + }, + delimiter=":", + line_terminator="\r\n", + double_quote=False, + quote_char="'", + ) self.__test_equal(sf2, self.dataframe) import csv - sf.export_csv(f.name, delimiter=':', line_terminator='\r\n', double_quote=False, quote_char='\'', quote_level=csv.QUOTE_MINIMAL) - sf2 = SFrame.read_csv(f.name, column_type_hints={'int_data': int, 'float_data': float, 'string_data': str}, delimiter=':', line_terminator='\r\n', double_quote=False, quote_char='\'') - self.__test_equal(sf2, self.dataframe) - sf.export_csv(f.name, delimiter=':', line_terminator='\r\n', double_quote=False, quote_char='\'', quote_level=csv.QUOTE_ALL) - sf2 = SFrame.read_csv(f.name, column_type_hints={'int_data': int, 'float_data': float, 'string_data': str}, delimiter=':', line_terminator='\r\n', double_quote=False, quote_char='\'') + sf.export_csv( + f.name, + delimiter=":", + line_terminator="\r\n", + double_quote=False, + quote_char="'", + quote_level=csv.QUOTE_MINIMAL, + ) + sf2 = SFrame.read_csv( + f.name, + column_type_hints={ + "int_data": int, + "float_data": float, + "string_data": str, + }, + delimiter=":", + line_terminator="\r\n", + double_quote=False, + quote_char="'", + ) self.__test_equal(sf2, self.dataframe) + sf.export_csv( + f.name, + delimiter=":", + line_terminator="\r\n", + double_quote=False, + quote_char="'", + quote_level=csv.QUOTE_ALL, + ) + sf2 = SFrame.read_csv( + f.name, + column_type_hints={ + "int_data": int, + "float_data": float, + "string_data": str, + }, + delimiter=":", + line_terminator="\r\n", + double_quote=False, + quote_char="'", + ) + self.__test_equal(sf2, self.dataframe) - sf.export_csv(f.name, delimiter=':', line_terminator='\r\n', double_quote=False, quote_char='\'', quote_level=csv.QUOTE_NONE) - sf2 = SFrame.read_csv(f.name, column_type_hints={'int_data': int, 'float_data': float, 'string_data': str}, delimiter=':', line_terminator='\r\n', double_quote=False, quote_char='\'') + sf.export_csv( + f.name, + delimiter=":", + line_terminator="\r\n", + double_quote=False, + quote_char="'", + quote_level=csv.QUOTE_NONE, + ) + sf2 = SFrame.read_csv( + f.name, + column_type_hints={ + "int_data": int, + "float_data": float, + "string_data": str, + }, + delimiter=":", + line_terminator="\r\n", + double_quote=False, + quote_char="'", + ) self.__test_equal(sf2, self.dataframe) # Pandas compatibility options - sf.export_csv(f.name, sep=':', lineterminator='\r\n', doublequote=False, quotechar='\'', quote_level=csv.QUOTE_NONE) - sf2 = SFrame.read_csv(f.name, column_type_hints={'int_data': int, 'float_data': float, 'string_data': str}, sep=':', lineterminator='\r\n', doublequote=False, quotechar='\'') + sf.export_csv( + f.name, + sep=":", + lineterminator="\r\n", + doublequote=False, + quotechar="'", + quote_level=csv.QUOTE_NONE, + ) + sf2 = SFrame.read_csv( + f.name, + column_type_hints={ + "int_data": int, + "float_data": float, + "string_data": str, + }, + sep=":", + lineterminator="\r\n", + doublequote=False, + quotechar="'", + ) self.__test_equal(sf2, self.dataframe) f.close() os.unlink(f.name) def test_save_to_json(self): - f = tempfile.NamedTemporaryFile(suffix='.json', delete=False) - sf = SFrame(data=self.dataframe, format='dataframe') - sf.save(f.name, format='json') + f = tempfile.NamedTemporaryFile(suffix=".json", delete=False) + sf = SFrame(data=self.dataframe, format="dataframe") + sf.save(f.name, format="json") sf2 = SFrame.read_json(f.name) # the float column will be parsed as integer - sf2['float_data'] = sf2['float_data'].astype(float) + sf2["float_data"] = sf2["float_data"].astype(float) self.__test_equal(sf2, self.dataframe) - sf = SFrame(data=self.dataframe, format='dataframe') + sf = SFrame(data=self.dataframe, format="dataframe") sf.export_json(f.name) sf2 = SFrame.read_json(f.name) - sf2['float_data'] = sf2['float_data'].astype(float) + sf2["float_data"] = sf2["float_data"].astype(float) self.__test_equal(sf2, self.dataframe) - with open(f.name, 'w') as out: - out.write('[\n]') + with open(f.name, "w") as out: + out.write("[\n]") sf = SFrame.read_json(f.name) self.__test_equal(SFrame(), sf.to_dataframe()) - with open(f.name, 'w') as out: - out.write('') - sf = SFrame.read_json(f.name, orient='lines') + with open(f.name, "w") as out: + out.write("") + sf = SFrame.read_json(f.name, orient="lines") self.__test_equal(SFrame(), sf.to_dataframe()) - sf = SFrame(data=self.dataframe, format='dataframe') - sf.export_json(f.name, orient='lines') - sf2 = SFrame.read_json(f.name, orient='lines') - sf2['float_data'] = sf2['float_data'].astype(float) + sf = SFrame(data=self.dataframe, format="dataframe") + sf.export_json(f.name, orient="lines") + sf2 = SFrame.read_json(f.name, orient="lines") + sf2["float_data"] = sf2["float_data"].astype(float) self.__test_equal(sf2, self.dataframe) f.close() os.unlink(f.name) def _remove_sframe_files(self, prefix): - filelist = [ f for f in os.listdir(".") if f.startswith(prefix) ] + filelist = [f for f in os.listdir(".") if f.startswith(prefix)] for f in filelist: os.remove(f) def test_creation_from_txt(self): - f = tempfile.NamedTemporaryFile(suffix='.txt', delete=False) - df = self.dataframe[['string_data']] + f = tempfile.NamedTemporaryFile(suffix=".txt", delete=False) + df = self.dataframe[["string_data"]] df.to_csv(f.name, index=False) sf = SFrame(f.name) - self.assertEqual(sf['string_data'].dtype, int) - sf['string_data'] = sf['string_data'].astype(str) + self.assertEqual(sf["string_data"].dtype, int) + sf["string_data"] = sf["string_data"].astype(str) self.__test_equal(sf, df) - fgzip = tempfile.NamedTemporaryFile(suffix='.txt.gz', delete=False) - f_in = open(f.name, 'rb') - f_out = gzip.open(fgzip.name, 'wb') + fgzip = tempfile.NamedTemporaryFile(suffix=".txt.gz", delete=False) + f_in = open(f.name, "rb") + f_out = gzip.open(fgzip.name, "wb") f_out.writelines(f_in) f_out.close() f_in.close() sf = SFrame(fgzip.name) - self.assertEqual(sf['string_data'].dtype, int) - sf['string_data'] = sf['string_data'].astype(str) + self.assertEqual(sf["string_data"].dtype, int) + sf["string_data"] = sf["string_data"].astype(str) self.__test_equal(sf, df) fgzip.close() @@ -518,46 +702,50 @@ def test_creation_from_txt(self): os.unlink(f.name) def test_creation_from_csv_on_local(self): - if os.path.exists('./foo.csv'): - os.remove('./foo.csv') - with open('./foo.csv', 'w') as f: + if os.path.exists("./foo.csv"): + os.remove("./foo.csv") + with open("./foo.csv", "w") as f: url = f.name basesf = SFrame(self.dataframe) basesf.save(url, format="csv") f.close() - sf = SFrame('./foo.csv') - self.assertEqual(sf['float_data'].dtype, int) - sf['float_data'] = sf['float_data'].astype(float) - self.assertEqual(sf['string_data'].dtype, int) - sf['string_data'] = sf['string_data'].astype(str) + sf = SFrame("./foo.csv") + self.assertEqual(sf["float_data"].dtype, int) + sf["float_data"] = sf["float_data"].astype(float) + self.assertEqual(sf["string_data"].dtype, int) + sf["string_data"] = sf["string_data"].astype(str) self.__test_equal(sf, self.dataframe) sf = SFrame(url) - self.assertEqual(sf['float_data'].dtype, int) - sf['float_data'] = sf['float_data'].astype(float) - self.assertEqual(sf['string_data'].dtype, int) - sf['string_data'] = sf['string_data'].astype(str) + self.assertEqual(sf["float_data"].dtype, int) + sf["float_data"] = sf["float_data"].astype(float) + self.assertEqual(sf["string_data"].dtype, int) + sf["string_data"] = sf["string_data"].astype(str) self.__test_equal(sf, self.dataframe) os.remove(url) def test_alternate_line_endings(self): # test Windows line endings - if os.path.exists('./windows_lines.csv'): - os.remove('./windows_lines.csv') + if os.path.exists("./windows_lines.csv"): + os.remove("./windows_lines.csv") windows_file_url = None - with open('./windows_lines.csv', 'w') as f: + with open("./windows_lines.csv", "w") as f: windows_file_url = f.name - def_writer = csv.writer(f, dialect='excel') - column_list = ['numbers'] + def_writer = csv.writer(f, dialect="excel") + column_list = ["numbers"] def_writer.writerow(column_list) for i in self.int_data: def_writer.writerow([i]) - sf = SFrame.read_csv('./windows_lines.csv', column_type_hints={'numbers':int}) + sf = SFrame.read_csv("./windows_lines.csv", column_type_hints={"numbers": int}) self.assertEqual(sf.column_names(), column_list) self.assertEqual(sf.column_types(), [int]) - self.assertEqual(list(sf['numbers'].head()), self.int_data) + self.assertEqual(list(sf["numbers"].head()), self.int_data) - sf = SFrame.read_csv('./windows_lines.csv', column_type_hints={'numbers':list}, error_bad_lines=False) + sf = SFrame.read_csv( + "./windows_lines.csv", + column_type_hints={"numbers": list}, + error_bad_lines=False, + ) self.assertEqual(sf.column_names(), column_list) self.assertEqual(sf.num_rows(), 0) @@ -565,25 +753,32 @@ def test_alternate_line_endings(self): def test_skip_rows(self): # test line skipping - if os.path.exists('./skip_lines.csv'): - os.remove('./skip_lines.csv') + if os.path.exists("./skip_lines.csv"): + os.remove("./skip_lines.csv") skip_file_url = None - with open('./skip_lines.csv', 'w') as f: + with open("./skip_lines.csv", "w") as f: f.write("trash\n") f.write("junk\n") skip_file_url = f.name - def_writer = csv.writer(f, dialect='excel') - column_list = ['numbers'] + def_writer = csv.writer(f, dialect="excel") + column_list = ["numbers"] def_writer.writerow(column_list) for i in self.int_data: def_writer.writerow([i]) - sf = SFrame.read_csv('./skip_lines.csv', skiprows=2, column_type_hints={'numbers':int}) + sf = SFrame.read_csv( + "./skip_lines.csv", skiprows=2, column_type_hints={"numbers": int} + ) self.assertEqual(sf.column_names(), column_list) self.assertEqual(sf.column_types(), [int]) - self.assertEqual(list(sf['numbers'].head()), self.int_data) - - sf = SFrame.read_csv('./skip_lines.csv', skiprows=2, column_type_hints={'numbers':list}, error_bad_lines=False) + self.assertEqual(list(sf["numbers"].head()), self.int_data) + + sf = SFrame.read_csv( + "./skip_lines.csv", + skiprows=2, + column_type_hints={"numbers": list}, + error_bad_lines=False, + ) self.assertEqual(sf.column_names(), column_list) self.assertEqual(sf.num_rows(), 0) @@ -597,7 +792,7 @@ def test_creation_from_csv_dir_local(self): os.mkdir(csv_dir) for i in range(0, 100): - with open(os.path.join(csv_dir, 'foo.%d.csv' % i), 'w') as f: + with open(os.path.join(csv_dir, "foo.%d.csv" % i), "w") as f: url = f.name self.dataframe.to_csv(url, index=False) f.close() @@ -621,50 +816,60 @@ def test_creation_from_csv_dir_local(self): def test_creation_from_iterable(self): # Normal dict of lists - the_dict = {'ints':self.int_data,'floats':self.float_data,'strings':self.string_data} + the_dict = { + "ints": self.int_data, + "floats": self.float_data, + "strings": self.string_data, + } sf = SFrame(the_dict) df = pd.DataFrame(the_dict) self.__test_equal(sf, df) # Test that a missing value does not change the data type - the_dict['ints'][0] = None + the_dict["ints"][0] = None sf = SFrame(the_dict) - self.assertEqual(sf['ints'].dtype, int) + self.assertEqual(sf["ints"].dtype, int) # numpy.nan is actually a float, so it should cast the column to float - the_dict['ints'][0] = np.nan + the_dict["ints"][0] = np.nan sf = SFrame(the_dict) - self.assertEqual(sf['ints'].dtype, float) + self.assertEqual(sf["ints"].dtype, float) # Just a single list sf = SFrame(self.int_data) df = pd.DataFrame(self.int_data) - df.columns = ['X1'] + df.columns = ["X1"] self.__test_equal(sf, df) # Normal list of lists - list_of_lists = [[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]] + list_of_lists = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]] sf = SFrame(list_of_lists) cntr = 0 for i in sf: - self.assertEqual(list_of_lists[cntr], list(i['X1'])) + self.assertEqual(list_of_lists[cntr], list(i["X1"])) cntr += 1 self.assertEqual(sf.num_columns(), 1) - the_dict = {'ints':self.int_data,'floats':self.float_data,'strings':self.string_data} + the_dict = { + "ints": self.int_data, + "floats": self.float_data, + "strings": self.string_data, + } sf = SFrame(the_dict) - sf2 = SFrame({'ints':sf['ints'],'floats':sf['floats'],'strings':sf['strings']}) + sf2 = SFrame( + {"ints": sf["ints"], "floats": sf["floats"], "strings": sf["strings"]} + ) df = pd.DataFrame(the_dict) self.__test_equal(sf2, df) - sf2 = SFrame([sf['ints'],sf['floats'],sf['strings']]) - self.assertEqual(['X1','X2','X3'],sf2.column_names()) - sf2.rename({'X1':'ints','X2':'floats','X3':'strings'}, inplace=True) - sf2=sf2[['floats','ints','strings']] + sf2 = SFrame([sf["ints"], sf["floats"], sf["strings"]]) + self.assertEqual(["X1", "X2", "X3"], sf2.column_names()) + sf2.rename({"X1": "ints", "X2": "floats", "X3": "strings"}, inplace=True) + sf2 = sf2[["floats", "ints", "strings"]] self.__test_equal(sf2, df) - sf = SFrame({'text': ('foo', 'bar', 'biz')}) - df = pd.DataFrame({'text': ['foo', 'bar', 'biz']}) + sf = SFrame({"text": ("foo", "bar", "biz")}) + df = pd.DataFrame({"text": ["foo", "bar", "biz"]}) self.__test_equal(sf, df) def test_head_tail(self): @@ -673,9 +878,13 @@ def test_head_tail(self): # Cannot test for equality the same way because of dataframe indices taildf = sf.tail(4) for i in range(0, 4): - self.assertEqual(taildf['int_data'][i], self.dataframe['int_data'][i+6]) - self.assertEqual(taildf['float_data'][i], self.dataframe['float_data'][i+6]) - self.assertEqual(taildf['string_data'][i], self.dataframe['string_data'][i+6]) + self.assertEqual(taildf["int_data"][i], self.dataframe["int_data"][i + 6]) + self.assertEqual( + taildf["float_data"][i], self.dataframe["float_data"][i + 6] + ) + self.assertEqual( + taildf["string_data"][i], self.dataframe["string_data"][i + 6] + ) def test_head_tail_edge_case(self): sf = SFrame() @@ -684,7 +893,7 @@ def test_head_tail_edge_case(self): self.assertEqual(sf.head().num_rows(), 0) self.assertEqual(sf.tail().num_rows(), 0) sf = SFrame() - sf['a'] = [] + sf["a"] = [] self.assertEqual(sf.head().num_columns(), 1) self.assertEqual(sf.tail().num_columns(), 1) self.assertEqual(sf.head().num_rows(), 0) @@ -697,21 +906,21 @@ def test_transform(self): sa = sf.apply(lambda x: x[colname], sf.column_types()[i]) self.__assert_sarray_equal(sa, sf[sf.column_names()[i]]) - sa = sf.apply(lambda x: x['int_data'] + x['float_data'], float) - self.__assert_sarray_equal(sf['int_data'] + sf['float_data'], sa) - + sa = sf.apply(lambda x: x["int_data"] + x["float_data"], float) + self.__assert_sarray_equal(sf["int_data"] + sf["float_data"], sa) + def test_add(self): - sf1 = SFrame({'a': [1,2,3]}) - sf2 = SFrame({'a': [6,7]}) - sf1=sf1+sf2 - expected=SFrame({'a': [1,2,3,6,7]}) + sf1 = SFrame({"a": [1, 2, 3]}) + sf2 = SFrame({"a": [6, 7]}) + sf1 = sf1 + sf2 + expected = SFrame({"a": [1, 2, 3, 6, 7]}) _assert_sframe_equal(sf1, expected) def test_transform_with_recursion(self): - sf = SFrame(data={'a':[0,1,2,3,4], 'b':['0','1','2','3','4']}) + sf = SFrame(data={"a": [0, 1, 2, 3, 4], "b": ["0", "1", "2", "3", "4"]}) # this should be the equivalent to sf.apply(lambda x:x since a is # equivalent to range(4) - sa = sf.apply(lambda x: sf[x['a']]) + sa = sf.apply(lambda x: sf[x["a"]]) sb = sf.apply(lambda x: x) self.__assert_sarray_equal(sa, sb) @@ -722,135 +931,145 @@ def test_transform_with_type_inference(self): sa = sf.apply(lambda x: x[colname]) self.__assert_sarray_equal(sa, sf[sf.column_names()[i]]) - sa = sf.apply(lambda x: x['int_data'] + x['float_data']) - self.__assert_sarray_equal(sf['int_data'] + sf['float_data'], sa) + sa = sf.apply(lambda x: x["int_data"] + x["float_data"]) + self.__assert_sarray_equal(sf["int_data"] + sf["float_data"], sa) # SFrame apply returns list of vector of numeric should be vector, not list - sa = sf.apply(lambda x: [x['int_data'], x['float_data']]) + sa = sf.apply(lambda x: [x["int_data"], x["float_data"]]) self.assertEqual(sa.dtype, array.array) def test_transform_with_exception(self): sf = SFrame(data=self.dataframe) - self.assertRaises(KeyError, lambda: sf.apply(lambda x: x['some random key'])) # cannot find the key - self.assertRaises(TypeError, lambda: sf.apply(lambda x: sum(x.values()))) # lambda cannot sum int and str - self.assertRaises(ZeroDivisionError, lambda: sf.apply(lambda x: x['int_data'] / 0)) # divide by 0 error - self.assertRaises(IndexError, lambda: sf.apply(lambda x: list(x.values())[10])) # index out of bound error + self.assertRaises( + KeyError, lambda: sf.apply(lambda x: x["some random key"]) + ) # cannot find the key + self.assertRaises( + TypeError, lambda: sf.apply(lambda x: sum(x.values())) + ) # lambda cannot sum int and str + self.assertRaises( + ZeroDivisionError, lambda: sf.apply(lambda x: x["int_data"] / 0) + ) # divide by 0 error + self.assertRaises( + IndexError, lambda: sf.apply(lambda x: list(x.values())[10]) + ) # index out of bound error def test_empty_transform(self): sf = SFrame() - b = sf.apply(lambda x:x) + b = sf.apply(lambda x: x) self.assertEqual(len(b.head()), 0) def test_flatmap(self): # Correctness of typical usage n = 10 - sf = SFrame({'id': range(n)}) - new_sf = sf.flat_map(["id_range"], lambda x: [[str(i)] for i in range(x['id'])]) + sf = SFrame({"id": range(n)}) + new_sf = sf.flat_map(["id_range"], lambda x: [[str(i)] for i in range(x["id"])]) self.assertEqual(new_sf.column_names(), ["id_range"]) self.assertEqual(new_sf.column_types(), [str]) expected_col = [str(x) for i in range(n) for x in range(i)] - self.assertListEqual(list(new_sf['id_range']), expected_col) + self.assertListEqual(list(new_sf["id_range"]), expected_col) # Empty SFrame, without explicit column types sf = SFrame() with self.assertRaises(TypeError): - new_sf = sf.flat_map(['id_range'], - lambda x: [[i] for i in range(x['id'])]) + new_sf = sf.flat_map(["id_range"], lambda x: [[i] for i in range(x["id"])]) # Empty rows successfully removed - sf = SFrame({'id': range(15)}) - new_sf = sf.flat_map(['id'], - lambda x: [[x['id']]] if x['id'] > 8 else []) + sf = SFrame({"id": range(15)}) + new_sf = sf.flat_map(["id"], lambda x: [[x["id"]]] if x["id"] > 8 else []) self.assertEqual(new_sf.num_rows(), 6) # First ten rows are empty raises error with self.assertRaises(TypeError): - new_sf = sf.flat_map(['id'], - lambda x: [[x['id']]] if x['id'] > 9 else []) - - + new_sf = sf.flat_map(["id"], lambda x: [[x["id"]]] if x["id"] > 9 else []) def test_select_column(self): sf = SFrame(data=self.dataframe) - sub_sf = sf.select_columns(['int_data', 'string_data']) - exp_df = pd.DataFrame({'int_data': self.int_data, 'string_data': self.string_data}) + sub_sf = sf.select_columns(["int_data", "string_data"]) + exp_df = pd.DataFrame( + {"int_data": self.int_data, "string_data": self.string_data} + ) self.__test_equal(sub_sf, exp_df) with self.assertRaises(ValueError): - sf.select_columns(['int_data', 'string_data', 'int_data']) + sf.select_columns(["int_data", "string_data", "int_data"]) # test indexing - sub_col = sf['float_data'] + sub_col = sf["float_data"] self.assertEqual(list(sub_col.head(10)), self.float_data) with self.assertRaises(TypeError): - sub_sf = sf.select_columns(['duh',1]) + sub_sf = sf.select_columns(["duh", 1]) with self.assertRaises(TypeError): sub_sf = sf.select_columns(0) with self.assertRaises(RuntimeError): - sub_sf = sf.select_columns(['not_a_column']) - - self.assertEqual(sf.select_columns([int]).column_names(), ['int_data']) - self.assertEqual(sf.select_columns([int, str]).column_names(), ['int_data', 'string_data']) - - self.assertEqual(sf[int].column_names(), ['int_data']) - self.assertEqual(sf[[int, str]].column_names(), ['int_data', 'string_data']) - self.assertEqual(sf[int, str].column_names(), ['int_data', 'string_data']) - self.assertEqual(sf['int_data', 'string_data'].column_names(), ['int_data', 'string_data']) - self.assertEqual(sf['string_data', 'int_data'].column_names(), ['string_data', 'int_data']) + sub_sf = sf.select_columns(["not_a_column"]) + + self.assertEqual(sf.select_columns([int]).column_names(), ["int_data"]) + self.assertEqual( + sf.select_columns([int, str]).column_names(), ["int_data", "string_data"] + ) + + self.assertEqual(sf[int].column_names(), ["int_data"]) + self.assertEqual(sf[[int, str]].column_names(), ["int_data", "string_data"]) + self.assertEqual(sf[int, str].column_names(), ["int_data", "string_data"]) + self.assertEqual( + sf["int_data", "string_data"].column_names(), ["int_data", "string_data"] + ) + self.assertEqual( + sf["string_data", "int_data"].column_names(), ["string_data", "int_data"] + ) sf = SFrame() with self.assertRaises(RuntimeError): - sf.select_column('x') + sf.select_column("x") with self.assertRaises(RuntimeError): - sf.select_columns(['x']) + sf.select_columns(["x"]) - sf.add_column(SArray(), 'x', inplace=True) + sf.add_column(SArray(), "x", inplace=True) # does not throw - sf.select_column('x') - sf.select_columns(['x']) + sf.select_column("x") + sf.select_columns(["x"]) with self.assertRaises(RuntimeError): - sf.select_column('y') + sf.select_column("y") with self.assertRaises(RuntimeError): - sf.select_columns(['y']) + sf.select_columns(["y"]) def test_topk(self): sf = SFrame(data=self.dataframe) # Test that order is preserved - df2 = sf.topk('int_data').to_dataframe() - df2_expected = self.dataframe.sort_values('int_data', ascending=False) + df2 = sf.topk("int_data").to_dataframe() + df2_expected = self.dataframe.sort_values("int_data", ascending=False) df2_expected.index = range(df2.shape[0]) assert_frame_equal(df2, df2_expected) - df2 = sf.topk('float_data', 3).to_dataframe() - df2_expected = self.dataframe.sort_values('float_data', ascending=False).head(3) + df2 = sf.topk("float_data", 3).to_dataframe() + df2_expected = self.dataframe.sort_values("float_data", ascending=False).head(3) df2_expected.index = range(3) assert_frame_equal(df2, df2_expected) - df2 = sf.topk('string_data', 3).to_dataframe() + df2 = sf.topk("string_data", 3).to_dataframe() for i in range(0, 3): - self.assertEqual(df2['int_data'][2-i], i + 7) + self.assertEqual(df2["int_data"][2 - i], i + 7) with self.assertRaises(TypeError): - sf.topk(2,3) + sf.topk(2, 3) sf = SFrame() - sf.add_column(SArray([1,2,3,4,5]), 'a', inplace=True) - sf.add_column(SArray([1,2,3,4,5]), 'b', inplace=True) - - sf.topk('a', 1) # should not fail + sf.add_column(SArray([1, 2, 3, 4, 5]), "a", inplace=True) + sf.add_column(SArray([1, 2, 3, 4, 5]), "b", inplace=True) + sf.topk("a", 1) # should not fail def test_filter(self): sf = SFrame(data=self.dataframe) - filter_sa = SArray([1,1,1,0,0,0,0,1,1,1]) + filter_sa = SArray([1, 1, 1, 0, 0, 0, 0, 1, 1, 1]) sf2 = sf[filter_sa] exp_df = sf.head(3).append(sf.tail(3)) @@ -862,41 +1081,40 @@ def test_filter(self): self.__test_equal(sf2, exp_df) # filter by 0s - sf2 = sf[SArray([0,0,0,0,0,0,0,0,0,0])] + sf2 = sf[SArray([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])] exp_df = sf.head(0).to_dataframe() self.__test_equal(sf2, exp_df) # wrong size with self.assertRaises(IndexError): - sf2 = sf[SArray([0,1,205])] + sf2 = sf[SArray([0, 1, 205])] # slightly bigger size sf = SFrame() n = 1000000 - sf['a'] = range(n) - result = sf[sf['a'] == -1] + sf["a"] = range(n) + result = sf[sf["a"] == -1] self.assertEqual(len(result), 0) - result = sf[sf['a'] > n - 123] + result = sf[sf["a"] > n - 123] self.assertEqual(len(result), 122) - l = list(result['a']) + l = list(result["a"]) for i in range(len(result)): self.assertEqual(i + n - 122, l[i]) - result = sf[sf['a'] < 2000] + result = sf[sf["a"] < 2000] self.assertEqual(len(result), 2000) - l = list(result['a']) + l = list(result["a"]) for i in range(len(result)): self.assertEqual(i, l[i]) # map input type - toy_data = SFrame({'a': range(100)}) - map_result = map(lambda x: x+1, [1, 30]) - result = toy_data.filter_by(map_result, 'a') + toy_data = SFrame({"a": range(100)}) + map_result = map(lambda x: x + 1, [1, 30]) + result = toy_data.filter_by(map_result, "a") self.assertEqual(len(result), 2) - self.assertEqual(result[0]['a'], 2) - self.assertEqual(result[1]['a'], 31) - + self.assertEqual(result[0]["a"], 2) + self.assertEqual(result[1]["a"], 31) def test_sample_split(self): sf = SFrame(data=self.__create_test_df(100)) @@ -904,13 +1122,14 @@ def test_sample_split(self): for i in sf: entry_list.add(str(i)) - - sample_sf = sf.sample(.12, 9) - sample_sf2 = sf.sample(.12, 9) + sample_sf = sf.sample(0.12, 9) + sample_sf2 = sf.sample(0.12, 9) self.assertEqual(len(sample_sf), len(sample_sf2)) - assert_frame_equal(sample_sf.head().to_dataframe(), sample_sf2.head().to_dataframe()) - self.assertEqual(len(sf.sample(0.5,1,exact=True)), 50) - self.assertEqual(len(sf.sample(0.5,2,exact=True)), 50) + assert_frame_equal( + sample_sf.head().to_dataframe(), sample_sf2.head().to_dataframe() + ) + self.assertEqual(len(sf.sample(0.5, 1, exact=True)), 50) + self.assertEqual(len(sf.sample(0.5, 2, exact=True)), 50) for i in sample_sf: self.assertTrue(str(i) in entry_list) @@ -918,10 +1137,10 @@ def test_sample_split(self): with self.assertRaises(ValueError): sf.sample(3) - sample_sf = SFrame().sample(.12, 9) + sample_sf = SFrame().sample(0.12, 9) self.assertEqual(len(sample_sf), 0) - a_split = sf.random_split(.12, 9) + a_split = sf.random_split(0.12, 9) first_split_entries = set() for i in a_split[0]: @@ -934,11 +1153,11 @@ def test_sample_split(self): with self.assertRaises(ValueError): sf.random_split(3) - self.assertEqual(len(SFrame().random_split(.4)[0]), 0) - self.assertEqual(len(SFrame().random_split(.4)[1]), 0) + self.assertEqual(len(SFrame().random_split(0.4)[0]), 0) + self.assertEqual(len(SFrame().random_split(0.4)[1]), 0) - self.assertEqual(len(sf.random_split(0.5,1,exact=True)[0]), 50) - self.assertEqual(len(sf.random_split(0.5,2,exact=True)[0]), 50) + self.assertEqual(len(sf.random_split(0.5, 1, exact=True)[0]), 50) + self.assertEqual(len(sf.random_split(0.5, 2, exact=True)[0]), 50) # tests add_column, rename def test_edit_column_ops(self): @@ -953,21 +1172,21 @@ def test_edit_column_ops(self): names = sf.column_names() cntr = 1 for i in names: - self.assertEqual("X"+str(cntr), i) + self.assertEqual("X" + str(cntr), i) cntr = cntr + 1 # Remove a column - del sf['X2'] + del sf["X2"] # names names = sf.column_names() self.assertEqual(len(names), 2) - self.assertEqual('X1', names[0]) - self.assertEqual('X3', names[1]) + self.assertEqual("X1", names[0]) + self.assertEqual("X3", names[1]) # check content - self.assertEqual(list(sf['X1'].head(10)), self.int_data) - self.assertEqual(list(sf['X3'].head(10)), self.string_data) + self.assertEqual(list(sf["X1"].head(10)), self.int_data) + self.assertEqual(list(sf["X3"].head(10)), self.string_data) # check that a new automatically named column will not conflict sf.add_column(SArray(self.string_data), inplace=True) @@ -990,7 +1209,7 @@ def test_edit_column_ops(self): self.assertEqual(sf.column_names(), names) # do it again! - del sf['X1'] + del sf["X1"] sf.add_column(SArray(self.string_data), inplace=True) names = sf.column_names() @@ -1002,21 +1221,21 @@ def test_edit_column_ops(self): self.assertEqual(len(uniq_set), len(names)) # standard rename - rename_dict = {'X3':'data','X3.1':'more_data','X3.2':'even_more'} + rename_dict = {"X3": "data", "X3.1": "more_data", "X3.2": "even_more"} sf.rename(rename_dict, inplace=True) - self.assertEqual(sf.column_names(), ['data','more_data','even_more']) + self.assertEqual(sf.column_names(), ["data", "more_data", "even_more"]) # rename a column to a name that's already taken with self.assertRaises(RuntimeError): - sf.rename({'data':'more_data'}, inplace=True) + sf.rename({"data": "more_data"}, inplace=True) # try to rename a column that doesn't exist with self.assertRaises(ValueError): - sf.rename({'foo':'bar'}, inplace=True) + sf.rename({"foo": "bar"}, inplace=True) # pass something other than a dict with self.assertRaises(TypeError): - sf.rename('foo', inplace=True) + sf.rename("foo", inplace=True) # Setting a column to const preserves order names = sf.column_names() @@ -1042,27 +1261,28 @@ def test_remove_column(self): sf.add_column(SArray(self.float_data), inplace=True) sf.add_column(SArray(self.string_data), inplace=True) - self.assertEqual(sf.column_names(), ['X1', 'X2', 'X3', 'X4', 'X5']) + self.assertEqual(sf.column_names(), ["X1", "X2", "X3", "X4", "X5"]) - sf2 = sf.remove_column('X3', inplace=True) + sf2 = sf.remove_column("X3", inplace=True) assert sf is sf2 - self.assertEqual(sf.column_names(), ['X1', 'X2', 'X4', 'X5']) + self.assertEqual(sf.column_names(), ["X1", "X2", "X4", "X5"]) - sf2 = sf.remove_columns(['X2', 'X5'], inplace=True) + sf2 = sf.remove_columns(["X2", "X5"], inplace=True) assert sf is sf2 - self.assertEqual(sf.column_names(), ['X1', 'X4']) + self.assertEqual(sf.column_names(), ["X1", "X4"]) # with a generator expression - sf2 = sf.remove_columns((n for n in ['X1', 'X5'] if n in sf.column_names()), inplace=True) + sf2 = sf.remove_columns( + (n for n in ["X1", "X5"] if n in sf.column_names()), inplace=True + ) assert sf is sf2 - self.assertEqual(sf.column_names(), ['X4']) - + self.assertEqual(sf.column_names(), ["X4"]) def test_remove_bad_column(self): sf = SFrame() @@ -1074,16 +1294,18 @@ def test_remove_bad_column(self): sf.add_column(SArray(self.float_data), inplace=True) sf.add_column(SArray(self.string_data), inplace=True) - self.assertEqual(sf.column_names(), ['X1', 'X2', 'X3', 'X4', 'X5']) - - self.assertRaises(KeyError, lambda: sf.remove_column('bad', inplace=True)) + self.assertEqual(sf.column_names(), ["X1", "X2", "X3", "X4", "X5"]) - self.assertEqual(sf.column_names(), ['X1', 'X2', 'X3', 'X4', 'X5']) + self.assertRaises(KeyError, lambda: sf.remove_column("bad", inplace=True)) - self.assertRaises(KeyError, lambda: sf.remove_columns(['X1', 'X2', 'X3', 'bad', 'X4'], inplace=True)) + self.assertEqual(sf.column_names(), ["X1", "X2", "X3", "X4", "X5"]) - self.assertEqual(sf.column_names(), ['X1', 'X2', 'X3', 'X4', 'X5']) + self.assertRaises( + KeyError, + lambda: sf.remove_columns(["X1", "X2", "X3", "bad", "X4"], inplace=True), + ) + self.assertEqual(sf.column_names(), ["X1", "X2", "X3", "X4", "X5"]) def __generate_synthetic_sframe__(self, num_users): """ @@ -1106,10 +1328,10 @@ def __generate_synthetic_sframe__(self, num_users): ratings += [x[1] for x in sparse_matrix[u]] length_of_watching += [x[2] for x in sparse_matrix[u]] # typical add column stuff - sf['user_id'] = (SArray(user_ids, int)) - sf['movie_id'] = (SArray(movie_ids, str)) - sf['rating'] = (SArray(ratings, float)) - sf['length'] = (SArray(length_of_watching, int)) + sf["user_id"] = SArray(user_ids, int) + sf["movie_id"] = SArray(movie_ids, str) + sf["rating"] = SArray(ratings, float) + sf["length"] = SArray(length_of_watching, int) return sf def test_aggregate_ops(self): @@ -1118,79 +1340,103 @@ def test_aggregate_ops(self): """ for m in [1, 10, 20, 50, 100]: values = range(m) - vector_values = [[random.randint(1,100) for num in range(10)] \ - for y in range(m)] - nd_values = [np.array([float(random.randint(1,100)) for num in range(10)]).reshape(2,5) \ - for y in range(m)] + vector_values = [ + [random.randint(1, 100) for num in range(10)] for y in range(m) + ] + nd_values = [ + np.array([float(random.randint(1, 100)) for num in range(10)]).reshape( + 2, 5 + ) + for y in range(m) + ] sf = SFrame() - sf['key'] = [1] * m - sf['value'] = values - sf['vector_values'] = vector_values - sf['nd_values'] = nd_values + sf["key"] = [1] * m + sf["value"] = values + sf["vector_values"] = vector_values + sf["nd_values"] = nd_values sf.materialize() - built_ins = [aggregate.COUNT(), aggregate.SUM('value'), - aggregate.AVG('value'), aggregate.MIN('value'), - aggregate.MAX('value'), aggregate.VAR('value'), - aggregate.STDV('value'), aggregate.SUM('vector_values'), - aggregate.MEAN('vector_values'), - aggregate.COUNT_DISTINCT('value'), - aggregate.DISTINCT('value'), - aggregate.FREQ_COUNT('value'), - aggregate.SUM('nd_values'), - aggregate.MEAN('nd_values')] - sf2 = sf.groupby('key', built_ins) + built_ins = [ + aggregate.COUNT(), + aggregate.SUM("value"), + aggregate.AVG("value"), + aggregate.MIN("value"), + aggregate.MAX("value"), + aggregate.VAR("value"), + aggregate.STDV("value"), + aggregate.SUM("vector_values"), + aggregate.MEAN("vector_values"), + aggregate.COUNT_DISTINCT("value"), + aggregate.DISTINCT("value"), + aggregate.FREQ_COUNT("value"), + aggregate.SUM("nd_values"), + aggregate.MEAN("nd_values"), + ] + sf2 = sf.groupby("key", built_ins) self.assertEqual(len(sf2), 1) - self.assertEqual(sf2['Count'][0], m) - self.assertEqual(sf2['Sum of value'][0], sum(values)) - self.assertAlmostEqual(sf2['Avg of value'][0], np.mean(values)) - self.assertEqual(sf2['Min of value'][0], min(values)) - self.assertEqual(sf2['Max of value'][0], max(values)) - self.assertAlmostEqual(sf2['Var of value'][0], np.var(values)) - self.assertAlmostEqual(sf2['Stdv of value'][0], np.std(values)) - np.testing.assert_almost_equal(list(sf2['Vector Sum of vector_values'][0]), - list(np.sum(vector_values, axis=0))) - np.testing.assert_almost_equal(list(sf2['Vector Avg of vector_values'][0]), - list(np.mean(vector_values, axis=0))) - np.testing.assert_almost_equal(list(sf2['Vector Sum of nd_values'][0]), - list(np.sum(nd_values, axis=0))) - np.testing.assert_almost_equal(list(sf2['Vector Avg of nd_values'][0]), - list(np.mean(nd_values, axis=0))) - self.assertEqual(sf2['Count Distinct of value'][0], - len(np.unique(values))) - self.assertEqual(sorted(sf2['Distinct of value'][0]), - sorted(list(np.unique(values)))) - self.assertEqual(sf2['Frequency Count of value'][0], - {k:1 for k in np.unique(values)}) + self.assertEqual(sf2["Count"][0], m) + self.assertEqual(sf2["Sum of value"][0], sum(values)) + self.assertAlmostEqual(sf2["Avg of value"][0], np.mean(values)) + self.assertEqual(sf2["Min of value"][0], min(values)) + self.assertEqual(sf2["Max of value"][0], max(values)) + self.assertAlmostEqual(sf2["Var of value"][0], np.var(values)) + self.assertAlmostEqual(sf2["Stdv of value"][0], np.std(values)) + np.testing.assert_almost_equal( + list(sf2["Vector Sum of vector_values"][0]), + list(np.sum(vector_values, axis=0)), + ) + np.testing.assert_almost_equal( + list(sf2["Vector Avg of vector_values"][0]), + list(np.mean(vector_values, axis=0)), + ) + np.testing.assert_almost_equal( + list(sf2["Vector Sum of nd_values"][0]), list(np.sum(nd_values, axis=0)) + ) + np.testing.assert_almost_equal( + list(sf2["Vector Avg of nd_values"][0]), + list(np.mean(nd_values, axis=0)), + ) + self.assertEqual(sf2["Count Distinct of value"][0], len(np.unique(values))) + self.assertEqual( + sorted(sf2["Distinct of value"][0]), sorted(list(np.unique(values))) + ) + self.assertEqual( + sf2["Frequency Count of value"][0], {k: 1 for k in np.unique(values)} + ) # For vectors - def test_min_max_with_missing_values(self): """ Test builtin groupby aggregators """ sf = SFrame() - sf['key'] = [1,1,1,1,1,1,2,2,2,2] - sf['value'] = [1,None,None,None,None,None, None,None,None,None] - built_ins = [aggregate.COUNT(), aggregate.SUM('value'), - aggregate.AVG('value'), aggregate.MIN('value'), - aggregate.MAX('value'), aggregate.VAR('value'), - aggregate.STDV('value'), aggregate.COUNT_DISTINCT('value'), - aggregate.DISTINCT('value'), aggregate.FREQ_COUNT('value')] - sf2 = sf.groupby('key', built_ins).sort('key') - self.assertEqual(list(sf2['Count']), [6,4]) - self.assertEqual(list(sf2['Sum of value']), [1, 0]) - self.assertEqual(list(sf2['Avg of value']), [1, None]) - self.assertEqual(list(sf2['Min of value']), [1, None]) - self.assertEqual(list(sf2['Max of value']), [1, None]) - self.assertEqual(list(sf2['Var of value']), [0, 0]) - self.assertEqual(list(sf2['Stdv of value']), [0, 0]) - self.assertEqual(list(sf2['Count Distinct of value']), [2, 1]) - self.assertEqual(set(sf2['Distinct of value'][0]), set([1, None])) - self.assertEqual(set(sf2['Distinct of value'][1]), set([None])) - self.assertEqual(sf2['Frequency Count of value'][0], {1:1, None:5}) - self.assertEqual(sf2['Frequency Count of value'][1], {None:4}) - + sf["key"] = [1, 1, 1, 1, 1, 1, 2, 2, 2, 2] + sf["value"] = [1, None, None, None, None, None, None, None, None, None] + built_ins = [ + aggregate.COUNT(), + aggregate.SUM("value"), + aggregate.AVG("value"), + aggregate.MIN("value"), + aggregate.MAX("value"), + aggregate.VAR("value"), + aggregate.STDV("value"), + aggregate.COUNT_DISTINCT("value"), + aggregate.DISTINCT("value"), + aggregate.FREQ_COUNT("value"), + ] + sf2 = sf.groupby("key", built_ins).sort("key") + self.assertEqual(list(sf2["Count"]), [6, 4]) + self.assertEqual(list(sf2["Sum of value"]), [1, 0]) + self.assertEqual(list(sf2["Avg of value"]), [1, None]) + self.assertEqual(list(sf2["Min of value"]), [1, None]) + self.assertEqual(list(sf2["Max of value"]), [1, None]) + self.assertEqual(list(sf2["Var of value"]), [0, 0]) + self.assertEqual(list(sf2["Stdv of value"]), [0, 0]) + self.assertEqual(list(sf2["Count Distinct of value"]), [2, 1]) + self.assertEqual(set(sf2["Distinct of value"][0]), set([1, None])) + self.assertEqual(set(sf2["Distinct of value"][1]), set([None])) + self.assertEqual(sf2["Frequency Count of value"][0], {1: 1, None: 5}) + self.assertEqual(sf2["Frequency Count of value"][1], {None: 4}) def test_aggregate_ops_on_lazy_frame(self): """ @@ -1198,37 +1444,48 @@ def test_aggregate_ops_on_lazy_frame(self): """ for m in [1, 10, 20, 50, 100]: values = range(m) - vector_values = [[random.randint(1,100) for num in range(10)] \ - for y in range(m)] + vector_values = [ + [random.randint(1, 100) for num in range(10)] for y in range(m) + ] sf = SFrame() - sf['key'] = [1] * m - sf['value'] = values - sf['vector_values'] = vector_values - sf['value'] = sf['value'] + 0 - built_ins = [aggregate.COUNT(), aggregate.SUM('value'), - aggregate.AVG('value'), aggregate.MIN('value'), - aggregate.MAX('value'), aggregate.VAR('value'), - aggregate.STDV('value'), aggregate.SUM('vector_values'), - aggregate.MEAN('vector_values'), - aggregate.COUNT_DISTINCT('value'), - aggregate.DISTINCT('value')] - sf2 = sf.groupby('key', built_ins) + sf["key"] = [1] * m + sf["value"] = values + sf["vector_values"] = vector_values + sf["value"] = sf["value"] + 0 + built_ins = [ + aggregate.COUNT(), + aggregate.SUM("value"), + aggregate.AVG("value"), + aggregate.MIN("value"), + aggregate.MAX("value"), + aggregate.VAR("value"), + aggregate.STDV("value"), + aggregate.SUM("vector_values"), + aggregate.MEAN("vector_values"), + aggregate.COUNT_DISTINCT("value"), + aggregate.DISTINCT("value"), + ] + sf2 = sf.groupby("key", built_ins) self.assertEqual(len(sf2), 1) - self.assertEqual(sf2['Count'][0], m) - self.assertEqual(sf2['Sum of value'][0], sum(values)) - self.assertAlmostEqual(sf2['Avg of value'][0], np.mean(values)) - self.assertEqual(sf2['Min of value'][0], min(values)) - self.assertEqual(sf2['Max of value'][0], max(values)) - self.assertAlmostEqual(sf2['Var of value'][0], np.var(values)) - self.assertAlmostEqual(sf2['Stdv of value'][0], np.std(values)) - np.testing.assert_almost_equal(list(sf2['Vector Sum of vector_values'][0]), - list(np.sum(vector_values, axis=0))) - np.testing.assert_almost_equal(list(sf2['Vector Avg of vector_values'][0]), - list(np.mean(vector_values, axis=0))) - self.assertEqual(sf2['Count Distinct of value'][0], - len(np.unique(values))) - self.assertEqual(sorted(sf2['Distinct of value'][0]), - sorted(np.unique(values))) + self.assertEqual(sf2["Count"][0], m) + self.assertEqual(sf2["Sum of value"][0], sum(values)) + self.assertAlmostEqual(sf2["Avg of value"][0], np.mean(values)) + self.assertEqual(sf2["Min of value"][0], min(values)) + self.assertEqual(sf2["Max of value"][0], max(values)) + self.assertAlmostEqual(sf2["Var of value"][0], np.var(values)) + self.assertAlmostEqual(sf2["Stdv of value"][0], np.std(values)) + np.testing.assert_almost_equal( + list(sf2["Vector Sum of vector_values"][0]), + list(np.sum(vector_values, axis=0)), + ) + np.testing.assert_almost_equal( + list(sf2["Vector Avg of vector_values"][0]), + list(np.mean(vector_values, axis=0)), + ) + self.assertEqual(sf2["Count Distinct of value"][0], len(np.unique(values))) + self.assertEqual( + sorted(sf2["Distinct of value"][0]), sorted(np.unique(values)) + ) def test_aggregate_ops2(self): """ @@ -1236,46 +1493,51 @@ def test_aggregate_ops2(self): """ for m in [1, 10, 20, 50, 100]: values = range(m) - vector_values = [[random.randint(1,100) for num in range(10)] \ - for y in range(m)] + vector_values = [ + [random.randint(1, 100) for num in range(10)] for y in range(m) + ] sf = SFrame() - sf['key'] = [1] * m - sf['value'] = values - sf['vector_values'] = vector_values - built_ins = {'count':aggregate.COUNT, - 'sum':aggregate.SUM('value'), - 'avg':aggregate.AVG('value'), - 'avg2':aggregate.MEAN('value'), - 'min':aggregate.MIN('value'), - 'max':aggregate.MAX('value'), - 'var':aggregate.VAR('value'), - 'var2':aggregate.VARIANCE('value'), - 'stdv':aggregate.STD('value'), - 'stdv2':aggregate.STDV('value'), - 'vector_sum': aggregate.SUM('vector_values'), - 'vector_mean': aggregate.MEAN('vector_values'), - 'count_unique':aggregate.COUNT_DISTINCT('value'), - 'unique':aggregate.DISTINCT('value'), - 'frequency':aggregate.FREQ_COUNT('value')} - sf2 = sf.groupby('key', built_ins) + sf["key"] = [1] * m + sf["value"] = values + sf["vector_values"] = vector_values + built_ins = { + "count": aggregate.COUNT, + "sum": aggregate.SUM("value"), + "avg": aggregate.AVG("value"), + "avg2": aggregate.MEAN("value"), + "min": aggregate.MIN("value"), + "max": aggregate.MAX("value"), + "var": aggregate.VAR("value"), + "var2": aggregate.VARIANCE("value"), + "stdv": aggregate.STD("value"), + "stdv2": aggregate.STDV("value"), + "vector_sum": aggregate.SUM("vector_values"), + "vector_mean": aggregate.MEAN("vector_values"), + "count_unique": aggregate.COUNT_DISTINCT("value"), + "unique": aggregate.DISTINCT("value"), + "frequency": aggregate.FREQ_COUNT("value"), + } + sf2 = sf.groupby("key", built_ins) self.assertEqual(len(sf2), 1) - self.assertEqual(sf2['count'][0], m) - self.assertEqual(sf2['sum'][0], sum(values)) - self.assertAlmostEqual(sf2['avg'][0], np.mean(values)) - self.assertAlmostEqual(sf2['avg2'][0], np.mean(values)) - self.assertEqual(sf2['min'][0], min(values)) - self.assertEqual(sf2['max'][0], max(values)) - self.assertAlmostEqual(sf2['var'][0], np.var(values)) - self.assertAlmostEqual(sf2['var2'][0], np.var(values)) - self.assertAlmostEqual(sf2['stdv'][0], np.std(values)) - self.assertAlmostEqual(sf2['stdv2'][0], np.std(values)) - np.testing.assert_almost_equal(sf2['vector_sum'][0], list(np.sum(vector_values, axis=0))) - np.testing.assert_almost_equal(sf2['vector_mean'][0], list(np.mean(vector_values, axis=0))) - self.assertEqual(sf2['count_unique'][0], len(np.unique(values))) - self.assertEqual(sorted(sf2['unique'][0]), - sorted(np.unique(values))) - self.assertEqual(sf2['frequency'][0], - {k:1 for k in np.unique(values)}) + self.assertEqual(sf2["count"][0], m) + self.assertEqual(sf2["sum"][0], sum(values)) + self.assertAlmostEqual(sf2["avg"][0], np.mean(values)) + self.assertAlmostEqual(sf2["avg2"][0], np.mean(values)) + self.assertEqual(sf2["min"][0], min(values)) + self.assertEqual(sf2["max"][0], max(values)) + self.assertAlmostEqual(sf2["var"][0], np.var(values)) + self.assertAlmostEqual(sf2["var2"][0], np.var(values)) + self.assertAlmostEqual(sf2["stdv"][0], np.std(values)) + self.assertAlmostEqual(sf2["stdv2"][0], np.std(values)) + np.testing.assert_almost_equal( + sf2["vector_sum"][0], list(np.sum(vector_values, axis=0)) + ) + np.testing.assert_almost_equal( + sf2["vector_mean"][0], list(np.mean(vector_values, axis=0)) + ) + self.assertEqual(sf2["count_unique"][0], len(np.unique(values))) + self.assertEqual(sorted(sf2["unique"][0]), sorted(np.unique(values))) + self.assertEqual(sf2["frequency"][0], {k: 1 for k in np.unique(values)}) def test_groupby(self): """ @@ -1284,84 +1546,120 @@ def test_groupby(self): num_users = 500 sf = self.__generate_synthetic_sframe__(num_users=num_users) - built_ins = [aggregate.COUNT(), aggregate.SUM('rating'), - aggregate.AVG('rating'), aggregate.MIN('rating'), - aggregate.MAX('rating'), aggregate.VAR('rating'), - aggregate.STDV('rating')] + built_ins = [ + aggregate.COUNT(), + aggregate.SUM("rating"), + aggregate.AVG("rating"), + aggregate.MIN("rating"), + aggregate.MAX("rating"), + aggregate.VAR("rating"), + aggregate.STDV("rating"), + ] - built_in_names = ['Sum', 'Avg', 'Min', 'Max', 'Var', 'Stdv'] + built_in_names = ["Sum", "Avg", "Min", "Max", "Var", "Stdv"] """ Test groupby user_id and aggregate on rating """ - sf_user_rating = sf.groupby('user_id', built_ins) + sf_user_rating = sf.groupby("user_id", built_ins) actual = sf_user_rating.column_names() - expected = ['%s of rating' % v for v in built_in_names] \ - + ['user_id'] + ['Count'] + expected = ( + ["%s of rating" % v for v in built_in_names] + ["user_id"] + ["Count"] + ) self.assertSetEqual(set(actual), set(expected)) for row in sf_user_rating: - uid = row['user_id'] + uid = row["user_id"] mids = range(1, uid + 1) ratings = [uid + i for i in mids] - expected = [len(ratings), sum(ratings), np.mean(ratings), - min(ratings), max(ratings), np.var(ratings), - np.sqrt(np.var(ratings))] - actual = [row['Count']] + [row['%s of rating' % op] \ - for op in built_in_names] + expected = [ + len(ratings), + sum(ratings), + np.mean(ratings), + min(ratings), + max(ratings), + np.var(ratings), + np.sqrt(np.var(ratings)), + ] + actual = [row["Count"]] + [ + row["%s of rating" % op] for op in built_in_names + ] for i in range(len(actual)): self.assertAlmostEqual(actual[i], expected[i]) """ Test that count can be applied on empty aggregate column. """ - sf_user_rating = sf.groupby("user_id", {'counter': aggregate.COUNT()}) - actual = {x['user_id']: x['counter'] for x in sf_user_rating} + sf_user_rating = sf.groupby("user_id", {"counter": aggregate.COUNT()}) + actual = {x["user_id"]: x["counter"] for x in sf_user_rating} expected = {i: i for i in range(1, num_users + 1)} self.assertDictEqual(actual, expected) """ Test groupby movie_id and aggregate on length_of_watching """ - built_ins = [aggregate.COUNT(), aggregate.SUM('length'), - aggregate.AVG('length'), aggregate.MIN('length'), - aggregate.MAX('length'), aggregate.VAR('length'), - aggregate.STDV('length')] - sf_movie_length = sf.groupby('movie_id', built_ins) + built_ins = [ + aggregate.COUNT(), + aggregate.SUM("length"), + aggregate.AVG("length"), + aggregate.MIN("length"), + aggregate.MAX("length"), + aggregate.VAR("length"), + aggregate.STDV("length"), + ] + sf_movie_length = sf.groupby("movie_id", built_ins) actual = sf_movie_length.column_names() - expected = ['%s of length' % v for v in built_in_names] \ - + ['movie_id'] + ['Count'] + expected = ( + ["%s of length" % v for v in built_in_names] + ["movie_id"] + ["Count"] + ) self.assertSetEqual(set(actual), set(expected)) for row in sf_movie_length: - mid = row['movie_id'] + mid = row["movie_id"] uids = range(int(mid), num_users + 1) values = [i - int(mid) for i in uids] - expected = [len(values), sum(values), np.mean(values), min(values), - max(values), np.var(values), np.std(values)] - actual = [row['Count']] + [row['%s of length' % op] \ - for op in built_in_names] + expected = [ + len(values), + sum(values), + np.mean(values), + min(values), + max(values), + np.var(values), + np.std(values), + ] + actual = [row["Count"]] + [ + row["%s of length" % op] for op in built_in_names + ] for i in range(len(actual)): self.assertAlmostEqual(actual[i], expected[i]) def test_quantile_groupby(self): sf = self.__generate_synthetic_sframe__(num_users=500) # max and min rating for each user - g = sf.groupby('user_id', [aggregate.MIN('rating'), - aggregate.MAX('rating'), - aggregate.QUANTILE('rating', 0, 1)]) + g = sf.groupby( + "user_id", + [ + aggregate.MIN("rating"), + aggregate.MAX("rating"), + aggregate.QUANTILE("rating", 0, 1), + ], + ) self.assertEqual(len(g), 500) for row in g: - minrating = row['Min of rating'] - maxrating = row['Max of rating'] - arr = list(row['Quantiles of rating']) + minrating = row["Min of rating"] + maxrating = row["Max of rating"] + arr = list(row["Quantiles of rating"]) self.assertEqual(len(arr), 2) self.assertEqual(arr[0], minrating) self.assertEqual(arr[1], maxrating) def test_argmax_argmin_groupby(self): sf = self.__generate_synthetic_sframe__(num_users=500) - sf_ret = sf.groupby('user_id', - {'movie with max rating' : aggregate.ARGMAX('rating','movie_id'), - 'movie with min rating' : aggregate.ARGMIN('rating','movie_id')}) + sf_ret = sf.groupby( + "user_id", + { + "movie with max rating": aggregate.ARGMAX("rating", "movie_id"), + "movie with min rating": aggregate.ARGMIN("rating", "movie_id"), + }, + ) self.assertEqual(len(sf_ret), 500) self.assertEqual(sf_ret["movie with max rating"].dtype, str) self.assertEqual(sf_ret["movie with min rating"].dtype, str) @@ -1370,19 +1668,19 @@ def test_argmax_argmin_groupby(self): max_d = {} min_d = {} for i in sf: - key = i['user_id'] + key = i["user_id"] if key not in max_d: - max_d[key] = (i['movie_id'],i['rating']) - min_d[key] = (i['movie_id'],i['rating']) + max_d[key] = (i["movie_id"], i["rating"]) + min_d[key] = (i["movie_id"], i["rating"]) else: - if max_d[key][1] < i['rating']: - max_d[key] = (i['movie_id'],i['rating']) - if min_d[key][1] > i['rating']: - min_d[key] = (i['movie_id'],i['rating']) + if max_d[key][1] < i["rating"]: + max_d[key] = (i["movie_id"], i["rating"]) + if min_d[key][1] > i["rating"]: + min_d[key] = (i["movie_id"], i["rating"]) for i in sf_ret: - key = i['user_id'] - self.assertEqual(i["movie with max rating"],max_d[key][0]) - self.assertEqual(i["movie with min rating"],min_d[key][0]) + key = i["user_id"] + self.assertEqual(i["movie with max rating"], max_d[key][0]) + self.assertEqual(i["movie with min rating"], min_d[key][0]) def test_multicolumn_groupby(self): sf = self.__generate_synthetic_sframe__(num_users=500) @@ -1394,15 +1692,15 @@ def test_multicolumn_groupby(self): # make sure we have counted correctly d = {} for i in sf: - key = str(i['user_id']) + "," + i["movie_id"] + key = str(i["user_id"]) + "," + i["movie_id"] if key not in d: d[key] = 0 d[key] = d[key] + 1 for i in sf_um: - key = str(i['user_id']) + "," + i["movie_id"] + key = str(i["user_id"]) + "," + i["movie_id"] self.assertTrue(key in d) - self.assertEqual(i['Count'], d[key]) + self.assertEqual(i["Count"], d[key]) sf_um = sf.groupby(["movie_id", "user_id"], aggregate.COUNT()) # I can query it @@ -1413,15 +1711,15 @@ def test_multicolumn_groupby(self): # make sure we have counted correctly d = {} for i in sf: - key = str(i['user_id']) + "," + i["movie_id"] + key = str(i["user_id"]) + "," + i["movie_id"] if key not in d: d[key] = 0 d[key] = d[key] + 1 for i in sf_um: - key = str(i['user_id']) + "," + i["movie_id"] + key = str(i["user_id"]) + "," + i["movie_id"] self.assertTrue(key in d) - self.assertEqual(i['Count'], d[key]) + self.assertEqual(i["Count"], d[key]) def __assert_concat_result_equal(self, result, expected, list_columns): self.assertEqual(result.num_columns(), expected.num_columns()) @@ -1430,116 +1728,155 @@ def __assert_concat_result_equal(self, result, expected, list_columns): c2 = expected[column] self.assertEqual(c1.dtype, c2.dtype) self.assertEqual(len(c1), len(c2)) - if (column in list_columns): + if column in list_columns: for i in range(len(c1)): - if (c1[i] is None): + if c1[i] is None: self.assertTrue(c2[i] is None) continue - if (c1.dtype == dict): + if c1.dtype == dict: for k in c1[i]: self.assertEqual(c2[i][k], c1[i][k]) else: s1 = list(c1[i]) - if s1 is not None: s1.sort() + if s1 is not None: + s1.sort() s2 = list(c2[i]) - if s2 is not None: s2.sort() + if s2 is not None: + s2.sort() self.assertEqual(s1, s2) else: - self.assertEqual(list(c1),list(c2)) + self.assertEqual(list(c1), list(c2)) def test_groupby_dict_key(self): - t = SFrame({'a':[{1:2},{3:4}]}) + t = SFrame({"a": [{1: 2}, {3: 4}]}) with self.assertRaises(TypeError): - t.groupby('a', {}) + t.groupby("a", {}) def test_concat(self): sf = SFrame() - sf['a'] = [1,1,1,1, 2,2,2, 3, 4,4, 5] - sf['b'] = [1,2,1,2, 3,3,1, 4, None, 2, None] - sf['c'] = ['a','b','a','b', 'e','e', None, 'h', 'i','j', 'k'] - sf['d'] = [1.0,2.0,1.0,2.0, 3.0,3.0,1.0, 4.0, None, 2.0, None] - sf['e'] = [{'x': 1}] * len(sf['a']) - - print(sf['b'].dtype) - - result = sf.groupby('a', aggregate.CONCAT('b')) - expected_result = SFrame({ - 'a': [1,2,3,4, 5], - 'List of b': [[1.,1.,2.,2.],[1.,3.,3.],[4.],[2.], []] - }) - expected_result['List of b'] = expected_result['List of b'].astype(list) - self.__assert_concat_result_equal(result.sort('a'), expected_result.sort('a'), ['List of b']) - - - result = sf.groupby('a', aggregate.CONCAT('d')) - - expected_result = SFrame({ - 'a': [1,2,3,4, 5], - 'List of d': [[1,1,2,2],[1,3,3],[4],[2], []] - }) - self.__assert_concat_result_equal(result.sort('a'), expected_result.sort('a'), ['List of d']) - - - result = sf.groupby('a', {'c_c' :aggregate.CONCAT('c')}) - expected_result = SFrame({ - 'a': [1,2,3,4, 5], - 'c_c': [['a','b','a','b'],['e','e'],['h'],['i','j'], ['k']] - }) - - self.__assert_concat_result_equal(result.sort('a'), expected_result.sort('a'), ['c_c']) - - result = sf.groupby('a', aggregate.CONCAT('b','c')) - expected_result = SFrame({ - 'a': [1,2,3,4,5], - 'Dict of b_c': [{1:'a',2:'b'},{3:'e', 1: None},{4:'h'},{2:'j'}, {}] - }) - - self.__assert_concat_result_equal(result.sort('a'), expected_result.sort('a'), ['Dict of b_c']) - - result = sf.groupby('a', {'c_b':aggregate.CONCAT('c','b')}) - expected_result = SFrame({ - 'a': [1,2,3,4,5], - 'c_b': [{'a':1, 'b':2},{'e':3},{'h':4},{'i':None, 'j':2},{'k':None}] - }) - - self.__assert_concat_result_equal(result.sort('a'), expected_result.sort('a'), ['c_b']) - - result = sf.groupby('a', {'cs':aggregate.CONCAT('c'), 'bs':aggregate.CONCAT('b')}) - expected_result = SFrame({ - 'a': [1,2,3,4,5], - 'bs': [[1,1,2,2],[1,3,3],[4],[2], []], - 'cs': [['a','b','a','b'],['e','e'],['h'],['i','j'], ['k']] - }) - expected_result['bs'] = expected_result['bs'].astype(list) - self.__assert_concat_result_equal(result.sort('a'), expected_result.sort('a'), ['bs','cs']) - - #exception fail if there is not column + sf["a"] = [1, 1, 1, 1, 2, 2, 2, 3, 4, 4, 5] + sf["b"] = [1, 2, 1, 2, 3, 3, 1, 4, None, 2, None] + sf["c"] = ["a", "b", "a", "b", "e", "e", None, "h", "i", "j", "k"] + sf["d"] = [1.0, 2.0, 1.0, 2.0, 3.0, 3.0, 1.0, 4.0, None, 2.0, None] + sf["e"] = [{"x": 1}] * len(sf["a"]) + + print(sf["b"].dtype) + + result = sf.groupby("a", aggregate.CONCAT("b")) + expected_result = SFrame( + { + "a": [1, 2, 3, 4, 5], + "List of b": [[1.0, 1.0, 2.0, 2.0], [1.0, 3.0, 3.0], [4.0], [2.0], []], + } + ) + expected_result["List of b"] = expected_result["List of b"].astype(list) + self.__assert_concat_result_equal( + result.sort("a"), expected_result.sort("a"), ["List of b"] + ) + + result = sf.groupby("a", aggregate.CONCAT("d")) + + expected_result = SFrame( + {"a": [1, 2, 3, 4, 5], "List of d": [[1, 1, 2, 2], [1, 3, 3], [4], [2], []]} + ) + self.__assert_concat_result_equal( + result.sort("a"), expected_result.sort("a"), ["List of d"] + ) + + result = sf.groupby("a", {"c_c": aggregate.CONCAT("c")}) + expected_result = SFrame( + { + "a": [1, 2, 3, 4, 5], + "c_c": [["a", "b", "a", "b"], ["e", "e"], ["h"], ["i", "j"], ["k"]], + } + ) + + self.__assert_concat_result_equal( + result.sort("a"), expected_result.sort("a"), ["c_c"] + ) + + result = sf.groupby("a", aggregate.CONCAT("b", "c")) + expected_result = SFrame( + { + "a": [1, 2, 3, 4, 5], + "Dict of b_c": [ + {1: "a", 2: "b"}, + {3: "e", 1: None}, + {4: "h"}, + {2: "j"}, + {}, + ], + } + ) + + self.__assert_concat_result_equal( + result.sort("a"), expected_result.sort("a"), ["Dict of b_c"] + ) + + result = sf.groupby("a", {"c_b": aggregate.CONCAT("c", "b")}) + expected_result = SFrame( + { + "a": [1, 2, 3, 4, 5], + "c_b": [ + {"a": 1, "b": 2}, + {"e": 3}, + {"h": 4}, + {"i": None, "j": 2}, + {"k": None}, + ], + } + ) + + self.__assert_concat_result_equal( + result.sort("a"), expected_result.sort("a"), ["c_b"] + ) + + result = sf.groupby( + "a", {"cs": aggregate.CONCAT("c"), "bs": aggregate.CONCAT("b")} + ) + expected_result = SFrame( + { + "a": [1, 2, 3, 4, 5], + "bs": [[1, 1, 2, 2], [1, 3, 3], [4], [2], []], + "cs": [["a", "b", "a", "b"], ["e", "e"], ["h"], ["i", "j"], ["k"]], + } + ) + expected_result["bs"] = expected_result["bs"].astype(list) + self.__assert_concat_result_equal( + result.sort("a"), expected_result.sort("a"), ["bs", "cs"] + ) + + # exception fail if there is not column with self.assertRaises(TypeError): - sf.groupby('a', aggregate.CONCAT()) + sf.groupby("a", aggregate.CONCAT()) with self.assertRaises(KeyError): - sf.groupby('a', aggregate.CONCAT('nonexist')) + sf.groupby("a", aggregate.CONCAT("nonexist")) with self.assertRaises(TypeError): - sf.groupby('a', aggregate.CONCAT('e', 'a')) + sf.groupby("a", aggregate.CONCAT("e", "a")) def test_select_one(self): - sf = SFrame({'a':[1,1,2,2,3,3,4,4,5,5],'b':[1,2,3,4,5,6,7,8,9,10]}) - res = list(sf.groupby('a', {'b':aggregate.SELECT_ONE('b')})) + sf = SFrame( + {"a": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5], "b": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]} + ) + res = list(sf.groupby("a", {"b": aggregate.SELECT_ONE("b")})) self.assertEqual(len(res), 5) for i in res: - self.assertTrue(i['b'] == 2 * i['a'] or i['b'] == 2 * i['a'] - 1) + self.assertTrue(i["b"] == 2 * i["a"] or i["b"] == 2 * i["a"] - 1) def test_unique(self): - sf = SFrame({'a':[1,1,2,2,3,3,4,4,5,5],'b':[1,2,3,4,5,6,7,8,9,10]}) + sf = SFrame( + {"a": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5], "b": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]} + ) self.assertEqual(len(sf.unique()), 10) - vals = [1,1,2,2,3,3,4,4, None, None] - sf = SFrame({'a':vals,'b':vals}) + vals = [1, 1, 2, 2, 3, 3, 4, 4, None, None] + sf = SFrame({"a": vals, "b": vals}) res = sf.unique() self.assertEqual(len(res), 5) - self.assertEqual(set(res['a']), set([1,2,3,4,None])) - self.assertEqual(set(res['b']), set([1,2,3,4,None])) + self.assertEqual(set(res["a"]), set([1, 2, 3, 4, None])) + self.assertEqual(set(res["b"]), set([1, 2, 3, 4, None])) def test_append_empty(self): sf_with_data = SFrame(data=self.dataframe) @@ -1553,7 +1890,10 @@ def test_append_all_match(self): sf2 = SFrame(data=self.dataframe2) new_sf = sf1.append(sf2) - assert_frame_equal(self.dataframe.append(self.dataframe2, ignore_index=True), new_sf.to_dataframe()) + assert_frame_equal( + self.dataframe.append(self.dataframe2, ignore_index=True), + new_sf.to_dataframe(), + ) def test_append_lazy(self): sf1 = SFrame(data=self.dataframe) @@ -1562,11 +1902,18 @@ def test_append_lazy(self): new_sf = sf1.append(sf2) self.assertTrue(new_sf.__is_materialized__()) - filter_sf1 = SArray([1 for i in range(sf1.num_rows())] + [0 for i in range(sf2.num_rows())]) - filter_sf2 = SArray([0 for i in range(sf1.num_rows())] + [1 for i in range(sf2.num_rows())]) + filter_sf1 = SArray( + [1 for i in range(sf1.num_rows())] + [0 for i in range(sf2.num_rows())] + ) + filter_sf2 = SArray( + [0 for i in range(sf1.num_rows())] + [1 for i in range(sf2.num_rows())] + ) new_sf1 = new_sf[filter_sf1] new_sf2 = new_sf[filter_sf2] - assert_frame_equal(self.dataframe.append(self.dataframe2, ignore_index=True), new_sf.to_dataframe()) + assert_frame_equal( + self.dataframe.append(self.dataframe2, ignore_index=True), + new_sf.to_dataframe(), + ) assert_frame_equal(sf1.to_dataframe(), new_sf1.to_dataframe()) assert_frame_equal(sf2.to_dataframe(), new_sf2.to_dataframe()) @@ -1576,14 +1923,16 @@ def test_append_lazy(self): sf = sf.append(row) df = sf.to_dataframe() for i in range(10): - self.assertEqual(list(df.iloc[[i]]), list(sf.head(1).to_dataframe().iloc[[0]])) + self.assertEqual( + list(df.iloc[[i]]), list(sf.head(1).to_dataframe().iloc[[0]]) + ) def test_recursive_append(self): sf = SFrame() for i in range(200): - sf = sf.append(SFrame(data = self.dataframe)) + sf = sf.append(SFrame(data=self.dataframe)) - #consume + # consume sf.materialize() def test_print_sframe(self): @@ -1600,25 +1949,25 @@ def _test_print(): sf.print_rows(output_file=output) n = 20 - sf['int'] = [i for i in range(n)] - sf['float'] = [float(i) for i in range(n)] - sf['str'] = [str(i) for i in range(n)] - uc = '\xe5\xa4\xa7\xe5\xa4\xb4' - sf['unicode'] = [uc for i in range(n)] - sf['array'] = [array.array('d', [i]) for i in range(n)] - sf['list'] = [[i, float(i), [i]] for i in range(n)] - utc = dt.datetime.strptime('2011-01-21 02:37:21', '%Y-%m-%d %H:%M:%S') - sf['dt'] = [utc for i in range(n)] - sf['img'] = [Image() for i in range(n)] - sf['long_str'] = ["".join([str(i)] * 50) for i in range(n)] - sf['long_unicode'] = ["".join([uc] * 50) for i in range(n)] - sf['bad_unicode'] = ['\x9d' + uc for i in range(n)] + sf["int"] = [i for i in range(n)] + sf["float"] = [float(i) for i in range(n)] + sf["str"] = [str(i) for i in range(n)] + uc = "\xe5\xa4\xa7\xe5\xa4\xb4" + sf["unicode"] = [uc for i in range(n)] + sf["array"] = [array.array("d", [i]) for i in range(n)] + sf["list"] = [[i, float(i), [i]] for i in range(n)] + utc = dt.datetime.strptime("2011-01-21 02:37:21", "%Y-%m-%d %H:%M:%S") + sf["dt"] = [utc for i in range(n)] + sf["img"] = [Image() for i in range(n)] + sf["long_str"] = ["".join([str(i)] * 50) for i in range(n)] + sf["long_unicode"] = ["".join([uc] * 50) for i in range(n)] + sf["bad_unicode"] = ["\x9d" + uc for i in range(n)] _test_print() def test_print_lazy_sframe(self): sf1 = SFrame(data=self.dataframe) self.assertTrue(sf1.__is_materialized__()) - sf2 = sf1[sf1['int_data'] > 3] + sf2 = sf1[sf1["int_data"] > 3] sf2.__repr__() sf2.__str__() self.assertFalse(sf2.__is_materialized__()) @@ -1629,10 +1978,13 @@ def test_append_order_diff(self): # name match but column order not match sf1 = SFrame(data=self.dataframe) sf2 = SFrame(data=self.dataframe2) - sf2.swap_columns('int_data', 'string_data', inplace=True) + sf2.swap_columns("int_data", "string_data", inplace=True) new_sf = sf1.append(sf2) - assert_frame_equal(self.dataframe.append(self.dataframe2, ignore_index=True), new_sf.to_dataframe()) + assert_frame_equal( + self.dataframe.append(self.dataframe2, ignore_index=True), + new_sf.to_dataframe(), + ) def test_append_empty_sframe(self): sf = SFrame(data=self.dataframe) @@ -1644,7 +1996,7 @@ def test_append_empty_sframe(self): # empty append non empty assert_frame_equal(other.append(sf).to_dataframe(), self.dataframe) - #empty append empty + # empty append empty assert_frame_equal(other.append(other).to_dataframe(), pd.DataFrame()) def test_append_exception(self): @@ -1660,219 +2012,316 @@ def test_append_exception(self): names = sf.column_names() for name in sf.column_names(): other.add_column(SArray(), name, inplace=True) - names[0] = 'some name not match' + names[0] = "some name not match" self.assertRaises(RuntimeError, lambda: sf.append(other)) # name match but column type order not match sf1 = SFrame(data=self.dataframe) sf2 = SFrame(data=self.dataframe2) - #change one column type + # change one column type sf1["int_data"] = sf2.select_column("int_data").astype(float) self.assertRaises(RuntimeError, lambda: sf.append(other)) def test_simple_joins(self): inner_expected = SFrame() - inner_expected.add_column(SArray(['Robinson','Jones','Smith','Heisenberg','Rafferty']), 'last_name', inplace=True) - inner_expected.add_column(SArray([34,33,34,33,31]), 'dep_id', inplace=True) - inner_expected.add_column(SArray(['Clerical','Engineering','Clerical','Engineering','Sales']), 'dep_name', inplace=True) + inner_expected.add_column( + SArray(["Robinson", "Jones", "Smith", "Heisenberg", "Rafferty"]), + "last_name", + inplace=True, + ) + inner_expected.add_column(SArray([34, 33, 34, 33, 31]), "dep_id", inplace=True) + inner_expected.add_column( + SArray(["Clerical", "Engineering", "Clerical", "Engineering", "Sales"]), + "dep_name", + inplace=True, + ) # Tests the "natural join" case beg = time.time() res = self.employees_sf.join(self.departments_sf) end = time.time() - print("Really small join: " + str(end-beg) + " s") + print("Really small join: " + str(end - beg) + " s") self.__assert_join_results_equal(res, inner_expected) left_join_row = SFrame() - left_join_row.add_column(SArray(['John']), 'last_name', inplace=True) - left_join_row.add_column(SArray([None], int), 'dep_id', inplace=True) - left_join_row.add_column(SArray([None], str), 'dep_name', inplace=True) + left_join_row.add_column(SArray(["John"]), "last_name", inplace=True) + left_join_row.add_column(SArray([None], int), "dep_id", inplace=True) + left_join_row.add_column(SArray([None], str), "dep_name", inplace=True) left_expected = inner_expected.append(left_join_row) # Left outer join, passing string to 'on' - res = self.employees_sf.join(self.departments_sf, how='left', on='dep_id') + res = self.employees_sf.join(self.departments_sf, how="left", on="dep_id") self.__assert_join_results_equal(res, left_expected) right_join_row = SFrame() - right_join_row.add_column(SArray([None], str), 'last_name', inplace=True) - right_join_row.add_column(SArray([35]), 'dep_id', inplace=True) - right_join_row.add_column(SArray(['Marketing']), 'dep_name', inplace=True) + right_join_row.add_column(SArray([None], str), "last_name", inplace=True) + right_join_row.add_column(SArray([35]), "dep_id", inplace=True) + right_join_row.add_column(SArray(["Marketing"]), "dep_name", inplace=True) right_expected = inner_expected.append(right_join_row) # Right outer join, passing list to 'on' - res = self.employees_sf.join(self.departments_sf, how='right', on=['dep_id']) + res = self.employees_sf.join(self.departments_sf, how="right", on=["dep_id"]) self.__assert_join_results_equal(res, right_expected) outer_expected = left_expected.append(right_join_row) # Full outer join, passing dict to 'on' - res = self.employees_sf.join(self.departments_sf, how='outer', on={'dep_id':'dep_id'}) + res = self.employees_sf.join( + self.departments_sf, how="outer", on={"dep_id": "dep_id"} + ) self.__assert_join_results_equal(res, outer_expected) # Test a join on non-matching key - res = self.employees_sf.join(self.departments_sf, on={'last_name':'dep_name'}) + res = self.employees_sf.join(self.departments_sf, on={"last_name": "dep_name"}) self.assertEqual(res.num_rows(), 0) self.assertEqual(res.num_columns(), 3) - self.assertEqual(res.column_names(), ['last_name', 'dep_id', 'dep_id.1']) + self.assertEqual(res.column_names(), ["last_name", "dep_id", "dep_id.1"]) # Test a join on a non-unique key bad_departments = SFrame() - bad_departments['dep_id'] = SArray([33,33,31,31]) - bad_departments['dep_name'] = self.departments_sf['dep_name'] + bad_departments["dep_id"] = SArray([33, 33, 31, 31]) + bad_departments["dep_name"] = self.departments_sf["dep_name"] no_pk_expected = SFrame() - no_pk_expected['last_name'] = SArray(['Rafferty','Rafferty','Heisenberg','Jones','Heisenberg','Jones']) - no_pk_expected['dep_id'] = SArray([31,31,33,33,33,33]) - no_pk_expected['dep_name'] = SArray(['Clerical','Marketing','Sales','Sales','Engineering','Engineering']) - res = self.employees_sf.join(bad_departments, on='dep_id') + no_pk_expected["last_name"] = SArray( + ["Rafferty", "Rafferty", "Heisenberg", "Jones", "Heisenberg", "Jones"] + ) + no_pk_expected["dep_id"] = SArray([31, 31, 33, 33, 33, 33]) + no_pk_expected["dep_name"] = SArray( + ["Clerical", "Marketing", "Sales", "Sales", "Engineering", "Engineering"] + ) + res = self.employees_sf.join(bad_departments, on="dep_id") self.__assert_join_results_equal(res, no_pk_expected) # Left join on non-unique key - bad_departments = bad_departments.append(right_join_row[['dep_id', 'dep_name']]) - bad_departments = bad_departments.append(right_join_row[['dep_id', 'dep_name']]) + bad_departments = bad_departments.append(right_join_row[["dep_id", "dep_name"]]) + bad_departments = bad_departments.append(right_join_row[["dep_id", "dep_name"]]) no_pk_expected = no_pk_expected.append(right_join_row) no_pk_expected = no_pk_expected.append(right_join_row) - no_pk_expected = no_pk_expected[['dep_id', 'dep_name', 'last_name']] - res = bad_departments.join(self.employees_sf, on='dep_id', how='left') + no_pk_expected = no_pk_expected[["dep_id", "dep_name", "last_name"]] + res = bad_departments.join(self.employees_sf, on="dep_id", how="left") self.__assert_join_results_equal(res, no_pk_expected) def test_simple_joins_with_customized_name(self): # redundant name conflict resolution with self.assertRaises(KeyError): - self.employees_sf.join(self.departments_sf, alter_name={'non_existing_name': 'random_name'}) + self.employees_sf.join( + self.departments_sf, alter_name={"non_existing_name": "random_name"} + ) with self.assertRaises(KeyError): - self.employees_sf.join(self.departments_sf, alter_name={'dep_id': 'random_name'}) + self.employees_sf.join( + self.departments_sf, alter_name={"dep_id": "random_name"} + ) with self.assertRaises(ValueError): - self.employees_sf.join(self.departments_sf, alter_name={'dep_name': 'last_name'}) + self.employees_sf.join( + self.departments_sf, alter_name={"dep_name": "last_name"} + ) # nothing should happen # Tests the "natural join" case inner_expected = SFrame() - inner_expected.add_column(SArray(['Robinson','Jones','Smith','Heisenberg','Rafferty']), 'last_name', inplace=True) - inner_expected.add_column(SArray([34,33,34,33,31]), 'dep_id', inplace=True) - inner_expected.add_column(SArray(['Marketing','Engineering','Cooking','Clerical','Sales']), 'dep_name', inplace=True) + inner_expected.add_column( + SArray(["Robinson", "Jones", "Smith", "Heisenberg", "Rafferty"]), + "last_name", + inplace=True, + ) + inner_expected.add_column(SArray([34, 33, 34, 33, 31]), "dep_id", inplace=True) + inner_expected.add_column( + SArray(["Marketing", "Engineering", "Cooking", "Clerical", "Sales"]), + "dep_name", + inplace=True, + ) # add extra column for employee table - employees_sf_extra = self.employees_sf.add_column(SArray( - ['Sales', 'Engineering', 'Clerical', 'Marketing', 'Cooking', 'Basketball']), 'dep_name'); - - res = employees_sf_extra.join(self.departments_sf, on='dep_id') - inner_expected_tmp = inner_expected.add_column(SArray(['Clerical', 'Engineering', 'Clerical', 'Engineering', 'Sales']), 'dep_name.1') + employees_sf_extra = self.employees_sf.add_column( + SArray( + [ + "Sales", + "Engineering", + "Clerical", + "Marketing", + "Cooking", + "Basketball", + ] + ), + "dep_name", + ) + + res = employees_sf_extra.join(self.departments_sf, on="dep_id") + inner_expected_tmp = inner_expected.add_column( + SArray(["Clerical", "Engineering", "Clerical", "Engineering", "Sales"]), + "dep_name.1", + ) self.__assert_join_results_equal(res, inner_expected_tmp) - inner_expected_tmp = inner_expected.add_column(SArray(['Clerical', 'Engineering', 'Clerical', 'Engineering', 'Sales']), 'X') - res = employees_sf_extra.join(self.departments_sf, on='dep_id', alter_name={'dep_name': 'X'}) + inner_expected_tmp = inner_expected.add_column( + SArray(["Clerical", "Engineering", "Clerical", "Engineering", "Sales"]), "X" + ) + res = employees_sf_extra.join( + self.departments_sf, on="dep_id", alter_name={"dep_name": "X"} + ) self.__assert_join_results_equal(res, inner_expected_tmp) ###### A simple and navive test start ###### employees_ = SFrame() - employees_.add_column(SArray(['A','B','C','D']), 'last_name', inplace=True) - employees_.add_column(SArray([31,32,33,None]), 'dep_id', inplace=True) - employees_.add_column(SArray([1,2,3,4]), 'org_id', inplace=True) - employees_.add_column(SArray([1,2,3,4]), 'bed_id', inplace=True) + employees_.add_column(SArray(["A", "B", "C", "D"]), "last_name", inplace=True) + employees_.add_column(SArray([31, 32, 33, None]), "dep_id", inplace=True) + employees_.add_column(SArray([1, 2, 3, 4]), "org_id", inplace=True) + employees_.add_column(SArray([1, 2, 3, 4]), "bed_id", inplace=True) departments_ = SFrame() - departments_.add_column(SArray([31,33,34]), 'dep_id', inplace=True) - departments_.add_column(SArray(['A','C','F']), 'last_name', inplace=True) - departments_.add_column(SArray(['Sales','Engineering', None]), 'dep_name', inplace=True) + departments_.add_column(SArray([31, 33, 34]), "dep_id", inplace=True) + departments_.add_column(SArray(["A", "C", "F"]), "last_name", inplace=True) + departments_.add_column( + SArray(["Sales", "Engineering", None]), "dep_name", inplace=True + ) # intentionally dup at the second last - departments_.add_column(SArray([1,3,5]), 'bed_id', inplace=True) - departments_.add_column(SArray([1,3,None]), 'car_id', inplace=True) + departments_.add_column(SArray([1, 3, 5]), "bed_id", inplace=True) + departments_.add_column(SArray([1, 3, None]), "car_id", inplace=True) - join_keys_ = ['dep_id', 'last_name'] + join_keys_ = ["dep_id", "last_name"] ## left expected_ = SFrame() - expected_.add_column(SArray(['A','B','C','D']), 'last_name', inplace=True) - expected_.add_column(SArray([31,32,33,None]), 'dep_id', inplace=True) - expected_.add_column(SArray([1,2,3,4]), 'org_id', inplace=True) - expected_.add_column(SArray([1,2,3,4]), 'bed_id', inplace=True) - expected_.add_column(SArray(['Sales', None, 'Engineering', None]), 'dep_name', inplace=True) - expected_.add_column(SArray([1,None,3,None]), 'bed_id.1', inplace=True) - expected_.add_column(SArray([1,None,3,None]), 'car_id', inplace=True) - - res = employees_.join(departments_, on=join_keys_, how='left') + expected_.add_column(SArray(["A", "B", "C", "D"]), "last_name", inplace=True) + expected_.add_column(SArray([31, 32, 33, None]), "dep_id", inplace=True) + expected_.add_column(SArray([1, 2, 3, 4]), "org_id", inplace=True) + expected_.add_column(SArray([1, 2, 3, 4]), "bed_id", inplace=True) + expected_.add_column( + SArray(["Sales", None, "Engineering", None]), "dep_name", inplace=True + ) + expected_.add_column(SArray([1, None, 3, None]), "bed_id.1", inplace=True) + expected_.add_column(SArray([1, None, 3, None]), "car_id", inplace=True) + + res = employees_.join(departments_, on=join_keys_, how="left") self.__assert_join_results_equal(res, expected_) expected_ = SFrame() - expected_.add_column(SArray(['A','B','C','D']), 'last_name', inplace=True) - expected_.add_column(SArray([31,32,33,None]), 'dep_id', inplace=True) - expected_.add_column(SArray([1,2,3,4]), 'org_id', inplace=True) - expected_.add_column(SArray([1,2,3,4]), 'bed_id', inplace=True) - expected_.add_column(SArray(['Sales', None, 'Engineering', None]), 'dep_name', inplace=True) - expected_.add_column(SArray([1,None,3,None]), 'Y', inplace=True) - expected_.add_column(SArray([1,None,3,None]), 'car_id', inplace=True) - - res = employees_.join(departments_, on=join_keys_, how='left', alter_name={'car_id': 'X', 'bed_id': 'Y'}) + expected_.add_column(SArray(["A", "B", "C", "D"]), "last_name", inplace=True) + expected_.add_column(SArray([31, 32, 33, None]), "dep_id", inplace=True) + expected_.add_column(SArray([1, 2, 3, 4]), "org_id", inplace=True) + expected_.add_column(SArray([1, 2, 3, 4]), "bed_id", inplace=True) + expected_.add_column( + SArray(["Sales", None, "Engineering", None]), "dep_name", inplace=True + ) + expected_.add_column(SArray([1, None, 3, None]), "Y", inplace=True) + expected_.add_column(SArray([1, None, 3, None]), "car_id", inplace=True) + + res = employees_.join( + departments_, + on=join_keys_, + how="left", + alter_name={"car_id": "X", "bed_id": "Y"}, + ) self.__assert_join_results_equal(res, expected_) ## left size is smaller than right expected_ = SFrame() - expected_.add_column(SArray([31,33,34]), 'dep_id', inplace=True) - expected_.add_column(SArray(['A','C','F']), 'last_name', inplace=True) - expected_.add_column(SArray(['Sales','Engineering', None]), 'dep_name', inplace=True) - expected_.add_column(SArray([1,3,5]), 'bed_id', inplace=True) - expected_.add_column(SArray([1,3,None]), 'car_id', inplace=True) - expected_.add_column(SArray([1,3,None]), 'org_id', inplace=True) - expected_.add_column(SArray([1,3,None]), 'Y', inplace=True) - - res = departments_.join(employees_, on=join_keys_, how='left', alter_name={'bed_id': 'Y'}) + expected_.add_column(SArray([31, 33, 34]), "dep_id", inplace=True) + expected_.add_column(SArray(["A", "C", "F"]), "last_name", inplace=True) + expected_.add_column( + SArray(["Sales", "Engineering", None]), "dep_name", inplace=True + ) + expected_.add_column(SArray([1, 3, 5]), "bed_id", inplace=True) + expected_.add_column(SArray([1, 3, None]), "car_id", inplace=True) + expected_.add_column(SArray([1, 3, None]), "org_id", inplace=True) + expected_.add_column(SArray([1, 3, None]), "Y", inplace=True) + + res = departments_.join( + employees_, on=join_keys_, how="left", alter_name={"bed_id": "Y"} + ) self.__assert_join_results_equal(res, expected_) ## right expected_ = SFrame() - expected_.add_column(SArray(['A','C','F']), 'last_name', inplace=True) - expected_.add_column(SArray([31,33,34]), 'dep_id', inplace=True) - expected_.add_column(SArray([1,3,None]), 'org_id', inplace=True) - expected_.add_column(SArray([1,3,None]), 'bed_id', inplace=True) - expected_.add_column(SArray(['Sales','Engineering', None]), 'dep_name', inplace=True) - expected_.add_column(SArray([1,3,5]), 'Y', inplace=True) - expected_.add_column(SArray([1,3,None]), 'car_id', inplace=True) - - res = employees_.join(departments_, on=join_keys_, how='right', alter_name={'car_id': 'X', 'bed_id': 'Y'}) + expected_.add_column(SArray(["A", "C", "F"]), "last_name", inplace=True) + expected_.add_column(SArray([31, 33, 34]), "dep_id", inplace=True) + expected_.add_column(SArray([1, 3, None]), "org_id", inplace=True) + expected_.add_column(SArray([1, 3, None]), "bed_id", inplace=True) + expected_.add_column( + SArray(["Sales", "Engineering", None]), "dep_name", inplace=True + ) + expected_.add_column(SArray([1, 3, 5]), "Y", inplace=True) + expected_.add_column(SArray([1, 3, None]), "car_id", inplace=True) + + res = employees_.join( + departments_, + on=join_keys_, + how="right", + alter_name={"car_id": "X", "bed_id": "Y"}, + ) self.__assert_join_results_equal(res, expected_) ## outer expected_ = SFrame() - expected_.add_column(SArray(['A','B','C','D','F']), 'last_name', inplace=True) - expected_.add_column(SArray([31,32,33,None,34]), 'dep_id', inplace=True) - expected_.add_column(SArray([1,2,3,4,None]), 'org_id', inplace=True) - expected_.add_column(SArray([1,2,3,4,None]), 'bed_id', inplace=True) - expected_.add_column(SArray(['Sales', None, 'Engineering', None, None]), 'dep_name', inplace=True) - expected_.add_column(SArray([1,None,3,None,5]), 'Y', inplace=True) - expected_.add_column(SArray([1,None,3,None,None]), 'car_id', inplace=True) - - res = employees_.join(departments_, on=join_keys_, how='outer', alter_name={'car_id': 'X', 'bed_id': 'Y'}) + expected_.add_column( + SArray(["A", "B", "C", "D", "F"]), "last_name", inplace=True + ) + expected_.add_column(SArray([31, 32, 33, None, 34]), "dep_id", inplace=True) + expected_.add_column(SArray([1, 2, 3, 4, None]), "org_id", inplace=True) + expected_.add_column(SArray([1, 2, 3, 4, None]), "bed_id", inplace=True) + expected_.add_column( + SArray(["Sales", None, "Engineering", None, None]), "dep_name", inplace=True + ) + expected_.add_column(SArray([1, None, 3, None, 5]), "Y", inplace=True) + expected_.add_column(SArray([1, None, 3, None, None]), "car_id", inplace=True) + + res = employees_.join( + departments_, + on=join_keys_, + how="outer", + alter_name={"car_id": "X", "bed_id": "Y"}, + ) self.__assert_join_results_equal(res, expected_) ## error cases with self.assertRaises(KeyError): - res = employees_.join(departments_, on=join_keys_, how='right', alter_name={ - 'some_id': 'car_id', 'bed_id': 'Y'}) + res = employees_.join( + departments_, + on=join_keys_, + how="right", + alter_name={"some_id": "car_id", "bed_id": "Y"}, + ) with self.assertRaises(ValueError): - res = employees_.join(departments_, on=join_keys_, how='right', alter_name={ - 'car_id': 'car_id', 'bed_id': 'car_id'}) + res = employees_.join( + departments_, + on=join_keys_, + how="right", + alter_name={"car_id": "car_id", "bed_id": "car_id"}, + ) ## resolution order is not independent with self.assertRaises(ValueError): - res = employees_.join(departments_, on=join_keys_, how='right', alter_name={ - 'car_id': 'X', 'bed_id': 'car_id'}) + res = employees_.join( + departments_, + on=join_keys_, + how="right", + alter_name={"car_id": "X", "bed_id": "car_id"}, + ) with self.assertRaises(ValueError): - res = employees_.join(departments_, on=join_keys_, how='right', alter_name={ - 'car_id': 'bed_id', 'bed_id': 'car_id'}) + res = employees_.join( + departments_, + on=join_keys_, + how="right", + alter_name={"car_id": "bed_id", "bed_id": "car_id"}, + ) ## duplicate values with self.assertRaises(RuntimeError): - res = employees_.join(departments_, on=join_keys_, how='right', alter_name={ - 'car_id': 'X', 'bed_id': 'X'}) + res = employees_.join( + departments_, + on=join_keys_, + how="right", + alter_name={"car_id": "X", "bed_id": "X"}, + ) def test_big_composite_join(self): # Create a semi large SFrame with composite primary key (letter, number) @@ -1880,7 +2329,7 @@ def test_big_composite_join(self): number_keys = [] data = [] for i in string.ascii_lowercase: - for j in range(0,100): + for j in range(0, 100): letter_keys.append(i) number_keys.append(j) which = j % 3 @@ -1891,90 +2340,92 @@ def test_big_composite_join(self): elif which == 2: data.append(string.hexdigits) pk_gibberish = SFrame() - pk_gibberish['letter'] = SArray(letter_keys, str) - pk_gibberish['number'] = SArray(number_keys, int) - pk_gibberish['data'] = SArray(data, str) + pk_gibberish["letter"] = SArray(letter_keys, str) + pk_gibberish["number"] = SArray(number_keys, int) + pk_gibberish["data"] = SArray(data, str) # Some rows that won't match more_data = [] more_letter_keys = [] more_number_keys = [] - for i in range(0,40000): - more_data.append('fish') - more_letter_keys.append('A') + for i in range(0, 40000): + more_data.append("fish") + more_letter_keys.append("A") more_number_keys.append(200) - for i in range(0,80): - for j in range(100,1000): - more_data.append('waffles') + for i in range(0, 80): + for j in range(100, 1000): + more_data.append("waffles") more_letter_keys.append(letter_keys[j]) more_number_keys.append(number_keys[j]) # Non-matching row in this stretch if j == 147: - more_letter_keys[-1] = 'A' - for i in range(0,5000): - more_data.append('pizza') - more_letter_keys.append('Z') + more_letter_keys[-1] = "A" + for i in range(0, 5000): + more_data.append("pizza") + more_letter_keys.append("Z") more_number_keys.append(400) join_with_gibberish = SFrame() - join_with_gibberish['data'] = SArray(more_data, str) - join_with_gibberish['moredata'] = SArray(more_data, str) - join_with_gibberish['a_number'] = SArray(more_number_keys, int) - join_with_gibberish['a_letter'] = SArray(more_letter_keys, str) + join_with_gibberish["data"] = SArray(more_data, str) + join_with_gibberish["moredata"] = SArray(more_data, str) + join_with_gibberish["a_number"] = SArray(more_number_keys, int) + join_with_gibberish["a_letter"] = SArray(more_letter_keys, str) expected_answer = SFrame() exp_letter = [] exp_number = [] exp_data = [] - for i in range(0,80): + for i in range(0, 80): exp_letter.extend(letter_keys[100:147]) exp_number.extend(number_keys[100:147]) exp_letter.extend(letter_keys[148:1000]) exp_number.extend(number_keys[148:1000]) exp_data.extend(data[100:147]) exp_data.extend(data[148:1000]) - expected_answer['letter'] = SArray(exp_letter, str) - expected_answer['number'] = SArray(exp_number, int) - expected_answer['data'] = SArray(exp_data, str) - expected_answer['data.1'] = 'waffles' - expected_answer['moredata'] = 'waffles' + expected_answer["letter"] = SArray(exp_letter, str) + expected_answer["number"] = SArray(exp_number, int) + expected_answer["data"] = SArray(exp_data, str) + expected_answer["data.1"] = "waffles" + expected_answer["moredata"] = "waffles" beg = time.time() - res = pk_gibberish.join(join_with_gibberish, on={'letter':'a_letter','number':'a_number'}) + res = pk_gibberish.join( + join_with_gibberish, on={"letter": "a_letter", "number": "a_number"} + ) end = time.time() - print("Join took " + str(end-beg) + " seconds") + print("Join took " + str(end - beg) + " seconds") self.__assert_join_results_equal(res, expected_answer) def test_convert_dataframe_empty(self): sf = SFrame() - sf['a'] = SArray([], int) + sf["a"] = SArray([], int) df = sf.to_dataframe() - self.assertEqual(df['a'].dtype, int) + self.assertEqual(df["a"].dtype, int) sf1 = SFrame(df) - self.assertEqual(sf1['a'].dtype, int) + self.assertEqual(sf1["a"].dtype, int) self.assertEqual(sf1.num_rows(), 0) def test_replace_one_column(self): sf = SFrame() - sf['a'] = [1,2,3] - self.assertEqual(list(sf['a']), [1,2,3]) + sf["a"] = [1, 2, 3] + self.assertEqual(list(sf["a"]), [1, 2, 3]) # this should succeed as we are replacing a new column - sf['a'] = [1,2] - self.assertEqual(list(sf['a']), [1,2]) + sf["a"] = [1, 2] + self.assertEqual(list(sf["a"]), [1, 2]) # failed to add new column should revert original sframe with self.assertRaises(TypeError): - sf['a'] = [1,2,'a'] + sf["a"] = [1, 2, "a"] - self.assertEqual(list(sf['a']), [1,2]) + self.assertEqual(list(sf["a"]), [1, 2]) # add a column with different length should fail if there are more than one column sf = SFrame() - sf['a'] = [1,2,3] - sf['b'] = ['a', 'b', 'c'] + sf["a"] = [1, 2, 3] + sf["b"] = ["a", "b", "c"] with self.assertRaises(RuntimeError): - sf['a'] = [1,2] + sf["a"] = [1, 2] def test_filter_by(self): # Set up SFrame to filter by @@ -2004,9 +2455,19 @@ def __build_data_list_with_none(data_lst): return data_lst sf_none = SFrame() - sf_none.add_column(SArray(__build_data_list_with_none(self.int_data[:])), "ints", inplace=True) - sf_none.add_column(SArray(__build_data_list_with_none(self.float_data[:])), "floats", inplace=True) - sf_none.add_column(SArray(__build_data_list_with_none(self.string_data[:])), "strings", inplace=True) + sf_none.add_column( + SArray(__build_data_list_with_none(self.int_data[:])), "ints", inplace=True + ) + sf_none.add_column( + SArray(__build_data_list_with_none(self.float_data[:])), + "floats", + inplace=True, + ) + sf_none.add_column( + SArray(__build_data_list_with_none(self.string_data[:])), + "strings", + inplace=True, + ) res = sf_none.filter_by(None, "ints") self.assertEqual(len(res), 3) @@ -2031,17 +2492,21 @@ def __build_data_list_with_none(data_lst): self.assertEqual(len(res), 1) self.assertEqual(res["ints"][0], 10) - res = sf.filter_by(map(lambda x : x - 5., self.float_data), "floats") + res = sf.filter_by(map(lambda x: x - 5.0, self.float_data), "floats") self.assertEqual(len(res), 5) self.assertEqual(res["floats"][0], self.float_data[0]) - res = sf.filter_by(map(lambda x : x - 5., self.float_data), "floats", exclude=True) + res = sf.filter_by( + map(lambda x: x - 5.0, self.float_data), "floats", exclude=True + ) self.assertEqual(len(res), 5) self.assertEqual(res["floats"][0], self.float_data[5]) - res = sf.filter_by(filter(lambda x : len(x) > 1, self.string_data), "strings") + res = sf.filter_by(filter(lambda x: len(x) > 1, self.string_data), "strings") self.assertEqual(len(res), 1) self.assertEqual(res["strings"][0], self.string_data[-1]) - res = sf.filter_by(filter(lambda x : len(x) > 1, self.string_data), "strings", exclude=True) + res = sf.filter_by( + filter(lambda x: len(x) > 1, self.string_data), "strings", exclude=True + ) self.assertEqual(len(res), 9) self.assertEqual(res["strings"][0], self.string_data[0]) @@ -2051,17 +2516,23 @@ def __build_data_list_with_none(data_lst): res = sf.filter_by(SArray(self.int_data), "ints", exclude=True) self.assertEqual(list(res), []) - res = sf.filter_by([5,6], "ints") + res = sf.filter_by([5, 6], "ints") exp = SFrame() exp.add_column(SArray(self.int_data[4:6]), "ints", inplace=True) exp.add_column(SArray(self.float_data[4:6]), "floats", inplace=True) exp.add_column(SArray(self.string_data[4:6]), "strings", inplace=True) self.__assert_join_results_equal(res, exp) exp_opposite = SFrame() - exp_opposite.add_column(SArray(self.int_data[:4]+self.int_data[6:]), "ints", inplace=True) - exp_opposite.add_column(SArray(self.float_data[:4]+self.float_data[6:]), "floats", inplace=True) - exp_opposite.add_column(SArray(self.string_data[:4]+self.string_data[6:]), "strings", inplace=True) - res = sf.filter_by([5,6], "ints", exclude=True) + exp_opposite.add_column( + SArray(self.int_data[:4] + self.int_data[6:]), "ints", inplace=True + ) + exp_opposite.add_column( + SArray(self.float_data[:4] + self.float_data[6:]), "floats", inplace=True + ) + exp_opposite.add_column( + SArray(self.string_data[:4] + self.string_data[6:]), "strings", inplace=True + ) + res = sf.filter_by([5, 6], "ints", exclude=True) self.__assert_join_results_equal(res, exp_opposite) exp_one = SFrame() @@ -2069,9 +2540,15 @@ def __build_data_list_with_none(data_lst): exp_one.add_column(SArray(self.float_data[4:5]), "floats", inplace=True) exp_one.add_column(SArray(self.string_data[4:5]), "strings", inplace=True) exp_all_but_one = SFrame() - exp_all_but_one.add_column(SArray(self.int_data[:4]+self.int_data[5:]), "ints", inplace=True) - exp_all_but_one.add_column(SArray(self.float_data[:4]+self.float_data[5:]), "floats", inplace=True) - exp_all_but_one.add_column(SArray(self.string_data[:4]+self.string_data[5:]), "strings", inplace=True) + exp_all_but_one.add_column( + SArray(self.int_data[:4] + self.int_data[5:]), "ints", inplace=True + ) + exp_all_but_one.add_column( + SArray(self.float_data[:4] + self.float_data[5:]), "floats", inplace=True + ) + exp_all_but_one.add_column( + SArray(self.string_data[:4] + self.string_data[5:]), "strings", inplace=True + ) res = sf.filter_by(5, "ints") self.__assert_join_results_equal(res, exp_one) @@ -2084,77 +2561,77 @@ def __build_data_list_with_none(data_lst): self.__assert_join_results_equal(res, exp_all_but_one) # Only missing values - res = sf.filter_by([77,77,88,88], "ints") + res = sf.filter_by([77, 77, 88, 88], "ints") # Test against empty SFrame with correct columns/types - self.__assert_join_results_equal(res, exp_one[exp_one['ints'] == 9000]) - res = sf.filter_by([77,77,88,88], "ints", exclude=True) + self.__assert_join_results_equal(res, exp_one[exp_one["ints"] == 9000]) + res = sf.filter_by([77, 77, 88, 88], "ints", exclude=True) self.__assert_join_results_equal(res, sf) - # Duplicate values - res = sf.filter_by([6,6,5,5,6,5,5,6,5,5,5], "ints") + res = sf.filter_by([6, 6, 5, 5, 6, 5, 5, 6, 5, 5, 5], "ints") self.__assert_join_results_equal(res, exp) - res = sf.filter_by([6,6,5,5,6,5,5,6,5,5,5], "ints", exclude=True) + res = sf.filter_by([6, 6, 5, 5, 6, 5, 5, 6, 5, 5, 5], "ints", exclude=True) self.__assert_join_results_equal(res, exp_opposite) # Duplicate and missing - res = sf.filter_by([11,12,46,6,6,55,5,5], "ints") + res = sf.filter_by([11, 12, 46, 6, 6, 55, 5, 5], "ints") self.__assert_join_results_equal(res, exp) - res = sf.filter_by([11,12,46,6,6,55,5,5], "ints", exclude=True) + res = sf.filter_by([11, 12, 46, 6, 6, 55, 5, 5], "ints", exclude=True) self.__assert_join_results_equal(res, exp_opposite) - # Type mismatch with self.assertRaises(TypeError): res = sf.filter_by(["hi"], "ints") # Column doesn't exist with self.assertRaises(KeyError): - res = sf.filter_by([1,2], "intssss") + res = sf.filter_by([1, 2], "intssss") # Something that can't be turned into an SArray with self.assertRaises(Exception): - res = sf.filter_by({1:2,3:4}, "ints") + res = sf.filter_by({1: 2, 3: 4}, "ints") # column_name not given as string with self.assertRaises(TypeError): - res = sf.filter_by(1,2) + res = sf.filter_by(1, 2) # Duplicate column names after join. Should be last because of the # renames. - sf.rename({'ints':'id','floats':'id1','strings':'id11'}, inplace=True) - exp.rename({'ints':'id','floats':'id1','strings':'id11'}, inplace=True) - exp_opposite.rename({'ints':'id','floats':'id1','strings':'id11'}, inplace=True) - res = sf.filter_by([5,6], "id") + sf.rename({"ints": "id", "floats": "id1", "strings": "id11"}, inplace=True) + exp.rename({"ints": "id", "floats": "id1", "strings": "id11"}, inplace=True) + exp_opposite.rename( + {"ints": "id", "floats": "id1", "strings": "id11"}, inplace=True + ) + res = sf.filter_by([5, 6], "id") self.__assert_join_results_equal(res, exp) - res = sf.filter_by([5,6], "id", exclude=True) + res = sf.filter_by([5, 6], "id", exclude=True) self.__assert_join_results_equal(res, exp_opposite) # XXXXXX: should be inner function def __test_to_from_dataframe(self, data, type): sf = SFrame() - sf['a'] = data + sf["a"] = data df = sf.to_dataframe() sf1 = SFrame(df) - self.assertTrue(sf1.dtype[0]== type) + self.assertTrue(sf1.dtype[0] == type) - df = pd.DataFrame({'val': data}) + df = pd.DataFrame({"val": data}) sf1 = SFrame(df) - self.assertTrue(sf1.dtype[0]== type) + self.assertTrue(sf1.dtype[0] == type) def test_to_from_dataframe(self): - self.__test_to_from_dataframe([1,2,3], int) - self.__test_to_from_dataframe(['a', 'b', 'c'], str) + self.__test_to_from_dataframe([1, 2, 3], int) + self.__test_to_from_dataframe(["a", "b", "c"], str) self.__test_to_from_dataframe([1.0, 2.0, 3.0], float) - self.__test_to_from_dataframe([[1, 'b', {'a': 1}], [1,2,3]], list) - self.__test_to_from_dataframe([{'a':1, 1:None}, {'b':2}], dict) - self.__test_to_from_dataframe([[1,2],[1,2],[]], array.array) + self.__test_to_from_dataframe([[1, "b", {"a": 1}], [1, 2, 3]], list) + self.__test_to_from_dataframe([{"a": 1, 1: None}, {"b": 2}], dict) + self.__test_to_from_dataframe([[1, 2], [1, 2], []], array.array) def test_pack_columns_exception(self): sf = SFrame() - sf['a'] = [1, 2, 3, None, None] - sf['b'] = [None, '2', '3', None, '5'] - sf['c'] = [None, 2.0, 3.0, None, 5.0] + sf["a"] = [1, 2, 3, None, None] + sf["b"] = [None, "2", "3", None, "5"] + sf["c"] = [None, 2.0, 3.0, None, 5.0] # cannot pack non array value into array with self.assertRaises(TypeError): @@ -2162,11 +2639,11 @@ def test_pack_columns_exception(self): # cannot given non numeric na vlaue to array with self.assertRaises(ValueError): - sf.pack_columns(dtype=array.array, fill_na='c') + sf.pack_columns(dtype=array.array, fill_na="c") # cannot pack non exist columns with self.assertRaises(ValueError): - sf.pack_columns(['d','a']) + sf.pack_columns(["d", "a"]) # dtype has to be dict/array/list with self.assertRaises(ValueError): @@ -2174,268 +2651,266 @@ def test_pack_columns_exception(self): # pack duplicate columns with self.assertRaises(ValueError): - sf.pack_columns(['a','a']) + sf.pack_columns(["a", "a"]) # pack partial columns to array, should fail if for columns that are not numeric with self.assertRaises(TypeError): - sf.pack_columns(['a','b'], dtype=array.array) + sf.pack_columns(["a", "b"], dtype=array.array) with self.assertRaises(TypeError): - sf.pack_columns(column_name_prefix = 1) + sf.pack_columns(column_name_prefix=1) with self.assertRaises(ValueError): - sf.pack_columns(column_name_prefix = '1') + sf.pack_columns(column_name_prefix="1") with self.assertRaises(ValueError): - sf.pack_columns(column_name_prefix = 'c', column_names=['a', 'b']) + sf.pack_columns(column_name_prefix="c", column_names=["a", "b"]) def test_pack_columns2(self): sf = SFrame() - sf['id'] = [1, 2, 3, 4] - sf['category.a'] = [None, '2', '3', None] - sf['category.b'] = [None, 2.0, None, 4.0] - - expected = SArray([ - [None, None], - ['2', 2.0], - ['3', None], - [None, 4.0]]) - result = sf.pack_columns(column_name_prefix='category') - self.assertEqual(result.column_names(), ['id', 'category']) - self.__assert_sarray_equal(result['id'], sf['id']) - self.__assert_sarray_equal(result['category'], expected) - - result = sf.pack_columns(column_name_prefix='category', new_column_name="new name") - self.assertEqual(result.column_names(), ['id', 'new name']) - self.__assert_sarray_equal(result['id'], sf['id']) - self.__assert_sarray_equal(result['new name'], expected) + sf["id"] = [1, 2, 3, 4] + sf["category.a"] = [None, "2", "3", None] + sf["category.b"] = [None, 2.0, None, 4.0] + + expected = SArray([[None, None], ["2", 2.0], ["3", None], [None, 4.0]]) + result = sf.pack_columns(column_name_prefix="category") + self.assertEqual(result.column_names(), ["id", "category"]) + self.__assert_sarray_equal(result["id"], sf["id"]) + self.__assert_sarray_equal(result["category"], expected) + + result = sf.pack_columns( + column_name_prefix="category", new_column_name="new name" + ) + self.assertEqual(result.column_names(), ["id", "new name"]) + self.__assert_sarray_equal(result["id"], sf["id"]) + self.__assert_sarray_equal(result["new name"], expected) # default dtype is list - result = sf.pack_columns(column_name_prefix='category', dtype=list) - self.assertEqual(result.column_names(), ['id', 'category']) - self.__assert_sarray_equal(result['category'], expected) + result = sf.pack_columns(column_name_prefix="category", dtype=list) + self.assertEqual(result.column_names(), ["id", "category"]) + self.__assert_sarray_equal(result["category"], expected) # remove prefix == True by default - expected = SArray([ - {}, - {'a':'2', 'b':2.0}, - {'a':'3'}, - {'b':4.0} - ]) - result = sf.pack_columns(column_name_prefix='category', dtype=dict) - self.__assert_sarray_equal(result['category'], expected) + expected = SArray([{}, {"a": "2", "b": 2.0}, {"a": "3"}, {"b": 4.0}]) + result = sf.pack_columns(column_name_prefix="category", dtype=dict) + self.__assert_sarray_equal(result["category"], expected) # remove prefix == False - expected = SArray([ - {}, - {'category.a':'2', 'category.b':2.0}, - {'category.a':'3'}, - {'category.b':4.0} - ]) - result = sf.pack_columns(column_name_prefix='category', dtype=dict, remove_prefix=False) - self.assertEqual(result.column_names(), ['id', 'category']) - self.__assert_sarray_equal(result['category'], expected) + expected = SArray( + [ + {}, + {"category.a": "2", "category.b": 2.0}, + {"category.a": "3"}, + {"category.b": 4.0}, + ] + ) + result = sf.pack_columns( + column_name_prefix="category", dtype=dict, remove_prefix=False + ) + self.assertEqual(result.column_names(), ["id", "category"]) + self.__assert_sarray_equal(result["category"], expected) # fill_na - expected = SArray([ - {'a':1, 'b':1}, - {'a':'2', 'b':2.0}, - {'a':'3', 'b':1}, - {'a':1, 'b':4.0} - ]) - result = sf.pack_columns(column_name_prefix='category', dtype=dict, fill_na = 1) - self.__assert_sarray_equal(result['category'], expected) - - expected = SArray([ - [1], - [2], - [3], - [4]], list) - result = sf.pack_columns(['id'], new_column_name='id') - self.assertEqual(sorted(result.column_names()), sorted(['id', 'category.a', 'category.b'])) - self.__assert_sarray_equal(result['id'], expected) + expected = SArray( + [ + {"a": 1, "b": 1}, + {"a": "2", "b": 2.0}, + {"a": "3", "b": 1}, + {"a": 1, "b": 4.0}, + ] + ) + result = sf.pack_columns(column_name_prefix="category", dtype=dict, fill_na=1) + self.__assert_sarray_equal(result["category"], expected) + + expected = SArray([[1], [2], [3], [4]], list) + result = sf.pack_columns(["id"], new_column_name="id") + self.assertEqual( + sorted(result.column_names()), sorted(["id", "category.a", "category.b"]) + ) + self.__assert_sarray_equal(result["id"], expected) def test_pack_columns(self): sf = SFrame() - sf['id'] = [1, 2, 3, 4, 5] - sf['b'] = [None, '2', '3', None, '5'] - sf['c'] = [None, 2.0, 3.0, None, 5.0] - - expected_all_default = SArray([ - [1, None, None], - [2, '2', 2.0], - [3, '3', 3.0], - [4, None, None], - [5, '5', 5.0] - ]) + sf["id"] = [1, 2, 3, 4, 5] + sf["b"] = [None, "2", "3", None, "5"] + sf["c"] = [None, 2.0, 3.0, None, 5.0] + + expected_all_default = SArray( + [ + [1, None, None], + [2, "2", 2.0], + [3, "3", 3.0], + [4, None, None], + [5, "5", 5.0], + ] + ) # pack all columns, all default values - self.__assert_sarray_equal(sf.pack_columns()['X1'], expected_all_default) - - expected_ab_default = SArray([ - [1, None], - [2, '2'], - [3, '3'], - [4, None], - [5, '5'] - ]) - - expected_all_fillna_1 = SArray([ - [1, -1, -1], - [2, '2', 2.0], - [3, '3', 3.0], - [4, -1, -1], - [5, '5', 5.0] - ]) + self.__assert_sarray_equal(sf.pack_columns()["X1"], expected_all_default) + + expected_ab_default = SArray( + [[1, None], [2, "2"], [3, "3"], [4, None], [5, "5"]] + ) + + expected_all_fillna_1 = SArray( + [[1, -1, -1], [2, "2", 2.0], [3, "3", 3.0], [4, -1, -1], [5, "5", 5.0]] + ) # pack all columns do not drop na and also fill with some value result = sf.pack_columns(fill_na=-1) - self.assertEqual(result.column_names(), ['X1']) - self.__assert_sarray_equal(result['X1'], expected_all_fillna_1) + self.assertEqual(result.column_names(), ["X1"]) + self.__assert_sarray_equal(result["X1"], expected_all_fillna_1) # pack partial columns, all default value - result = sf.pack_columns(['id','b']) - self.assertEqual(result.column_names(), ['c','X2']) - self.__assert_sarray_equal(result['c'], sf['c']) - self.__assert_sarray_equal(result['X2'], expected_ab_default) - - expected_sarray_ac_fillna_default = SArray([ - [1, float('NaN')], - [2, 2.0], - [3, 3.0], - [4, float('NaN')], - [5, 5.0] - ]) - - result = sf.pack_columns(['id','c'], dtype=array.array) - self.assertEqual(result.column_names(), ['b', 'X2']) - self.__assert_sarray_equal(result['b'], sf['b']) - self.__assert_sarray_equal(result['X2'], expected_sarray_ac_fillna_default) - - expected_dict_default = SArray([ - {'id': 1}, - {'id': 2, 'b':'2', 'c': 2.0}, - {'id': 3, 'b':'3', 'c': 3.0}, - {'id':4 }, - {'id':5, 'b':'5', 'c': 5.0} - ]) + result = sf.pack_columns(["id", "b"]) + self.assertEqual(result.column_names(), ["c", "X2"]) + self.__assert_sarray_equal(result["c"], sf["c"]) + self.__assert_sarray_equal(result["X2"], expected_ab_default) + + expected_sarray_ac_fillna_default = SArray( + [[1, float("NaN")], [2, 2.0], [3, 3.0], [4, float("NaN")], [5, 5.0]] + ) + + result = sf.pack_columns(["id", "c"], dtype=array.array) + self.assertEqual(result.column_names(), ["b", "X2"]) + self.__assert_sarray_equal(result["b"], sf["b"]) + self.__assert_sarray_equal(result["X2"], expected_sarray_ac_fillna_default) + + expected_dict_default = SArray( + [ + {"id": 1}, + {"id": 2, "b": "2", "c": 2.0}, + {"id": 3, "b": "3", "c": 3.0}, + {"id": 4}, + {"id": 5, "b": "5", "c": 5.0}, + ] + ) result = sf.pack_columns(dtype=dict) - self.__assert_sarray_equal(result['X1'], expected_dict_default) - - expected_dict_fillna = SArray([ - {'id': 1, 'b':-1, 'c': -1}, - {'id': 2, 'b':'2', 'c': 2.0}, - {'id': 3, 'b':'3', 'c': 3.0}, - {'id': 4, 'b':-1, 'c': -1}, - {'id': 5, 'b':'5', 'c': 5.0} - ]) + self.__assert_sarray_equal(result["X1"], expected_dict_default) + + expected_dict_fillna = SArray( + [ + {"id": 1, "b": -1, "c": -1}, + {"id": 2, "b": "2", "c": 2.0}, + {"id": 3, "b": "3", "c": 3.0}, + {"id": 4, "b": -1, "c": -1}, + {"id": 5, "b": "5", "c": 5.0}, + ] + ) result = sf.pack_columns(dtype=dict, fill_na=-1) - self.__assert_sarray_equal(result['X1'], expected_dict_fillna) + self.__assert_sarray_equal(result["X1"], expected_dict_fillna) # pack large number of rows sf = SFrame() num_rows = 100000 - sf['a'] = range(0, num_rows) - sf['b'] = range(0, num_rows) - result = sf.pack_columns(['a', 'b']) + sf["a"] = range(0, num_rows) + sf["b"] = range(0, num_rows) + result = sf.pack_columns(["a", "b"]) self.assertEqual(len(result), num_rows) def test_pack_columns_dtype(self): - a = SFrame({'name':[-140500967,-1405039672],'data':[3,4]}) - b = a.pack_columns(['name','data'],dtype=array.array) - expected = SArray([[-140500967, 3],[-1405039672,4]]) - self.__assert_sarray_equal(b['X1'], expected) + a = SFrame({"name": [-140500967, -1405039672], "data": [3, 4]}) + b = a.pack_columns(["name", "data"], dtype=array.array) + expected = SArray([[-140500967, 3], [-1405039672, 4]]) + self.__assert_sarray_equal(b["X1"], expected) def test_unpack_dict_mixtype(self): - sf = SFrame({'a':[{'a':["haha", "hoho"]}, {'a':array.array('d', [1,2,3])}]}) - sf = sf.unpack('a', column_name_prefix = '') - self.assertEqual(sf['a'].dtype, list) - - sf = SFrame({'a':[{'a':["haha", "hoho"]}, {'a':array.array('d', [1,2,3])}]}) + sf = SFrame( + {"a": [{"a": ["haha", "hoho"]}, {"a": array.array("d", [1, 2, 3])}]} + ) + sf = sf.unpack("a", column_name_prefix="") + self.assertEqual(sf["a"].dtype, list) + + sf = SFrame( + {"a": [{"a": ["haha", "hoho"]}, {"a": array.array("d", [1, 2, 3])}]} + ) sf = sf.unpack() - self.assertEqual(sf['a'].dtype, list) + self.assertEqual(sf["a"].dtype, list) - sf = SFrame({'a':[{'a':["haha", "hoho"]}, {'a':None}]}) - sf = sf.unpack('a', column_name_prefix = '') - self.assertEqual(sf['a'].dtype, list) + sf = SFrame({"a": [{"a": ["haha", "hoho"]}, {"a": None}]}) + sf = sf.unpack("a", column_name_prefix="") + self.assertEqual(sf["a"].dtype, list) - sf = SFrame({'a':[{'a':["haha", "hoho"]}, {'a':None}]}) - sf = sf.unpack('a', column_name_prefix = '') - self.assertEqual(sf['a'].dtype, list) + sf = SFrame({"a": [{"a": ["haha", "hoho"]}, {"a": None}]}) + sf = sf.unpack("a", column_name_prefix="") + self.assertEqual(sf["a"].dtype, list) - sa = SArray([{'a':array.array('d', [1,2,3])}, {'a':None}]) - sf = sa.unpack(column_name_prefix = '') - self.assertEqual(sf['a'].dtype, array.array) + sa = SArray([{"a": array.array("d", [1, 2, 3])}, {"a": None}]) + sf = sa.unpack(column_name_prefix="") + self.assertEqual(sf["a"].dtype, array.array) - sa = SArray([{'a':array.array('d', [1,2,3])}, {'a':{'b':1}}]) - sf = sa.unpack(column_name_prefix = '') - self.assertEqual(sf['a'].dtype, str) - - sa = SArray([{'a': 1, 'b': 0.1}, {'a': 0.1, 'b': 1}]) - sf = sa.unpack(column_name_prefix = '') - self.assertEqual(sf['a'].dtype, float) - self.assertEqual(sf['b'].dtype, float) + sa = SArray([{"a": array.array("d", [1, 2, 3])}, {"a": {"b": 1}}]) + sf = sa.unpack(column_name_prefix="") + self.assertEqual(sf["a"].dtype, str) + sa = SArray([{"a": 1, "b": 0.1}, {"a": 0.1, "b": 1}]) + sf = sa.unpack(column_name_prefix="") + self.assertEqual(sf["a"].dtype, float) + self.assertEqual(sf["b"].dtype, float) def test_unpack_list(self): - sa = SArray([ - [1, None, None], - [2, '2', 2.0], - [3, '3', 3.0], - [4, None, None], - [5, '5', 5.0] - ]) + sa = SArray( + [ + [1, None, None], + [2, "2", 2.0], + [3, "3", 3.0], + [4, None, None], + [5, "5", 5.0], + ] + ) expected = SFrame() - expected ['a'] = [1, 2, 3, 4, 5] - expected ['b'] = [None, '2', '3', None, '5'] - expected ['c'] = [None, 2.0, 3.0, None, 5.0] + expected["a"] = [1, 2, 3, 4, 5] + expected["b"] = [None, "2", "3", None, "5"] + expected["c"] = [None, 2.0, 3.0, None, 5.0] result = sa.unpack() - result.rename(dict(zip(result.column_names(), ['a','b','c'])), inplace=True) + result.rename(dict(zip(result.column_names(), ["a", "b", "c"])), inplace=True) assert_frame_equal(result.to_dataframe(), expected.to_dataframe()) - result = sa.unpack(column_name_prefix='ttt') - self.assertEqual(result.column_names(), ['ttt.0', 'ttt.1', 'ttt.2']) - result.rename(dict(zip(result.column_names(), ['a','b','c'])), inplace=True) + result = sa.unpack(column_name_prefix="ttt") + self.assertEqual(result.column_names(), ["ttt.0", "ttt.1", "ttt.2"]) + result.rename(dict(zip(result.column_names(), ["a", "b", "c"])), inplace=True) assert_frame_equal(result.to_dataframe(), expected.to_dataframe()) # column types result = sa.unpack(column_types=[int, str, float]) - result.rename(dict(zip(result.column_names(), ['a','b','c'])), inplace=True) + result.rename(dict(zip(result.column_names(), ["a", "b", "c"])), inplace=True) assert_frame_equal(result.to_dataframe(), expected.to_dataframe()) # more column types result = sa.unpack(column_types=[int, str, float, int]) - result.rename(dict(zip(result.column_names(), ['a','b','c','d'])), inplace=True) - e = expected.select_columns(['a','b','c']) - e.add_column(SArray([None for i in range(5)], int),'d', inplace=True) + result.rename( + dict(zip(result.column_names(), ["a", "b", "c", "d"])), inplace=True + ) + e = expected.select_columns(["a", "b", "c"]) + e.add_column(SArray([None for i in range(5)], int), "d", inplace=True) assert_frame_equal(result.to_dataframe(), e.to_dataframe()) # less column types result = sa.unpack(column_types=[int, str]) - result.rename(dict(zip(result.column_names(), ['a','b'])), inplace=True) - e = expected.select_columns(['a','b']) + result.rename(dict(zip(result.column_names(), ["a", "b"])), inplace=True) + e = expected.select_columns(["a", "b"]) assert_frame_equal(result.to_dataframe(), e.to_dataframe()) # fill na_value e = SFrame() - e['a'] = [1, 2, None, 4, 5] - e['b'] = [None, '2', '3', None, '5'] - e['c'] = [None, 2.0, None, None, 5.0] + e["a"] = [1, 2, None, 4, 5] + e["b"] = [None, "2", "3", None, "5"] + e["c"] = [None, 2.0, None, None, 5.0] result = sa.unpack(na_value=3) - result.rename(dict(zip(result.column_names(), ['a','b','c'])), inplace=True) + result.rename(dict(zip(result.column_names(), ["a", "b", "c"])), inplace=True) assert_frame_equal(result.to_dataframe(), e.to_dataframe()) # wrong length with self.assertRaises(TypeError): - sa.unpack(column_name_prefix=['a','b']) + sa.unpack(column_name_prefix=["a", "b"]) # wrong type with self.assertRaises(RuntimeError): - sa.unpack(column_types = [str, int, float]) + sa.unpack(column_types=[str, int, float]) # wrong limit types with self.assertRaises(TypeError): @@ -2443,7 +2918,7 @@ def test_unpack_list(self): # int array cannot be unpacked with self.assertRaises(TypeError): - SArray([1,2,3,4]).unpack() + SArray([1, 2, 3, 4]).unpack() # column name must be a string with self.assertRaises(TypeError): @@ -2451,11 +2926,11 @@ def test_unpack_list(self): # invalid column type with self.assertRaises(TypeError): - sa.unpack(column_types = int) + sa.unpack(column_types=int) # invalid column type with self.assertRaises(TypeError): - sa.unpack(column_types = [np.array]) + sa.unpack(column_types=[np.array]) # cannot infer type if no values with self.assertRaises(RuntimeError): @@ -2463,545 +2938,677 @@ def test_unpack_list(self): def test_unpack_array(self): import array - sa = SArray([ - array.array('d', [1, 1, 0]), - array.array('d', [2, -1, 1]), - array.array('d', [3, 3, 2]), - array.array('d', [-1, 2, 3]), - array.array('d', [5, 5, 4]) - ]) + + sa = SArray( + [ + array.array("d", [1, 1, 0]), + array.array("d", [2, -1, 1]), + array.array("d", [3, 3, 2]), + array.array("d", [-1, 2, 3]), + array.array("d", [5, 5, 4]), + ] + ) expected = SFrame() - expected ['a'] = [1.0, 2.0, 3.0, -1.0, 5.0] - expected ['b'] = [1.0, -1.0, 3.0, 2.0, 5.0] - expected ['c'] = [0.0, 1.0, 2.0, 3.0, 4.0] + expected["a"] = [1.0, 2.0, 3.0, -1.0, 5.0] + expected["b"] = [1.0, -1.0, 3.0, 2.0, 5.0] + expected["c"] = [0.0, 1.0, 2.0, 3.0, 4.0] result = sa.unpack() - result.rename(dict(zip(result.column_names(), ['a','b','c'])), inplace=True) + result.rename(dict(zip(result.column_names(), ["a", "b", "c"])), inplace=True) assert_frame_equal(result.to_dataframe(), expected.to_dataframe()) # right amount column names - result = sa.unpack(column_name_prefix = 'unpacked') - result.rename(dict(zip(result.column_names(), ['t.0', 't.1', 't.2'])), inplace=True) - result.rename(dict(zip(result.column_names(), ['a','b','c'])), inplace=True) + result = sa.unpack(column_name_prefix="unpacked") + result.rename( + dict(zip(result.column_names(), ["t.0", "t.1", "t.2"])), inplace=True + ) + result.rename(dict(zip(result.column_names(), ["a", "b", "c"])), inplace=True) assert_frame_equal(result.to_dataframe(), expected.to_dataframe()) # column types result = sa.unpack(column_types=[int, str, float]) - result.rename(dict(zip(result.column_names(), ['a','b','c'])), inplace=True) - expected['a'] = expected['a'].astype(int) - expected['b'] = expected['b'].astype(str) - expected['c'] = expected['c'].astype(float) + result.rename(dict(zip(result.column_names(), ["a", "b", "c"])), inplace=True) + expected["a"] = expected["a"].astype(int) + expected["b"] = expected["b"].astype(str) + expected["c"] = expected["c"].astype(float) assert_frame_equal(result.to_dataframe(), expected.to_dataframe()) # more column types result = sa.unpack(column_types=[int, str, float, int]) - result.rename(dict(zip(result.column_names(), ['a','b','c','d'])), inplace=True) - e = expected.select_columns(['a','b','c']) - e.add_column(SArray([None for i in range(5)], int),'d', inplace=True) + result.rename( + dict(zip(result.column_names(), ["a", "b", "c", "d"])), inplace=True + ) + e = expected.select_columns(["a", "b", "c"]) + e.add_column(SArray([None for i in range(5)], int), "d", inplace=True) assert_frame_equal(result.to_dataframe(), e.to_dataframe()) # less column types result = sa.unpack(column_types=[int, str]) - result.rename(dict(zip(result.column_names(), ['a','b'])), inplace=True) - e = expected.select_columns(['a','b']) + result.rename(dict(zip(result.column_names(), ["a", "b"])), inplace=True) + e = expected.select_columns(["a", "b"]) assert_frame_equal(result.to_dataframe(), e.to_dataframe()) # fill na_value e = SFrame() - e['a'] = SArray([1, 2, 3, None, 5], float) - e['b'] = SArray([1, None, 3, 2, 5], float) - e['c'] = SArray([0, 1, 2, 3, 4], float) + e["a"] = SArray([1, 2, 3, None, 5], float) + e["b"] = SArray([1, None, 3, 2, 5], float) + e["c"] = SArray([0, 1, 2, 3, 4], float) result = sa.unpack(na_value=-1) - result.rename(dict(zip(result.column_names(), ['a','b','c'])), inplace=True) + result.rename(dict(zip(result.column_names(), ["a", "b", "c"])), inplace=True) assert_frame_equal(result.to_dataframe(), e.to_dataframe()) def test_unpack_dict(self): - sf = SFrame([{'a':1,'b':2,'c':3},{'a':4,'b':5,'c':6}]) + sf = SFrame([{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}]) expected_sf = SFrame() - expected_sf["a"] = [1,4] - expected_sf["b"] = [2,5] - expected_sf["c"] = [3,6] + expected_sf["a"] = [1, 4] + expected_sf["b"] = [2, 5] + expected_sf["c"] = [3, 6] unpacked_sf = sf.unpack() assert_frame_equal(unpacked_sf.to_dataframe(), expected_sf.to_dataframe()) expected_sf = SFrame() - expected_sf["xx.a"] = [1,4] - expected_sf["xx.b"] = [2,5] - expected_sf["xx.c"] = [3,6] - unpacked_sf = sf.unpack(column_name_prefix='xx') + expected_sf["xx.a"] = [1, 4] + expected_sf["xx.b"] = [2, 5] + expected_sf["xx.c"] = [3, 6] + unpacked_sf = sf.unpack(column_name_prefix="xx") assert_frame_equal(unpacked_sf.to_dataframe(), expected_sf.to_dataframe()) - packed_sf = SFrame({"X1":{'a':1,'b':2,'c':3},"X2":{'a':4,'b':5,'c':6}}) + packed_sf = SFrame( + {"X1": {"a": 1, "b": 2, "c": 3}, "X2": {"a": 4, "b": 5, "c": 6}} + ) with self.assertRaises(RuntimeError): packed_sf.unpack() sf = SFrame() - sf["user_id"] = [1,2,3,4,5,6,7] - sf["is_restaurant"] = [1, 1,0,0, 1, None, None] - sf["is_retail"] = [None,1,1,None,1, None, None] - sf["is_electronics"] = ["yes", "no","yes",None,"no", None, None] - + sf["user_id"] = [1, 2, 3, 4, 5, 6, 7] + sf["is_restaurant"] = [1, 1, 0, 0, 1, None, None] + sf["is_retail"] = [None, 1, 1, None, 1, None, None] + sf["is_electronics"] = ["yes", "no", "yes", None, "no", None, None] packed_sf = SFrame() - packed_sf['user_id'] = sf['user_id'] + packed_sf["user_id"] = sf["user_id"] packed_sf["category"] = [ - {"is_restaurant": 1, "is_electronics": "yes"}, - {"is_restaurant": 1, "is_retail": 1, "is_electronics": "no"}, - {"is_restaurant": 0, "is_retail": 1, "is_electronics": "yes"}, - {"is_restaurant": 0 }, - {"is_restaurant": 1, "is_retail": 1, "is_electronics": "no"}, - { }, - None] + {"is_restaurant": 1, "is_electronics": "yes"}, + {"is_restaurant": 1, "is_retail": 1, "is_electronics": "no"}, + {"is_restaurant": 0, "is_retail": 1, "is_electronics": "yes"}, + {"is_restaurant": 0}, + {"is_restaurant": 1, "is_retail": 1, "is_electronics": "no"}, + {}, + None, + ] with self.assertRaises(TypeError): - packed_sf['user_id'].unpack() + packed_sf["user_id"].unpack() with self.assertRaises(TypeError): - packed_sf['category'].unpack(1) + packed_sf["category"].unpack(1) with self.assertRaises(TypeError): - packed_sf['category'].unpack(value_types = [int]) + packed_sf["category"].unpack(value_types=[int]) # unpack only one column expected_sf = SFrame() expected_sf["is_retail"] = sf["is_retail"] - unpacked_sf = packed_sf['category'].unpack(limit=["is_retail"], column_types=[int], column_name_prefix=None) + unpacked_sf = packed_sf["category"].unpack( + limit=["is_retail"], column_types=[int], column_name_prefix=None + ) assert_frame_equal(unpacked_sf.to_dataframe(), expected_sf.to_dataframe()) - # unpack all - unpacked_sf = packed_sf['category'].unpack(column_name_prefix=None, column_types=[int, int, str], limit=["is_restaurant", "is_retail", "is_electronics"]) - assert_frame_equal(unpacked_sf.to_dataframe(), sf[["is_restaurant", "is_retail", "is_electronics"]].to_dataframe()) + unpacked_sf = packed_sf["category"].unpack( + column_name_prefix=None, + column_types=[int, int, str], + limit=["is_restaurant", "is_retail", "is_electronics"], + ) + assert_frame_equal( + unpacked_sf.to_dataframe(), + sf[["is_restaurant", "is_retail", "is_electronics"]].to_dataframe(), + ) # auto infer types, the column order may be different, so use order here before comparison unpacked_sf = packed_sf["category"].unpack() - unpacked_sf.rename({ - "X.is_restaurant": "is_restaurant", - "X.is_retail": "is_retail", - "X.is_electronics": "is_electronics" - }, inplace=True) - assert_frame_equal(unpacked_sf.to_dataframe().sort_index(axis=1), sf[["is_restaurant", "is_retail", "is_electronics"]].to_dataframe().sort_index(axis=1)) - - unpacked_sf = packed_sf["category"].unpack(na_value = 0, column_name_prefix="new") + unpacked_sf.rename( + { + "X.is_restaurant": "is_restaurant", + "X.is_retail": "is_retail", + "X.is_electronics": "is_electronics", + }, + inplace=True, + ) + assert_frame_equal( + unpacked_sf.to_dataframe().sort_index(axis=1), + sf[["is_restaurant", "is_retail", "is_electronics"]] + .to_dataframe() + .sort_index(axis=1), + ) + + unpacked_sf = packed_sf["category"].unpack(na_value=0, column_name_prefix="new") expected = SFrame() - expected["new.is_restaurant"] = [1, 1,None,None, 1, None, None] - expected["new.is_retail"] = [None,1,1,None,1, None, None] - expected["new.is_electronics"] = ["yes", "no","yes",None,"no", None, None] - assert_frame_equal(unpacked_sf.to_dataframe().sort_index(axis=1), expected.to_dataframe().sort_index(axis=1)) + expected["new.is_restaurant"] = [1, 1, None, None, 1, None, None] + expected["new.is_retail"] = [None, 1, 1, None, 1, None, None] + expected["new.is_electronics"] = ["yes", "no", "yes", None, "no", None, None] + assert_frame_equal( + unpacked_sf.to_dataframe().sort_index(axis=1), + expected.to_dataframe().sort_index(axis=1), + ) # unpack a dictionary key integer as key - sa = SArray([ - {1: 'a'}, - {2: 'b'} - ]) + sa = SArray([{1: "a"}, {2: "b"}]) result = sa.unpack() - expected = SFrame({'X.1':['a', None], 'X.2':[None, 'b']}) + expected = SFrame({"X.1": ["a", None], "X.2": [None, "b"]}) assert_frame_equal(result.to_dataframe(), expected.to_dataframe()) result = sa.unpack(limit=[2]) - expected = SFrame({'X.2':[None, 'b']}) + expected = SFrame({"X.2": [None, "b"]}) assert_frame_equal(result.to_dataframe(), expected.to_dataframe()) result = sa.unpack(limit=[2], column_name_prefix="expanded") - expected = SFrame({'expanded.2':[None, 'b']}) + expected = SFrame({"expanded.2": [None, "b"]}) assert_frame_equal(result.to_dataframe(), expected.to_dataframe()) - sa = SArray([{i:i} for i in range(500)]) + sa = SArray([{i: i} for i in range(500)]) unpacked_sa = sa.unpack() self.assertEqual(len(unpacked_sa), len(sa)) i = 0 for v in unpacked_sa: for j in range(500): - val = v['X.' + str(j)] - if (j == i): + val = v["X." + str(j)] + if j == i: self.assertEqual(val, i) else: self.assertEqual(val, None) i = i + 1 # if types don't agree, convert to string automatically - sa = SArray([{'a':1},{'a': 'a_3'}]) + sa = SArray([{"a": 1}, {"a": "a_3"}]) sf = sa.unpack() self.assertEqual(sf.column_types(), [str]) - sa = SArray([{'a':None}, {'a': 1}]) + sa = SArray([{"a": None}, {"a": 1}]) sf = sa.unpack() self.assertEqual(sf.column_types(), [int]) - sa = SArray([{'a':1}, {'a': None}]) + sa = SArray([{"a": 1}, {"a": None}]) sf = sa.unpack() self.assertEqual(sf.column_types(), [int]) # type inference is already at server side even if limit is given - sa = SArray([{'c'+str(i): i if i % 2 == 0 else 'v' + str(i)} for i in range(1000)]) - unpacked = sa.unpack(limit=['c'+str(i) for i in range(10)], column_name_prefix="") + sa = SArray( + [{"c" + str(i): i if i % 2 == 0 else "v" + str(i)} for i in range(1000)] + ) + unpacked = sa.unpack( + limit=["c" + str(i) for i in range(10)], column_name_prefix="" + ) for i in range(10): v = unpacked[i] for j in range(10): - if (j != i): - self.assertEqual(v['c'+str(j)], None) + if j != i: + self.assertEqual(v["c" + str(j)], None) elif j % 2 == 0: - self.assertEqual(v['c'+str(j)], j) + self.assertEqual(v["c" + str(j)], j) else: - self.assertEqual(v['c'+str(j)], 'v' + str(j)) - - + self.assertEqual(v["c" + str(j)], "v" + str(j)) def test_unpack_sframe(self): sf = SFrame() - sf['user_id'] = range(7) + sf["user_id"] = range(7) sf["category"] = [ - {"is_restaurant": 1, "is_electronics": "yes"}, - {"is_restaurant": 1, "is_retail": 1, "is_electronics": "no"}, - {"is_restaurant": 0, "is_retail": 1, "is_electronics": "yes"}, - {"is_restaurant": 0 }, - {"is_restaurant": 1, "is_retail": 1, "is_electronics": "no"}, - { }, - None] - sf['list'] = [ - None, - range(1), - range(2), - range(3), - range(1), - range(2), - range(3), + {"is_restaurant": 1, "is_electronics": "yes"}, + {"is_restaurant": 1, "is_retail": 1, "is_electronics": "no"}, + {"is_restaurant": 0, "is_retail": 1, "is_electronics": "yes"}, + {"is_restaurant": 0}, + {"is_restaurant": 1, "is_retail": 1, "is_electronics": "no"}, + {}, + None, + ] + sf["list"] = [ + None, + range(1), + range(2), + range(3), + range(1), + range(2), + range(3), ] with self.assertRaises(TypeError): - sf.unpack('user_id') + sf.unpack("user_id") expected = SFrame() - expected['user_id'] = sf['user_id'] - expected['list'] = sf['list'] - expected["is_restaurant"] = [1, 1,0,0, 1, None, None] - expected["is_retail"] = [None,1,1,None,1, None, None] - expected["is_electronics"] = ["yes", "no","yes",None,"no", None, None] - - result = sf.unpack('category') - result.rename({ - 'category.is_restaurant': 'is_restaurant', - 'category.is_retail': 'is_retail', - 'category.is_electronics': 'is_electronics' - }, inplace=True) - assert_frame_equal(expected.to_dataframe().sort_index(axis=1), result.to_dataframe().sort_index(axis=1)) - - result = sf.unpack(column_name='category', column_name_prefix="") - assert_frame_equal(expected.to_dataframe().sort_index(axis=1), result.to_dataframe().sort_index(axis=1)) - - result = sf.unpack(column_name='category', column_name_prefix="abc") - result.rename({ - 'abc.is_restaurant': 'is_restaurant', - 'abc.is_retail': 'is_retail', - 'abc.is_electronics': 'is_electronics' - }, inplace=True) - assert_frame_equal(expected.to_dataframe().sort_index(axis=1), result.to_dataframe().sort_index(axis=1)) - - result = sf.unpack(column_name='category', column_name_prefix="", column_types=[str], limit=['is_restaurant']) - new_expected = expected[['user_id', 'list', 'is_restaurant']] - new_expected['is_restaurant'] = new_expected['is_restaurant'].astype(str) - assert_frame_equal(new_expected.to_dataframe().sort_index(axis=1), result.to_dataframe().sort_index(axis=1)) - - result = sf.unpack(column_name='category', column_name_prefix="", na_value = None) - assert_frame_equal(expected.to_dataframe().sort_index(axis=1), result.to_dataframe().sort_index(axis=1)) - - result = sf.unpack(column_name='list') + expected["user_id"] = sf["user_id"] + expected["list"] = sf["list"] + expected["is_restaurant"] = [1, 1, 0, 0, 1, None, None] + expected["is_retail"] = [None, 1, 1, None, 1, None, None] + expected["is_electronics"] = ["yes", "no", "yes", None, "no", None, None] + + result = sf.unpack("category") + result.rename( + { + "category.is_restaurant": "is_restaurant", + "category.is_retail": "is_retail", + "category.is_electronics": "is_electronics", + }, + inplace=True, + ) + assert_frame_equal( + expected.to_dataframe().sort_index(axis=1), + result.to_dataframe().sort_index(axis=1), + ) + + result = sf.unpack(column_name="category", column_name_prefix="") + assert_frame_equal( + expected.to_dataframe().sort_index(axis=1), + result.to_dataframe().sort_index(axis=1), + ) + + result = sf.unpack(column_name="category", column_name_prefix="abc") + result.rename( + { + "abc.is_restaurant": "is_restaurant", + "abc.is_retail": "is_retail", + "abc.is_electronics": "is_electronics", + }, + inplace=True, + ) + assert_frame_equal( + expected.to_dataframe().sort_index(axis=1), + result.to_dataframe().sort_index(axis=1), + ) + + result = sf.unpack( + column_name="category", + column_name_prefix="", + column_types=[str], + limit=["is_restaurant"], + ) + new_expected = expected[["user_id", "list", "is_restaurant"]] + new_expected["is_restaurant"] = new_expected["is_restaurant"].astype(str) + assert_frame_equal( + new_expected.to_dataframe().sort_index(axis=1), + result.to_dataframe().sort_index(axis=1), + ) + + result = sf.unpack(column_name="category", column_name_prefix="", na_value=None) + assert_frame_equal( + expected.to_dataframe().sort_index(axis=1), + result.to_dataframe().sort_index(axis=1), + ) + + result = sf.unpack(column_name="list") expected = SFrame() - expected['user_id'] = sf['user_id'] - expected['list.0'] = [None,0,0,0, 0,0,0] - expected['list.1'] = [None,None,1,1, None,1,1] - expected['list.2'] = [None,None,None,2, None, None,2] - expected['category'] = sf['category'] - assert_frame_equal(expected.to_dataframe().sort_index(axis=1), result.to_dataframe().sort_index(axis=1)) - - result = sf.unpack(column_name='list', na_value= 2) + expected["user_id"] = sf["user_id"] + expected["list.0"] = [None, 0, 0, 0, 0, 0, 0] + expected["list.1"] = [None, None, 1, 1, None, 1, 1] + expected["list.2"] = [None, None, None, 2, None, None, 2] + expected["category"] = sf["category"] + assert_frame_equal( + expected.to_dataframe().sort_index(axis=1), + result.to_dataframe().sort_index(axis=1), + ) + + result = sf.unpack(column_name="list", na_value=2) expected = SFrame() - expected['user_id'] = sf['user_id'] - expected['list.0'] = [None,0,0,0, 0,0,0] - expected['list.1'] = [None,None,1,1, None,1,1] - expected['list.2'] = [None,None,None,None, None, None,None] - expected['category'] = sf['category'] - assert_frame_equal(expected.to_dataframe().sort_index(axis=1), result.to_dataframe().sort_index(axis=1)) + expected["user_id"] = sf["user_id"] + expected["list.0"] = [None, 0, 0, 0, 0, 0, 0] + expected["list.1"] = [None, None, 1, 1, None, 1, 1] + expected["list.2"] = [None, None, None, None, None, None, None] + expected["category"] = sf["category"] + assert_frame_equal( + expected.to_dataframe().sort_index(axis=1), + result.to_dataframe().sort_index(axis=1), + ) # auto resolving conflicting names sf = SFrame() - sf['a'] = range(100) - sf['b'] = [range(5) for i in range(100)] - sf['b.0'] = range(100) - sf['b.0.1'] = range(100) - result = sf.unpack('b') - self.assertEqual(result.column_names(), ['a', 'b.0', 'b.0.1', 'b.0.1.1', 'b.1.1.1', 'b.2.1.1', 'b.3.1.1', 'b.4.1.1']) + sf["a"] = range(100) + sf["b"] = [range(5) for i in range(100)] + sf["b.0"] = range(100) + sf["b.0.1"] = range(100) + result = sf.unpack("b") + self.assertEqual( + result.column_names(), + [ + "a", + "b.0", + "b.0.1", + "b.0.1.1", + "b.1.1.1", + "b.2.1.1", + "b.3.1.1", + "b.4.1.1", + ], + ) sf = SFrame() - sf['a'] = range(100) - sf['b'] = [{'str1': i, 'str2':i + 1} for i in range(100)] - sf['b.str1'] = range(100) - result = sf.unpack('b') + sf["a"] = range(100) + sf["b"] = [{"str1": i, "str2": i + 1} for i in range(100)] + sf["b.str1"] = range(100) + result = sf.unpack("b") self.assertEqual(len(result.column_names()), 4) def test_stack_dict(self): sf = SFrame() - sf["user_id"] = [1,2,3,4,5] - sf["user_name"] = ['user' + str(i) for i in list(sf['user_id'])] + sf["user_id"] = [1, 2, 3, 4, 5] + sf["user_name"] = ["user" + str(i) for i in list(sf["user_id"])] sf["category"] = [ - {"is_restaurant": 1, }, - {"is_restaurant": 0, "is_retail": 1 }, - { "is_retail": 0 }, - {}, - None] + {"is_restaurant": 1,}, + {"is_restaurant": 0, "is_retail": 1}, + {"is_retail": 0}, + {}, + None, + ] expected_sf = SFrame() - expected_sf["user_id"] = [1,2, 2, 3,4,5] - expected_sf["user_name"] = ['user' + str(i) for i in list(expected_sf['user_id'])] - expected_sf['category'] = ['is_restaurant', 'is_restaurant', 'is_retail', 'is_retail', None, None] - expected_sf['value'] = [1,0,1,0, None, None] - df_expected = expected_sf.to_dataframe().sort_values(['user_id', 'category']).reset_index(drop=True) + expected_sf["user_id"] = [1, 2, 2, 3, 4, 5] + expected_sf["user_name"] = [ + "user" + str(i) for i in list(expected_sf["user_id"]) + ] + expected_sf["category"] = [ + "is_restaurant", + "is_restaurant", + "is_retail", + "is_retail", + None, + None, + ] + expected_sf["value"] = [1, 0, 1, 0, None, None] + df_expected = ( + expected_sf.to_dataframe() + .sort_values(["user_id", "category"]) + .reset_index(drop=True) + ) with self.assertRaises(TypeError): sf.stack() with self.assertRaises(ValueError): - sf.stack('sss') + sf.stack("sss") with self.assertRaises(ValueError): - sf.stack('category', ['user_id', 'value']) + sf.stack("category", ["user_id", "value"]) # normal case - stacked_sf = sf.stack('category', ['category', 'value']) - assert_frame_equal(stacked_sf.to_dataframe().sort_values(["user_id", "category"]).reset_index(drop=True), df_expected) + stacked_sf = sf.stack("category", ["category", "value"]) + assert_frame_equal( + stacked_sf.to_dataframe() + .sort_values(["user_id", "category"]) + .reset_index(drop=True), + df_expected, + ) # set column types - stacked_sf = sf.stack('category') + stacked_sf = sf.stack("category") self.assertTrue(stacked_sf.column_types()[2] == str) self.assertTrue(stacked_sf.column_types()[3] == int) # auto generate column names - stacked_sf = sf.stack('category') + stacked_sf = sf.stack("category") new_column_names = stacked_sf.column_names() self.assertTrue(len(new_column_names) == 4) - expected_sf.rename({'category':new_column_names[2], 'value':new_column_names[3]}, inplace=True) - df_expected = expected_sf.to_dataframe().sort_values(['user_id', new_column_names[2]]).reset_index(drop=True) - assert_frame_equal(stacked_sf.to_dataframe().sort_values(["user_id", new_column_names[2]]).reset_index(drop=True), df_expected) - - #dropna + expected_sf.rename( + {"category": new_column_names[2], "value": new_column_names[3]}, + inplace=True, + ) + df_expected = ( + expected_sf.to_dataframe() + .sort_values(["user_id", new_column_names[2]]) + .reset_index(drop=True) + ) + assert_frame_equal( + stacked_sf.to_dataframe() + .sort_values(["user_id", new_column_names[2]]) + .reset_index(drop=True), + df_expected, + ) + + # dropna expected_sf = SFrame() - expected_sf["user_id"] = [1,2, 2, 3, 4, 5] - expected_sf["user_name"] = ['user' + str(i) for i in list(expected_sf['user_id'])] - expected_sf['category'] = ['is_restaurant', 'is_restaurant', 'is_retail', 'is_retail', None, None] - expected_sf['value'] = [1,0,1,0, None, None] - df_expected = expected_sf.to_dataframe().sort_values(['user_id', 'category']).reset_index(drop=True) - - stacked_sf = sf.stack('category', ['category','value'], drop_na = False) - assert_frame_equal(stacked_sf.to_dataframe().sort_values(["user_id", "category"]).reset_index(drop=True), df_expected) + expected_sf["user_id"] = [1, 2, 2, 3, 4, 5] + expected_sf["user_name"] = [ + "user" + str(i) for i in list(expected_sf["user_id"]) + ] + expected_sf["category"] = [ + "is_restaurant", + "is_restaurant", + "is_retail", + "is_retail", + None, + None, + ] + expected_sf["value"] = [1, 0, 1, 0, None, None] + df_expected = ( + expected_sf.to_dataframe() + .sort_values(["user_id", "category"]) + .reset_index(drop=True) + ) + + stacked_sf = sf.stack("category", ["category", "value"], drop_na=False) + assert_frame_equal( + stacked_sf.to_dataframe() + .sort_values(["user_id", "category"]) + .reset_index(drop=True), + df_expected, + ) sf = SFrame() - sf['a'] = SArray(([{}] * 100) + [{'a':1}]) + sf["a"] = SArray(([{}] * 100) + [{"a": 1}]) # its a dict need 2 types with self.assertRaises(ValueError): - sf.stack('a',['key', 'value'], new_column_type=[str]) + sf.stack("a", ["key", "value"], new_column_type=[str]) with self.assertRaises(ValueError): - sf.stack('a',['key', 'value'], new_column_type=str) + sf.stack("a", ["key", "value"], new_column_type=str) - sf.stack('a',['key', 'value'], new_column_type=[str, int]) + sf.stack("a", ["key", "value"], new_column_type=[str, int]) expected_sf = SFrame() - expected_sf['key'] = SArray([None] * 100 + ["a"]) - expected_sf['value'] = SArray([None] * 100 + [1]) + expected_sf["key"] = SArray([None] * 100 + ["a"]) + expected_sf["value"] = SArray([None] * 100 + [1]) def test_stack_list(self): sf = SFrame() - sf["a"] = [1,2,3,4,5] - sf["b"] = [['a', 'b'], ['c'], ['d'],['e', None], None] + sf["a"] = [1, 2, 3, 4, 5] + sf["b"] = [["a", "b"], ["c"], ["d"], ["e", None], None] expected_result = SFrame() - expected_result['a'] = [1,1,2,3,4,4,5] - expected_result['X1'] = ['a','b','c','d','e',None, None] + expected_result["a"] = [1, 1, 2, 3, 4, 4, 5] + expected_result["X1"] = ["a", "b", "c", "d", "e", None, None] with self.assertRaises(TypeError): sf.stack() with self.assertRaises(ValueError): - sf.stack('sss') + sf.stack("sss") with self.assertRaises(TypeError): - sf.stack('a') + sf.stack("a") with self.assertRaises(TypeError): - sf.stack('b', ["something"]) + sf.stack("b", ["something"]) - result = sf.stack("b", drop_na = False) + result = sf.stack("b", drop_na=False) stacked_column_name = result.column_names()[1] - expected_result.rename({'X1':stacked_column_name}, inplace=True) + expected_result.rename({"X1": stacked_column_name}, inplace=True) assert_frame_equal(result.to_dataframe(), expected_result.to_dataframe()) # default drop_na=False result = sf.stack("b") assert_frame_equal(result.to_dataframe(), expected_result.to_dataframe()) - result = sf.stack("b", new_column_name = "b", drop_na = False) - expected_result.rename({stacked_column_name: 'b'}, inplace=True) + result = sf.stack("b", new_column_name="b", drop_na=False) + expected_result.rename({stacked_column_name: "b"}, inplace=True) assert_frame_equal(result.to_dataframe(), expected_result.to_dataframe()) - result = sf.stack("b", new_column_name = "b", drop_na = False) + result = sf.stack("b", new_column_name="b", drop_na=False) assert_frame_equal(result.to_dataframe(), expected_result.to_dataframe()) # drop_na=True - result = sf.stack("b", drop_na = True) + result = sf.stack("b", drop_na=True) expected_result = SFrame() - expected_result['a'] = [1,1,2,3,4,4] - expected_result[result.column_names()[1]] = ['a','b','c','d','e',None] + expected_result["a"] = [1, 1, 2, 3, 4, 4] + expected_result[result.column_names()[1]] = ["a", "b", "c", "d", "e", None] assert_frame_equal(result.to_dataframe(), expected_result.to_dataframe()) - sf = SFrame() n = 1000000 - sf['a'] = range(1,n) - sf['b'] = [[str(i), str(i+1)] for i in range(1,n)] - result = sf.stack('b') + sf["a"] = range(1, n) + sf["b"] = [[str(i), str(i + 1)] for i in range(1, n)] + result = sf.stack("b") self.assertTrue(len(result), n * 2) - sf = SFrame() - sf['a'] = SArray(([[]] * 100) + [['a','b']]) + sf["a"] = SArray(([[]] * 100) + [["a", "b"]]) # its a dict need 2 types with self.assertRaises(ValueError): - sf.stack('a', 'a', new_column_type=[str, int]) + sf.stack("a", "a", new_column_type=[str, int]) - sf.stack('a', 'a', new_column_type=str) + sf.stack("a", "a", new_column_type=str) expected_sf = SFrame() - expected_sf['a'] = SArray([None] * 100 + ["a", "b"]) + expected_sf["a"] = SArray([None] * 100 + ["a", "b"]) def test_stack_vector(self): sf = SFrame() - sf["a"] = [1,2,3,4,5] - sf["b"] = [[1],[1,2],[1,2,3],[1,2,3,4],None] + sf["a"] = [1, 2, 3, 4, 5] + sf["b"] = [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4], None] expected_result = SFrame() - expected_result['a'] = [1,2,2,3,3,3,4,4,4,4,5] - expected_result['X1'] = [1,1,2,1,2,3,1,2,3,4,None] + expected_result["a"] = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5] + expected_result["X1"] = [1, 1, 2, 1, 2, 3, 1, 2, 3, 4, None] with self.assertRaises(TypeError): sf.stack() with self.assertRaises(ValueError): - sf.stack('sss') + sf.stack("sss") with self.assertRaises(TypeError): - sf.stack('a') + sf.stack("a") with self.assertRaises(TypeError): - sf.stack('b', ["something"]) + sf.stack("b", ["something"]) - result = sf.stack("b", drop_na = False) + result = sf.stack("b", drop_na=False) stacked_column_name = result.column_names()[1] - expected_result.rename({'X1':stacked_column_name}, inplace=True) + expected_result.rename({"X1": stacked_column_name}, inplace=True) assert_frame_equal(result.to_dataframe(), expected_result.to_dataframe()) # default drop_na=False result = sf.stack("b") assert_frame_equal(result.to_dataframe(), expected_result.to_dataframe()) - - result = sf.stack("b", new_column_name = "b", drop_na = False) - expected_result.rename({stacked_column_name: 'b'}, inplace=True) + result = sf.stack("b", new_column_name="b", drop_na=False) + expected_result.rename({stacked_column_name: "b"}, inplace=True) assert_frame_equal(result.to_dataframe(), expected_result.to_dataframe()) - result = sf.stack("b", new_column_name = "b", drop_na = False) + result = sf.stack("b", new_column_name="b", drop_na=False) assert_frame_equal(result.to_dataframe(), expected_result.to_dataframe()) # drop_na=True - result = sf.stack("b", drop_na = True) + result = sf.stack("b", drop_na=True) expected_result = SFrame() - expected_result['a'] = [1,2,2,3,3,3,4,4,4,4] - expected_result[result.column_names()[1]] = SArray([1,1,2,1,2,3,1,2,3,4], float) + expected_result["a"] = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4] + expected_result[result.column_names()[1]] = SArray( + [1, 1, 2, 1, 2, 3, 1, 2, 3, 4], float + ) assert_frame_equal(result.to_dataframe(), expected_result.to_dataframe()) import array + sf = SFrame() - sf['a'] = SArray(([array.array('d')] * 100) + [array.array('d',[1.0,2.0])]) + sf["a"] = SArray(([array.array("d")] * 100) + [array.array("d", [1.0, 2.0])]) # its a dict need 2 types with self.assertRaises(ValueError): - sf.stack('a', 'a', new_column_type=[str, int]) + sf.stack("a", "a", new_column_type=[str, int]) - sf.stack('a', 'a', new_column_type=int) + sf.stack("a", "a", new_column_type=int) expected_sf = SFrame() - expected_sf['a'] = SArray([None] * 100 + [1, 2]) + expected_sf["a"] = SArray([None] * 100 + [1, 2]) def test_unstack_dict(self): sf = SFrame() - sf["user_id"] = [1,2,3,4] - sf["user_name"] = ['user' + str(i) for i in list(sf['user_id'])] + sf["user_id"] = [1, 2, 3, 4] + sf["user_name"] = ["user" + str(i) for i in list(sf["user_id"])] sf["categories"] = [ - {"is_restaurant": 1, }, - {"is_restaurant": 0, "is_retail": 1 }, - { "is_retail": 0 }, - None] + {"is_restaurant": 1,}, + {"is_restaurant": 0, "is_retail": 1}, + {"is_retail": 0}, + None, + ] - stacked_sf = sf.stack('categories', ['category', 'value'], drop_na=False) + stacked_sf = sf.stack("categories", ["category", "value"], drop_na=False) # normal unstack - unstacked_sf = stacked_sf.unstack(column_names=['category', 'value'], new_column_name = 'categories') + unstacked_sf = stacked_sf.unstack( + column_names=["category", "value"], new_column_name="categories" + ) # these frames are *almost* equal except user4 will be {} instead of None - assert_frame_equal(sf.fillna('categories',{}).to_dataframe(), unstacked_sf.to_dataframe().sort_values("user_id").reset_index(drop=True)) + assert_frame_equal( + sf.fillna("categories", {}).to_dataframe(), + unstacked_sf.to_dataframe().sort_values("user_id").reset_index(drop=True), + ) # missing new column name - unstacked_sf = stacked_sf.unstack(['category', 'value']) + unstacked_sf = stacked_sf.unstack(["category", "value"]) self.assertEqual(len(unstacked_sf.column_names()), 3) - unstacked_sf.rename({unstacked_sf.column_names()[2] : 'categories'}, inplace=True) - assert_frame_equal(sf.fillna('categories',{}).to_dataframe(), unstacked_sf.to_dataframe().sort_values("user_id").reset_index(drop=True)) + unstacked_sf.rename( + {unstacked_sf.column_names()[2]: "categories"}, inplace=True + ) + assert_frame_equal( + sf.fillna("categories", {}).to_dataframe(), + unstacked_sf.to_dataframe().sort_values("user_id").reset_index(drop=True), + ) # missing column names with self.assertRaises(KeyError): - stacked_sf.unstack(['category','value1']) + stacked_sf.unstack(["category", "value1"]) # wrong input with self.assertRaises(TypeError): - stacked_sf.unstack(['category']) + stacked_sf.unstack(["category"]) # duplicate new column name with self.assertRaises(RuntimeError): - unstacked_sf = stacked_sf.unstack(['category', 'value'], 'user_name') + unstacked_sf = stacked_sf.unstack(["category", "value"], "user_name") def test_unstack_list(self): sf = SFrame() - sf['a'] = [1,2,3,4] - sf['b'] = [range(10), range(20), range(30), range(50)] - stacked_sf = sf.stack('b', new_column_name = 'new_b') - unstacked_sf = stacked_sf.unstack('new_b', new_column_name = 'b') - self.__assert_concat_result_equal(sf.sort('a'), unstacked_sf.sort('a'), ['b']) + sf["a"] = [1, 2, 3, 4] + sf["b"] = [range(10), range(20), range(30), range(50)] + stacked_sf = sf.stack("b", new_column_name="new_b") + unstacked_sf = stacked_sf.unstack("new_b", new_column_name="b") + self.__assert_concat_result_equal(sf.sort("a"), unstacked_sf.sort("a"), ["b"]) - unstacked_sf = stacked_sf.unstack('new_b') - unstacked_sf.rename({unstacked_sf.column_names()[1]: 'b'}, inplace=True) - self.__assert_concat_result_equal(sf.sort('a'), unstacked_sf.sort('a'), ['b']) + unstacked_sf = stacked_sf.unstack("new_b") + unstacked_sf.rename({unstacked_sf.column_names()[1]: "b"}, inplace=True) + self.__assert_concat_result_equal(sf.sort("a"), unstacked_sf.sort("a"), ["b"]) - unstacked_sf = stacked_sf.unstack('new_b', new_column_name='b') - unstacked_sf.rename({unstacked_sf.column_names()[1]: 'b'}, inplace=True) - self.__assert_concat_result_equal(sf.sort('a'), unstacked_sf.sort('a'), ['b']) + unstacked_sf = stacked_sf.unstack("new_b", new_column_name="b") + unstacked_sf.rename({unstacked_sf.column_names()[1]: "b"}, inplace=True) + self.__assert_concat_result_equal(sf.sort("a"), unstacked_sf.sort("a"), ["b"]) with self.assertRaises(RuntimeError): - stacked_sf.unstack('new_b', new_column_name='a') + stacked_sf.unstack("new_b", new_column_name="a") with self.assertRaises(TypeError): - stacked_sf.unstack(['new_b']) + stacked_sf.unstack(["new_b"]) with self.assertRaises(KeyError): - stacked_sf.unstack('non exist') + stacked_sf.unstack("non exist") def test_content_identifier(self): - sf = SFrame({"a":[1,2,3,4],"b":["1","2","3","4"]}) - a1 = sf['a'].__get_content_identifier__() - a2 = sf['a'].__get_content_identifier__() + sf = SFrame({"a": [1, 2, 3, 4], "b": ["1", "2", "3", "4"]}) + a1 = sf["a"].__get_content_identifier__() + a2 = sf["a"].__get_content_identifier__() self.assertEqual(a1, a2) def test_random_access(self): - t1 = list(range(0,100000)) + t1 = list(range(0, 100000)) t2 = [str(i) for i in t1] - t = [{'t1':t1[i], 't2':t2[i]} for i in range(len(t1))] - s = SFrame({'t1':t1,'t2':t2}) + t = [{"t1": t1[i], "t2": t2[i]} for i in range(len(t1))] + s = SFrame({"t1": t1, "t2": t2}) # simple slices self.__test_equal(s[1:10000], pd.DataFrame(t[1:10000])) self.__test_equal(s[0:10000:3], pd.DataFrame(t[0:10000:3])) @@ -3015,13 +3622,13 @@ def test_random_access(self): self.__test_equal(s[-100:-10:2], pd.DataFrame(t[-100:-10:2])) # single element reads self.assertEqual(s[511], t[511]) - self.assertEqual(s[1912],t[1912]) + self.assertEqual(s[1912], t[1912]) self.assertEqual(s[-1], t[-1]) - self.assertEqual(s[-10],t[-10]) + self.assertEqual(s[-10], t[-10]) # edge case oddities self.__test_equal(s[10:100:100], pd.DataFrame(t[10:100:100])) - self.__test_equal(s[-100:len(s):10], pd.DataFrame(t[-100:len(t):10])) + self.__test_equal(s[-100 : len(s) : 10], pd.DataFrame(t[-100 : len(t) : 10])) self.assertEqual(len(s[-1:-2]), 0) self.assertEqual(len(s[-1:-1000:2]), 0) with self.assertRaises(IndexError): @@ -3030,16 +3637,16 @@ def test_random_access(self): def sort_n_rows(self, nrows=100): nrows += 1 sf = SFrame() - sf['a'] = range(1, nrows) - sf['b'] = [float(i) for i in range(1,nrows)] - sf['c'] = [str(i) for i in range(1,nrows)] - sf['d'] = [[i, i+1] for i in range(1,nrows)] + sf["a"] = range(1, nrows) + sf["b"] = [float(i) for i in range(1, nrows)] + sf["c"] = [str(i) for i in range(1, nrows)] + sf["d"] = [[i, i + 1] for i in range(1, nrows)] reversed_sf = SFrame() - reversed_sf['a'] = range(nrows-1, 0, -1) - reversed_sf['b'] = [float(i) for i in range(nrows-1, 0, -1)] - reversed_sf['c'] = [str(i) for i in range(nrows-1, 0, -1)] - reversed_sf['d'] = [[i, i+1] for i in range(nrows-1, 0, -1)] + reversed_sf["a"] = range(nrows - 1, 0, -1) + reversed_sf["b"] = [float(i) for i in range(nrows - 1, 0, -1)] + reversed_sf["c"] = [str(i) for i in range(nrows - 1, 0, -1)] + reversed_sf["d"] = [[i, i + 1] for i in range(nrows - 1, 0, -1)] with self.assertRaises(TypeError): sf.sort() @@ -3054,45 +3661,49 @@ def sort_n_rows(self, nrows=100): sf.sort("nonexist") with self.assertRaises(TypeError): - sf.sort({'a':True}) + sf.sort({"a": True}) - result = sf.sort('a') + result = sf.sort("a") assert_frame_equal(sf.to_dataframe(), result.to_dataframe()) # try a lazy input - result = sf[sf['a'] > 10].sort('a') - assert_frame_equal(sf[sf['a'] > 10].to_dataframe(), result.to_dataframe()) + result = sf[sf["a"] > 10].sort("a") + assert_frame_equal(sf[sf["a"] > 10].to_dataframe(), result.to_dataframe()) - result = sf.sort('a', ascending = False) + result = sf.sort("a", ascending=False) assert_frame_equal(reversed_sf.to_dataframe(), result.to_dataframe()) # lazy reversed - result = sf[sf['a'] > 10].sort('a', ascending = False) - assert_frame_equal(reversed_sf[reversed_sf['a'] > 10].to_dataframe(), result.to_dataframe()) + result = sf[sf["a"] > 10].sort("a", ascending=False) + assert_frame_equal( + reversed_sf[reversed_sf["a"] > 10].to_dataframe(), result.to_dataframe() + ) # lazy reversed - result = sf[sf['a'] > 10].sort('a', ascending = False) - assert_frame_equal(reversed_sf[reversed_sf['a'] > 10].to_dataframe(), result.to_dataframe()) + result = sf[sf["a"] > 10].sort("a", ascending=False) + assert_frame_equal( + reversed_sf[reversed_sf["a"] > 10].to_dataframe(), result.to_dataframe() + ) # sort two columns - result = sf.sort(['a', 'b']) + result = sf.sort(["a", "b"]) assert_frame_equal(sf.to_dataframe(), result.to_dataframe()) - result = sf.sort(['a', 'c'], ascending = False) + result = sf.sort(["a", "c"], ascending=False) assert_frame_equal(reversed_sf.to_dataframe(), result.to_dataframe()) - result = sf.sort([('a', True), ('b', False)]) + result = sf.sort([("a", True), ("b", False)]) assert_frame_equal(sf.to_dataframe(), result.to_dataframe()) - result = sf.sort([('a', False), ('b', True)]) + result = sf.sort([("a", False), ("b", True)]) assert_frame_equal(reversed_sf.to_dataframe(), result.to_dataframe()) # empty sort should not throw - sf = SFrame({'x':[]}) - sf.sort('x') + sf = SFrame({"x": []}) + sf.sort("x") def test_sort(self): - #self.sort_n_rows(100) + # self.sort_n_rows(100) for i in range(1, 10): self.sort_n_rows(i) @@ -3102,211 +3713,291 @@ def test_dropna(self): self.assertEqual(len(sf.dropna()), 0) # normal case - self.__test_equal(self.employees_sf.dropna(), self.employees_sf[0:5].to_dataframe()) + self.__test_equal( + self.employees_sf.dropna(), self.employees_sf[0:5].to_dataframe() + ) test_split = self.employees_sf.dropna_split() self.__test_equal(test_split[0], self.employees_sf[0:5].to_dataframe()) self.__test_equal(test_split[1], self.employees_sf[5:6].to_dataframe()) # test recursively removing nan - test_sf = SFrame({'array':SArray([[1,1],[2,np.nan],[3,3],[4,4],[5,5],[6,np.nan],[7,7],[8, np.nan]], np.ndarray), - 'lists':SArray([[1], None, [], [4], [5, 5], [6, np.nan], [7], None], list), - 'dicts':SArray([{1:2},{2:3},{3:4},{},{5:None},{6:7},{7:[7,[7,np.nan]]},None], dict)}) + test_sf = SFrame( + { + "array": SArray( + [ + [1, 1], + [2, np.nan], + [3, 3], + [4, 4], + [5, 5], + [6, np.nan], + [7, 7], + [8, np.nan], + ], + np.ndarray, + ), + "lists": SArray( + [[1], None, [], [4], [5, 5], [6, np.nan], [7], None], list + ), + "dicts": SArray( + [ + {1: 2}, + {2: 3}, + {3: 4}, + {}, + {5: None}, + {6: 7}, + {7: [7, [7, np.nan]]}, + None, + ], + dict, + ), + } + ) # non-recursive dropna - self.__test_equal(test_sf.dropna(how='any'), - test_sf[0:1].append(test_sf[2:7]).to_dataframe()) + self.__test_equal( + test_sf.dropna(how="any"), test_sf[0:1].append(test_sf[2:7]).to_dataframe() + ) test_split = test_sf.dropna_split() - self.__test_equal(test_split[0], test_sf[0:1].append(test_sf[2:7]).to_dataframe()) + self.__test_equal( + test_split[0], test_sf[0:1].append(test_sf[2:7]).to_dataframe() + ) - self.__test_equal(test_sf.dropna(how='all'), test_sf.to_dataframe()); - test_split = test_sf.dropna_split(how='all') + self.__test_equal(test_sf.dropna(how="all"), test_sf.to_dataframe()) + test_split = test_sf.dropna_split(how="all") self.assertEqual(len(test_split[1]), 0) # recursive dropna - self.__test_equal(test_sf.dropna(recursive=True), - pd.DataFrame({'array':[[1,1],[3,3],[4,4]],'lists':[[1],[],[4]],'dicts':[{1:2},{3:4},{}]})) + self.__test_equal( + test_sf.dropna(recursive=True), + pd.DataFrame( + { + "array": [[1, 1], [3, 3], [4, 4]], + "lists": [[1], [], [4]], + "dicts": [{1: 2}, {3: 4}, {}], + } + ), + ) test_split = test_sf.dropna_split(recursive=True) - self.__test_equal(test_split[0], test_sf[0:1].append(test_sf[2:4]).to_dataframe()) + self.__test_equal( + test_split[0], test_sf[0:1].append(test_sf[2:4]).to_dataframe() + ) # nan is not comparable, so we don't check the nan part # self.__test_equal(test_split[1], test_sf[1:2].append(test_sf[4:8]).to_dataframe()) # the 'all' case - self.__test_equal(test_sf.dropna(how='all', recursive=True), test_sf[0:7].to_dataframe()) - test_split = test_sf.dropna_split(how='all', recursive=True) + self.__test_equal( + test_sf.dropna(how="all", recursive=True), test_sf[0:7].to_dataframe() + ) + test_split = test_sf.dropna_split(how="all", recursive=True) self.__test_equal(test_split[0], test_sf[0:7].to_dataframe()) # test 'split' cases - self.__test_equal(test_sf.dropna('array', recursive=True), - test_sf[0:1].append(test_sf[2:5]).append(test_sf[6:7]).to_dataframe()) - test_split = test_sf.dropna_split('array', recursive=True) - self.__test_equal(test_split[0], - test_sf[0:1].append(test_sf[2:5]).append(test_sf[6:7]).to_dataframe()) - - self.__test_equal(test_sf.dropna('lists', recursive=True), - test_sf[0:1].append(test_sf[2:5]).append(test_sf[6:7]).to_dataframe()) - test_split = test_sf.dropna_split('lists', recursive=True) - self.__test_equal(test_split[0], - test_sf[0:1].append(test_sf[2:5]).append(test_sf[6:7]).to_dataframe()) - - self.__test_equal(test_sf.dropna('dicts', recursive=True), - test_sf[0:4].append(test_sf[5:6]).to_dataframe()) - test_split = test_sf.dropna_split('dicts', recursive=True) - self.__test_equal(test_split[0], - test_sf[0:4].append(test_sf[5:6]).to_dataframe()) + self.__test_equal( + test_sf.dropna("array", recursive=True), + test_sf[0:1].append(test_sf[2:5]).append(test_sf[6:7]).to_dataframe(), + ) + test_split = test_sf.dropna_split("array", recursive=True) + self.__test_equal( + test_split[0], + test_sf[0:1].append(test_sf[2:5]).append(test_sf[6:7]).to_dataframe(), + ) + + self.__test_equal( + test_sf.dropna("lists", recursive=True), + test_sf[0:1].append(test_sf[2:5]).append(test_sf[6:7]).to_dataframe(), + ) + test_split = test_sf.dropna_split("lists", recursive=True) + self.__test_equal( + test_split[0], + test_sf[0:1].append(test_sf[2:5]).append(test_sf[6:7]).to_dataframe(), + ) + + self.__test_equal( + test_sf.dropna("dicts", recursive=True), + test_sf[0:4].append(test_sf[5:6]).to_dataframe(), + ) + test_split = test_sf.dropna_split("dicts", recursive=True) + self.__test_equal( + test_split[0], test_sf[0:4].append(test_sf[5:6]).to_dataframe() + ) # create some other test sframe - test_sf = SFrame({'ints':SArray([None,None,3,4,None], int), - 'floats':SArray([np.nan,2.,3.,4.,np.nan],float), - 'strs':SArray(['1',np.nan,'','4',None], str), - 'lists':SArray([[1],None,[],[1,1,1,1],None], list), - 'dicts':SArray([{1:2},{2:3},{},{4:5},None], dict)}) + test_sf = SFrame( + { + "ints": SArray([None, None, 3, 4, None], int), + "floats": SArray([np.nan, 2.0, 3.0, 4.0, np.nan], float), + "strs": SArray(["1", np.nan, "", "4", None], str), + "lists": SArray([[1], None, [], [1, 1, 1, 1], None], list), + "dicts": SArray([{1: 2}, {2: 3}, {}, {4: 5}, None], dict), + } + ) # another normal, but more interesting case - self.__test_equal(test_sf.dropna(), - pd.DataFrame({'ints':[3,4],'floats':[3.,4.],'strs':['','4'],'lists':[[],[1,1,1,1]],'dicts':[{},{4:5}]})) + self.__test_equal( + test_sf.dropna(), + pd.DataFrame( + { + "ints": [3, 4], + "floats": [3.0, 4.0], + "strs": ["", "4"], + "lists": [[], [1, 1, 1, 1]], + "dicts": [{}, {4: 5}], + } + ), + ) test_split = test_sf.dropna_split() self.__test_equal(test_split[0], test_sf[2:4].to_dataframe()) - self.__test_equal(test_split[1], test_sf[0:2].append(test_sf[4:5]).to_dataframe()) + self.__test_equal( + test_split[1], test_sf[0:2].append(test_sf[4:5]).to_dataframe() + ) # the 'all' case - self.__test_equal(test_sf.dropna(how='all'), test_sf[0:4].to_dataframe()) - test_split = test_sf.dropna_split(how='all') + self.__test_equal(test_sf.dropna(how="all"), test_sf[0:4].to_dataframe()) + test_split = test_sf.dropna_split(how="all") self.__test_equal(test_split[0], test_sf[0:4].to_dataframe()) self.__test_equal(test_split[1], test_sf[4:5].to_dataframe()) # select some columns - self.__test_equal(test_sf.dropna(['ints','floats'], how='all'), test_sf[1:4].to_dataframe()) - test_split = test_sf.dropna_split(['ints','floats'], how='all') + self.__test_equal( + test_sf.dropna(["ints", "floats"], how="all"), test_sf[1:4].to_dataframe() + ) + test_split = test_sf.dropna_split(["ints", "floats"], how="all") self.__test_equal(test_split[0], test_sf[1:4].to_dataframe()) - self.__test_equal(test_split[1], test_sf[0:1].append(test_sf[4:5]).to_dataframe()) + self.__test_equal( + test_split[1], test_sf[0:1].append(test_sf[4:5]).to_dataframe() + ) - self.__test_equal(test_sf.dropna('strs'), test_sf[0:4].to_dataframe()) - test_split = test_sf.dropna_split('strs') + self.__test_equal(test_sf.dropna("strs"), test_sf[0:4].to_dataframe()) + test_split = test_sf.dropna_split("strs") self.__test_equal(test_split[0], test_sf[0:4].to_dataframe()) self.__test_equal(test_split[1], test_sf[4:5].to_dataframe()) - self.__test_equal(test_sf.dropna(['strs','dicts']), test_sf[0:4].to_dataframe()) - test_split = test_sf.dropna_split(['strs','dicts']) + self.__test_equal( + test_sf.dropna(["strs", "dicts"]), test_sf[0:4].to_dataframe() + ) + test_split = test_sf.dropna_split(["strs", "dicts"]) self.__test_equal(test_split[0], test_sf[0:4].to_dataframe()) self.__test_equal(test_split[1], test_sf[4:5].to_dataframe()) # bad stuff with self.assertRaises(TypeError): test_sf.dropna(1) - test_sf.dropna([1,2]) - test_sf.dropna('strs', how=1) + test_sf.dropna([1, 2]) + test_sf.dropna("strs", how=1) test_sf.dropna_split(1) - test_sf.dropna_split([1,2]) - test_sf.dropna_split('strs', how=1) + test_sf.dropna_split([1, 2]) + test_sf.dropna_split("strs", how=1) with self.assertRaises(ValueError): - test_sf.dropna('ints', how='blah') - test_sf.dropna_split('ints', how='blah') + test_sf.dropna("ints", how="blah") + test_sf.dropna_split("ints", how="blah") with self.assertRaises(RuntimeError): - test_sf.dropna('dontexist') - test_sf.dropna_split('dontexist') + test_sf.dropna("dontexist") + test_sf.dropna_split("dontexist") def test_add_row_number(self): sf = SFrame(self.__create_test_df(400000)) - sf = sf.add_row_number('id') - self.assertEqual(list(sf['id']), list(range(0,400000))) + sf = sf.add_row_number("id") + self.assertEqual(list(sf["id"]), list(range(0, 400000))) - del sf['id'] + del sf["id"] - sf = sf.add_row_number('id', -20000) - self.assertEqual(list(sf['id']), list(range(-20000,380000))) - del sf['id'] + sf = sf.add_row_number("id", -20000) + self.assertEqual(list(sf["id"]), list(range(-20000, 380000))) + del sf["id"] - sf = sf.add_row_number('id', 40000) - self.assertEqual(list(sf['id']), list(range(40000,440000))) + sf = sf.add_row_number("id", 40000) + self.assertEqual(list(sf["id"]), list(range(40000, 440000))) with self.assertRaises(RuntimeError): - sf.add_row_number('id') + sf.add_row_number("id") with self.assertRaises(TypeError): sf = sf.add_row_number(46) - sf = sf.add_row_number('id2',start='hi') + sf = sf.add_row_number("id2", start="hi") def test_inplace_not_inplace(self): # add row number sf = SFrame(self.__create_test_df(1000)) - sf2 = sf.add_row_number('id', inplace=False) + sf2 = sf.add_row_number("id", inplace=False) self.assertTrue(sf2 is not sf) - self.assertTrue('id' in sf2.column_names()) - self.assertTrue('id' not in sf.column_names()) + self.assertTrue("id" in sf2.column_names()) + self.assertTrue("id" not in sf.column_names()) - sf2 = sf.add_row_number('id', inplace=True) + sf2 = sf.add_row_number("id", inplace=True) self.assertTrue(sf2 is sf) - self.assertTrue('id' in sf2.column_names()) + self.assertTrue("id" in sf2.column_names()) # add column sf = SFrame(self.__create_test_df(1000)) newcol = SArray(range(1000)) - sf2 = sf.add_column(newcol, 'newcol', inplace=False) + sf2 = sf.add_column(newcol, "newcol", inplace=False) self.assertTrue(sf2 is not sf) - self.assertTrue('newcol' in sf2.column_names()) - self.assertTrue('newcol' not in sf.column_names()) - sf2 = sf.add_column(newcol, 'newcol', inplace=True) + self.assertTrue("newcol" in sf2.column_names()) + self.assertTrue("newcol" not in sf.column_names()) + sf2 = sf.add_column(newcol, "newcol", inplace=True) self.assertTrue(sf2 is sf) - self.assertTrue('newcol' in sf2.column_names()) + self.assertTrue("newcol" in sf2.column_names()) # add columns sf = SFrame(self.__create_test_df(1000)) - newcols = SFrame({'newcol':range(1000), 'newcol2':range(1000)}) + newcols = SFrame({"newcol": range(1000), "newcol2": range(1000)}) sf2 = sf.add_columns(newcols, inplace=False) self.assertTrue(sf2 is not sf) - self.assertTrue('newcol' in sf2.column_names()) - self.assertTrue('newcol2' in sf2.column_names()) - self.assertTrue('newcol' not in sf.column_names()) - self.assertTrue('newcol2' not in sf.column_names()) + self.assertTrue("newcol" in sf2.column_names()) + self.assertTrue("newcol2" in sf2.column_names()) + self.assertTrue("newcol" not in sf.column_names()) + self.assertTrue("newcol2" not in sf.column_names()) sf2 = sf.add_columns(newcols, inplace=True) self.assertTrue(sf2 is sf) - self.assertTrue('newcol' in sf2.column_names()) - self.assertTrue('newcol2' in sf2.column_names()) + self.assertTrue("newcol" in sf2.column_names()) + self.assertTrue("newcol2" in sf2.column_names()) # remove column sf = SFrame(self.__create_test_df(1000)) - sf2 = sf.remove_column('int_data', inplace=False) + sf2 = sf.remove_column("int_data", inplace=False) self.assertTrue(sf2 is not sf) - self.assertTrue('int_data' in sf.column_names()) - self.assertTrue('int_data' not in sf2.column_names()) - sf2 = sf.remove_column('int_data', inplace=True) + self.assertTrue("int_data" in sf.column_names()) + self.assertTrue("int_data" not in sf2.column_names()) + sf2 = sf.remove_column("int_data", inplace=True) self.assertTrue(sf2 is sf) - self.assertTrue('int_data' not in sf2.column_names()) + self.assertTrue("int_data" not in sf2.column_names()) # remove columns sf = SFrame(self.__create_test_df(1000)) - sf2 = sf.remove_columns(['int_data', 'float_data'], inplace=False) + sf2 = sf.remove_columns(["int_data", "float_data"], inplace=False) self.assertTrue(sf2 is not sf) - self.assertTrue('int_data' in sf.column_names()) - self.assertTrue('float_data' in sf.column_names()) - self.assertTrue('int_data' not in sf2.column_names()) - self.assertTrue('float_data' not in sf2.column_names()) - sf2 = sf.remove_columns(['int_data', 'float_data'], inplace=True) + self.assertTrue("int_data" in sf.column_names()) + self.assertTrue("float_data" in sf.column_names()) + self.assertTrue("int_data" not in sf2.column_names()) + self.assertTrue("float_data" not in sf2.column_names()) + sf2 = sf.remove_columns(["int_data", "float_data"], inplace=True) self.assertTrue(sf2 is sf) - self.assertTrue('int_data' not in sf2.column_names()) - self.assertTrue('float_data' not in sf2.column_names()) + self.assertTrue("int_data" not in sf2.column_names()) + self.assertTrue("float_data" not in sf2.column_names()) # rename sf = SFrame(self.__create_test_df(1000)) - sf2 = sf.rename({'int_data':'int','float_data':'float'}, inplace=False) + sf2 = sf.rename({"int_data": "int", "float_data": "float"}, inplace=False) self.assertTrue(sf2 is not sf) - self.assertTrue('int_data' in sf.column_names()) - self.assertTrue('float_data' in sf.column_names()) - self.assertTrue('int' not in sf.column_names()) - self.assertTrue('float' not in sf.column_names()) - self.assertTrue('int_data' not in sf2.column_names()) - self.assertTrue('float_data' not in sf2.column_names()) - self.assertTrue('int' in sf2.column_names()) - self.assertTrue('float' in sf2.column_names()) - sf2 = sf.rename({'int_data':'int','float_data':'float'}, inplace=True) + self.assertTrue("int_data" in sf.column_names()) + self.assertTrue("float_data" in sf.column_names()) + self.assertTrue("int" not in sf.column_names()) + self.assertTrue("float" not in sf.column_names()) + self.assertTrue("int_data" not in sf2.column_names()) + self.assertTrue("float_data" not in sf2.column_names()) + self.assertTrue("int" in sf2.column_names()) + self.assertTrue("float" in sf2.column_names()) + sf2 = sf.rename({"int_data": "int", "float_data": "float"}, inplace=True) self.assertTrue(sf2 is sf) - self.assertTrue('int_data' not in sf2.column_names()) - self.assertTrue('float_data' not in sf2.column_names()) - self.assertTrue('int' in sf2.column_names()) - self.assertTrue('float' in sf2.column_names()) + self.assertTrue("int_data" not in sf2.column_names()) + self.assertTrue("float_data" not in sf2.column_names()) + self.assertTrue("int" in sf2.column_names()) + self.assertTrue("float" in sf2.column_names()) # swap sf = SFrame(self.__create_test_df(1000)) @@ -3314,22 +4005,22 @@ def test_inplace_not_inplace(self): # swap int_data and float_data new_cnames = sf.column_names() - int_data_idx = new_cnames.index('int_data') - float_data_idx = new_cnames.index('float_data') - new_cnames[int_data_idx],new_cnames[float_data_idx] = new_cnames[float_data_idx],new_cnames[int_data_idx] - - - - sf2 = sf.swap_columns('int_data', 'float_data', inplace=False) + int_data_idx = new_cnames.index("int_data") + float_data_idx = new_cnames.index("float_data") + new_cnames[int_data_idx], new_cnames[float_data_idx] = ( + new_cnames[float_data_idx], + new_cnames[int_data_idx], + ) + + sf2 = sf.swap_columns("int_data", "float_data", inplace=False) self.assertTrue(sf2 is not sf) self.assertEqual(sf.column_names(), old_cnames) self.assertEqual(sf2.column_names(), new_cnames) - sf2 = sf.swap_columns('int_data', 'float_data', inplace=True) + sf2 = sf.swap_columns("int_data", "float_data", inplace=True) self.assertTrue(sf2 is sf) self.assertEqual(sf2.column_names(), new_cnames) - def test_check_lazy_sframe_size(self): # empty sframe, materialized, has_size sf = SFrame() @@ -3337,49 +4028,48 @@ def test_check_lazy_sframe_size(self): self.assertTrue(sf.__has_size__()) # add one column, not materialized, has_size - sf['a'] = range(1000) + sf["a"] = range(1000) self.assertTrue(sf.__is_materialized__()) self.assertTrue(sf.__has_size__()) # materialize it, materialized, has_size - sf['a'] = range(1000) + sf["a"] = range(1000) sf.materialize() self.assertTrue(sf.__is_materialized__()) self.assertTrue(sf.__has_size__()) # logical filter, not materialized, not has_size - sf = sf[sf['a'] > 5000] + sf = sf[sf["a"] > 5000] self.assertFalse(sf.__is_materialized__()) self.assertFalse(sf.__has_size__()) def test_lazy_logical_filter_sarray(self): - g=SArray(range(10000)) - g2=SArray(range(10000)) - a=g[g>10] - a2=g2[g>10] - z=a[a2>20] + g = SArray(range(10000)) + g2 = SArray(range(10000)) + a = g[g > 10] + a2 = g2[g > 10] + z = a[a2 > 20] self.assertEqual(len(z), 9979) def test_lazy_logical_filter_sframe(self): - g=SFrame({'a':range(10000)}) - g2=SFrame({'a':range(10000)}) - a=g[g['a']>10] - a2=g2[g['a']>10] - z=a[a2['a']>20] + g = SFrame({"a": range(10000)}) + g2 = SFrame({"a": range(10000)}) + a = g[g["a"] > 10] + a2 = g2[g["a"] > 10] + z = a[a2["a"] > 20] self.assertEqual(len(z), 9979) - def test_column_manipulation_of_lazy_sframe(self): - g=SFrame({'a':[1,2,3,4,5],'id':[1,2,3,4,5]}) - g = g[g['id'] > 2] - del g['id'] + g = SFrame({"a": [1, 2, 3, 4, 5], "id": [1, 2, 3, 4, 5]}) + g = g[g["id"] > 2] + del g["id"] # if lazy column deletion is quirky, this will cause an exception - self.assertEqual(list(g[0:2]['a']), [3,4]) - g=SFrame({'a':[1,2,3,4,5],'id':[1,2,3,4,5]}) - g = g[g['id'] > 2] - g.swap_columns('a','id', inplace=True) + self.assertEqual(list(g[0:2]["a"]), [3, 4]) + g = SFrame({"a": [1, 2, 3, 4, 5], "id": [1, 2, 3, 4, 5]}) + g = g[g["id"] > 2] + g.swap_columns("a", "id", inplace=True) # if lazy column swap is quirky, this will cause an exception - self.assertEqual(list(g[0:2]['a']), [3,4]) + self.assertEqual(list(g[0:2]["a"]), [3, 4]) def test_empty_sarray(self): with util.TempDirectory() as f: @@ -3397,10 +4087,10 @@ def test_empty_sframe(self): self.assertEqual(sf2.num_columns(), 0) def test_none_column(self): - sf = SFrame({'a':[1,2,3,4,5]}) - sf['b'] = None - self.assertEqual(sf['b'].dtype, float) - df = pd.DataFrame({'a': [1,2,3,4,5], 'b': [None,None,None,None,None]}) + sf = SFrame({"a": [1, 2, 3, 4, 5]}) + sf["b"] = None + self.assertEqual(sf["b"].dtype, float) + df = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [None, None, None, None, None]}) self.__test_equal(sf, df) sa = SArray.from_const(None, 100) @@ -3408,36 +4098,36 @@ def test_none_column(self): self.assertEqual(sa.dtype, float) def test_apply_with_partial(self): - sf = SFrame({'a': [1, 2, 3, 4, 5]}) + sf = SFrame({"a": [1, 2, 3, 4, 5]}) def concat_fn(character, row): - return '%s%d' % (character, row['a']) + return "%s%d" % (character, row["a"]) - my_partial_fn = functools.partial(concat_fn, 'x') + my_partial_fn = functools.partial(concat_fn, "x") sa = sf.apply(my_partial_fn) - self.assertEqual(list(sa), ['x1', 'x2', 'x3', 'x4', 'x5']) + self.assertEqual(list(sa), ["x1", "x2", "x3", "x4", "x5"]) def test_apply_with_functor(self): - sf = SFrame({'a': [1, 2, 3, 4, 5]}) + sf = SFrame({"a": [1, 2, 3, 4, 5]}) class Concatenator(object): def __init__(self, character): self.character = character def __call__(self, row): - return '%s%d' % (self.character, row['a']) + return "%s%d" % (self.character, row["a"]) - concatenator = Concatenator('x') + concatenator = Concatenator("x") sa = sf.apply(concatenator) - self.assertEqual(list(sa), ['x1', 'x2', 'x3', 'x4', 'x5']) + self.assertEqual(list(sa), ["x1", "x2", "x3", "x4", "x5"]) def test_save_sframe(self): - '''save lazily evaluated SFrame should not materialize to target folder - ''' + """save lazily evaluated SFrame should not materialize to target folder + """ data = SFrame() - data['x'] = range(100) - data['x'] = data['x'] > 50 - #lazy and good + data["x"] = range(100) + data["x"] = data["x"] > 50 + # lazy and good tmp_dir = tempfile.mkdtemp() data.save(tmp_dir) shutil.rmtree(tmp_dir) @@ -3445,61 +4135,64 @@ def test_save_sframe(self): def test_empty_argmax_does_not_fail(self): # an empty argmax should not result in a crash - sf = SFrame({'id': [0, 0, 0, 1, 1, 2, 2], - 'value': [3.0, 2.0, 2.3, None, None, 4.3, 1.3], - 'category': ['A', 'B', 'A', 'E', 'A', 'A', 'B']}) - sf.groupby('id', aggregate.ARGMAX('value', 'category')) + sf = SFrame( + { + "id": [0, 0, 0, 1, 1, 2, 2], + "value": [3.0, 2.0, 2.3, None, None, 4.3, 1.3], + "category": ["A", "B", "A", "E", "A", "A", "B"], + } + ) + sf.groupby("id", aggregate.ARGMAX("value", "category")) def test_cache_invalidation(self): # Changes to the SFrame should invalidate the indexing cache. - X = SFrame({'a' : range(4000), - 'b' : range(4000)}) + X = SFrame({"a": range(4000), "b": range(4000)}) for i in range(0, 4000, 20): - self.assertEqual(X[i], {'a' : i, 'b' : i}) + self.assertEqual(X[i], {"a": i, "b": i}) - X['a'] = range(1000, 5000) + X["a"] = range(1000, 5000) for i in range(0, 4000, 20): - self.assertEqual(X[i], {'a' : 1000 + i, 'b' : i}) + self.assertEqual(X[i], {"a": 1000 + i, "b": i}) - del X['b'] + del X["b"] for i in range(0, 4000, 20): - self.assertEqual(X[i], {'a' : 1000 + i}) + self.assertEqual(X[i], {"a": 1000 + i}) - X['b'] = X['a'] + X["b"] = X["a"] for i in range(0, 4000, 20): - self.assertEqual(X[i], {'a' : 1000 + i, 'b' : 1000 + i}) + self.assertEqual(X[i], {"a": 1000 + i, "b": 1000 + i}) - X.rename({'b' : 'c'}, inplace=True) + X.rename({"b": "c"}, inplace=True) for i in range(0, 4000, 20): - self.assertEqual(X[i], {'a' : 1000 + i, 'c' : 1000 + i}) + self.assertEqual(X[i], {"a": 1000 + i, "c": 1000 + i}) def test_to_numpy(self): - X = SFrame({'a' : range(100), - 'b' : range(100)}) + X = SFrame({"a": range(100), "b": range(100)}) import numpy as np import numpy.testing as nptest + Y = np.transpose(np.array([range(100), range(100)])) nptest.assert_array_equal(X.to_numpy(), Y) - X['b'] = X['b'].astype(str) + X["b"] = X["b"].astype(str) s = [str(i) for i in range(100)] Y = np.transpose(np.array([s, s])) nptest.assert_array_equal(X.to_numpy(), Y) - @mock.patch(__name__+'.sqlite3.Cursor', spec=True) - @mock.patch(__name__+'.sqlite3.Connection', spec=True) + @mock.patch(__name__ + ".sqlite3.Cursor", spec=True) + @mock.patch(__name__ + ".sqlite3.Connection", spec=True) def test_from_sql(self, mock_conn, mock_cursor): # Set up mock connection and cursor - conn = mock_conn('example.db') + conn = mock_conn("example.db") curs = mock_cursor() conn.cursor.return_value = curs - sf_type_codes = [44,44,41,22,114,199,43] + sf_type_codes = [44, 44, 41, 22, 114, 199, 43] sf_data = list(zip(*self.all_type_cols)) sf_iter = sf_data.__iter__() @@ -3524,99 +4217,147 @@ def mock_fetchmany(size=1): curs.fetchone.side_effect = mock_fetchone curs.fetchmany.side_effect = mock_fetchmany - curs.description = [['X'+str(i+1),sf_type_codes[i]]+[None for j in range(5)] for i in range(len(sf_data[0]))] + curs.description = [ + ["X" + str(i + 1), sf_type_codes[i]] + [None for j in range(5)] + for i in range(len(sf_data[0])) + ] # bigger than cache, no Nones - sf = SFrame.from_sql(conn, "SELECT * FROM test_table", type_inference_rows=5, dbapi_module=dbapi2_mock()) + sf = SFrame.from_sql( + conn, + "SELECT * FROM test_table", + type_inference_rows=5, + dbapi_module=dbapi2_mock(), + ) _assert_sframe_equal(sf, self.sf_all_types) # smaller than cache, no Nones sf_iter = sf_data.__iter__() - sf = SFrame.from_sql(conn, "SELECT * FROM test_table", type_inference_rows=100, dbapi_module=dbapi2_mock()) + sf = SFrame.from_sql( + conn, + "SELECT * FROM test_table", + type_inference_rows=100, + dbapi_module=dbapi2_mock(), + ) _assert_sframe_equal(sf, self.sf_all_types) none_col = [None for i in range(5)] nones_in_cache = list(zip(*[none_col for i in range(len(sf_data[0]))])) - none_sf = SFrame({'X'+str(i):none_col for i in range(1,len(sf_data[0])+1)}) - test_data = (nones_in_cache+sf_data) + none_sf = SFrame( + {"X" + str(i): none_col for i in range(1, len(sf_data[0]) + 1)} + ) + test_data = nones_in_cache + sf_data sf_iter = test_data.__iter__() # more None rows than cache & types in description - sf = SFrame.from_sql(conn, "SELECT * FROM test_table", type_inference_rows=5, dbapi_module=dbapi2_mock()) + sf = SFrame.from_sql( + conn, + "SELECT * FROM test_table", + type_inference_rows=5, + dbapi_module=dbapi2_mock(), + ) sf_inferred_types = SFrame() - expected_types = [float,float,str,str,str,str,dt.datetime] - for i in zip(self.sf_all_types.column_names(),expected_types): + expected_types = [float, float, str, str, str, str, dt.datetime] + for i in zip(self.sf_all_types.column_names(), expected_types): new_col = SArray(none_col).astype(i[1]) - new_col = new_col.append(self.sf_all_types[i[0]].apply(lambda x: i[1](x) if i[1] is not dt.datetime else x)) + new_col = new_col.append( + self.sf_all_types[i[0]].apply( + lambda x: i[1](x) if i[1] is not dt.datetime else x + ) + ) sf_inferred_types.add_column(new_col, inplace=True) # Don't test the string representation of dict and list; there are # funky consistency issues with the string representations of these - sf.remove_columns(['X5', 'X6'], inplace=True) - sf_inferred_types.remove_columns(['X5', 'X6'], inplace=True) + sf.remove_columns(["X5", "X6"], inplace=True) + sf_inferred_types.remove_columns(["X5", "X6"], inplace=True) _assert_sframe_equal(sf, sf_inferred_types) # more None rows than cache & no type information for i in range(len(curs.description)): curs.description[i][1] = None sf_iter = test_data.__iter__() - sf = SFrame.from_sql(conn, "SELECT * FROM test_table", type_inference_rows=5, dbapi_module=dbapi2_mock()) + sf = SFrame.from_sql( + conn, + "SELECT * FROM test_table", + type_inference_rows=5, + dbapi_module=dbapi2_mock(), + ) sf_inferred_types = SFrame() expected_types = [str for i in range(len(sf_data[0]))] - for i in zip(self.sf_all_types.column_names(),expected_types): + for i in zip(self.sf_all_types.column_names(), expected_types): new_col = SArray(none_col).astype(i[1]) new_col = new_col.append(self.sf_all_types[i[0]].apply(lambda x: str(x))) sf_inferred_types.add_column(new_col, inplace=True) # Don't test the string representation of dict, could be out of order - sf.remove_columns(['X5', 'X6'], inplace=True) - sf_inferred_types.remove_columns(['X5', 'X6'], inplace=True) + sf.remove_columns(["X5", "X6"], inplace=True) + sf_inferred_types.remove_columns(["X5", "X6"], inplace=True) _assert_sframe_equal(sf, sf_inferred_types) ### column_type_hints tests sf_iter = test_data.__iter__() - sf = SFrame.from_sql(conn, "SELECT * FROM test_table", type_inference_rows=5, - dbapi_module=dbapi2_mock(), column_type_hints=str) - sf.remove_columns(['X5', 'X6'], inplace=True) + sf = SFrame.from_sql( + conn, + "SELECT * FROM test_table", + type_inference_rows=5, + dbapi_module=dbapi2_mock(), + column_type_hints=str, + ) + sf.remove_columns(["X5", "X6"], inplace=True) _assert_sframe_equal(sf, sf_inferred_types) # Provide unhintable types sf_iter = test_data.__iter__() - expected_types = [int,float,str,array.array,list,dict,dt.datetime] + expected_types = [int, float, str, array.array, list, dict, dt.datetime] with self.assertRaises(TypeError): - sf = SFrame.from_sql(conn, - "SELECT * FROM test_table", type_inference_rows=5, - dbapi_module=dbapi2_mock(), column_type_hints=expected_types) + sf = SFrame.from_sql( + conn, + "SELECT * FROM test_table", + type_inference_rows=5, + dbapi_module=dbapi2_mock(), + column_type_hints=expected_types, + ) sf_iter = test_data.__iter__() - expected_types = {'X'+str(i+1):expected_types[i] for i in range(3)} - sf = SFrame.from_sql(conn, - "SELECT * FROM test_table", type_inference_rows=10, - dbapi_module=dbapi2_mock(), column_type_hints=expected_types) - _assert_sframe_equal(sf[5:],self.sf_all_types) + expected_types = {"X" + str(i + 1): expected_types[i] for i in range(3)} + sf = SFrame.from_sql( + conn, + "SELECT * FROM test_table", + type_inference_rows=10, + dbapi_module=dbapi2_mock(), + column_type_hints=expected_types, + ) + _assert_sframe_equal(sf[5:], self.sf_all_types) # Test a float forced to a str sf_iter = test_data.__iter__() - expected_types['X2'] = str - self.sf_all_types['X2'] = self.sf_all_types['X2'].apply(lambda x: str(x)) - sf = SFrame.from_sql(conn, - "SELECT * FROM test_table", type_inference_rows=10, - dbapi_module=dbapi2_mock(), column_type_hints=expected_types) - _assert_sframe_equal(sf[5:],self.sf_all_types) + expected_types["X2"] = str + self.sf_all_types["X2"] = self.sf_all_types["X2"].apply(lambda x: str(x)) + sf = SFrame.from_sql( + conn, + "SELECT * FROM test_table", + type_inference_rows=10, + dbapi_module=dbapi2_mock(), + column_type_hints=expected_types, + ) + _assert_sframe_equal(sf[5:], self.sf_all_types) # Type unsupported by sframe - curs.description = [['X1',44],['X2',44]] - sf_iter = [[complex(4.5,3),1], [complex(3.4,5),2]].__iter__() + curs.description = [["X1", 44], ["X2", 44]] + sf_iter = [[complex(4.5, 3), 1], [complex(3.4, 5), 2]].__iter__() sf = SFrame.from_sql(conn, "SELECT * FROM test_table") - expected_sf = SFrame({'X1':["(4.5+3j)","(3.4+5j)"],'X2':[1,2]}) + expected_sf = SFrame({"X1": ["(4.5+3j)", "(3.4+5j)"], "X2": [1, 2]}) _assert_sframe_equal(sf, expected_sf) # bad DBAPI version! bad_version = dbapi2_mock() bad_version.apilevel = "1.0 " with self.assertRaises(NotImplementedError): - sf = SFrame.from_sql(conn, "SELECT * FROM test_table", dbapi_module=bad_version) + sf = SFrame.from_sql( + conn, "SELECT * FROM test_table", dbapi_module=bad_version + ) # Bad module with self.assertRaises(AttributeError): @@ -3628,23 +4369,36 @@ def mock_fetchmany(size=1): # Empty query result curs.description = [] - sf = SFrame.from_sql(conn, "SELECT * FROM test_table", dbapi_module=dbapi2_mock()) + sf = SFrame.from_sql( + conn, "SELECT * FROM test_table", dbapi_module=dbapi2_mock() + ) _assert_sframe_equal(sf, SFrame()) - @mock.patch(__name__+'.sqlite3.Cursor', spec=True) - @mock.patch(__name__+'.sqlite3.Connection', spec=True) + @mock.patch(__name__ + ".sqlite3.Cursor", spec=True) + @mock.patch(__name__ + ".sqlite3.Connection", spec=True) def test_to_sql(self, mock_conn, mock_cursor): - conn = mock_conn('example.db') + conn = mock_conn("example.db") curs = mock_cursor() insert_stmt = "INSERT INTO ins_test (X1,X2,X3,X4,X5,X6,X7) VALUES ({0},{1},{2},{3},{4},{5},{6})" num_cols = len(self.sf_all_types.column_names()) test_cases = [ - ('qmark',insert_stmt.format(*['?' for i in range(num_cols)])), - ('numeric',insert_stmt.format(*[':'+str(i) for i in range(1,num_cols+1)])), - ('named',insert_stmt.format(*[':X'+str(i) for i in range(1,num_cols+1)])), - ('format',insert_stmt.format(*['%s' for i in range(num_cols)])), - ('pyformat',insert_stmt.format(*['%(X'+str(i)+')s' for i in range(1,num_cols+1)])), - ] + ("qmark", insert_stmt.format(*["?" for i in range(num_cols)])), + ( + "numeric", + insert_stmt.format(*[":" + str(i) for i in range(1, num_cols + 1)]), + ), + ( + "named", + insert_stmt.format(*[":X" + str(i) for i in range(1, num_cols + 1)]), + ), + ("format", insert_stmt.format(*["%s" for i in range(num_cols)])), + ( + "pyformat", + insert_stmt.format( + *["%(X" + str(i) + ")s" for i in range(1, num_cols + 1)] + ), + ), + ] for i in test_cases: conn.cursor.return_value = curs @@ -3655,10 +4409,10 @@ def test_to_sql(self, mock_conn, mock_cursor): calls = [] col_names = self.sf_all_types.column_names() for j in self.sf_all_types: - if i[0] == 'named' or i[0] == 'pyformat': - calls.append(mock.call(i[1],j)) + if i[0] == "named" or i[0] == "pyformat": + calls.append(mock.call(i[1], j)) else: - calls.append(mock.call(i[1],[j[k] for k in col_names])) + calls.append(mock.call(i[1], [j[k] for k in col_names])) curs.execute.assert_has_calls(calls, any_order=False) self.assertEqual(curs.execute.call_count, len(self.sf_all_types)) conn.commit.assert_called_once_with() @@ -3675,27 +4429,27 @@ def test_to_sql(self, mock_conn, mock_cursor): # bad paramstyle bad_paramstyle = dbapi2_mock() - bad_paramstyle.paramstyle = 'foo' + bad_paramstyle.paramstyle = "foo" with self.assertRaises(TypeError): self.sf_all_types.to_sql(conn, "ins_test", dbapi_module=bad_paramstyle) - def test_materialize(self): - sf = SFrame({'a':range(100)}) - sf = sf[sf['a'] > 10] + sf = SFrame({"a": range(100)}) + sf = sf[sf["a"] > 10] self.assertFalse(sf.is_materialized()) sf.materialize() self.assertTrue(sf.is_materialized()) def test_materialization_slicing(self): # Has been known to fail. - g=SFrame({'a':range(100)})[:10] - g['b'] = g['a'] + 1 - g['b'].materialize() + g = SFrame({"a": range(100)})[:10] + g["b"] = g["a"] + 1 + g["b"].materialize() g.materialize() def test_copy(self): from copy import copy + sf = generate_random_sframe(100, "Cns") sf_copy = copy(sf) @@ -3705,6 +4459,7 @@ def test_copy(self): def test_deepcopy(self): from copy import deepcopy + sf = generate_random_sframe(100, "Cns") sf_copy = deepcopy(sf) @@ -3716,85 +4471,90 @@ def test_builtins(self): import builtins import six - sf = SFrame({'dict': [builtins.dict({'foo': 'bar'})], - 'float': [builtins.float(3.14)], - 'int': [builtins.int(12)], - 'bool': [builtins.bool(False)], - 'list': [builtins.list([1,2,3])], - 'str': [builtins.str('foo')], - 'tuple': [builtins.tuple((1,2))], - }) - sf2 = SFrame({'dict': [{'foo': 'bar'}], - 'float': [3.14], - 'int': [12], - 'bool': [False], - 'list': [[1,2,3]], - 'str': ['foo'], - 'tuple': [(1,2)], - }) + sf = SFrame( + { + "dict": [builtins.dict({"foo": "bar"})], + "float": [builtins.float(3.14)], + "int": [builtins.int(12)], + "bool": [builtins.bool(False)], + "list": [builtins.list([1, 2, 3])], + "str": [builtins.str("foo")], + "tuple": [builtins.tuple((1, 2))], + } + ) + sf2 = SFrame( + { + "dict": [{"foo": "bar"}], + "float": [3.14], + "int": [12], + "bool": [False], + "list": [[1, 2, 3]], + "str": ["foo"], + "tuple": [(1, 2)], + } + ) if six.PY2: - sf = sf.add_columns(SFrame( - {'long': [builtins.long(12)], 'unicode': [builtins.unicode('foo')]})) - sf2 = sf2.add_columns(SFrame( - {'long': [12], 'unicode': [unicode('foo')]})) + sf = sf.add_columns( + SFrame( + {"long": [builtins.long(12)], "unicode": [builtins.unicode("foo")]} + ) + ) + sf2 = sf2.add_columns(SFrame({"long": [12], "unicode": [unicode("foo")]})) _assert_sframe_equal(sf, sf2) def test_add_column_nonSArray(self): sf = SFrame() - sf = sf.add_column([1,2,3,4],'x') + sf = sf.add_column([1, 2, 3, 4], "x") sf_test = SFrame() - sf_test['x'] = SArray([1,2,3,4]) + sf_test["x"] = SArray([1, 2, 3, 4]) _assert_sframe_equal(sf, sf_test) - def test_add_column_noniterable1(self): sf = SFrame() - sf = sf.add_column([1,2,3,4],'x') - sf = sf.add_column(5,'y') + sf = sf.add_column([1, 2, 3, 4], "x") + sf = sf.add_column(5, "y") sf_test = SFrame() - sf_test['x'] = SArray([1,2,3,4]) - sf_test['y'] = 5 + sf_test["x"] = SArray([1, 2, 3, 4]) + sf_test["y"] = 5 _assert_sframe_equal(sf, sf_test) - - def test_add_column_noniterable2(self): # If SFrame is empty then the passed data should be treated as an SArray of size 1 sf = SFrame() - sf = sf.add_column(5,'y') + sf = sf.add_column(5, "y") sf_test = SFrame() - sf_test['y'] = SArray([5]) + sf_test["y"] = SArray([5]) _assert_sframe_equal(sf, sf_test) - def test_filter_by_dict(self): # Check for dict in filter_by - sf = SFrame({'check':range(10)}) - d = {1:1} + sf = SFrame({"check": range(10)}) + d = {1: 1} - sf = sf.filter_by(d.keys(),'check') - sf_test = sf.filter_by(list(d.keys()),'check') + sf = sf.filter_by(d.keys(), "check") + sf_test = sf.filter_by(list(d.keys()), "check") _assert_sframe_equal(sf, sf_test) - sf = sf.filter_by(d.values(),'check') - sf_test = sf.filter_by(list(d.values()),'check') + sf = sf.filter_by(d.values(), "check") + sf_test = sf.filter_by(list(d.values()), "check") _assert_sframe_equal(sf, sf_test) def test_export_empty_SFrame(self): - f = tempfile.NamedTemporaryFile(suffix='.json', delete=False) + f = tempfile.NamedTemporaryFile(suffix=".json", delete=False) sf = SFrame() sf.export_json(f.name) sf2 = SFrame.read_json(f.name) _assert_sframe_equal(sf, sf2) + if __name__ == "__main__": import sys diff --git a/src/python/turicreate/test/test_sframe_builder.py b/src/python/turicreate/test/test_sframe_builder.py index 5a92e10d5e..c21a753f20 100644 --- a/src/python/turicreate/test/test_sframe_builder.py +++ b/src/python/turicreate/test/test_sframe_builder.py @@ -14,29 +14,36 @@ from .._cython.cy_flexible_type import GMT from ..util import _assert_sframe_equal + class SFrameBuilderTest(unittest.TestCase): def setUp(self): self.int_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] - self.float_data = [1., 2., 3., 4., 5., 6., 7., 8., 9., 10.] + self.float_data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] self.string_data = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] - self.vec_data = [array.array('d', [i, i+1]) for i in self.int_data] + self.vec_data = [array.array("d", [i, i + 1]) for i in self.int_data] self.list_data = [[i, str(i), i * 1.0] for i in self.int_data] - self.dict_data = [{str(i): i, i : float(i)} for i in self.int_data] - self.datetime_data = [dt.datetime(2013, 5, 7, 10, 4, 10), - dt.datetime(1902, 10, 21, 10, 34, 10).replace(tzinfo=GMT(0.0))] - self.all_type_cols = [self.int_data, - self.float_data, - self.string_data, - self.vec_data, - self.list_data, - self.dict_data, - self.datetime_data*5] - self.sf_all_types = SFrame({"X"+str(i[0]):i[1] for i in zip(range(1,8), - self.all_type_cols)}) - self.all_types = [int,float,str,array.array,list,dict,dt.datetime] + self.dict_data = [{str(i): i, i: float(i)} for i in self.int_data] + self.datetime_data = [ + dt.datetime(2013, 5, 7, 10, 4, 10), + dt.datetime(1902, 10, 21, 10, 34, 10).replace(tzinfo=GMT(0.0)), + ] + self.all_type_cols = [ + self.int_data, + self.float_data, + self.string_data, + self.vec_data, + self.list_data, + self.dict_data, + self.datetime_data * 5, + ] + self.sf_all_types = SFrame( + {"X" + str(i[0]): i[1] for i in zip(range(1, 8), self.all_type_cols)} + ) + self.all_types = [int, float, str, array.array, list, dict, dt.datetime] def test_basic(self): from ..data_structures.sframe_builder import SFrameBuilder + sf_data = list(zip(*self.all_type_cols)) sb = SFrameBuilder(self.all_types) @@ -74,7 +81,7 @@ def test_basic(self): ## type cannot be converted to target type sb = SFrameBuilder(self.all_types) # maks sure we replace the type int to str - assert(type(self.all_type_cols[0][0]) is int) + assert type(self.all_type_cols[0][0]) is int sf_data_wrong_type = list(zip(self.string_data, *self.all_type_cols[1:])) with self.assertRaises(TypeError): sb.append_multiple(sf_data_wrong_type[0]) @@ -85,57 +92,63 @@ def test_basic(self): def test_history(self): from ..data_structures.sframe_builder import SFrameBuilder - sb = SFrameBuilder([int,float], history_size=10) - sb.append_multiple(([i,i+0.0] for i in range(8))) + + sb = SFrameBuilder([int, float], history_size=10) + sb.append_multiple(([i, i + 0.0] for i in range(8))) hist = sb.read_history(3) - self.assertEqual(hist,[[5,5.0],[6,6.0],[7,7.0]]) + self.assertEqual(hist, [[5, 5.0], [6, 6.0], [7, 7.0]]) hist = sb.read_history(20) - self.assertEqual(hist, [[i,i+0.0] for i in range(8)]) + self.assertEqual(hist, [[i, i + 0.0] for i in range(8)]) hist = sb.read_history() - self.assertEqual(hist, [[i,i+0.0] for i in range(8)]) + self.assertEqual(hist, [[i, i + 0.0] for i in range(8)]) - sb.append_multiple(([i,i+0.0] for i in range(5))) + sb.append_multiple(([i, i + 0.0] for i in range(5))) hist = sb.read_history(10) - self.assertEqual(hist, [[i,i+0.0] for i in [3,4,5,6,7,0,1,2,3,4]]) + self.assertEqual(hist, [[i, i + 0.0] for i in [3, 4, 5, 6, 7, 0, 1, 2, 3, 4]]) - sb.append([50,50.0]) + sb.append([50, 50.0]) hist = sb.read_history(10) - self.assertEqual(hist, [[i,i+0.0] for i in [4,5,6,7,0,1,2,3,4,50]]) + self.assertEqual(hist, [[i, i + 0.0] for i in [4, 5, 6, 7, 0, 1, 2, 3, 4, 50]]) hist = sb.read_history(-1) self.assertEqual(hist, []) hist = sb.read_history(0) self.assertEqual(hist, []) - expected_data = [[i,i+0.0] for i in range(8)] + [[i,i+0.0] for i in range(5)] + [[50,50.0]] - cols = [[],[]] + expected_data = ( + [[i, i + 0.0] for i in range(8)] + + [[i, i + 0.0] for i in range(5)] + + [[50, 50.0]] + ) + cols = [[], []] for i in expected_data: cols[0].append(i[0]) cols[1].append(i[1]) - expected_sf = SFrame({'X1':cols[0],'X2':cols[1]}) + expected_sf = SFrame({"X1": cols[0], "X2": cols[1]}) sf = sb.close() - _assert_sframe_equal(sf,expected_sf) + _assert_sframe_equal(sf, expected_sf) def test_segments(self): from ..data_structures.sframe_builder import SFrameBuilder - sb = SFrameBuilder([int],num_segments=4) - sb.append_multiple(([i] for i in range(20,30)), segment=2) - sb.append_multiple(([i] for i in range(10,20)), segment=1) - sb.append_multiple(([i] for i in range(30,40)), segment=3) - sb.append_multiple(([i] for i in range(0,10)), segment=0) + sb = SFrameBuilder([int], num_segments=4) + + sb.append_multiple(([i] for i in range(20, 30)), segment=2) + sb.append_multiple(([i] for i in range(10, 20)), segment=1) + sb.append_multiple(([i] for i in range(30, 40)), segment=3) + sb.append_multiple(([i] for i in range(0, 10)), segment=0) hist = sb.read_history(3, segment=0) - self.assertSequenceEqual(hist, [[7],[8],[9]]) + self.assertSequenceEqual(hist, [[7], [8], [9]]) hist = sb.read_history(3, segment=1) - self.assertSequenceEqual(hist, [[17],[18],[19]]) + self.assertSequenceEqual(hist, [[17], [18], [19]]) hist = sb.read_history(3, segment=2) - self.assertSequenceEqual(hist, [[27],[28],[29]]) + self.assertSequenceEqual(hist, [[27], [28], [29]]) hist = sb.read_history(3, segment=3) - self.assertSequenceEqual(hist, [[37],[38],[39]]) + self.assertSequenceEqual(hist, [[37], [38], [39]]) sf = sb.close() - expected_sf = SFrame({'X1':range(40)}) + expected_sf = SFrame({"X1": range(40)}) _assert_sframe_equal(sf, expected_sf) diff --git a/src/python/turicreate/test/test_sframe_generation.py b/src/python/turicreate/test/test_sframe_generation.py index 064c033c9f..42fd3d5756 100644 --- a/src/python/turicreate/test/test_sframe_generation.py +++ b/src/python/turicreate/test/test_sframe_generation.py @@ -13,35 +13,36 @@ import unittest import array -class SFrameGeneration(unittest.TestCase): +class SFrameGeneration(unittest.TestCase): def test_data_types(self): column_codes = { - 'n': float, - 'N': float, - 'r': float, - 'R': float, - 'b': int, - 'z': int, - 'Z': int, - 'c': str, - 'C': str, - 's': str, - 'S': str, - 'x': str, - 'X': str, - 'h': str, - 'H': str, - 'v': array.array, - 'V': array.array, - 'l': list, - 'L': list, - 'm': list, - 'M': list, - 'd': dict, - 'D': dict} + "n": float, + "N": float, + "r": float, + "R": float, + "b": int, + "z": int, + "Z": int, + "c": str, + "C": str, + "s": str, + "S": str, + "x": str, + "X": str, + "h": str, + "H": str, + "v": array.array, + "V": array.array, + "l": list, + "L": list, + "m": list, + "M": list, + "d": dict, + "D": dict, + } - test_codes = ''.join(column_codes.keys()) + test_codes = "".join(column_codes.keys()) X = generate_random_sframe(10, test_codes) column_names = X.column_names() @@ -51,24 +52,35 @@ def test_data_types(self): def test_regression_result(self): for L in range(1, 10): - X = generate_random_regression_sframe(100, 'n' * L, target_noise_level=0) - X["target_2"] = X.apply(lambda d: sum(v for k, v in d.items() if k != "target")) + X = generate_random_regression_sframe(100, "n" * L, target_noise_level=0) + X["target_2"] = X.apply( + lambda d: sum(v for k, v in d.items() if k != "target") + ) X["target_2"] = X["target_2"] - X["target_2"].min() X["target_2"] = X["target_2"] / X["target_2"].max() - self.assertAlmostEqual( (X["target_2"] - X["target"]).std(), 0, delta = 0.001) + self.assertAlmostEqual((X["target_2"] - X["target"]).std(), 0, delta=0.001) def test_classification_result(self): for L in range(1, 10): - X = generate_random_classification_sframe(100, 'n' * L, misclassification_spread=0, - num_classes = 2, num_extra_class_bins = 0) - X["target_2"] = X.apply(lambda d: sum(v for k, v in d.items() if k != "target")) + X = generate_random_classification_sframe( + 100, + "n" * L, + misclassification_spread=0, + num_classes=2, + num_extra_class_bins=0, + ) + X["target_2"] = X.apply( + lambda d: sum(v for k, v in d.items() if k != "target") + ) X["target_2"] = X["target_2"] - X["target_2"].min() X["target_2"] = X["target_2"] / X["target_2"].max() x_1 = X["target_2"][X["target"] == 0] x_2 = X["target_2"][X["target"] == 1] - self.assertTrue((x_1.max() - 1e-4 <= x_2.min() + 1e-4) - or (x_2.max() - 1e-4 <= x_1.min() + 1e-4)) + self.assertTrue( + (x_1.max() - 1e-4 <= x_2.min() + 1e-4) + or (x_2.max() - 1e-4 <= x_1.min() + 1e-4) + ) diff --git a/src/python/turicreate/test/test_style_transfer.py b/src/python/turicreate/test/test_style_transfer.py index 09463f1370..28da5a3472 100644 --- a/src/python/turicreate/test/test_style_transfer.py +++ b/src/python/turicreate/test/test_style_transfer.py @@ -162,21 +162,19 @@ def test_create_with_incorrect_max_iterations_format_float(self): def test_create_with_verbose_False(self): args = [self.style_sf, self.content_sf] kwargs = { - 'style_feature': self.style_feature, - 'content_feature': self.content_feature, - 'max_iterations': 1, - 'model': self.pre_trained_model + "style_feature": self.style_feature, + "content_feature": self.content_feature, + "max_iterations": 1, + "model": self.pre_trained_model, } - test_util.assert_longer_verbose_logs( - tc.style_transfer.create, args, kwargs) + test_util.assert_longer_verbose_logs(tc.style_transfer.create, args, kwargs) def test_stylize_with_verbose_False(self): sf = self.content_sf[0:1] styles = self._get_valid_style_cases() args = [sf] - kwargs = {'style': styles[0]} - test_util.assert_longer_verbose_logs( - self.model.stylize, args, kwargs) + kwargs = {"style": styles[0]} + test_util.assert_longer_verbose_logs(self.model.stylize, args, kwargs) def _get_invalid_style_cases(self): style_cases = [] diff --git a/src/python/turicreate/test/test_supervised_learning_missing_value_actions.py b/src/python/turicreate/test/test_supervised_learning_missing_value_actions.py index ba9d6ba3f9..f353dcb467 100644 --- a/src/python/turicreate/test/test_supervised_learning_missing_value_actions.py +++ b/src/python/turicreate/test/test_supervised_learning_missing_value_actions.py @@ -11,31 +11,33 @@ import array from turicreate.toolkits._main import ToolkitError + class SupervisedLearningMissingValueTest(unittest.TestCase): """ Base class for missing values in supervised learning. """ + @classmethod def setUpClass(self): """ Set up (Run only once) """ - self.target = 'y' + self.target = "y" self.sf = tc.SFrame() - self.sf['y'] = tc.SArray([1, 2, 1], int) - self.sf['int'] = tc.SArray([1, 2, 3], int) - self.sf['float'] = tc.SArray([1, 2, 3], float) - self.sf['dict'] = tc.SArray([{'1': 3, '2': 2}, {'2': 1}, {}], dict) - self.sf['array'] = tc.SArray([[1, 2], [3, 4], [5, 6]], array.array) - self.sf['str'] = tc.SArray(['1', '2', '3'], str) + self.sf["y"] = tc.SArray([1, 2, 1], int) + self.sf["int"] = tc.SArray([1, 2, 3], int) + self.sf["float"] = tc.SArray([1, 2, 3], float) + self.sf["dict"] = tc.SArray([{"1": 3, "2": 2}, {"2": 1}, {}], dict) + self.sf["array"] = tc.SArray([[1, 2], [3, 4], [5, 6]], array.array) + self.sf["str"] = tc.SArray(["1", "2", "3"], str) test_sf = tc.SFrame() - test_sf['y'] = tc.SArray([2], int) - test_sf['int'] = tc.SArray([2], int) - test_sf['float'] = tc.SArray([2], float) - test_sf['dict'] = tc.SArray([{'1': 1, '2': 1}], dict) - test_sf['array'] = tc.SArray([[3, 4]], array.array) - test_sf['str'] = tc.SArray(['2'], str) + test_sf["y"] = tc.SArray([2], int) + test_sf["int"] = tc.SArray([2], int) + test_sf["float"] = tc.SArray([2], float) + test_sf["dict"] = tc.SArray([{"1": 1, "2": 1}], dict) + test_sf["array"] = tc.SArray([[3, 4]], array.array) + test_sf["str"] = tc.SArray(["2"], str) self.test_sf = test_sf self.model = None @@ -66,6 +68,7 @@ def fill_some_na(self, sf, colname): def get_create_function_of_model(self, model): import sys + mod_name = model.__module__ mod = sys.modules[mod_name] return mod.create @@ -82,13 +85,13 @@ def test_create(self): # Missing value in each column # ------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: train_sf_with_na = self.fill_with_na(train_sf, col) model = create_fun(train_sf_with_na, self.target, validation_set=None) # Missing value at top of each column # ------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: train_sf_with_na = self.fill_some_na(train_sf, col) model = create_fun(train_sf_with_na, self.target, validation_set=None) @@ -101,11 +104,11 @@ def test_create_with_missing_target(self): create_fun = self.get_create_function_of_model(self.model) with self.assertRaises(ToolkitError): - train_sf_with_na = self.fill_with_na(train_sf, 'y') + train_sf_with_na = self.fill_with_na(train_sf, "y") model = create_fun(train_sf_with_na, self.target, validation_set=None) with self.assertRaises(ToolkitError): - train_sf_with_na = self.fill_some_na(train_sf, 'y') + train_sf_with_na = self.fill_some_na(train_sf, "y") model = create_fun(train_sf_with_na, self.target, validation_set=None) def test_predict(self): @@ -119,24 +122,26 @@ def test_predict(self): test_sf = self.test_sf.copy() # Should pass - pred = model.predict(test_sf, missing_value_action='auto') - pred = model.predict(test_sf, missing_value_action='impute') - pred = model.predict(test_sf, missing_value_action='error') + pred = model.predict(test_sf, missing_value_action="auto") + pred = model.predict(test_sf, missing_value_action="impute") + pred = model.predict(test_sf, missing_value_action="error") # Missing value in each column # ------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: test_sf_with_na = self.fill_with_na(test_sf, col) pred_missing = model.predict(test_sf_with_na) - for col in ['int', 'float', 'array']: + for col in ["int", "float", "array"]: test_sf_with_na = self.fill_with_na(test_sf, col) - self.assertRaises(ToolkitError, lambda: model.predict(test_sf_with_na, - missing_value_action='error')) + self.assertRaises( + ToolkitError, + lambda: model.predict(test_sf_with_na, missing_value_action="error"), + ) # Missing entire columns # ---------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: del test_sf[col] pred = model.predict(test_sf) @@ -147,36 +152,38 @@ def test_extract_features(self): if self.model is None: return - if not hasattr(self.model, 'extract_features'): + if not hasattr(self.model, "extract_features"): return model = self.model test_sf = self.test_sf.copy() # Should pass - pred = model.extract_features(test_sf, missing_value_action='auto') - pred = model.extract_features(test_sf, missing_value_action='impute') - pred = model.extract_features(test_sf, missing_value_action='error') + pred = model.extract_features(test_sf, missing_value_action="auto") + pred = model.extract_features(test_sf, missing_value_action="impute") + pred = model.extract_features(test_sf, missing_value_action="error") # Missing value in each column # ------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: test_sf_with_na = self.fill_with_na(test_sf, col) pred_missing = model.extract_features(test_sf_with_na) - for col in ['int', 'float', 'array']: + for col in ["int", "float", "array"]: test_sf_with_na = self.fill_with_na(test_sf, col) - self.assertRaises(ToolkitError, - lambda: model.extract_features(test_sf_with_na, - missing_value_action='error')) + self.assertRaises( + ToolkitError, + lambda: model.extract_features( + test_sf_with_na, missing_value_action="error" + ), + ) # Missing entire columns # ---------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: del test_sf[col] pred = model.extract_features(test_sf) - def test_evaluate(self): """ Test evaluate missing value @@ -188,24 +195,26 @@ def test_evaluate(self): test_sf = self.test_sf.copy() # Should pass - eval = model.evaluate(test_sf, missing_value_action='auto') - eval = model.evaluate(test_sf, missing_value_action='impute') - eval = model.evaluate(test_sf, missing_value_action='error') + eval = model.evaluate(test_sf, missing_value_action="auto") + eval = model.evaluate(test_sf, missing_value_action="impute") + eval = model.evaluate(test_sf, missing_value_action="error") # Missing value in each col type # ------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: test_sf_with_na = self.fill_with_na(test_sf, col) eval_missing = model.evaluate(test_sf_with_na) - for col in ['int', 'float', 'array']: + for col in ["int", "float", "array"]: test_sf_with_na = self.fill_with_na(test_sf, col) - self.assertRaises(ToolkitError, lambda: model.evaluate(test_sf_with_na, - missing_value_action='error')) + self.assertRaises( + ToolkitError, + lambda: model.evaluate(test_sf_with_na, missing_value_action="error"), + ) # Missing columns # ---------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: del test_sf[col] model.evaluate(test_sf) @@ -213,31 +222,33 @@ def test_classify(self): """ Test classify missing value """ - if self.model is None or not hasattr(self.model, 'classify'): + if self.model is None or not hasattr(self.model, "classify"): return model = self.model test_sf = self.test_sf.copy() # Should pass - model.classify(test_sf, missing_value_action='auto') - model.classify(test_sf, missing_value_action='impute') - model.classify(test_sf, missing_value_action='error') + model.classify(test_sf, missing_value_action="auto") + model.classify(test_sf, missing_value_action="impute") + model.classify(test_sf, missing_value_action="error") # Missing value in each col type # ------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: test_sf_with_na = self.fill_with_na(test_sf, col) model.classify(test_sf_with_na) - for col in ['int', 'float', 'array']: + for col in ["int", "float", "array"]: test_sf_with_na = self.fill_with_na(test_sf, col) - self.assertRaises(ToolkitError, lambda: model.classify(test_sf_with_na, - missing_value_action='error')) + self.assertRaises( + ToolkitError, + lambda: model.classify(test_sf_with_na, missing_value_action="error"), + ) # Missing columns # ---------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: del test_sf[col] model.classify(test_sf) @@ -245,31 +256,35 @@ def test_predict_topk(self): """ Test predict topk with missing value """ - if self.model is None or not hasattr(self.model, 'predict_topk'): + if self.model is None or not hasattr(self.model, "predict_topk"): return model = self.model test_sf = self.test_sf.copy() # Should pass - model.predict_topk(test_sf, k=1, missing_value_action='auto') - model.predict_topk(test_sf, k=1, missing_value_action='impute') - model.predict_topk(test_sf, k=1, missing_value_action='error') + model.predict_topk(test_sf, k=1, missing_value_action="auto") + model.predict_topk(test_sf, k=1, missing_value_action="impute") + model.predict_topk(test_sf, k=1, missing_value_action="error") # Missing value in each col type # ------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: test_sf_with_na = self.fill_with_na(test_sf, col) model.predict_topk(test_sf_with_na, k=1) - for col in ['int', 'float', 'array']: + for col in ["int", "float", "array"]: test_sf_with_na = self.fill_with_na(test_sf, col) - self.assertRaises(ToolkitError, lambda: model.predict_topk(test_sf_with_na, k=1, - missing_value_action='error')) + self.assertRaises( + ToolkitError, + lambda: model.predict_topk( + test_sf_with_na, k=1, missing_value_action="error" + ), + ) # Missing columns # ---------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: del test_sf[col] model.predict_topk(test_sf, k=1) @@ -278,21 +293,28 @@ class LinearRegressionTest(SupervisedLearningMissingValueTest): @classmethod def setUpClass(self): super(LinearRegressionTest, self).setUpClass() - self.model = tc.linear_regression.create(self.sf, self.target, validation_set=None) + self.model = tc.linear_regression.create( + self.sf, self.target, validation_set=None + ) class RandomForestRegression(SupervisedLearningMissingValueTest): @classmethod def setUpClass(self): super(RandomForestRegression, self).setUpClass() - self.model = tc.random_forest_regression.create(self.sf, self.target, validation_set=None) + self.model = tc.random_forest_regression.create( + self.sf, self.target, validation_set=None + ) self.support_missing_value = True + class DecisionTreeRegression(SupervisedLearningMissingValueTest): @classmethod def setUpClass(self): super(DecisionTreeRegression, self).setUpClass() - self.model = tc.decision_tree_regression.create(self.sf, self.target, validation_set=None) + self.model = tc.decision_tree_regression.create( + self.sf, self.target, validation_set=None + ) self.support_missing_value = True @@ -300,7 +322,9 @@ class BoostedTreesRegression(SupervisedLearningMissingValueTest): @classmethod def setUpClass(self): super(BoostedTreesRegression, self).setUpClass() - self.model = tc.boosted_trees_regression.create(self.sf, self.target, validation_set=None) + self.model = tc.boosted_trees_regression.create( + self.sf, self.target, validation_set=None + ) self.support_missing_value = True @@ -308,7 +332,9 @@ class LogisticRegressionTest(SupervisedLearningMissingValueTest): @classmethod def setUpClass(self): super(LogisticRegressionTest, self).setUpClass() - self.model = tc.logistic_classifier.create(self.sf, self.target, validation_set=None) + self.model = tc.logistic_classifier.create( + self.sf, self.target, validation_set=None + ) class SVMClassifierTest(SupervisedLearningMissingValueTest): @@ -317,23 +343,32 @@ def setUpClass(self): super(SVMClassifierTest, self).setUpClass() self.model = tc.svm_classifier.create(self.sf, self.target, validation_set=None) + class RandomForestClassifierTest(SupervisedLearningMissingValueTest): @classmethod def setUpClass(self): super(RandomForestClassifierTest, self).setUpClass() - self.model = tc.random_forest_classifier.create(self.sf, self.target, validation_set=None) + self.model = tc.random_forest_classifier.create( + self.sf, self.target, validation_set=None + ) self.support_missing_value = True + class DecisionTreeClassifierTest(SupervisedLearningMissingValueTest): @classmethod def setUpClass(self): super(DecisionTreeClassifierTest, self).setUpClass() - self.model = tc.decision_tree_classifier.create(self.sf, self.target, validation_set=None) + self.model = tc.decision_tree_classifier.create( + self.sf, self.target, validation_set=None + ) self.support_missing_value = True + class BoostedTreesClassifierTest(SupervisedLearningMissingValueTest): @classmethod def setUpClass(self): super(BoostedTreesClassifierTest, self).setUpClass() - self.model = tc.boosted_trees_classifier.create(self.sf, self.target, validation_set=None) + self.model = tc.boosted_trees_classifier.create( + self.sf, self.target, validation_set=None + ) self.support_missing_value = True diff --git a/src/python/turicreate/test/test_supervised_learning_string_targets.py b/src/python/turicreate/test/test_supervised_learning_string_targets.py index b8df0ea604..d9cbd458de 100644 --- a/src/python/turicreate/test/test_supervised_learning_string_targets.py +++ b/src/python/turicreate/test/test_supervised_learning_string_targets.py @@ -16,27 +16,28 @@ class SupervisedLearningStringTargetBinary(unittest.TestCase): """ Base class for missing values in supervised learning. """ + @classmethod def setUpClass(self): """ Set up (Run only once) """ - self.target = 'y' + self.target = "y" self.sf = tc.SFrame() - self.sf['y'] = tc.SArray(["t1", "t2", "t1"], str) - self.sf['int'] = tc.SArray([1, 2, 3], int) - self.sf['float'] = tc.SArray([1, 2, 3], float) - self.sf['dict'] = tc.SArray([{'1': 3, '2': 2}, {'2': 1}, {}], dict) - self.sf['array'] = tc.SArray([[1, 2], [3, 4], [5, 6]], array.array) - self.sf['str'] = tc.SArray(['1', '2', '3'], str) + self.sf["y"] = tc.SArray(["t1", "t2", "t1"], str) + self.sf["int"] = tc.SArray([1, 2, 3], int) + self.sf["float"] = tc.SArray([1, 2, 3], float) + self.sf["dict"] = tc.SArray([{"1": 3, "2": 2}, {"2": 1}, {}], dict) + self.sf["array"] = tc.SArray([[1, 2], [3, 4], [5, 6]], array.array) + self.sf["str"] = tc.SArray(["1", "2", "3"], str) test_sf = tc.SFrame() - test_sf['y'] = tc.SArray(["foobar", "t1"], str) - test_sf['int'] = tc.SArray([2, 1], int) - test_sf['float'] = tc.SArray([2, 2.0], float) - test_sf['dict'] = tc.SArray([{'1': 1, '2': 1}, {}], dict) - test_sf['array'] = tc.SArray([[3, 4], [2, 2]], array.array) - test_sf['str'] = tc.SArray(['2', '2'], str) + test_sf["y"] = tc.SArray(["foobar", "t1"], str) + test_sf["int"] = tc.SArray([2, 1], int) + test_sf["float"] = tc.SArray([2, 2.0], float) + test_sf["dict"] = tc.SArray([{"1": 1, "2": 1}, {}], dict) + test_sf["array"] = tc.SArray([[3, 4], [2, 2]], array.array) + test_sf["str"] = tc.SArray(["2", "2"], str) self.test_sf = test_sf self.model = None @@ -44,6 +45,7 @@ def setUpClass(self): def get_create_function_of_model(self, model): import sys + mod_name = model.__module__ mod = sys.modules[mod_name] return mod.create @@ -57,12 +59,12 @@ def test_create(self): # Missing value in each column # ------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: model = create_fun(train_sf, self.target, validation_set=None) # Missing value at top of each column # ------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: model = create_fun(train_sf, self.target, validation_set=None) def test_evaluate(self): @@ -72,37 +74,38 @@ def test_evaluate(self): model = self.model test_sf = self.test_sf.copy() - # Should pass ev_train = model.evaluate(self.sf) ev_test = model.evaluate(test_sf) ev_test_one = model.evaluate(test_sf[0:1]) + class SupervisedLearningStringTargetMulticlass(unittest.TestCase): """ Base class for missing values in supervised learning. """ + @classmethod def setUpClass(self): """ Set up (Run only once) """ - self.target = 'y' + self.target = "y" self.sf = tc.SFrame() - self.sf['y'] = tc.SArray(["t1", "t2", "t3"], str) - self.sf['int'] = tc.SArray([1, 2, 3], int) - self.sf['float'] = tc.SArray([1, 2, 3], float) - self.sf['dict'] = tc.SArray([{'1': 3, '2': 2}, {'2': 1}, {}], dict) - self.sf['array'] = tc.SArray([[1, 2], [3, 4], [5, 6]], array.array) - self.sf['str'] = tc.SArray(['1', '2', '3'], str) + self.sf["y"] = tc.SArray(["t1", "t2", "t3"], str) + self.sf["int"] = tc.SArray([1, 2, 3], int) + self.sf["float"] = tc.SArray([1, 2, 3], float) + self.sf["dict"] = tc.SArray([{"1": 3, "2": 2}, {"2": 1}, {}], dict) + self.sf["array"] = tc.SArray([[1, 2], [3, 4], [5, 6]], array.array) + self.sf["str"] = tc.SArray(["1", "2", "3"], str) test_sf = tc.SFrame() - test_sf['y'] = tc.SArray(["foobar", "t1"], str) - test_sf['int'] = tc.SArray([2, 1], int) - test_sf['float'] = tc.SArray([2, 2.0], float) - test_sf['dict'] = tc.SArray([{'1': 1, '2': 1}, {}], dict) - test_sf['array'] = tc.SArray([[3, 4], [2, 2]], array.array) - test_sf['str'] = tc.SArray(['2', '2'], str) + test_sf["y"] = tc.SArray(["foobar", "t1"], str) + test_sf["int"] = tc.SArray([2, 1], int) + test_sf["float"] = tc.SArray([2, 2.0], float) + test_sf["dict"] = tc.SArray([{"1": 1, "2": 1}, {}], dict) + test_sf["array"] = tc.SArray([[3, 4], [2, 2]], array.array) + test_sf["str"] = tc.SArray(["2", "2"], str) self.test_sf = test_sf self.model = None @@ -110,6 +113,7 @@ def setUpClass(self): def get_create_function_of_model(self, model): import sys + mod_name = model.__module__ mod = sys.modules[mod_name] return mod.create @@ -123,12 +127,12 @@ def test_create(self): # Missing value in each column # ------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: model = create_fun(train_sf, self.target, validation_set=None) # Missing value at top of each column # ------------------------------- - for col in ['int', 'float', 'dict', 'array', 'str']: + for col in ["int", "float", "dict", "array", "str"]: model = create_fun(train_sf, self.target, validation_set=None) def test_evaluate(self): @@ -138,7 +142,6 @@ def test_evaluate(self): model = self.model test_sf = self.test_sf.copy() - # Should pass ev_train = model.evaluate(self.sf) ev_test = model.evaluate(test_sf) @@ -149,7 +152,9 @@ class LogisticRegressionTest(SupervisedLearningStringTargetBinary): @classmethod def setUpClass(self): super(LogisticRegressionTest, self).setUpClass() - self.model = tc.logistic_classifier.create(self.sf, self.target, validation_set=None) + self.model = tc.logistic_classifier.create( + self.sf, self.target, validation_set=None + ) class SVMClassifierTest(SupervisedLearningStringTargetBinary): @@ -158,47 +163,65 @@ def setUpClass(self): super(SVMClassifierTest, self).setUpClass() self.model = tc.svm_classifier.create(self.sf, self.target, validation_set=None) + class RandomForestClassifierTest(SupervisedLearningStringTargetBinary): @classmethod def setUpClass(self): super(RandomForestClassifierTest, self).setUpClass() - self.model = tc.random_forest_classifier.create(self.sf, self.target, validation_set=None) + self.model = tc.random_forest_classifier.create( + self.sf, self.target, validation_set=None + ) + class DecisionTreeClassifierTest(SupervisedLearningStringTargetBinary): @classmethod def setUpClass(self): super(DecisionTreeClassifierTest, self).setUpClass() - self.model = tc.decision_tree_classifier.create(self.sf, self.target, validation_set=None) + self.model = tc.decision_tree_classifier.create( + self.sf, self.target, validation_set=None + ) class BoostedTreesClassifierTest(SupervisedLearningStringTargetBinary): @classmethod def setUpClass(self): super(BoostedTreesClassifierTest, self).setUpClass() - self.model = tc.boosted_trees_classifier.create(self.sf, self.target, validation_set=None) + self.model = tc.boosted_trees_classifier.create( + self.sf, self.target, validation_set=None + ) + class MulticlassLogisticRegressionTest(SupervisedLearningStringTargetMulticlass): @classmethod def setUpClass(self): super(MulticlassLogisticRegressionTest, self).setUpClass() - self.model = tc.logistic_classifier.create(self.sf, self.target, validation_set=None) + self.model = tc.logistic_classifier.create( + self.sf, self.target, validation_set=None + ) class MulticlassRandomForestClassifierTest(SupervisedLearningStringTargetMulticlass): @classmethod def setUpClass(self): super(MulticlassRandomForestClassifierTest, self).setUpClass() - self.model = tc.random_forest_classifier.create(self.sf, self.target, validation_set=None) + self.model = tc.random_forest_classifier.create( + self.sf, self.target, validation_set=None + ) + class MulticlassDecisionTreeClassifierTest(SupervisedLearningStringTargetMulticlass): @classmethod def setUpClass(self): super(MulticlassDecisionTreeClassifierTest, self).setUpClass() - self.model = tc.decision_tree_classifier.create(self.sf, self.target, validation_set=None) + self.model = tc.decision_tree_classifier.create( + self.sf, self.target, validation_set=None + ) class MulticlassBoostedTreesClassifierTest(SupervisedLearningStringTargetMulticlass): @classmethod def setUpClass(self): super(MulticlassBoostedTreesClassifierTest, self).setUpClass() - self.model = tc.boosted_trees_classifier.create(self.sf, self.target, validation_set=None) + self.model = tc.boosted_trees_classifier.create( + self.sf, self.target, validation_set=None + ) diff --git a/src/python/turicreate/test/test_svm_classifier.py b/src/python/turicreate/test/test_svm_classifier.py index 10d7c9d0cb..228db83a0f 100644 --- a/src/python/turicreate/test/test_svm_classifier.py +++ b/src/python/turicreate/test/test_svm_classifier.py @@ -21,6 +21,7 @@ import os as _os + class SVMClassifierTest(unittest.TestCase): """ Unit test class for a LogisticRegressionModel that has already been created. @@ -43,90 +44,99 @@ def setUpClass(self): target[1] = 1 ## Create the model - self.sf['target'] = target + self.sf["target"] = target self.def_kwargs = _DEFAULT_SOLVER_OPTIONS - self.def_opts= dict(list(self.def_kwargs.items()) + list({'solver' : 'auto', - 'feature_rescaling' : 1, - 'class_weights' : None, - 'penalty' : 1.0}.items())) + self.def_opts = dict( + list(self.def_kwargs.items()) + + list( + { + "solver": "auto", + "feature_rescaling": 1, + "class_weights": None, + "penalty": 1.0, + }.items() + ) + ) self.opts = self.def_opts.copy() - self.opts['max_iterations'] = 500 - self.opts['solver'] = "lbfgs" + self.opts["max_iterations"] = 500 + self.opts["solver"] = "lbfgs" - self.features = ['X{}'.format(i) for i in range(1, d+1)] + self.features = ["X{}".format(i) for i in range(1, d + 1)] features = self.features - self.unpacked_features = ['X{}'.format(i) for i in range(1, d+1)] - self.target = 'target' + self.unpacked_features = ["X{}".format(i) for i in range(1, d + 1)] + self.target = "target" - self.model = tc.svm_classifier.create(self.sf, target='target', + self.model = tc.svm_classifier.create( + self.sf, + target="target", features=None, - feature_rescaling = True, + feature_rescaling=True, validation_set=None, - max_iterations = self.opts['max_iterations']) + max_iterations=self.opts["max_iterations"], + ) ## Compute the correct answers with Scikit-Learns target_name = self.target feature_names = self.features - X_train = list(self.sf.apply(lambda row: [row[k] for k in \ - features])) + X_train = list(self.sf.apply(lambda row: [row[k] for k in features])) y_train = list(self.sf[self.target]) - sm_model = svm.LinearSVC(C=1.0, loss='l1') + sm_model = svm.LinearSVC(C=1.0, loss="l1") sm_model.fit(X_train, y_train) self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0]) predictions = list(sm_model.predict(X_train)) classes = predictions - margins = [np.concatenate(([1], x)).dot(np.array(self.coef)) \ - for x in X_train] + margins = [np.concatenate(([1], x)).dot(np.array(self.coef)) for x in X_train] self.yhat_class = tc.SArray(predictions) self.yhat_margins = tc.SArray(margins) self.sm_metrics = { - "accuracy" : accuracy_score(target, list(self.yhat_class)), - "confusion_matrix" : tc.toolkits.evaluation.confusion_matrix( - tc.SArray(target), tc.SArray(self.yhat_class)), - "f1_score" : f1_score(target, list(self.yhat_class)), - "precision" : precision_score(target, list(self.yhat_class)), - "recall" : recall_score(target, list(self.yhat_class)), + "accuracy": accuracy_score(target, list(self.yhat_class)), + "confusion_matrix": tc.toolkits.evaluation.confusion_matrix( + tc.SArray(target), tc.SArray(self.yhat_class) + ), + "f1_score": f1_score(target, list(self.yhat_class)), + "precision": precision_score(target, list(self.yhat_class)), + "recall": recall_score(target, list(self.yhat_class)), } self.get_ans = { - 'coefficients': lambda x: isinstance(x, tc.SFrame), - 'convergence_threshold': \ - lambda x: x == self.opts['convergence_threshold'], - 'unpacked_features': lambda x: x == self.unpacked_features, - 'feature_rescaling': lambda x: x == True, - 'features': lambda x: x == self.features, - 'lbfgs_memory_level': lambda x: x == 11, - 'max_iterations': lambda x: x == self.opts['max_iterations'], - 'num_classes': lambda x: x == 2, - 'num_coefficients': lambda x: x == 11, - 'num_examples': lambda x: x == 100, - 'classes': lambda x: set(x) == set([0,1]), - 'class_weights': lambda x: x == {0:1, 1:1}, - 'num_examples_per_class': lambda x: {0: (tc.SArray(target) == 0).sum(), - 1: (tc.SArray(target) == 1).sum()}, - 'num_features': lambda x: x == 10, - 'num_unpacked_features': lambda x: x == 10, - 'penalty': lambda x: x == self.opts['penalty'], - 'progress': lambda x: isinstance(x, tc.SFrame), - 'solver': lambda x: x == self.opts["solver"], - 'target': lambda x: x == self.target, - 'training_accuracy': lambda x: x >= 0 and x <= 1, - 'training_iterations': lambda x: x > 0, - 'training_loss': lambda x: x > 0, - 'training_solver_status':\ - lambda x: x == "SUCCESS: Optimal solution found.", - 'training_time': lambda x: x >= 0, - 'training_confusion_matrix': lambda x: len(x) > 0, - 'training_f1_score': lambda x: x > 0, - 'training_precision': lambda x: x > 0, - 'training_recall': lambda x: x > 0, - 'training_report_by_class': lambda x: len(x) > 0, - 'validation_data': lambda x: isinstance(x, tc.SFrame) and len(x) == 0, - 'disable_posttrain_evaluation' : lambda x: x == False, - } + "coefficients": lambda x: isinstance(x, tc.SFrame), + "convergence_threshold": lambda x: x == self.opts["convergence_threshold"], + "unpacked_features": lambda x: x == self.unpacked_features, + "feature_rescaling": lambda x: x == True, + "features": lambda x: x == self.features, + "lbfgs_memory_level": lambda x: x == 11, + "max_iterations": lambda x: x == self.opts["max_iterations"], + "num_classes": lambda x: x == 2, + "num_coefficients": lambda x: x == 11, + "num_examples": lambda x: x == 100, + "classes": lambda x: set(x) == set([0, 1]), + "class_weights": lambda x: x == {0: 1, 1: 1}, + "num_examples_per_class": lambda x: { + 0: (tc.SArray(target) == 0).sum(), + 1: (tc.SArray(target) == 1).sum(), + }, + "num_features": lambda x: x == 10, + "num_unpacked_features": lambda x: x == 10, + "penalty": lambda x: x == self.opts["penalty"], + "progress": lambda x: isinstance(x, tc.SFrame), + "solver": lambda x: x == self.opts["solver"], + "target": lambda x: x == self.target, + "training_accuracy": lambda x: x >= 0 and x <= 1, + "training_iterations": lambda x: x > 0, + "training_loss": lambda x: x > 0, + "training_solver_status": lambda x: x == "SUCCESS: Optimal solution found.", + "training_time": lambda x: x >= 0, + "training_confusion_matrix": lambda x: len(x) > 0, + "training_f1_score": lambda x: x > 0, + "training_precision": lambda x: x > 0, + "training_recall": lambda x: x > 0, + "training_report_by_class": lambda x: len(x) > 0, + "validation_data": lambda x: isinstance(x, tc.SFrame) and len(x) == 0, + "disable_posttrain_evaluation": lambda x: x == False, + } self.fields_ans = self.get_ans.keys() def test__list_fields(self): @@ -145,8 +155,10 @@ def test_get(self): model = self.model for field in self.fields_ans: ans = model._get(field) - self.assertTrue(self.get_ans[field](ans), \ - '''Get failed in field {}. Output was {}.'''.format(field, ans)) + self.assertTrue( + self.get_ans[field](ans), + """Get failed in field {}. Output was {}.""".format(field, ans), + ) def test_coefficients(self): """ @@ -154,7 +166,7 @@ def test_coefficients(self): """ model = self.model coefs = model.coefficients - coef_list = list(coefs['value']) + coef_list = list(coefs["value"]) def test_summary(self): """ @@ -168,7 +180,7 @@ def test_repr(self): Check the repr function. """ model = self.model - ans = str(model) + ans = str(model) self.assertTrue(type(ans) == str) def test_predict(self): @@ -176,50 +188,52 @@ def test_predict(self): Check the prediction function. """ model = self.model - ans = model.predict(self.sf) - ans = model.predict(self.sf, output_type='class') + ans = model.predict(self.sf) + ans = model.predict(self.sf, output_type="class") self.assertEqual(ans.dtype, int) - ans = model.predict(self.sf, output_type='margin') + ans = model.predict(self.sf, output_type="margin") def test_classify(self): """ Check the classify function. """ model = self.model - ans = model.classify(self.sf) - self.assertEqual(len(ans) ,len(self.sf)) + ans = model.classify(self.sf) + self.assertEqual(len(ans), len(self.sf)) def test_evaluate(self): """ Make sure that evaluate works. """ model = self.model + def check_cf_matrix(ans): self.assertTrue(ans is not None) - self.assertTrue('confusion_matrix' in ans) - cf = ans['confusion_matrix'].sort(['target_label', 'predicted_label']) - sm = self.sf_margin['confusion_matrix']\ - .sort(['target_label', 'predicted_label']) - self.assertTrue( - np.allclose(cf['count'], sm['count'])) + self.assertTrue("confusion_matrix" in ans) + cf = ans["confusion_matrix"].sort(["target_label", "predicted_label"]) + sm = self.sf_margin["confusion_matrix"].sort( + ["target_label", "predicted_label"] + ) + self.assertTrue(np.allclose(cf["count"], sm["count"])) def check_metric(ans, metric): - if metric == 'confusion_matrix': + if metric == "confusion_matrix": check_cf_matrix(ans) else: self.assertTrue(ans is not None) self.assertTrue(metric in ans) - self.assertAlmostEqual(ans[metric], - self.sm_metrics[metric], - places = 4, - msg = "%s = (%s,%s)" % \ - (metric, ans[metric], self.sm_metrics[metric])) + self.assertAlmostEqual( + ans[metric], + self.sm_metrics[metric], + places=4, + msg="%s = (%s,%s)" % (metric, ans[metric], self.sm_metrics[metric]), + ) def test_save_and_load(self): """ Make sure saving and loading retains everything. """ - filename = 'save_file{}'.format(uuid.uuid4()) + filename = "save_file{}".format(uuid.uuid4()) self.model.save(filename) self.model = tc.load_model(filename) @@ -265,41 +279,44 @@ def setUpClass(self): target[1] = 1 ## Create the model - self.sf['target'] = target + self.sf["target"] = target self.def_kwargs = _DEFAULT_SOLVER_OPTIONS - self.solver = 'auto' - self.features = ', '.join(['X{}'.format(i) for i in range(1, d+1)]) - self.target = 'target' + self.solver = "auto" + self.features = ", ".join(["X{}".format(i) for i in range(1, d + 1)]) + self.target = "target" - self.sf['target'] = target - self.features = ['X{}'.format(i) for i in range(1, d+1)] + self.sf["target"] = target + self.features = ["X{}".format(i) for i in range(1, d + 1)] ## Compute the correct answers with Scikit-Learns target_name = self.target feature_names = self.features - X_train = list(self.sf.apply(lambda row: [row[k] for k in \ - feature_names])) + X_train = list(self.sf.apply(lambda row: [row[k] for k in feature_names])) y_train = list(self.sf[self.target]) - sm_model = svm.LinearSVC(C=1.0, loss='l1') + sm_model = svm.LinearSVC(C=1.0, loss="l1") sm_model.fit(X_train, y_train) self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0]) - def _test_create(self, sf, target, features, solver, kwargs): """ Test svm create. """ - model = tc.svm_classifier.create(sf, target, features, - solver=solver, verbose=False, - validation_set=None, - feature_rescaling = False, - **kwargs) + model = tc.svm_classifier.create( + sf, + target, + features, + solver=solver, + verbose=False, + validation_set=None, + feature_rescaling=False, + **kwargs + ) - test_case = 'solver = {}, kwargs = {}'.format(solver, kwargs) - self.assertTrue(model is not None, 'Model is None') - coefs = list(model.coefficients['value']) + test_case = "solver = {}, kwargs = {}".format(solver, kwargs) + self.assertTrue(model is not None, "Model is None") + coefs = list(model.coefficients["value"]) print(coefs, self.coef) self.assertTrue(np.allclose(coefs, self.coef, rtol=2e-01, atol=2e-01)) @@ -309,31 +326,47 @@ def test_class_weights(self): """ # Should train correctly - model = tc.svm_classifier.create(self.sf, self.target, self.features, - class_weights = 'auto', - validation_set=None) - model = tc.svm_classifier.create(self.sf, self.target, self.features, - class_weights = {0:1, 1:2}) + model = tc.svm_classifier.create( + self.sf, + self.target, + self.features, + class_weights="auto", + validation_set=None, + ) + model = tc.svm_classifier.create( + self.sf, self.target, self.features, class_weights={0: 1, 1: 2} + ) # Should fail try: - model = tc.svm_classifier.create(self.sf, self.target, self.features, - class_weights = 1.0, - validation_set = None) + model = tc.svm_classifier.create( + self.sf, + self.target, + self.features, + class_weights=1.0, + validation_set=None, + ) except ToolkitError: - pass + pass try: - model = tc.svm_classifier.create(self.sf, self.target, self.features, - class_weights = {2: 10}, - validation_set = None) + model = tc.svm_classifier.create( + self.sf, + self.target, + self.features, + class_weights={2: 10}, + validation_set=None, + ) except ToolkitError: - pass + pass try: - model = tc.svm_classifier.create(self.sf, self.target, self.features, - class_weights = [1,1], - validation_set = None) + model = tc.svm_classifier.create( + self.sf, + self.target, + self.features, + class_weights=[1, 1], + validation_set=None, + ) except ToolkitError: - pass - + pass def test_create_default_features(self): """ @@ -341,8 +374,8 @@ def test_create_default_features(self): """ kwargs = self.def_kwargs.copy() - kwargs['max_iterations'] = 100 - for solver in ['lbfgs', 'auto']: + kwargs["max_iterations"] = 100 + for solver in ["lbfgs", "auto"]: args = (self.sf, self.target, None, solver, kwargs) self._test_create(*args) @@ -351,11 +384,12 @@ def test_create(self): Test svm create. """ kwargs = self.def_kwargs.copy() - kwargs['max_iterations'] = 100 - for solver in ['lbfgs', 'auto']: + kwargs["max_iterations"] = 100 + for solver in ["lbfgs", "auto"]: args = (self.sf, self.target, self.features, solver, kwargs) self._test_create(*args) + class ListCategoricalSVMTest(unittest.TestCase): """ Unit test class for testing svm with a categorical feature. @@ -373,7 +407,7 @@ def setUpClass(self): self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) # Categorical column - species = np.array(['cat', 'dog', 'foosa']) + species = np.array(["cat", "dog", "foosa"]) idx = np.random.randint(3, size=n) # Stats models maps categorical in alphabetical order of categories. @@ -381,38 +415,36 @@ def setUpClass(self): idx[0] = 0 idx[1] = 1 idx[2] = 2 - self.sf['species'] = list(species[idx]) + self.sf["species"] = list(species[idx]) y = np.random.randint(2, size=n) y[0] = 0 y[1] = 1 - self.sf['target'] = y + self.sf["target"] = y ## Set the turicreate model params - self.target = 'target' - self.features = ['species', 'X1', 'X2', 'X3'] + self.target = "target" + self.features = ["species", "X1", "X2", "X3"] self.def_kwargs = _DEFAULT_SOLVER_OPTIONS ## Compute the correct answers with Scikit-Learns target_name = self.target - order = ['cat', 'dog', 'foosa'] - self.sf['species_0'] = self.sf['species'] == order[1] - self.sf['species_1'] = self.sf['species'] == order[2] - feature_names = ['species_0', 'species_1', 'X1', 'X2', 'X3'] - X_train = list(self.sf.apply(lambda row: [row[k] for k in \ - feature_names])) + order = ["cat", "dog", "foosa"] + self.sf["species_0"] = self.sf["species"] == order[1] + self.sf["species_1"] = self.sf["species"] == order[2] + feature_names = ["species_0", "species_1", "X1", "X2", "X3"] + X_train = list(self.sf.apply(lambda row: [row[k] for k in feature_names])) y_train = list(self.sf[self.target]) - sm_model = svm.LinearSVC(C=1.0, loss='l1') + sm_model = svm.LinearSVC(C=1.0, loss="l1") sm_model.fit(X_train, y_train) self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0]) - self.sf['species'] = self.sf["species"].apply(lambda x: [x]) - + self.sf["species"] = self.sf["species"].apply(lambda x: [x]) def _test_coefficients(self, model): """ Check that the coefficient values are very close to the correct values. """ coefs = model.coefficients - coef_list = list(coefs['value']) + coef_list = list(coefs["value"]) def _test_create(self, sf, target, features, solver, kwargs): """ @@ -420,9 +452,9 @@ def _test_create(self, sf, target, features, solver, kwargs): """ test_label = "solver: {}\tkwargs: {}".format(solver, kwargs) - model = tc.svm_classifier.create(sf, target, features, - solver=solver, feature_rescaling = False, - **kwargs) + model = tc.svm_classifier.create( + sf, target, features, solver=solver, feature_rescaling=False, **kwargs + ) self.assertTrue(model is not None, "Model is None") self._test_coefficients(model) @@ -433,10 +465,9 @@ def test_create(self): """ kwargs = self.def_kwargs.copy() - kwargs['max_iterations'] = 100 - for solver in ['auto', 'lbfgs']: - self._test_create(self.sf, self.target, self.features, solver, - kwargs) + kwargs["max_iterations"] = 100 + for solver in ["auto", "lbfgs"]: + self._test_create(self.sf, self.target, self.features, solver, kwargs) class CategoricalSVMTest(unittest.TestCase): @@ -456,7 +487,7 @@ def setUpClass(self): self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) # Categorical column - species = np.array(['cat', 'dog', 'foosa']) + species = np.array(["cat", "dog", "foosa"]) idx = np.random.randint(3, size=n) # Stats models maps categorical in alphabetical order of categories. @@ -464,37 +495,35 @@ def setUpClass(self): idx[0] = 0 idx[1] = 1 idx[2] = 2 - self.sf['species'] = list(species[idx]) + self.sf["species"] = list(species[idx]) y = np.random.randint(2, size=n) y[0] = 0 y[1] = 1 - self.sf['target'] = y + self.sf["target"] = y ## Set the turicreate model params - self.target = 'target' - self.features = ['species', 'X1', 'X2', 'X3'] + self.target = "target" + self.features = ["species", "X1", "X2", "X3"] self.def_kwargs = _DEFAULT_SOLVER_OPTIONS ## Compute the correct answers with Scikit-Learns target_name = self.target - order = ['cat', 'dog', 'foosa'] - self.sf['species_0'] = self.sf['species'] == order[1] - self.sf['species_1'] = self.sf['species'] == order[2] - feature_names = ['species_0', 'species_1', 'X1', 'X2', 'X3'] - X_train = list(self.sf.apply(lambda row: [row[k] for k in \ - feature_names])) + order = ["cat", "dog", "foosa"] + self.sf["species_0"] = self.sf["species"] == order[1] + self.sf["species_1"] = self.sf["species"] == order[2] + feature_names = ["species_0", "species_1", "X1", "X2", "X3"] + X_train = list(self.sf.apply(lambda row: [row[k] for k in feature_names])) y_train = list(self.sf[self.target]) - sm_model = svm.LinearSVC(C=1.0, loss='l1') + sm_model = svm.LinearSVC(C=1.0, loss="l1") sm_model.fit(X_train, y_train) self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0]) - def _test_coefficients(self, model): """ Check that the coefficient values are very close to the correct values. """ coefs = model.coefficients - coef_list = list(coefs['value']) + coef_list = list(coefs["value"]) def _test_create(self, sf, target, features, solver, kwargs): """ @@ -502,9 +531,9 @@ def _test_create(self, sf, target, features, solver, kwargs): """ test_label = "solver: {}\tkwargs: {}".format(solver, kwargs) - model = tc.svm_classifier.create(sf, target, features, - solver=solver, feature_rescaling = False, - **kwargs) + model = tc.svm_classifier.create( + sf, target, features, solver=solver, feature_rescaling=False, **kwargs + ) self.assertTrue(model is not None, "Model is None") self._test_coefficients(model) @@ -515,245 +544,268 @@ def test_create(self): """ kwargs = self.def_kwargs.copy() - kwargs['max_iterations'] = 100 - for solver in ['auto', 'lbfgs']: - self._test_create(self.sf, self.target, self.features, solver, - kwargs) + kwargs["max_iterations"] = 100 + for solver in ["auto", "lbfgs"]: + self._test_create(self.sf, self.target, self.features, solver, kwargs) def test_predict_new_categories(self): - model = tc.svm_classifier.create(self.sf, self.target, self.features, - feature_rescaling = False, - validation_set = None) - pred = model.predict(self.sf) - self.sf['species'] = self.sf['species'].apply(lambda x: 'rat' if x == 'foosa' - else x) - pred = model.evaluate(self.sf) - self.sf['species'] = self.sf['species'].apply(lambda x: 'foosa' if x == 'rat' - else x) + model = tc.svm_classifier.create( + self.sf, + self.target, + self.features, + feature_rescaling=False, + validation_set=None, + ) + pred = model.predict(self.sf) + self.sf["species"] = self.sf["species"].apply( + lambda x: "rat" if x == "foosa" else x + ) + pred = model.evaluate(self.sf) + self.sf["species"] = self.sf["species"].apply( + lambda x: "foosa" if x == "rat" else x + ) def test_evaluate_new_categories(self): - model = tc.svm_classifier.create(self.sf, self.target, self.features, - feature_rescaling = False, - validation_set = None) - pred = model.predict(self.sf) - self.sf['species'] = self.sf['species'].apply(lambda x: 'rat' if x == 'foosa' - else x) - pred = model.evaluate(self.sf) - self.sf['species'] = self.sf['species'].apply(lambda x: 'foosa' if x == 'rat' - else x) + model = tc.svm_classifier.create( + self.sf, + self.target, + self.features, + feature_rescaling=False, + validation_set=None, + ) + pred = model.predict(self.sf) + self.sf["species"] = self.sf["species"].apply( + lambda x: "rat" if x == "foosa" else x + ) + pred = model.evaluate(self.sf) + self.sf["species"] = self.sf["species"].apply( + lambda x: "foosa" if x == "rat" else x + ) """ Test detection of columns that are almost the same. """ + def test_zero_variance_detection(self): sf = self.sf try: - sf['error-column'] = 1 + sf["error-column"] = 1 model = tc.svm_classifier.create(sf, self.target) except ToolkitError: pass try: - sf['error-column'] = '1' + sf["error-column"] = "1" model = tc.svm_classifier.create(sf, self.target) except ToolkitError: pass try: - sf['error-column'] = [[1] for i in sf] + sf["error-column"] = [[1] for i in sf] model = tc.svm_classifier.create(sf, self.target) except ToolkitError: pass try: - sf['error-column'] = [{1:1} for i in sf] + sf["error-column"] = [{1: 1} for i in sf] model = tc.svm_classifier.create(sf, self.target) except ToolkitError: pass - del sf['error-column'] + del sf["error-column"] """ Test detection of columns have nan """ + def test_nan_detection(self): sf = self.sf try: - sf['error-column'] = np.nan + sf["error-column"] = np.nan model = tc.svm_classifier.create(sf, self.target) except ToolkitError: pass try: - sf['error-column'] = [[np.nan] for i in sf] + sf["error-column"] = [[np.nan] for i in sf] model = tc.svm_classifier.create(sf, self.target) except ToolkitError: pass try: - sf['error-column'] = [{1:np.nan} for i in sf] + sf["error-column"] = [{1: np.nan} for i in sf] model = tc.svm_classifier.create(sf, self.target) except ToolkitError: pass - del sf['error-column'] + del sf["error-column"] + class VectorSVMTest(unittest.TestCase): - """ + """ Unit test class for testing a svm create function. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - np.random.seed(15) - n, d = 100, 3 - self.sf = tc.SFrame() - - for i in range(d): - self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - self.sf['target'] = np.random.randint(2, size=n) - self.target = 'target' - self.sf['vec'] = self.sf.apply(lambda row: [row['X{}'.format(i+1)] for i in - range(d)]) - self.sf['vec'] = self.sf['vec'].apply(lambda x:x, array.array) - - self.features = ['vec'] - self.unpacked_features = ['vec[%s]' % (i) for i in range(d)] - self.def_kwargs = _DEFAULT_SOLVER_OPTIONS - - ## Compute the correct answers with Scikit-Learn - target_name = self.target - feature_names = self.features - X_train = list(self.sf['vec']) - y_train = list(self.sf[self.target]) - sm_model = svm.LinearSVC(C=1.0, loss='l1') - sm_model.fit(X_train, y_train) - self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0]) - - def _test_coefficients(self, model): - """ + np.random.seed(15) + n, d = 100, 3 + self.sf = tc.SFrame() + + for i in range(d): + self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + self.sf["target"] = np.random.randint(2, size=n) + self.target = "target" + self.sf["vec"] = self.sf.apply( + lambda row: [row["X{}".format(i + 1)] for i in range(d)] + ) + self.sf["vec"] = self.sf["vec"].apply(lambda x: x, array.array) + + self.features = ["vec"] + self.unpacked_features = ["vec[%s]" % (i) for i in range(d)] + self.def_kwargs = _DEFAULT_SOLVER_OPTIONS + + ## Compute the correct answers with Scikit-Learn + target_name = self.target + feature_names = self.features + X_train = list(self.sf["vec"]) + y_train = list(self.sf[self.target]) + sm_model = svm.LinearSVC(C=1.0, loss="l1") + sm_model.fit(X_train, y_train) + self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0]) + + def _test_coefficients(self, model): + """ Check that the coefficient values are very close to the correct values. """ - coefs = model.coefficients - coef_list = list(coefs['value']) + coefs = model.coefficients + coef_list = list(coefs["value"]) - def _test_create(self, sf, target, features, solver, - kwargs): + def _test_create(self, sf, target, features, solver, kwargs): - model = tc.svm_classifier.create(sf, target, features, solver = solver, - feature_rescaling = False, **kwargs) - test_case = 'solver = {solver}, kwargs = {kwargs}'.format(solver = solver, - kwargs = kwargs) + model = tc.svm_classifier.create( + sf, target, features, solver=solver, feature_rescaling=False, **kwargs + ) + test_case = "solver = {solver}, kwargs = {kwargs}".format( + solver=solver, kwargs=kwargs + ) - self.assertTrue(model is not None, 'Model is None') - self._test_coefficients(model) + self.assertTrue(model is not None, "Model is None") + self._test_coefficients(model) - def test_create(self): + def test_create(self): - kwargs = self.def_kwargs.copy() - kwargs['max_iterations'] = 1000 - for solver in ['auto', 'lbfgs']: - args = (self.sf, self.target, self.features, - solver, self.def_kwargs) - self._test_create(*args) + kwargs = self.def_kwargs.copy() + kwargs["max_iterations"] = 1000 + for solver in ["auto", "lbfgs"]: + args = (self.sf, self.target, self.features, solver, self.def_kwargs) + self._test_create(*args) - def test_features(self): + def test_features(self): - model = tc.svm_classifier.create(self.sf, self.target, self.features, - feature_rescaling = False) - self.assertEqual(model.num_features, len(self.features)) - self.assertEqual(model.features, self.features) - self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) - self.assertEqual(model.unpacked_features, self.unpacked_features) + model = tc.svm_classifier.create( + self.sf, self.target, self.features, feature_rescaling=False + ) + self.assertEqual(model.num_features, len(self.features)) + self.assertEqual(model.features, self.features) + self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) + self.assertEqual(model.unpacked_features, self.unpacked_features) class DictSVMTest(unittest.TestCase): - """ + """ Unit test class for testing a svm create function. """ - @classmethod - def setUpClass(self): - """ + @classmethod + def setUpClass(self): + """ Set up (Run only once) """ - np.random.seed(15) - n, d = 100, 3 - self.sf = tc.SFrame() - - for i in range(d): - self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - self.sf['target'] = np.random.randint(2, size=n) - self.target = 'target' - self.sf['dict'] = self.sf.apply(lambda row: {i: row['X{}'.format(i+1)] for i in - range(d)}) - self.features = ['dict'] - self.unpacked_features = ['dict[%s]' % i for i in range(d)] - self.def_kwargs = _DEFAULT_SOLVER_OPTIONS - - ## Compute the correct answers with Scikit-Learn - target_name = self.target - feature_names = self.features - X_train = list(self.sf['dict'].apply(lambda x: [x[k] for k in \ - sorted(x.keys())])) - y_train = list(self.sf[self.target]) - sm_model = svm.LinearSVC(C=1.0, loss='l1') - sm_model.fit(X_train, y_train) - self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0]) - - def _test_coefficients(self, model): - coefs = model.coefficients - coef_list = list(coefs['value']) - - def _test_create(self, sf, target, features, solver, - kwargs): - - model = tc.svm_classifier.create(sf, target, features, solver - = solver, feature_rescaling = False, **kwargs) - - test_case = 'solver = {solver}, kwargs = {kwargs}'.format(solver = solver, - kwargs = kwargs) - self.assertTrue(model is not None, 'Model is None') - self._test_coefficients(model) - - def test_create(self): - - kwargs = self.def_kwargs.copy() - kwargs['max_iterations'] = 100 - for solver in ['auto']: - args = (self.sf, self.target, self.features, - solver, self.def_kwargs) - self._test_create(*args) - - def test_predict_extra_cols(self): - - sf = self.sf[:] - model = tc.svm_classifier.create(sf, self.target, self.features, - feature_rescaling = False) - pred = model.predict(sf) - sf['dict'] = sf['dict'].apply(lambda x: dict(list(x.items()) - + list({'extra_col': 0}.items()))) - pred2 = model.predict(sf) - self.assertTrue((pred == pred2).all()) - - def test_evaluate_extra_cols(self): - sf = self.sf[:] - model = tc.svm_classifier.create(sf, self.target, self.features, - feature_rescaling = False) - eval1 = model.predict(sf) - sf['dict'] = sf['dict'].apply(lambda x: dict(list(x.items()) - + list({'extra_col': 0}.items()))) - eval2 = model.predict(sf) - self.assertTrue((eval1 == eval2).all()) - - def test_features(self): - - model = tc.svm_classifier.create(self.sf, self.target, self.features, - feature_rescaling = False) - self.assertEqual(model.num_features, len(self.features)) - self.assertEqual(model.features, self.features) - self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) - self.assertEqual(model.unpacked_features, self.unpacked_features) + np.random.seed(15) + n, d = 100, 3 + self.sf = tc.SFrame() + + for i in range(d): + self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + self.sf["target"] = np.random.randint(2, size=n) + self.target = "target" + self.sf["dict"] = self.sf.apply( + lambda row: {i: row["X{}".format(i + 1)] for i in range(d)} + ) + self.features = ["dict"] + self.unpacked_features = ["dict[%s]" % i for i in range(d)] + self.def_kwargs = _DEFAULT_SOLVER_OPTIONS + + ## Compute the correct answers with Scikit-Learn + target_name = self.target + feature_names = self.features + X_train = list( + self.sf["dict"].apply(lambda x: [x[k] for k in sorted(x.keys())]) + ) + y_train = list(self.sf[self.target]) + sm_model = svm.LinearSVC(C=1.0, loss="l1") + sm_model.fit(X_train, y_train) + self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0]) + + def _test_coefficients(self, model): + coefs = model.coefficients + coef_list = list(coefs["value"]) + + def _test_create(self, sf, target, features, solver, kwargs): + + model = tc.svm_classifier.create( + sf, target, features, solver=solver, feature_rescaling=False, **kwargs + ) + + test_case = "solver = {solver}, kwargs = {kwargs}".format( + solver=solver, kwargs=kwargs + ) + self.assertTrue(model is not None, "Model is None") + self._test_coefficients(model) + + def test_create(self): + + kwargs = self.def_kwargs.copy() + kwargs["max_iterations"] = 100 + for solver in ["auto"]: + args = (self.sf, self.target, self.features, solver, self.def_kwargs) + self._test_create(*args) + + def test_predict_extra_cols(self): + + sf = self.sf[:] + model = tc.svm_classifier.create( + sf, self.target, self.features, feature_rescaling=False + ) + pred = model.predict(sf) + sf["dict"] = sf["dict"].apply( + lambda x: dict(list(x.items()) + list({"extra_col": 0}.items())) + ) + pred2 = model.predict(sf) + self.assertTrue((pred == pred2).all()) + + def test_evaluate_extra_cols(self): + sf = self.sf[:] + model = tc.svm_classifier.create( + sf, self.target, self.features, feature_rescaling=False + ) + eval1 = model.predict(sf) + sf["dict"] = sf["dict"].apply( + lambda x: dict(list(x.items()) + list({"extra_col": 0}.items())) + ) + eval2 = model.predict(sf) + self.assertTrue((eval1 == eval2).all()) + + def test_features(self): + + model = tc.svm_classifier.create( + self.sf, self.target, self.features, feature_rescaling=False + ) + self.assertEqual(model.num_features, len(self.features)) + self.assertEqual(model.features, self.features) + self.assertEqual(model.num_unpacked_features, len(self.unpacked_features)) + self.assertEqual(model.unpacked_features, self.unpacked_features) class SVMStringTargetTest(unittest.TestCase): @@ -774,20 +826,20 @@ def setUpClass(self): for i in range(d): self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) target = np.random.randint(2, size=n) - self.sf['target'] = target - self.sf['target'] = self.sf['target'].astype(str) - + self.sf["target"] = target + self.sf["target"] = self.sf["target"].astype(str) def test_create(self): - model = tc.svm_classifier.create(self.sf, target='target') + model = tc.svm_classifier.create(self.sf, target="target") predictions = model.predict(self.sf) results = model.classify(self.sf) results = model.evaluate(self.sf) + class ValidationSetSVMTest(unittest.TestCase): @classmethod def setUpClass(self): - ## Simulate test data + ## Simulate test data np.random.seed(10) n, d = 100, 10 self.sf = tc.SFrame() @@ -796,58 +848,68 @@ def setUpClass(self): target = np.random.randint(2, size=n) ## Create the model - self.sf['target'] = target + self.sf["target"] = target self.def_kwargs = _DEFAULT_SOLVER_OPTIONS - self.def_opts= dict(list(self.def_kwargs.items()) + list({'solver' : 'auto', - 'feature_rescaling' : True, - 'class_weights' : None, - 'penalty' : 1.0}.items())) - - self.solver = 'auto' + self.def_opts = dict( + list(self.def_kwargs.items()) + + list( + { + "solver": "auto", + "feature_rescaling": True, + "class_weights": None, + "penalty": 1.0, + }.items() + ) + ) + + self.solver = "auto" self.opts = self.def_opts.copy() - self.opts['max_iterations'] = 500 - self.features = ['X{}'.format(i) for i in range(1, d+1)] - self.unpacked_features = ['X{}'.format(i) for i in range(1, d+1)] - self.target = 'target' + self.opts["max_iterations"] = 500 + self.features = ["X{}".format(i) for i in range(1, d + 1)] + self.unpacked_features = ["X{}".format(i) for i in range(1, d + 1)] + self.target = "target" def test_valid_set(self): - model = tc.svm_classifier.create(self.sf, target='target', - validation_set = 'auto') + model = tc.svm_classifier.create( + self.sf, target="target", validation_set="auto" + ) self.assertTrue(model is not None) self.assertTrue(isinstance(model.progress, tc.SFrame)) - model = tc.svm_classifier.create(self.sf, target='target', - validation_set =self.sf) + model = tc.svm_classifier.create( + self.sf, target="target", validation_set=self.sf + ) self.assertTrue(model is not None) self.assertTrue(isinstance(model.progress, tc.SFrame)) - model = tc.svm_classifier.create(self.sf, target='target', - validation_set = None) + model = tc.svm_classifier.create(self.sf, target="target", validation_set=None) self.assertTrue(model is not None) self.assertTrue(isinstance(model.progress, tc.SFrame)) class TestStringTarget(unittest.TestCase): - def test_cat(self): import numpy as np + # Arrange np.random.seed(8) n, d = 1000, 100 sf = tc.SFrame() for i in range(d): - sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) - target = np.random.randint(2, size=n) - sf['target'] = target + sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) + target = np.random.randint(2, size=n) + sf["target"] = target - sf['target'] = sf['target'].astype(str) - sf['target'] = 'cat-' + sf['target'] - model = tc.logistic_classifier.create(sf, 'target') + sf["target"] = sf["target"].astype(str) + sf["target"] = "cat-" + sf["target"] + model = tc.logistic_classifier.create(sf, "target") # Act evaluation = model.evaluate(sf) # Assert - self.assertEqual(['cat-0', 'cat-1'], - sorted(list(evaluation['confusion_matrix']['target_label'].unique()))) + self.assertEqual( + ["cat-0", "cat-1"], + sorted(list(evaluation["confusion_matrix"]["target_label"].unique())), + ) diff --git a/src/python/turicreate/test/test_text_analytics.py b/src/python/turicreate/test/test_text_analytics.py index d7cf60f98a..153f26fd15 100644 --- a/src/python/turicreate/test/test_text_analytics.py +++ b/src/python/turicreate/test/test_text_analytics.py @@ -20,46 +20,131 @@ class FeatureEngineeringTest(unittest.TestCase): @classmethod def setUpClass(self): - self.sa_word = tc.SArray(["I like big dogs. They are fun. I LIKE BIG DOGS", "I like.", "I like big"]) - self.sa_char = tc.SArray(["Fun. is. fun","Fun is fun.","fu", "fun"]) - - self.languages = tc.SArray(["This is someurl http://someurl!!", - "中文 应该也 行", - "Сблъсъкът между"]) - - self.languages_double = tc.SArray(["This is someurl http://someurl!! This is someurl http://someurl!!", - "中文 应该也 行 中文 应该也 行", - "Сблъсъкът между Сблъсъкът между"]) - - self.punctuated = tc.SArray(["This is some url http://www.someurl.com!!", - "Should we? Yes, we should."]) - self.punctuated_double = tc.SArray(["This is some url http://www.someurl.com!! This is some url http://www.someurl.com!!", - "Should we? Yes, we should. Should we? Yes, we should."]) - - self.docs = tc.SArray([{'this': 1, 'is': 1, 'a': 2, 'sample': 1}, - {'this': 1, 'is': 1, 'another': 2, 'example': 3}]) + self.sa_word = tc.SArray( + ["I like big dogs. They are fun. I LIKE BIG DOGS", "I like.", "I like big"] + ) + self.sa_char = tc.SArray(["Fun. is. fun", "Fun is fun.", "fu", "fun"]) + + self.languages = tc.SArray( + ["This is someurl http://someurl!!", "中文 应该也 行", "Сблъсъкът между"] + ) + + self.languages_double = tc.SArray( + [ + "This is someurl http://someurl!! This is someurl http://someurl!!", + "中文 应该也 行 中文 应该也 行", + "Сблъсъкът между Сблъсъкът между", + ] + ) + + self.punctuated = tc.SArray( + ["This is some url http://www.someurl.com!!", "Should we? Yes, we should."] + ) + self.punctuated_double = tc.SArray( + [ + "This is some url http://www.someurl.com!! This is some url http://www.someurl.com!!", + "Should we? Yes, we should. Should we? Yes, we should.", + ] + ) + + self.docs = tc.SArray( + [ + {"this": 1, "is": 1, "a": 2, "sample": 1}, + {"this": 1, "is": 1, "another": 2, "example": 3}, + ] + ) self.sframe_comparer = util.SFrameComparer() - def test_tokenize(self): sa_word_results = tc.text_analytics.tokenize(self.sa_word) - self.assertEqual(sa_word_results[0], ['I', 'like', 'big', 'dogs', 'They', 'are', 'fun', 'I', 'LIKE', 'BIG', 'DOGS']) - self.assertEqual(sa_word_results[1], ['I', 'like']) - self.assertEqual(sa_word_results[2], ['I', 'like', 'big']) - + self.assertEqual( + sa_word_results[0], + [ + "I", + "like", + "big", + "dogs", + "They", + "are", + "fun", + "I", + "LIKE", + "BIG", + "DOGS", + ], + ) + self.assertEqual(sa_word_results[1], ["I", "like"]) + self.assertEqual(sa_word_results[2], ["I", "like", "big"]) def test_count_ngrams(self): # Testing word n-gram functionality result = tc.text_analytics.count_ngrams(self.sa_word, 3) result2 = tc.text_analytics.count_ngrams(self.sa_word, 2) - result3 = tc.text_analytics.count_ngrams(self.sa_word, 3,"word", to_lower=False) - result4 = tc.text_analytics.count_ngrams(self.sa_word, 2,"word", to_lower=False) - expected = [{'fun i like': 1, 'i like big': 2, 'they are fun': 1, 'big dogs they': 1, 'like big dogs': 2, 'are fun i': 1, 'dogs they are': 1}, {}, {'i like big': 1}] - expected2 = [{'i like': 2, 'dogs they': 1, 'big dogs': 2, 'are fun': 1, 'like big': 2, 'they are': 1, 'fun i': 1}, {'i like': 1}, {'i like': 1, 'like big': 1}] - expected3 = [{'I like big': 1, 'fun I LIKE': 1, 'I LIKE BIG': 1, 'LIKE BIG DOGS': 1, 'They are fun': 1, 'big dogs They': 1, 'like big dogs': 1, 'are fun I': 1, 'dogs They are': 1}, {}, {'I like big': 1}] - expected4 = [{'I like': 1, 'like big': 1, 'I LIKE': 1, 'BIG DOGS': 1, 'are fun': 1, 'LIKE BIG': 1, 'big dogs': 1, 'They are': 1, 'dogs They': 1, 'fun I': 1}, {'I like': 1}, {'I like': 1, 'like big': 1}] + result3 = tc.text_analytics.count_ngrams( + self.sa_word, 3, "word", to_lower=False + ) + result4 = tc.text_analytics.count_ngrams( + self.sa_word, 2, "word", to_lower=False + ) + expected = [ + { + "fun i like": 1, + "i like big": 2, + "they are fun": 1, + "big dogs they": 1, + "like big dogs": 2, + "are fun i": 1, + "dogs they are": 1, + }, + {}, + {"i like big": 1}, + ] + expected2 = [ + { + "i like": 2, + "dogs they": 1, + "big dogs": 2, + "are fun": 1, + "like big": 2, + "they are": 1, + "fun i": 1, + }, + {"i like": 1}, + {"i like": 1, "like big": 1}, + ] + expected3 = [ + { + "I like big": 1, + "fun I LIKE": 1, + "I LIKE BIG": 1, + "LIKE BIG DOGS": 1, + "They are fun": 1, + "big dogs They": 1, + "like big dogs": 1, + "are fun I": 1, + "dogs They are": 1, + }, + {}, + {"I like big": 1}, + ] + expected4 = [ + { + "I like": 1, + "like big": 1, + "I LIKE": 1, + "BIG DOGS": 1, + "are fun": 1, + "LIKE BIG": 1, + "big dogs": 1, + "They are": 1, + "dogs They": 1, + "fun I": 1, + }, + {"I like": 1}, + {"I like": 1, "like big": 1}, + ] self.assertEqual(result.dtype, dict) self.sframe_comparer._assert_sarray_equal(result, expected) @@ -70,32 +155,199 @@ def test_count_ngrams(self): self.assertEqual(result4.dtype, dict) self.sframe_comparer._assert_sarray_equal(result4, expected4) - - #Testing character n-gram functionality + # Testing character n-gram functionality result5 = tc.text_analytics.count_ngrams(self.sa_char, 3, "character") result6 = tc.text_analytics.count_ngrams(self.sa_char, 2, "character") - result7 = tc.text_analytics.count_ngrams(self.sa_char, 3, "character", to_lower=False) - result8 = tc.text_analytics.count_ngrams(self.sa_char, 2, "character", to_lower=False) - result9 = tc.text_analytics.count_ngrams(self.sa_char, 3, "character", to_lower=False, ignore_space=False) - result10 = tc.text_analytics.count_ngrams(self.sa_char, 2, "character", to_lower=False, ignore_space=False) - result11 = tc.text_analytics.count_ngrams(self.sa_char, 3, "character", to_lower=True, ignore_space=False) - result12 = tc.text_analytics.count_ngrams(self.sa_char, 2, "character", to_lower=True, ignore_space=False) - result13 = tc.text_analytics.count_ngrams(self.sa_char, 3, "character", to_lower=False, ignore_punct=False, ignore_space=False) - result14 = tc.text_analytics.count_ngrams(self.sa_char, 2, "character", to_lower=False, ignore_punct=False, ignore_space=False) - result15 = tc.text_analytics.count_ngrams(self.sa_char, 3, "character", to_lower=False, ignore_punct=False, ignore_space=True) - result16 = tc.text_analytics.count_ngrams(self.sa_char, 2, "character", to_lower=False, ignore_punct=False, ignore_space=True) - expected5 = [{'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}, {'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}, {}, {'fun': 1}] - expected6 = [{'ni': 1, 'is': 1, 'un': 2, 'sf': 1, 'fu': 2}, {'ni': 1, 'is': 1, 'un': 2, 'sf': 1, 'fu': 2}, {'fu': 1}, {'un': 1, 'fu': 1}] - expected7 = [{'sfu': 1, 'Fun': 1, 'uni': 1, 'fun': 1, 'nis': 1, 'isf': 1}, {'sfu': 1, 'Fun': 1, 'uni': 1, 'fun': 1, 'nis': 1, 'isf': 1}, {}, {'fun': 1}] - expected8 = [{'ni': 1, 'Fu': 1, 'is': 1, 'un': 2, 'sf': 1, 'fu': 1}, {'ni': 1, 'Fu': 1, 'is': 1, 'un': 2, 'sf': 1, 'fu': 1}, {'fu': 1}, {'un': 1, 'fu': 1}] - expected9 = [{' fu': 1, ' is': 1, 's f': 1, 'un ': 1, 'Fun': 1, 'n i': 1, 'fun': 1, 'is ': 1}, {' fu': 1, ' is': 1, 's f': 1, 'un ': 1, 'Fun': 1, 'n i': 1, 'fun': 1, 'is ': 1}, {}, {'fun': 1}] - expected10 = [{' f': 1, 'fu': 1, 'n ': 1, 'is': 1, ' i': 1, 'un': 2, 's ': 1, 'Fu': 1}, {' f': 1, 'fu': 1, 'n ': 1, 'is': 1, ' i': 1, 'un': 2, 's ': 1, 'Fu': 1}, {'fu': 1}, {'un': 1, 'fu': 1}] - expected11 = [{' fu': 1, ' is': 1, 's f': 1, 'un ': 1, 'n i': 1, 'fun': 2, 'is ': 1}, {' fu': 1, ' is': 1, 's f': 1, 'un ': 1, 'n i': 1, 'fun': 2, 'is ': 1}, {}, {'fun': 1}] - expected12 = [{' f': 1, 'fu': 2, 'n ': 1, 'is': 1, ' i': 1, 'un': 2, 's ': 1}, {' f': 1, 'fu': 2, 'n ': 1, 'is': 1, ' i': 1, 'un': 2, 's ': 1}, {'fu': 1}, {'un': 1, 'fu': 1}] - expected13 = [{' fu': 1, 's. ': 1, ' is': 1, 'n. ': 1, 'Fun': 1, '. i': 1, 'is.': 1, 'fun': 1, '. f': 1, 'un.': 1}, {' fu': 1, ' is': 1, 's f': 1, 'un ': 1, 'Fun': 1, 'n i': 1, 'fun': 1, 'is ': 1, 'un.': 1}, {}, {'fun': 1}] - expected14 = [{' f': 1, 'fu': 1, 'n.': 1, '. ': 2, 'is': 1, ' i': 1, 'un': 2, 's.': 1, 'Fu': 1}, {' f': 1, 'fu': 1, 'n.': 1, 'n ': 1, 'is': 1, ' i': 1, 'un': 2, 's ': 1, 'Fu': 1}, {'fu': 1}, {'un': 1, 'fu': 1}] - expected15 = [{'s.f': 1, 'n.i': 1, 'Fun': 1, '.fu': 1, 'is.': 1, 'fun': 1, '.is': 1, 'un.': 1}, {'sfu': 1, 'Fun': 1, 'uni': 1, 'fun': 1, 'nis': 1, 'isf': 1, 'un.': 1}, {}, {'fun': 1}] - expected16 = [{'.i': 1, 'fu': 1, 'n.': 1, 'is': 1, '.f': 1, 'un': 2, 's.': 1, 'Fu': 1}, {'ni': 1, 'fu': 1, 'n.': 1, 'is': 1, 'un': 2, 'sf': 1, 'Fu': 1}, {'fu': 1}, {'un': 1, 'fu': 1}] + result7 = tc.text_analytics.count_ngrams( + self.sa_char, 3, "character", to_lower=False + ) + result8 = tc.text_analytics.count_ngrams( + self.sa_char, 2, "character", to_lower=False + ) + result9 = tc.text_analytics.count_ngrams( + self.sa_char, 3, "character", to_lower=False, ignore_space=False + ) + result10 = tc.text_analytics.count_ngrams( + self.sa_char, 2, "character", to_lower=False, ignore_space=False + ) + result11 = tc.text_analytics.count_ngrams( + self.sa_char, 3, "character", to_lower=True, ignore_space=False + ) + result12 = tc.text_analytics.count_ngrams( + self.sa_char, 2, "character", to_lower=True, ignore_space=False + ) + result13 = tc.text_analytics.count_ngrams( + self.sa_char, + 3, + "character", + to_lower=False, + ignore_punct=False, + ignore_space=False, + ) + result14 = tc.text_analytics.count_ngrams( + self.sa_char, + 2, + "character", + to_lower=False, + ignore_punct=False, + ignore_space=False, + ) + result15 = tc.text_analytics.count_ngrams( + self.sa_char, + 3, + "character", + to_lower=False, + ignore_punct=False, + ignore_space=True, + ) + result16 = tc.text_analytics.count_ngrams( + self.sa_char, + 2, + "character", + to_lower=False, + ignore_punct=False, + ignore_space=True, + ) + expected5 = [ + {"fun": 2, "nis": 1, "sfu": 1, "isf": 1, "uni": 1}, + {"fun": 2, "nis": 1, "sfu": 1, "isf": 1, "uni": 1}, + {}, + {"fun": 1}, + ] + expected6 = [ + {"ni": 1, "is": 1, "un": 2, "sf": 1, "fu": 2}, + {"ni": 1, "is": 1, "un": 2, "sf": 1, "fu": 2}, + {"fu": 1}, + {"un": 1, "fu": 1}, + ] + expected7 = [ + {"sfu": 1, "Fun": 1, "uni": 1, "fun": 1, "nis": 1, "isf": 1}, + {"sfu": 1, "Fun": 1, "uni": 1, "fun": 1, "nis": 1, "isf": 1}, + {}, + {"fun": 1}, + ] + expected8 = [ + {"ni": 1, "Fu": 1, "is": 1, "un": 2, "sf": 1, "fu": 1}, + {"ni": 1, "Fu": 1, "is": 1, "un": 2, "sf": 1, "fu": 1}, + {"fu": 1}, + {"un": 1, "fu": 1}, + ] + expected9 = [ + { + " fu": 1, + " is": 1, + "s f": 1, + "un ": 1, + "Fun": 1, + "n i": 1, + "fun": 1, + "is ": 1, + }, + { + " fu": 1, + " is": 1, + "s f": 1, + "un ": 1, + "Fun": 1, + "n i": 1, + "fun": 1, + "is ": 1, + }, + {}, + {"fun": 1}, + ] + expected10 = [ + {" f": 1, "fu": 1, "n ": 1, "is": 1, " i": 1, "un": 2, "s ": 1, "Fu": 1}, + {" f": 1, "fu": 1, "n ": 1, "is": 1, " i": 1, "un": 2, "s ": 1, "Fu": 1}, + {"fu": 1}, + {"un": 1, "fu": 1}, + ] + expected11 = [ + {" fu": 1, " is": 1, "s f": 1, "un ": 1, "n i": 1, "fun": 2, "is ": 1}, + {" fu": 1, " is": 1, "s f": 1, "un ": 1, "n i": 1, "fun": 2, "is ": 1}, + {}, + {"fun": 1}, + ] + expected12 = [ + {" f": 1, "fu": 2, "n ": 1, "is": 1, " i": 1, "un": 2, "s ": 1}, + {" f": 1, "fu": 2, "n ": 1, "is": 1, " i": 1, "un": 2, "s ": 1}, + {"fu": 1}, + {"un": 1, "fu": 1}, + ] + expected13 = [ + { + " fu": 1, + "s. ": 1, + " is": 1, + "n. ": 1, + "Fun": 1, + ". i": 1, + "is.": 1, + "fun": 1, + ". f": 1, + "un.": 1, + }, + { + " fu": 1, + " is": 1, + "s f": 1, + "un ": 1, + "Fun": 1, + "n i": 1, + "fun": 1, + "is ": 1, + "un.": 1, + }, + {}, + {"fun": 1}, + ] + expected14 = [ + { + " f": 1, + "fu": 1, + "n.": 1, + ". ": 2, + "is": 1, + " i": 1, + "un": 2, + "s.": 1, + "Fu": 1, + }, + { + " f": 1, + "fu": 1, + "n.": 1, + "n ": 1, + "is": 1, + " i": 1, + "un": 2, + "s ": 1, + "Fu": 1, + }, + {"fu": 1}, + {"un": 1, "fu": 1}, + ] + expected15 = [ + { + "s.f": 1, + "n.i": 1, + "Fun": 1, + ".fu": 1, + "is.": 1, + "fun": 1, + ".is": 1, + "un.": 1, + }, + {"sfu": 1, "Fun": 1, "uni": 1, "fun": 1, "nis": 1, "isf": 1, "un.": 1}, + {}, + {"fun": 1}, + ] + expected16 = [ + {".i": 1, "fu": 1, "n.": 1, "is": 1, ".f": 1, "un": 2, "s.": 1, "Fu": 1}, + {"ni": 1, "fu": 1, "n.": 1, "is": 1, "un": 2, "sf": 1, "Fu": 1}, + {"fu": 1}, + {"un": 1, "fu": 1}, + ] self.assertEqual(result5.dtype, dict) self.sframe_comparer._assert_sarray_equal(result5, expected5) @@ -122,7 +374,6 @@ def test_count_ngrams(self): self.assertEqual(result16.dtype, dict) self.sframe_comparer._assert_sarray_equal(result16, expected16) - ## Bogus input types and values sa = tc.SArray([1, 2, 3]) with self.assertRaises(RuntimeError): @@ -139,10 +390,9 @@ def test_count_ngrams(self): with warnings.catch_warnings(record=True) as context: warnings.simplefilter("always") - tc.text_analytics.count_ngrams(self.sa_word, n=10, method='word') + tc.text_analytics.count_ngrams(self.sa_word, n=10, method="word") assert len(context) == 1 - def test_drop_words(self): ## Bogus input type sa = tc.SArray([1, 2, 3]) @@ -156,14 +406,17 @@ def test_drop_words(self): tc.text_analytics.drop_words(sa, stop_words=stop_words) ## Other languages - expected = ["this is someurl http someurl this is someurl http someurl", - "中文 应该也 行 中文 应该也 行", - "Сблъсъкът между Сблъсъкът между"] - - expected2 = ["This is someurl http someurl This is someurl http someurl", - "中文 应该也 行 中文 应该也 行", - "Сблъсъкът между Сблъсъкът между"] - + expected = [ + "this is someurl http someurl this is someurl http someurl", + "中文 应该也 行 中文 应该也 行", + "Сблъсъкът между Сблъсъкът между", + ] + + expected2 = [ + "This is someurl http someurl This is someurl http someurl", + "中文 应该也 行 中文 应该也 行", + "Сблъсъкът между Сблъсъкът между", + ] result = tc.text_analytics.drop_words(self.languages_double) self.assertEqual(result.dtype, str) @@ -173,16 +426,24 @@ def test_drop_words(self): self.assertEqual(result.dtype, str) self.sframe_comparer._assert_sarray_equal(result, expected2) - ## Check that delimiters work properly by default and when modified. - expected1 = ['this is some url http www someurl com this is some url http www someurl com', 'should we yes we should should we yes we should'] - expected2 = ['this is some url http://www.someurl.com this is some url http://www.someurl.com', 'should we yes we should. should we yes we should.'] - expected3 = ['url http www someurl url http www someurl', ''] + expected1 = [ + "this is some url http www someurl com this is some url http www someurl com", + "should we yes we should should we yes we should", + ] + expected2 = [ + "this is some url http://www.someurl.com this is some url http://www.someurl.com", + "should we yes we should. should we yes we should.", + ] + expected3 = ["url http www someurl url http www someurl", ""] word_counts1 = tc.text_analytics.drop_words(self.punctuated_double) - word_counts2 = tc.text_analytics.drop_words(self.punctuated_double, - delimiters=["?", "!", ","," "]) - word_counts3 = tc.text_analytics.drop_words(self.punctuated_double, stop_words=tc.text_analytics.stop_words()) + word_counts2 = tc.text_analytics.drop_words( + self.punctuated_double, delimiters=["?", "!", ",", " "] + ) + word_counts3 = tc.text_analytics.drop_words( + self.punctuated_double, stop_words=tc.text_analytics.stop_words() + ) self.assertEqual(word_counts1.dtype, str) self.sframe_comparer._assert_sarray_equal(word_counts1, expected1) @@ -191,7 +452,6 @@ def test_drop_words(self): self.assertEqual(word_counts3.dtype, str) self.sframe_comparer._assert_sarray_equal(word_counts3, expected3) - def test_count_words(self): ## Bogus input type sa = tc.SArray([1, 2, 3]) @@ -199,12 +459,16 @@ def test_count_words(self): tc.text_analytics.count_words(sa) ## Other languages - expected = [{"this": 1, "http": 1, "someurl": 2, "is": 1}, - {"中文": 1, "应该也": 1, "行": 1}, - {"Сблъсъкът": 1, "между": 1}] - expected2 = [{"This": 1, "http": 1, "someurl": 2, "is": 1}, - {"中文": 1, "应该也": 1, "行": 1}, - {"Сблъсъкът": 1, "между": 1}] + expected = [ + {"this": 1, "http": 1, "someurl": 2, "is": 1}, + {"中文": 1, "应该也": 1, "行": 1}, + {"Сблъсъкът": 1, "между": 1}, + ] + expected2 = [ + {"This": 1, "http": 1, "someurl": 2, "is": 1}, + {"中文": 1, "应该也": 1, "行": 1}, + {"Сблъсъкът": 1, "между": 1}, + ] result = tc.text_analytics.count_words(self.languages) self.assertEqual(result.dtype, dict) @@ -215,14 +479,28 @@ def test_count_words(self): self.sframe_comparer._assert_sarray_equal(result, expected2) ## Check that delimiters work properly by default and when modified. - expected1 = [{"this": 1, "is": 1, "some": 1, "url": 1, "http": 1, "www": 1, "someurl": 1, "com": 1}, - {"should": 2, "we": 2, "yes": 1}] - expected2 = [{"this is some url http://www.someurl.com": 1}, - {"should we": 1, " yes": 1, " we should.": 1}] + expected1 = [ + { + "this": 1, + "is": 1, + "some": 1, + "url": 1, + "http": 1, + "www": 1, + "someurl": 1, + "com": 1, + }, + {"should": 2, "we": 2, "yes": 1}, + ] + expected2 = [ + {"this is some url http://www.someurl.com": 1}, + {"should we": 1, " yes": 1, " we should.": 1}, + ] word_counts1 = tc.text_analytics.count_words(self.punctuated) - word_counts2 = tc.text_analytics.count_words(self.punctuated, - delimiters=["?", "!", ","]) + word_counts2 = tc.text_analytics.count_words( + self.punctuated, delimiters=["?", "!", ","] + ) self.assertEqual(word_counts1.dtype, dict) self.sframe_comparer._assert_sarray_equal(word_counts1, expected1) @@ -236,7 +514,7 @@ def test_stop_words(self): """ words = tc.text_analytics.stop_words() self.assertTrue(len(words) > 400) - self.assertTrue('a' in words) + self.assertTrue("a" in words) def test_tf_idf(self): """ @@ -245,8 +523,8 @@ def test_tf_idf(self): # Use the example on wikipedia tfidf_docs = tc.text_analytics.tf_idf(self.docs) - self.assertAlmostEqual(tfidf_docs[1]['example'], 3 * math.log(2)) - self.assertAlmostEqual(tfidf_docs[0]['is'], 1 * math.log(1)) + self.assertAlmostEqual(tfidf_docs[1]["example"], 3 * math.log(2)) + self.assertAlmostEqual(tfidf_docs[0]["is"], 1 * math.log(1)) empty_sa = tc.text_analytics.tf_idf(tc.SArray()) self.assertEqual(len(empty_sa), 0) @@ -256,10 +534,17 @@ def test_tf_idf(self): assert len(empty_dict_sf.apply(lambda x: len(x) == 0)) == 1 def test_count_words_dict_type(self): - sa = tc.SArray([{'Alice bob mike': 1, 'Bob Alice Sue': 0.5, '': 100}, - {'a dog cow': 0, 'a dog cat ': 5, 'mice dog': -1, 'mice cat': 2}]) + sa = tc.SArray( + [ + {"Alice bob mike": 1, "Bob Alice Sue": 0.5, "": 100}, + {"a dog cow": 0, "a dog cat ": 5, "mice dog": -1, "mice cat": 2}, + ] + ) result = tc.text_analytics.count_words(sa) - expected = [{'bob': 1.5, 'mike': 1.0, 'sue': 0.5, 'alice': 1.5}, {'a': 5.0, 'mice': 1.0, 'dog': 4.0, 'cow': 0.0, 'cat': 7.0}] + expected = [ + {"bob": 1.5, "mike": 1.0, "sue": 0.5, "alice": 1.5}, + {"a": 5.0, "mice": 1.0, "dog": 4.0, "cow": 0.0, "cat": 7.0}, + ] self.assertEqual(result.dtype, dict) self.sframe_comparer._assert_sarray_equal(result, expected) @@ -272,9 +557,7 @@ class RandomWordSplitTest(unittest.TestCase): @classmethod def setUpClass(self): - self.docs = tc.SArray([{'a': 3, 'b': 5}, - {'b': 5, 'c': 7}, - {'a': 2, 'd': 3}]) + self.docs = tc.SArray([{"a": 3, "b": 5}, {"b": 5, "c": 7}, {"a": 2, "d": 3}]) def test_random_split(self): """ @@ -309,7 +592,7 @@ def test_random_split(self): assert v == av + bv # Check that a low probability puts fewer items in the test set - train, test = tc.text_analytics.random_split(self.docs, prob=.001) + train, test = tc.text_analytics.random_split(self.docs, prob=0.001) total_in_train = train.dict_values().apply(lambda x: sum(x)).sum() total_in_test = test.dict_values().apply(lambda x: sum(x)).sum() @@ -323,11 +606,15 @@ class RetrievalTest(unittest.TestCase): @classmethod def setUpClass(self): - self.data = tc.SArray([{'a':5, 'b':7, 'c':10}, - {'a':3, 'c':1, 'd':2}, - {'a':10, 'b':3, 'e':5}, - {'a':1}, - {'f':5}]) + self.data = tc.SArray( + [ + {"a": 5, "b": 7, "c": 10}, + {"a": 3, "c": 1, "d": 2}, + {"a": 10, "b": 3, "e": 5}, + {"a": 1}, + {"f": 5}, + ] + ) def test_bm25(self): """ @@ -335,24 +622,27 @@ def test_bm25(self): """ # Test input formats - query = ['a','b','c'] + query = ["a", "b", "c"] assert tc.text_analytics.bm25(self.data, query) is not None - query = tc.SArray(['a','b','c']) + query = tc.SArray(["a", "b", "c"]) assert tc.text_analytics.bm25(self.data, query) is not None - query = {'a':5, 'b':3, 'c':1} + query = {"a": 5, "b": 3, "c": 1} assert tc.text_analytics.bm25(self.data, query) is not None # Only documents containing query words are included in result assert tc.text_analytics.bm25(self.data, query).num_rows() == 4 - dataset = tc.SArray([ - {'a':5, 'b':7, 'c':10}, - {'a':3, 'c':1, 'd':2}, - None, - {'a':1}, - {'f':5}]) + dataset = tc.SArray( + [ + {"a": 5, "b": 7, "c": 10}, + {"a": 3, "c": 1, "d": 2}, + None, + {"a": 1}, + {"f": 5}, + ] + ) res = tc.text_analytics.bm25(dataset, query) assert res.num_rows() == 3 diff --git a/src/python/turicreate/test/test_text_classifier.py b/src/python/turicreate/test/test_text_classifier.py index 87bf57bf73..1c70ee720c 100644 --- a/src/python/turicreate/test/test_text_classifier.py +++ b/src/python/turicreate/test/test_text_classifier.py @@ -16,6 +16,7 @@ from . import util as test_util import sys + if sys.version_info.major == 3: unittest.TestCase.assertItemsEqual = unittest.TestCase.assertCountEqual @@ -27,18 +28,17 @@ class TextClassifierTest(unittest.TestCase): @classmethod def setUpClass(self): - text = ['hello friend', 'how exciting', 'mostly exciting', 'hello again'] + text = ["hello friend", "how exciting", "mostly exciting", "hello again"] score = [0, 1, 1, 0] - self.docs = tc.SFrame({'text': text, 'score': score}) + self.docs = tc.SFrame({"text": text, "score": score}) - self.features = ['text'] + self.features = ["text"] self.num_features = 1 - self.target = 'score' - self.method = 'bow-logistic' - self.model = tc.text_classifier.create(self.docs, - target=self.target, - features=self.features, - method='auto') + self.target = "score" + self.method = "bow-logistic" + self.model = tc.text_classifier.create( + self.docs, target=self.target, features=self.features, method="auto" + ) self.num_examples = 4 @@ -47,12 +47,13 @@ def test__list_fields(self): Check the model list fields method. """ correct_fields = [ - 'classifier', - 'features', - 'num_features', - 'method', - 'num_examples', - 'target'] + "classifier", + "features", + "num_features", + "method", + "num_examples", + "target", + ] self.assertItemsEqual(self.model._list_fields(), correct_fields) @@ -61,11 +62,12 @@ def test_get(self): Check the various 'get' methods against known answers for each field. """ correct_fields = { - 'features': self.features, - 'num_features': self.num_features, - 'target': self.target, - 'method': self.method, - 'num_examples': self.num_examples} + "features": self.features, + "num_features": self.num_features, + "target": self.target, + "method": self.method, + "num_examples": self.num_examples, + } print(self.model) for field, ans in correct_fields.items(): @@ -73,7 +75,9 @@ def test_get(self): def test_model_access(self): m = self.model.classifier - self.assertTrue(isinstance(m, tc.classifier.logistic_classifier.LogisticClassifier)) + self.assertTrue( + isinstance(m, tc.classifier.logistic_classifier.LogisticClassifier) + ) def test_summaries(self): """ @@ -91,31 +95,34 @@ def test_evaluate(self): self.model.evaluate(self.docs) def test_export_coreml(self): - filename = tempfile.mkstemp('bingo.mlmodel')[1] + filename = tempfile.mkstemp("bingo.mlmodel")[1] self.model.export_coreml(filename) import platform + coreml_model = coremltools.models.MLModel(filename) - self.assertDictEqual({ - 'com.github.apple.turicreate.version': tc.__version__, - 'com.github.apple.os.platform': platform.platform(), - }, dict(coreml_model.user_defined_metadata) + self.assertDictEqual( + { + "com.github.apple.turicreate.version": tc.__version__, + "com.github.apple.os.platform": platform.platform(), + }, + dict(coreml_model.user_defined_metadata), + ) + expected_result = ( + "Text classifier created by Turi Create (version %s)" % tc.__version__ ) - expected_result = 'Text classifier created by Turi Create (version %s)' % tc.__version__ self.assertEquals(expected_result, coreml_model.short_description) - @unittest.skipIf(_mac_ver() < (10, 13), 'Only supported on macOS 10.13+') + @unittest.skipIf(_mac_ver() < (10, 13), "Only supported on macOS 10.13+") def test_export_coreml_with_predict(self): - filename = tempfile.mkstemp('bingo.mlmodel')[1] + filename = tempfile.mkstemp("bingo.mlmodel")[1] self.model.export_coreml(filename) - preds = self.model.predict(self.docs, output_type='probability_vector') + preds = self.model.predict(self.docs, output_type="probability_vector") coreml_model = coremltools.models.MLModel(filename) - coreml_preds = coreml_model.predict({ - 'text': {'hello': 1, 'friend': 1} - }) - self.assertAlmostEqual(preds[0][0], coreml_preds['scoreProbability'][0]) - self.assertAlmostEqual(preds[0][1], coreml_preds['scoreProbability'][1]) + coreml_preds = coreml_model.predict({"text": {"hello": 1, "friend": 1}}) + self.assertAlmostEqual(preds[0][0], coreml_preds["scoreProbability"][0]) + self.assertAlmostEqual(preds[0][1], coreml_preds["scoreProbability"][1]) def test_save_and_load(self): """ @@ -136,82 +143,107 @@ def test_save_and_load(self): self.test_summaries() print("Saved model summaries passed") + class TextClassifierCreateTests(unittest.TestCase): @classmethod def setUpClass(self): - self.data = tc.SFrame({ - 'rating': [1, 5, 2, 3, 3, 5], - 'place': ['a', 'a', 'b', 'b', 'b', 'c'], - 'text': ["The burrito was terrible and awful and I hated it", - "I will come here every day of my life because the burrito " - "is awesome and delicious", - "Meh, the waiter died while serving us. Other than that " - "the experience was OK, but the burrito was not great.", - "Mediocre burrito. Nothing much else to report.", - "My dad works here, so I guess I have to kinda like it. " - "Hate the burrito, though.", - "Love it! Mexican restaurant of my dreams and a burrito " - "from the gods."]}) - - self.rating_column = 'rating' - self.features = ['text'] - self.keywords = ['burrito', 'dad'] - self.model = tc.text_classifier.create(self.data, target='rating', - features=self.features) + self.data = tc.SFrame( + { + "rating": [1, 5, 2, 3, 3, 5], + "place": ["a", "a", "b", "b", "b", "c"], + "text": [ + "The burrito was terrible and awful and I hated it", + "I will come here every day of my life because the burrito " + "is awesome and delicious", + "Meh, the waiter died while serving us. Other than that " + "the experience was OK, but the burrito was not great.", + "Mediocre burrito. Nothing much else to report.", + "My dad works here, so I guess I have to kinda like it. " + "Hate the burrito, though.", + "Love it! Mexican restaurant of my dreams and a burrito " + "from the gods.", + ], + } + ) + + self.rating_column = "rating" + self.features = ["text"] + self.keywords = ["burrito", "dad"] + self.model = tc.text_classifier.create( + self.data, target="rating", features=self.features + ) def test_sentiment_create_no_features(self): - model = tc.text_classifier.create(self.data, target='rating') + model = tc.text_classifier.create(self.data, target="rating") self.assertTrue(isinstance(model, tc.text_classifier.TextClassifier)) def test_sentiment_create_string_target(self): data_str = self.data[:] - data_str['rating'] = data_str['rating'].astype(str) - model = tc.text_classifier.create(data_str, target='rating') + data_str["rating"] = data_str["rating"].astype(str) + model = tc.text_classifier.create(data_str, target="rating") self.assertTrue(isinstance(model, tc.text_classifier.TextClassifier)) def test_invalid_data_set(self): # infer dtype str - a = tc.SArray(['str', None]) - b = tc.SArray(['str', 'str']) + a = tc.SArray(["str", None]) + b = tc.SArray(["str", "str"]) # target contains none - sf = tc.SFrame({'a': a, 'b': b}) + sf = tc.SFrame({"a": a, "b": b}) with self.assertRaises(ToolkitError): - tc.text_classifier.create(sf, target='a', features=['b'], word_count_threshold=1) + tc.text_classifier.create( + sf, target="a", features=["b"], word_count_threshold=1 + ) # feature contains none, Github #2402 - sf = tc.SFrame({'b': a, 'a': b}) + sf = tc.SFrame({"b": a, "a": b}) with self.assertRaises(ToolkitError): - tc.text_classifier.create(sf, target='b', features=['a'], word_count_threshold=1) - + tc.text_classifier.create( + sf, target="b", features=["a"], word_count_threshold=1 + ) def test_validation_set(self): train = self.data valid = self.data # Test with a validation set - model = tc.text_classifier.create(train, target='rating', validation_set=valid) - self.assertTrue('Validation Accuracy' in model.classifier.progress.column_names()) + model = tc.text_classifier.create(train, target="rating", validation_set=valid) + self.assertTrue( + "Validation Accuracy" in model.classifier.progress.column_names() + ) # Test without a validation set - model = tc.text_classifier.create(train, target='rating', validation_set=None) - self.assertTrue('Validation Accuracy' not in model.classifier.progress.column_names()) + model = tc.text_classifier.create(train, target="rating", validation_set=None) + self.assertTrue( + "Validation Accuracy" not in model.classifier.progress.column_names() + ) # Test 'auto' validation set - big_data = train.append(tc.SFrame({ - 'rating': [5] * 100, - 'place': ['d'] * 100, - 'text': ['large enough data for %5 percent validation split to activate'] * 100 - })) - model = tc.text_classifier.create(big_data, target='rating', validation_set='auto') - self.assertTrue('Validation Accuracy' in model.classifier.progress.column_names()) + big_data = train.append( + tc.SFrame( + { + "rating": [5] * 100, + "place": ["d"] * 100, + "text": [ + "large enough data for %5 percent validation split to activate" + ] + * 100, + } + ) + ) + model = tc.text_classifier.create( + big_data, target="rating", validation_set="auto" + ) + self.assertTrue( + "Validation Accuracy" in model.classifier.progress.column_names() + ) # Test bad validation set string with self.assertRaises(TypeError): - tc.text_classifier.create(train, target='rating', validation_set='wrong') + tc.text_classifier.create(train, target="rating", validation_set="wrong") # Test bad validation set type with self.assertRaises(TypeError): - tc.text_classifier.create(train, target='rating', validation_set=5) + tc.text_classifier.create(train, target="rating", validation_set=5) def test_sentiment_classifier(self): m = self.model @@ -230,13 +262,15 @@ def test_classify(self): self.assertEqual(preds.column_names(), ["class", "probability"]) def test_not_sframe_create_error(self): - dataset = {'rating': [1,5], 'text': ['this is bad', 'this is good']} + dataset = {"rating": [1, 5], "text": ["this is bad", "this is good"]} try: # dataset is NOT an SFrame - tc.text_classifier.create(dataset, 'rating', features=['text']) + tc.text_classifier.create(dataset, "rating", features=["text"]) except ToolkitError as t: exception_msg = t.args[0] - self.assertTrue(exception_msg.startswith('Input dataset is not an SFrame. ')) + self.assertTrue( + exception_msg.startswith("Input dataset is not an SFrame. ") + ) else: self.fail("This should have thrown an exception") @@ -245,20 +279,25 @@ class TextClassifierCreateBadValues(unittest.TestCase): @classmethod def setUpClass(self): - self.data = tc.SFrame({ - 'rating': [1, 5, 2, 3], - 'place': ['a', 'a', 'b', 'b'], - 'text': ["The burrito was terrible and awful and I hated it", - "I will come here a lot", - "......", - ""]}) + self.data = tc.SFrame( + { + "rating": [1, 5, 2, 3], + "place": ["a", "a", "b", "b"], + "text": [ + "The burrito was terrible and awful and I hated it", + "I will come here a lot", + "......", + "", + ], + } + ) - self.rating_column = 'rating' - self.features = ['text'] - self.keywords = ['burrito', 'dad'] + self.rating_column = "rating" + self.features = ["text"] + self.keywords = ["burrito", "dad"] def test_create(self): - model = tc.text_classifier.create(self.data, - target=self.rating_column, - features=self.features) + model = tc.text_classifier.create( + self.data, target=self.rating_column, features=self.features + ) self.assertTrue(model is not None) diff --git a/src/python/turicreate/test/test_topic_model.py b/src/python/turicreate/test/test_topic_model.py index 43028d8c81..eec0b0427c 100644 --- a/src/python/turicreate/test/test_topic_model.py +++ b/src/python/turicreate/test/test_topic_model.py @@ -22,15 +22,13 @@ import numpy as np import itertools as _itertools -DELTA = .0000001 +DELTA = 0.0000001 examples = {} -def generate_bar_example(num_topics=10, - num_documents=500, - num_words_per_doc=100, - alpha=1, beta=1, - seed=None): +def generate_bar_example( + num_topics=10, num_documents=500, num_words_per_doc=100, alpha=1, beta=1, seed=None +): """ Generate the classic "bars" example, a synthetic data set of small black 5x5 pixel images with a single white bar that is either horizontal @@ -63,10 +61,10 @@ def generate_bar_example(num_topics=10, topic_squares = [zeros for i in range(num_topics)] for i in range(width): for j in range(width): - topic_squares[i][i][j] = 1./width + topic_squares[i][i][j] = 1.0 / width for i in range(width): for j in range(width): - topic_squares[width+i][j][i] = 1./width + topic_squares[width + i][j][i] = 1.0 / width topics = [] for k in range(num_topics): topics.append(list(_itertools.chain(*topic_squares[k]))) @@ -99,7 +97,7 @@ def weighted_choice(probs): sd = {} for i in range(width): for j in range(width): - k = str(i) + ',' + str(j) + k = str(i) + "," + str(j) sd[k] = d[i * width + j] sparse_documents.append(sd) bow_documents = turicreate.SArray(sparse_documents) @@ -107,7 +105,6 @@ def weighted_choice(probs): class BasicTest(unittest.TestCase): - @classmethod def setUpClass(self): @@ -117,18 +114,13 @@ def setUpClass(self): models = [] # Test a model that used CGS - m = topic_model.create(docs, - num_topics=10) + m = topic_model.create(docs, num_topics=10) models.append(m) # Test a model with many topics - m = topic_model.create(docs, method='cgs', - num_topics=100, - num_iterations=2) + m = topic_model.create(docs, method="cgs", num_topics=100, num_iterations=2) models.append(m) - m = topic_model.create(docs, method='alias', - num_topics=100, - num_iterations=2) + m = topic_model.create(docs, method="alias", num_topics=100, num_iterations=2) models.append(m) # Test a model serialized after using CGS @@ -138,43 +130,44 @@ def setUpClass(self): models.append(m2) - # Save - examples['synthetic'] = {'docs': docs, 'models': models} - self.docs = examples['synthetic']['docs'] - self.models = examples['synthetic']['models'] + examples["synthetic"] = {"docs": docs, "models": models} + self.docs = examples["synthetic"]["docs"] + self.models = examples["synthetic"]["models"] def test_set_burnin(self): m = topic_model.create(self.docs, num_burnin=25, num_iterations=1) self.assertTrue(m.num_burnin == 25) def test_no_validation_print(self): - m = topic_model.create(self.docs, num_burnin=25, num_iterations=2, print_interval=0) + m = topic_model.create( + self.docs, num_burnin=25, num_iterations=2, print_interval=0 + ) self.assertTrue(m is not None) self.assertEqual(m.num_burnin, 25) def test_validation_set(self): m = topic_model.create(self.docs, validation_set=self.docs) - self.assertTrue('validation_perplexity' in m._list_fields()) + self.assertTrue("validation_perplexity" in m._list_fields()) # Test that an SFrame can be used - sf = turicreate.SFrame({'text': self.docs}) + sf = turicreate.SFrame({"text": self.docs}) m = topic_model.create(self.docs, validation_set=sf) - self.assertTrue('validation_perplexity' in m._list_fields()) + self.assertTrue("validation_perplexity" in m._list_fields()) def test_set_associations(self): associations = turicreate.SFrame() - associations['word'] = ['1,1', '1,2', '1,3'] - associations['topic'] = [0, 0, 0] + associations["word"] = ["1,1", "1,2", "1,3"] + associations["topic"] = [0, 0, 0] m = topic_model.create(self.docs, associations=associations) # In this context, the "words" '1,1', '1,2', '1,3' should be # the first three words in the vocabulary. - self.assertEqual(list(m.topics['vocabulary'].head(3)), ['1,1', '1,2', '1,3']) + self.assertEqual(list(m.topics["vocabulary"].head(3)), ["1,1", "1,2", "1,3"]) # For each of these words, the probability of topic 0 should # be largest. - probs = m.topics['topic_probabilities'] + probs = m.topics["topic_probabilities"] largest = probs.apply(lambda x: np.argmax(x)) self.assertEqual(list(largest.head(3)), [0, 0, 0]) @@ -198,25 +191,30 @@ def test_get_topics(self): self.assertTrue(isinstance(topics, turicreate.SFrame)) self.assertEqual(topics.num_rows(), 25) self.assertEqual(topics.num_columns(), 2) - z = m.topics['topic_probabilities'] + z = m.topics["topic_probabilities"] for k in range(m.num_topics): - self.assertTrue(abs(sum(z.vector_slice(k)) - 1) < DELTA, \ - 'Returned probabilities do not sum to 1.') + self.assertTrue( + abs(sum(z.vector_slice(k)) - 1) < DELTA, + "Returned probabilities do not sum to 1.", + ) # Make sure returned object is an SFrame of the right size topics = m.get_topics() self.assertTrue(isinstance(topics, turicreate.SFrame)) - self.assertTrue(topics.num_columns() == 3, \ - 'Returned SFrame should have a topic, word, and probs.') + self.assertTrue( + topics.num_columns() == 3, + "Returned SFrame should have a topic, word, and probs.", + ) # Make sure that requesting a single topic returns only that topic num_words = 8 topics = m.get_topics([5], num_words=num_words) - self.assertTrue(all(topics['topic'] == 5), \ - 'Returned topics do not have the right id.') + self.assertTrue( + all(topics["topic"] == 5), "Returned topics do not have the right id." + ) self.assertEqual(topics.num_rows(), num_words) topics = m.get_topics([2, 4], num_words=num_words) - self.assertEqual(set(list(topics['topic'])), set([2, 4])) + self.assertEqual(set(list(topics["topic"])), set([2, 4])) self.assertEqual(topics.num_rows(), num_words + num_words) # Make sure the cumulative probability of the returned words is @@ -224,9 +222,13 @@ def test_get_topics(self): # A cutoff of 1.0 should return num_words for every topic. cutoff = 1.0 topics = m.get_topics(cdf_cutoff=cutoff, num_words=len(m.vocabulary)) - totals = topics.groupby('topic', {'total_score': turicreate.aggregate.SUM('score')}) - self.assertTrue(all(totals['total_score'] <= (cutoff + DELTA)), \ - 'More words were returned than expected for this cutoff.') + totals = topics.groupby( + "topic", {"total_score": turicreate.aggregate.SUM("score")} + ) + self.assertTrue( + all(totals["total_score"] <= (cutoff + DELTA)), + "More words were returned than expected for this cutoff.", + ) # Make sure we raise errors for bad input with self.assertRaises(ValueError): @@ -234,10 +236,10 @@ def test_get_topics(self): with self.assertRaises(ValueError): m.get_topics([10000]) with self.assertRaises(ToolkitError): - topics = m.get_topics(output_type='other') + topics = m.get_topics(output_type="other") # Test getting topic_words - topic_words = m.get_topics(output_type='topic_words', num_words=5) + topic_words = m.get_topics(output_type="topic_words", num_words=5) self.assertEqual(type(topic_words), turicreate.SFrame) # Test words are sorted correctly for the first topic @@ -270,20 +272,20 @@ def test_predict(self): self.assertEqual(len(preds), len(docs)) self.assertEqual(preds.dtype, int) - preds = m.predict(docs, output_type='probability') + preds = m.predict(docs, output_type="probability") self.assertTrue(isinstance(preds, turicreate.SArray)) self.assertTrue(len(preds) == len(docs)) s = preds.apply(lambda x: sum(x)) - self.assertTrue((s.apply(lambda x: abs(x-1)) < .000001).all()) + self.assertTrue((s.apply(lambda x: abs(x - 1)) < 0.000001).all()) # Test predictions when docs have new words - new_docs = turicreate.SArray([{'-1,-1':3.0, '0,4': 5.0, '0,3': 2.0}]) + new_docs = turicreate.SArray([{"-1,-1": 3.0, "0,4": 5.0, "0,3": 2.0}]) preds = m.predict(new_docs) self.assertEqual(len(preds), len(new_docs)) # Test additional burnin. Ideally we could show that things # converge as you increase burnin. - preds_no_burnin = m.predict(docs, output_type='probability', num_burnin=0) + preds_no_burnin = m.predict(docs, output_type="probability", num_burnin=0) self.assertEqual(len(preds_no_burnin), len(docs)) def test_save_load(self): @@ -297,9 +299,10 @@ def test_save_load(self): self.assertTrue(m2 is not None) self.assertEqual(m.__str__(), m2.__str__()) - diff = m.topics['topic_probabilities'] - \ - m2.topics['topic_probabilities'] - zeros = diff * 0 + diff = ( + m.topics["topic_probabilities"] - m2.topics["topic_probabilities"] + ) + zeros = diff * 0 for i in range(len(zeros)): observed = np.array(diff[i]) @@ -317,40 +320,48 @@ def test_initialize(self): for m in self.models: start_docs = turicreate.SArray(self.docs.tail(3)) - m = topic_model.create(start_docs, num_topics=20, - method='cgs', - alpha=.1, beta=.01, - num_iterations=1, - print_interval=1) + m = topic_model.create( + start_docs, + num_topics=20, + method="cgs", + alpha=0.1, + beta=0.01, + num_iterations=1, + print_interval=1, + ) start_topics = turicreate.SFrame(m.topics.head(100)) - m2 = topic_model.create(self.docs, num_topics=20, - initial_topics=start_topics, - method='cgs', - alpha=0.1, beta=0.01, - num_iterations=0, - print_interval=1) + m2 = topic_model.create( + self.docs, + num_topics=20, + initial_topics=start_topics, + method="cgs", + alpha=0.1, + beta=0.01, + num_iterations=0, + print_interval=1, + ) # Check that the vocabulary of the new model is the same as # the one we used to initialize the model. - self.assertTrue((start_topics['vocabulary'] == m2.topics['vocabulary']).all()) + self.assertTrue( + (start_topics["vocabulary"] == m2.topics["vocabulary"]).all() + ) # Check that the previously most probable word is still the most # probable after 0 iterations, i.e. just initialization. - old_prob = start_topics['topic_probabilities'].vector_slice(0) - new_prob = m2.topics['topic_probabilities'].vector_slice(0) + old_prob = start_topics["topic_probabilities"].vector_slice(0) + new_prob = m2.topics["topic_probabilities"].vector_slice(0) self.assertTrue(np.argmax(list(old_prob)) == np.argmax(list(new_prob))) - def test_exceptions(self): - good1 = turicreate.SArray([{'a':5, 'b':7}]) - good2 = turicreate.SFrame({'bow': good1}) + good1 = turicreate.SArray([{"a": 5, "b": 7}]) + good2 = turicreate.SFrame({"bow": good1}) good3 = turicreate.SArray([{}]) - bad1 = turicreate.SFrame({'x': [0, 1, 2, 3]}) - bad2 = turicreate.SFrame({'x': [{'0': 3}], - 'y': [{'3': 5}]}) - bad3 = turicreate.SArray([{'a':5, 'b':3}, None, {'a': 10}]) - bad4 = turicreate.SArray([{'a': 5, 'b': None}, {'a': 3}]) + bad1 = turicreate.SFrame({"x": [0, 1, 2, 3]}) + bad2 = turicreate.SFrame({"x": [{"0": 3}], "y": [{"3": 5}]}) + bad3 = turicreate.SArray([{"a": 5, "b": 3}, None, {"a": 10}]) + bad4 = turicreate.SArray([{"a": 5, "b": None}, {"a": 3}]) for d in [good1, good2, good3]: m = topic_model.create(d) @@ -362,10 +373,9 @@ def test_exceptions(self): with self.assertRaises(Exception): m = topic_model.create(bad2) with self.assertRaises(ToolkitError): - m = topic_model.create(bad3) + m = topic_model.create(bad3) with self.assertRaises(ToolkitError): - m = topic_model.create(bad4) - + m = topic_model.create(bad4) m = self.models[0] with self.assertRaises(Exception): @@ -375,18 +385,17 @@ def test_exceptions(self): with self.assertRaises(Exception): pr = m.predict(bad3) - def test_evaluate(self): for m in self.models: # Check evaluate on the training set returns an answer perp = m.evaluate(self.docs) self.assertTrue(isinstance(perp, dict)) - self.assertTrue(isinstance(perp['perplexity'], float)) + self.assertTrue(isinstance(perp["perplexity"], float)) # Check that the answer is within 5% of the one reported # by the model during training. - if 'validation_perplexity' in m._list_fields(): + if "validation_perplexity" in m._list_fields(): perp2 = m.validation_perplexity # TEMP: Disable since we now only compute this if validatino set is provided # assert abs(perp - perp2)/perp < .05 @@ -396,7 +405,7 @@ def test_evaluate(self): self.assertTrue(isinstance(perp, dict)) def test__training_stats(self): - expected_fields = ['training_iterations', 'training_time'] + expected_fields = ["training_iterations", "training_time"] for m in self.models: actual_fields = m._training_stats() for f in expected_fields: @@ -405,16 +414,18 @@ def test__training_stats(self): def test_summary(self): - expected_fields = ['num_topics', # model parameters - 'alpha', - 'beta', - 'topics', - 'vocabulary', - 'num_iterations', # stats and options - 'print_interval', - 'training_time', - 'training_iterations', - 'num_burnin'] + expected_fields = [ + "num_topics", # model parameters + "alpha", + "beta", + "topics", + "vocabulary", + "num_iterations", # stats and options + "print_interval", + "training_time", + "training_iterations", + "num_burnin", + ] for m in self.models: actual_fields = m._list_fields() @@ -422,37 +433,37 @@ def test_summary(self): self.assertTrue(f in actual_fields) self.assertTrue(m._get(f) is not None) -class ParsersTest(unittest.TestCase): +class ParsersTest(unittest.TestCase): def setUp(self): self.tmpfile_a = tempfile.NamedTemporaryFile(delete=False).name - with open(self.tmpfile_a, 'w') as o: - o.write('3 1:5 2:10 5:353\n') - o.write('0 0:7 6:3 3:100') + with open(self.tmpfile_a, "w") as o: + o.write("3 1:5 2:10 5:353\n") + o.write("0 0:7 6:3 3:100") self.tmpfile_vocab = tempfile.NamedTemporaryFile(delete=False).name - with open(self.tmpfile_vocab, 'w') as o: - o.write('\n'.join(['a', 'b', 'c', 'd', 'e', 'f', 'g'])) + with open(self.tmpfile_vocab, "w") as o: + o.write("\n".join(["a", "b", "c", "d", "e", "f", "g"])) self.tmpfile_b = tempfile.NamedTemporaryFile(delete=False).name - with open(self.tmpfile_b, 'w') as o: - o.write('2\n5\n6\n') - o.write('1 2 5\n') - o.write('1 3 10\n') - o.write('1 6 353\n') - o.write('2 1 7\n') - o.write('2 7 3\n') - o.write('2 4 100') + with open(self.tmpfile_b, "w") as o: + o.write("2\n5\n6\n") + o.write("1 2 5\n") + o.write("1 3 10\n") + o.write("1 6 353\n") + o.write("2 1 7\n") + o.write("2 7 3\n") + o.write("2 4 100") def test_parse_sparse(self): d = parse_sparse(self.tmpfile_a, self.tmpfile_vocab) - self.assertTrue(d[0] == {'b': 5, 'c': 10, 'f': 353}) - self.assertTrue(d[1] == {'a': 7, 'g': 3, 'd': 100}) + self.assertTrue(d[0] == {"b": 5, "c": 10, "f": 353}) + self.assertTrue(d[1] == {"a": 7, "g": 3, "d": 100}) def test_parse_docword(self): d = parse_docword(self.tmpfile_b, self.tmpfile_vocab) - self.assertTrue(d[0] == {'b': 5, 'c': 10, 'f': 353}) - self.assertTrue(d[1] == {'a': 7, 'g': 3, 'd': 100}) + self.assertTrue(d[0] == {"b": 5, "c": 10, "f": 353}) + self.assertTrue(d[1] == {"a": 7, "g": 3, "d": 100}) def tearDown(self): os.remove(self.tmpfile_a) @@ -461,22 +472,15 @@ def tearDown(self): class UtilitiesTest(unittest.TestCase): - def setUp(self): - docs = turicreate.SArray([{'b': 5, 'a': 3}, - {'c': 7, 'b': 5}, - {'a': 2, 'd': 3}]) + docs = turicreate.SArray([{"b": 5, "a": 3}, {"c": 7, "b": 5}, {"a": 2, "d": 3}]) - doc_topics = turicreate.SArray([[.9, .1], - [.7, .3], - [.1, .9]]) + doc_topics = turicreate.SArray([[0.9, 0.1], [0.7, 0.3], [0.1, 0.9]]) - word_topics = turicreate.SArray([[.5, .5], - [.1, .9], - [.25, .75]]) + word_topics = turicreate.SArray([[0.5, 0.5], [0.1, 0.9], [0.25, 0.75]]) - vocabulary = turicreate.SArray(['a', 'b', 'c']) + vocabulary = turicreate.SArray(["a", "b", "c"]) self.docs = docs self.word_topics = word_topics @@ -486,22 +490,21 @@ def setUp(self): def test_perplexity(self): - prob_0_a = .9 * .5 + .1 * .5 - prob_0_b = .9 * .1 + .1 * .9 - prob_1_b = .7 * .1 + .3 * .9 - prob_1_c = .7 * .25 + .3 * .75 - prob_2_a = .1 * .5 + .9 * .5 + prob_0_a = 0.9 * 0.5 + 0.1 * 0.5 + prob_0_b = 0.9 * 0.1 + 0.1 * 0.9 + prob_1_b = 0.7 * 0.1 + 0.3 * 0.9 + prob_1_c = 0.7 * 0.25 + 0.3 * 0.75 + prob_2_a = 0.1 * 0.5 + 0.9 * 0.5 prob_2_d = 0 perp = 0.0 perp += 3 * np.log(prob_0_a) + 5 * np.log(prob_0_b) perp += 5 * np.log(prob_1_b) + 7 * np.log(prob_1_c) perp += 2 * np.log(prob_2_a) - perp = np.exp(- perp / (3+5+5+7+2)) + perp = np.exp(-perp / (3 + 5 + 5 + 7 + 2)) - observed_perp = perplexity(self.docs, - self.doc_topics, - self.word_topics, - self.vocabulary) + observed_perp = perplexity( + self.docs, self.doc_topics, self.word_topics, self.vocabulary + ) self.assertAlmostEqual(perp, observed_perp, delta=0.0001) diff --git a/src/python/turicreate/test/test_tree_extract_features.py b/src/python/turicreate/test/test_tree_extract_features.py index 8d92131ae5..c913bb52e8 100644 --- a/src/python/turicreate/test/test_tree_extract_features.py +++ b/src/python/turicreate/test/test_tree_extract_features.py @@ -18,16 +18,19 @@ import numpy as np from array import array -class TreeExtractFeaturesTest(unittest.TestCase): +class TreeExtractFeaturesTest(unittest.TestCase): def _run_test(self, sf, expected_number_of_features): - sf['target'] = [0 if random.random() < 0.5 else 1 for i in range(sf.num_rows())] + sf["target"] = [0 if random.random() < 0.5 else 1 for i in range(sf.num_rows())] - for model in [tc.regression.boosted_trees_regression, - tc.classifier.boosted_trees_classifier]: - m = model.create(sf, 'target', validation_set = None, - max_iterations=5, max_depth=2) + for model in [ + tc.regression.boosted_trees_regression, + tc.classifier.boosted_trees_classifier, + ]: + m = model.create( + sf, "target", validation_set=None, max_iterations=5, max_depth=2 + ) out = m.extract_features(sf) self.assertEqual(len(out), len(sf)) @@ -35,12 +38,13 @@ def _run_test(self, sf, expected_number_of_features): out = m._extract_features_with_missing(sf) self.assertEqual(len(out), len(sf)) - - for model in [tc.regression.random_forest_regression, - tc.classifier.random_forest_classifier, - tc.regression.decision_tree_regression, - tc.classifier.decision_tree_classifier]: - m = model.create(sf, 'target', validation_set = None, max_depth=2) + for model in [ + tc.regression.random_forest_regression, + tc.classifier.random_forest_classifier, + tc.regression.decision_tree_regression, + tc.classifier.decision_tree_classifier, + ]: + m = model.create(sf, "target", validation_set=None, max_depth=2) out = m.extract_features(sf) self.assertEqual(len(out), len(sf)) @@ -50,110 +54,150 @@ def _run_test(self, sf, expected_number_of_features): def test_categorical_1(self): - sf = tc.SFrame({ - 'cat1': ['1', '1', '2', '2', '2'] * 100, - 'cat2': ['1', '3', '3', '1', '1'] * 100 - }) + sf = tc.SFrame( + { + "cat1": ["1", "1", "2", "2", "2"] * 100, + "cat2": ["1", "3", "3", "1", "1"] * 100, + } + ) self._run_test(sf, 4) def test_categorical_2(self): - sf = tc.SFrame({ - 'cat[1]': ['1', '1', '2', '2', '2'] * 100, - 'cat[2]': ['1', '3', '3', '1', '1'] * 100 - }) + sf = tc.SFrame( + { + "cat[1]": ["1", "1", "2", "2", "2"] * 100, + "cat[2]": ["1", "3", "3", "1", "1"] * 100, + } + ) self._run_test(sf, 4) def test_dict_1(self): - sf = tc.SFrame({ - 'dict1': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {'1' : 1, 'b' : 2}, - {'1' : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100 - }) + sf = tc.SFrame( + { + "dict1": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {"1": 1, "b": 2}, + {"1": 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100 + } + ) self._run_test(sf, 4) def test_dict_2(self): - sf = tc.SFrame({ - 'dict1': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100 - }) + sf = tc.SFrame( + { + "dict1": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100 + } + ) self._run_test(sf, 5) def test_dict_3(self): - sf = tc.SFrame({ - 'dict': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100, - 'dict[2]': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100, - 'dict[3]': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100 - }) + sf = tc.SFrame( + { + "dict": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100, + "dict[2]": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100, + "dict[3]": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100, + } + ) self._run_test(sf, 15) def test_cat_dict_1(self): - sf = tc.SFrame({ - 'cat1': [str(i) for i in range(500)], - 'dict2': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100 - }) + sf = tc.SFrame( + { + "cat1": [str(i) for i in range(500)], + "dict2": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100, + } + ) self._run_test(sf, 505) def test_numeric_1(self): - sf = tc.SFrame({ - 'num1' : [1,2,3.5,4,5] * 100, - 'num2' : [1,2,3.5,4,5] * 100, - 'num3' : [1,2,3.5,4,5] * 100 - }) + sf = tc.SFrame( + { + "num1": [1, 2, 3.5, 4, 5] * 100, + "num2": [1, 2, 3.5, 4, 5] * 100, + "num3": [1, 2, 3.5, 4, 5] * 100, + } + ) self._run_test(sf, 3) def test_numeric_2(self): - sf = tc.SFrame({ - 'num1' : [1,2,3.5,4,5] * 100, - 'num2' : [1,2,3.5,4,5] * 100, - 'vect' : [[1,2,3.5,4,5]] * 500 - }) + sf = tc.SFrame( + { + "num1": [1, 2, 3.5, 4, 5] * 100, + "num2": [1, 2, 3.5, 4, 5] * 100, + "vect": [[1, 2, 3.5, 4, 5]] * 500, + } + ) self._run_test(sf, 7) - def test_numeric_dict(self): - sf = tc.SFrame({ - 'num1' : [1,2,3.5,4,5] * 100, - 'num2' : [1,2,3.5,4,5] * 100, - 'vect' : [[1,2,3.5,4,5]] * 500, - 'dict[2]': [{'1' : 1, '2' : 3.2}, - {'1' : 3.1,}, - {1 : 1, 'b' : 2}, - {1 : 1, 'b' : 3}, - {'a' : 2, 'b' : 3} ] * 100, - }) + sf = tc.SFrame( + { + "num1": [1, 2, 3.5, 4, 5] * 100, + "num2": [1, 2, 3.5, 4, 5] * 100, + "vect": [[1, 2, 3.5, 4, 5]] * 500, + "dict[2]": [ + {"1": 1, "2": 3.2}, + {"1": 3.1,}, + {1: 1, "b": 2}, + {1: 1, "b": 3}, + {"a": 2, "b": 3}, + ] + * 100, + } + ) self._run_test(sf, 12) def test_missing(self): - sf = tc.SFrame({ - 'num1' : [1,2,3.5,4,None] * 100, - 'num2' : [1,2,3.5,4,None] * 100, - 'num3' : [1,2,3.5,4,None] * 100 - }) + sf = tc.SFrame( + { + "num1": [1, 2, 3.5, 4, None] * 100, + "num2": [1, 2, 3.5, 4, None] * 100, + "num3": [1, 2, 3.5, 4, None] * 100, + } + ) self._run_test(sf, 3) diff --git a/src/python/turicreate/test/test_tree_json_dump.py b/src/python/turicreate/test/test_tree_json_dump.py index 468d591f0f..8de525a4f9 100644 --- a/src/python/turicreate/test/test_tree_json_dump.py +++ b/src/python/turicreate/test/test_tree_json_dump.py @@ -14,6 +14,7 @@ import os as _os + class Config: is_debug = False # always set this to False in production level = 0 @@ -27,14 +28,14 @@ class GBDTNode: NODE_TYPE_INDICATOR = 2 NODE_TYPE_LEAF = 3 type2typeid = { - 'indicator': NODE_TYPE_INDICATOR, - 'integer': NODE_TYPE_INT, - 'leaf': NODE_TYPE_LEAF + "indicator": NODE_TYPE_INDICATOR, + "integer": NODE_TYPE_INT, + "leaf": NODE_TYPE_LEAF, } typeid2type = { - NODE_TYPE_INDICATOR: 'indicator', - NODE_TYPE_INT: 'integer', - NODE_TYPE_LEAF: 'leaf' + NODE_TYPE_INDICATOR: "indicator", + NODE_TYPE_INT: "integer", + NODE_TYPE_LEAF: "leaf", } def __init__(self, node_type, value, gid, name, left=None, right=None): @@ -47,46 +48,46 @@ def __init__(self, node_type, value, gid, name, left=None, right=None): @classmethod def load_vertex(cls, v): - node_type = GBDTNode.type2typeid[v['type']] - name = v['name'] if (node_type != GBDTNode.NODE_TYPE_LEAF) else '' - gid = v['id'] - value = v['value'] + node_type = GBDTNode.type2typeid[v["type"]] + name = v["name"] if (node_type != GBDTNode.NODE_TYPE_LEAF) else "" + gid = v["id"] + value = v["value"] return GBDTNode(node_type, value, gid, name) @classmethod def load_turicreate_json_tree(cls, jstree): - vertices = jstree['vertices'] - edges = jstree['edges'] - vtup = map(lambda x: (x['id'], GBDTNode.load_vertex(x)), vertices) + vertices = jstree["vertices"] + edges = jstree["edges"] + vtup = map(lambda x: (x["id"], GBDTNode.load_vertex(x)), vertices) id2ver = dict(vtup) for e in edges: - src = id2ver[e['src']] - dst = id2ver[e['dst']] - val = e['value'] - if val == 'yes': + src = id2ver[e["src"]] + dst = id2ver[e["dst"]] + val = e["value"] + if val == "yes": src.left = dst else: src.right = dst - r = id2ver[0] # root node + r = id2ver[0] # root node return r def __str__(self): n = "%d:%s" % (self.gid, self.node_name) if self.node_type == GBDTNode.NODE_TYPE_INDICATOR: - n += '=' + n += "=" elif self.node_type == GBDTNode.NODE_TYPE_INT: - n += '<' + n += "<" if self.node_type == GBDTNode.NODE_TYPE_INT: n += "%f" % self.value else: n += "%s" % self.value - left = '_' + left = "_" if self.left is not None: if isinstance(self.left, GBDTNode): left = "%d" % self.left.gid else: left = self.left - right = '_' + right = "_" if self.right is not None: if isinstance(self.right, GBDTNode): right = "%d" % self.right.gid @@ -95,15 +96,15 @@ def __str__(self): n += ",%s,%s" % (left, right) return n - def traverse(self, level=''): + def traverse(self, level=""): print(level + str(self)) this_level = level if self.left: if isinstance(self.left, GBDTNode): - self.left.traverse(level=this_level + ' ') + self.left.traverse(level=this_level + " ") if self.right: if isinstance(self.right, GBDTNode): - self.right.traverse(level=this_level + ' ') + self.right.traverse(level=this_level + " ") def renumbering(self, val): self.gid = val @@ -118,7 +119,7 @@ def calculate_score(self, inp): if Config.is_debug: if Config.level == 0: print("=====================") - print((' ' * Config.level) + str(self)) + print((" " * Config.level) + str(self)) Config.level += 1 if self.node_type == GBDTNode.NODE_TYPE_LEAF: retval = self.value @@ -129,7 +130,7 @@ def calculate_score(self, inp): else: retval = self.right.calculate_score(inp) elif self.node_type == GBDTNode.NODE_TYPE_INDICATOR: - strkey, strval = self.node_name.split('=') + strkey, strval = self.node_name.split("=") inpval = str(inp[strkey]) if inpval == strval: retval = self.left.calculate_score(inp) @@ -143,10 +144,10 @@ def get_dict(self): d = { "id": self.gid, "type": GBDTNode.typeid2type[self.node_type], - "value": self.value + "value": self.value, } if self.node_type != GBDTNode.NODE_TYPE_LEAF: - d['name'] = self.node_name + d["name"] = self.node_name return d def get_all_vertice_helper(self, vhash): @@ -167,20 +168,12 @@ def get_all_vertices(self): def get_all_edge_helper(self, visited, edge_lst): visited.add(self.gid) if self.left: - edge_lst.append({ - "src" : self.gid, - "dst" : self.left.gid, - "value" : "yes" - }) + edge_lst.append({"src": self.gid, "dst": self.left.gid, "value": "yes"}) if self.left.gid not in visited: self.left.get_all_edge_helper(visited, edge_lst) if self.right: - edge_lst.append({ - "src" : self.gid, - "dst" : self.right.gid, - "value" : "no" - }) + edge_lst.append({"src": self.gid, "dst": self.right.gid, "value": "no"}) if self.right.gid not in visited: self.right.get_all_edge_helper(visited, edge_lst) @@ -193,10 +186,7 @@ def get_all_edges(self): def to_dict(self): all_vertices = self.get_all_vertices() all_edges = self.get_all_edges() - ret_dict = { - "vertices" : all_vertices, - "edges" : all_edges - } + ret_dict = {"vertices": all_vertices, "edges": all_edges} return ret_dict def is_leaf(self): @@ -219,7 +209,7 @@ def predict_one(self, record): @classmethod def parse_turicreate_json(cls, jslst): - treelst = map(lambda js : GBDTNode.load_turicreate_json_tree(js), jslst) + treelst = map(lambda js: GBDTNode.load_turicreate_json_tree(js), jslst) return treelst def predict(self, data): @@ -231,7 +221,7 @@ def predict(self, data): @classmethod def create_from_gbdt(cls, gbdt_model): - jss = [json.loads(s) for s in gbdt_model._get('trees_json')] + jss = [json.loads(s) for s in gbdt_model._get("trees_json")] retval = GBDTDecoder() retval.combination_method = 0 retval.tree_list = list(GBDTDecoder.parse_turicreate_json(jss)) @@ -245,19 +235,19 @@ def create_from_gbdt_json(cls, gbdt_json): return retval def get_json_trees(self): - dict_trees = map(lambda x : x.to_dict(), self.tree_list) - js_trees = map(lambda x : json.dumps(x), dict_trees) + dict_trees = map(lambda x: x.to_dict(), self.tree_list) + js_trees = map(lambda x: json.dumps(x), dict_trees) # ret_json = json.dumps(js_trees) return js_trees def get(self, key_to_get): - if key_to_get == 'trees_json': + if key_to_get == "trees_json": return self.get_json_trees() else: - raise Exception('Not implemented yet (get key = %s)' % key_to_get) + raise Exception("Not implemented yet (get key = %s)" % key_to_get) def save_json(self, fpath): - fp = open(fpath, 'wt') + fp = open(fpath, "wt") strjs = json.dumps(self.get_json_trees()) fp.write(strjs) fp.close() @@ -287,16 +277,27 @@ def _check_json_model_predict_consistency(self, glc_model, test_data): def test_synthetic_data(self): random.seed(0) num_rows = 1000 - sf = tc.SFrame({'num': [random.randint(0, 100) for i in range(num_rows)], - 'cat': [['a', 'b'][random.randint(0, 1)] for i in range(num_rows)]}) + sf = tc.SFrame( + { + "num": [random.randint(0, 100) for i in range(num_rows)], + "cat": [["a", "b"][random.randint(0, 1)] for i in range(num_rows)], + } + ) coeffs = [random.random(), random.random()] def make_target(row): - if row['cat'] == 'a': - return row['num'] * coeffs[0] + if row["cat"] == "a": + return row["num"] * coeffs[0] else: - return row['num'] * coeffs[1] - sf['target'] = sf.apply(make_target) - m = tc.boosted_trees_regression.create(sf, target='target', validation_set=None, random_seed=0, - max_depth=10, max_iterations=3) + return row["num"] * coeffs[1] + + sf["target"] = sf.apply(make_target) + m = tc.boosted_trees_regression.create( + sf, + target="target", + validation_set=None, + random_seed=0, + max_depth=10, + max_iterations=3, + ) self._check_json_model_predict_consistency(m, sf) diff --git a/src/python/turicreate/test/test_tree_tracking_metrics.py b/src/python/turicreate/test/test_tree_tracking_metrics.py index 209649cf41..3c1850d0b4 100644 --- a/src/python/turicreate/test/test_tree_tracking_metrics.py +++ b/src/python/turicreate/test/test_tree_tracking_metrics.py @@ -15,23 +15,31 @@ class TreeRegressionTrackingMetricsTest(unittest.TestCase): @classmethod def setUpClass(cls): - sf = tc.SFrame({'cat[1]': ['1', '1', '2', '2', '2'] * 20, - 'cat[2]': ['1', '3', '3', '1', '1'] * 20, - 'target': [random.random() for i in range(100)]}) + sf = tc.SFrame( + { + "cat[1]": ["1", "1", "2", "2", "2"] * 20, + "cat[2]": ["1", "3", "3", "1", "1"] * 20, + "target": [random.random() for i in range(100)], + } + ) cls.train, cls.test = sf.random_split(0.5, seed=5) - cls.default_metric = ['rmse', 'max_error'] - cls.test_metrics = ['rmse', 'max_error'] - cls.models = {'bst': tc.regression.boosted_trees_regression, - 'rf': tc.regression.random_forest_regression, - 'dt': tc.regression.decision_tree_regression} + cls.default_metric = ["rmse", "max_error"] + cls.test_metrics = ["rmse", "max_error"] + cls.models = { + "bst": tc.regression.boosted_trees_regression, + "rf": tc.regression.random_forest_regression, + "dt": tc.regression.decision_tree_regression, + } return cls def _metric_display_name(self, metric): - metric_display_names = {'accuracy' : 'Accuracy', - 'auc' : 'Area Under Curve', - 'log_loss' : 'Log Loss', - 'max_error' : 'Max Error', - 'rmse' : 'Root-Mean-Square Error'} + metric_display_names = { + "accuracy": "Accuracy", + "auc": "Area Under Curve", + "log_loss": "Log Loss", + "max_error": "Max Error", + "rmse": "Root-Mean-Square Error", + } if metric in metric_display_names: return metric_display_names[metric] else: @@ -39,9 +47,11 @@ def _metric_display_name(self, metric): def _run_test(self, train, valid, metric): for (name, model) in self.models.items(): - m = model.create(train, 'target', validation_set=valid, max_depth=2, metric=metric) + m = model.create( + train, "target", validation_set=valid, max_depth=2, metric=metric + ) history_header = m.progress.column_names() - if metric is 'auto': + if metric is "auto": metric = self.default_metric if type(metric) is str: @@ -49,23 +59,25 @@ def _run_test(self, train, valid, metric): elif type(metric) is list: test_metrics = metric else: - raise TypeError('Invalid metric type') + raise TypeError("Invalid metric type") for name in test_metrics: - column_name = 'Training %s' % self._metric_display_name(name) + column_name = "Training %s" % self._metric_display_name(name) self.assertTrue(column_name in history_header) final_eval = m.evaluate(train, name)[name] progress_evals = m.progress[column_name] - self.assertAlmostEqual(float(progress_evals[-1]), final_eval, delta=1e-4) + self.assertAlmostEqual( + float(progress_evals[-1]), final_eval, delta=1e-4 + ) if valid is not None: - column_name = 'Validation %s' % self._metric_display_name(name) + column_name = "Validation %s" % self._metric_display_name(name) self.assertTrue(column_name in history_header) def test_auto_metric(self): - self._run_test(self.train, self.test, 'auto') + self._run_test(self.train, self.test, "auto") def test_auto_metric_no_validation(self): - self._run_test(self.train, None, 'auto') + self._run_test(self.train, None, "auto") def test_single_metric(self): for m in self.test_metrics: @@ -82,14 +94,20 @@ def test_tracking_metrics_consistency(self): # Train model with increasing max_iterations for ntrees in [1, 2, 3]: - m = self.models['rf'].create(self.train, 'target', validation_set=self.test, max_iterations=ntrees, metric=self.test_metrics, - random_seed=1) + m = self.models["rf"].create( + self.train, + "target", + validation_set=self.test, + max_iterations=ntrees, + metric=self.test_metrics, + random_seed=1, + ) rf_models.append(m) m_last = rf_models[-1] for name in self.test_metrics: - train_column_name = 'Training %s' % self._metric_display_name(name) - test_column_name = 'Validation %s' % self._metric_display_name(name) + train_column_name = "Training %s" % self._metric_display_name(name) + test_column_name = "Validation %s" % self._metric_display_name(name) train_evals = [float(x) for x in m_last.progress[train_column_name]] test_evals = [float(x) for x in m_last.progress[test_column_name]] # Check the final model's metric at iteration i against the ith model's last metric @@ -104,46 +122,60 @@ def test_tracking_metrics_consistency(self): class BinaryTreeClassifierTrackingMetricsTest(TreeRegressionTrackingMetricsTest): @classmethod def setUpClass(cls): - sf = tc.SFrame({'cat[1]': ['1', '1', '2', '2', '2'] * 20, - 'cat[2]': ['1', '3', '3', '1', '1'] * 20, - 'target': ['0', '1'] * 50}) + sf = tc.SFrame( + { + "cat[1]": ["1", "1", "2", "2", "2"] * 20, + "cat[2]": ["1", "3", "3", "1", "1"] * 20, + "target": ["0", "1"] * 50, + } + ) cls.train, cls.test = sf.random_split(0.5, seed=5) - cls.default_metric = ['log_loss', 'accuracy'] - cls.test_metrics = ['log_loss', 'accuracy', 'auc'] - cls.models = {'bst': tc.classifier.boosted_trees_classifier, - 'rf': tc.classifier.random_forest_classifier, - 'dt': tc.classifier.decision_tree_classifier} + cls.default_metric = ["log_loss", "accuracy"] + cls.test_metrics = ["log_loss", "accuracy", "auc"] + cls.models = { + "bst": tc.classifier.boosted_trees_classifier, + "rf": tc.classifier.random_forest_classifier, + "dt": tc.classifier.decision_tree_classifier, + } return cls def test_unseen_label_in_validation(self): test = self.test.copy() l = len(test) - test['target'] = test['target'].head(l - 10).append(tc.SArray(['unknown'] * 10)) + test["target"] = test["target"].head(l - 10).append(tc.SArray(["unknown"] * 10)) self._run_test(self.train, test, self.test_metrics) def test_auto_metric_unseen_label_in_validation(self): test = self.test.copy() l = len(test) - test['target'] = test['target'].head(l - 10).append(tc.SArray(['unknown'] * 10)) - self._run_test(self.train, test, 'auto') + test["target"] = test["target"].head(l - 10).append(tc.SArray(["unknown"] * 10)) + self._run_test(self.train, test, "auto") -class MultiClassTreeClassifierTrackingMetricsTest(BinaryTreeClassifierTrackingMetricsTest): +class MultiClassTreeClassifierTrackingMetricsTest( + BinaryTreeClassifierTrackingMetricsTest +): @classmethod def setUpClass(cls): - sf = tc.SFrame({'cat[1]': ['1', '1', '2', '2', '2'] * 20, - 'cat[2]': ['1', '3', '3', '1', '1'] * 20, - 'target': ['0', '1', '2', '3'] * 25}) + sf = tc.SFrame( + { + "cat[1]": ["1", "1", "2", "2", "2"] * 20, + "cat[2]": ["1", "3", "3", "1", "1"] * 20, + "target": ["0", "1", "2", "3"] * 25, + } + ) cls.train, cls.test = sf.random_split(0.5, seed=5) - cls.default_metric = ['log_loss', 'accuracy'] - cls.test_metrics = ['log_loss', 'accuracy'] - cls.models = {'bst': tc.classifier.boosted_trees_classifier, - 'rf': tc.classifier.random_forest_classifier, - 'dt': tc.classifier.decision_tree_classifier} + cls.default_metric = ["log_loss", "accuracy"] + cls.test_metrics = ["log_loss", "accuracy"] + cls.models = { + "bst": tc.classifier.boosted_trees_classifier, + "rf": tc.classifier.random_forest_classifier, + "dt": tc.classifier.decision_tree_classifier, + } return cls def test_auc_exception(self): test = self.test.copy() l = len(test) - test['target'] = test['target'].head(l - 10).append(tc.SArray(['unknown'] * 10)) - self.assertRaises(ToolkitError, lambda: self._run_test(self.train, test, 'auc')) + test["target"] = test["target"].head(l - 10).append(tc.SArray(["unknown"] * 10)) + self.assertRaises(ToolkitError, lambda: self._run_test(self.train, test, "auc")) diff --git a/src/python/turicreate/test/test_unicode_strings.py b/src/python/turicreate/test/test_unicode_strings.py index 05ff52578b..57a17382ce 100644 --- a/src/python/turicreate/test/test_unicode_strings.py +++ b/src/python/turicreate/test/test_unicode_strings.py @@ -14,30 +14,30 @@ import unittest import turicreate as tc -class UnicodeStringTest(unittest.TestCase): +class UnicodeStringTest(unittest.TestCase): def test_unicode_column_accessor(self): - sf = tc.SFrame({'a': range(100)}) - self.assertEqual(sf[u'a'][0], sf['a'][0]) + sf = tc.SFrame({"a": range(100)}) + self.assertEqual(sf[u"a"][0], sf["a"][0]) def test_unicode_unpack_prefix(self): - sf = tc.SFrame({'a':[{'x':1}, {'x':2}, {'x':3}]}) - sf = sf.unpack('a', u'\u00aa') + sf = tc.SFrame({"a": [{"x": 1}, {"x": 2}, {"x": 3}]}) + sf = sf.unpack("a", u"\u00aa") for col in sf.column_names(): if six.PY2: # column names come out as str - self.assertTrue(col.startswith(u'\u00aa'.encode('utf-8'))) + self.assertTrue(col.startswith(u"\u00aa".encode("utf-8"))) else: # column names come out as unicode - self.assertTrue(col.startswith(u'\u00aa')) + self.assertTrue(col.startswith(u"\u00aa")) def test_unicode_column_construction(self): - sf = tc.SFrame({u'\u00aa': [1, 2, 3]}) - self.assertEqual(sf[u'\u00aa'][0], 1) + sf = tc.SFrame({u"\u00aa": [1, 2, 3]}) + self.assertEqual(sf[u"\u00aa"][0], 1) def test_access_nonexistent_column(self): - sf = tc.SFrame({u'\u00aa': [1,2,3], 'a': [4,5,6]}) + sf = tc.SFrame({u"\u00aa": [1, 2, 3], "a": [4, 5, 6]}) with self.assertRaises(RuntimeError): - sf['b'] + sf["b"] with self.assertRaises(RuntimeError): - sf[u'\u00ab'] + sf[u"\u00ab"] diff --git a/src/python/turicreate/test/test_util.py b/src/python/turicreate/test/test_util.py index 6f6f6f081c..37d818c6c8 100644 --- a/src/python/turicreate/test/test_util.py +++ b/src/python/turicreate/test/test_util.py @@ -24,33 +24,31 @@ class UtilTests(unittest.TestCase): def test_archive_utils(self): # Arrange sf = SFrame([1, 2, 3, 4, 5]) - dir = tempfile.mkdtemp(prefix='archive-tests') + dir = tempfile.mkdtemp(prefix="archive-tests") try: sf.save(dir) # Act & Assert self.assertTrue(glutil.is_directory_archive(dir)) - self.assertEqual(glutil.get_archive_type(dir), 'sframe') - self.assertFalse(glutil.is_directory_archive('/tmp')) - self.assertRaises(TypeError, lambda: glutil.get_archive_type('/tmp')) + self.assertEqual(glutil.get_archive_type(dir), "sframe") + self.assertFalse(glutil.is_directory_archive("/tmp")) + self.assertRaises(TypeError, lambda: glutil.get_archive_type("/tmp")) finally: shutil.rmtree(dir) def test_crossproduct(self): s = util.SFrameComparer() - d = {'opt1': [1, 2, 3], - 'opt2': ['a', 'b']} + d = {"opt1": [1, 2, 3], "opt2": ["a", "b"]} actual = glutil.crossproduct(d) - actual = actual.sort('opt1') - expected = SFrame({'opt1': [1, 1, 2, 2, 3, 3], - 'opt2': ['a', 'b', 'a', 'b', 'a', 'b']}) + actual = actual.sort("opt1") + expected = SFrame( + {"opt1": [1, 1, 2, 2, 3, 3], "opt2": ["a", "b", "a", "b", "a", "b"]} + ) # Check columns individually since there is no # guaranteed ordering among columns. for k in d.keys(): - s._assert_sarray_equal(actual[k], - expected[k]) - + s._assert_sarray_equal(actual[k], expected[k]) def _validate_gl_object_type(self, obj, expected): with util.TempDirectory() as temp_dir: @@ -59,18 +57,21 @@ def _validate_gl_object_type(self, obj, expected): self.assertEqual(t, expected) def test_get_turicreate_object_type(self): - sf = SFrame({"a":[1,2]}) - self._validate_gl_object_type(sf, 'sframe') + sf = SFrame({"a": [1, 2]}) + self._validate_gl_object_type(sf, "sframe") - sa = SArray([1,2]) - self._validate_gl_object_type(sa, 'sarray') + sa = SArray([1, 2]) + self._validate_gl_object_type(sa, "sarray") d = SFrame( - {"__src_id":[175343, 163607, 44041, 101370, 64892], - "__dst_id":[1011, 7928, 7718, 12966, 11080]}) + { + "__src_id": [175343, 163607, 44041, 101370, 64892], + "__dst_id": [1011, 7928, 7718, 12966, 11080], + } + ) g = SGraph() - self._validate_gl_object_type(g, 'sgraph') + self._validate_gl_object_type(g, "sgraph") def test_sframe_equals(self): # Empty SFrames should be equal @@ -79,26 +80,26 @@ def test_sframe_equals(self): glutil._assert_sframe_equal(sf_a, sf_b) - the_data = [i for i in range(0,10)] + the_data = [i for i in range(0, 10)] sf = SFrame() - sf['ints'] = SArray(data=the_data, dtype=int) - sf['floats'] = SArray(data=the_data, dtype=float) - sf['floats'] = sf['floats'] * .5 - sf['strings'] = SArray(data=the_data, dtype=str) - sf['strings'] = sf['strings'].apply(lambda x: x+x+x) + sf["ints"] = SArray(data=the_data, dtype=int) + sf["floats"] = SArray(data=the_data, dtype=float) + sf["floats"] = sf["floats"] * 0.5 + sf["strings"] = SArray(data=the_data, dtype=str) + sf["strings"] = sf["strings"].apply(lambda x: x + x + x) # Make sure these aren't pointing to the same SFrame - sf_a = sf.filter_by([43], 'ints', exclude=True) - sf_b = sf.filter_by([43], 'ints', exclude=True) + sf_a = sf.filter_by([43], "ints", exclude=True) + sf_b = sf.filter_by([43], "ints", exclude=True) glutil._assert_sframe_equal(sf_a, sf_b) # Difference in number of columns - sf_a['extra'] = SArray(data=the_data) + sf_a["extra"] = SArray(data=the_data) with self.assertRaises(AssertionError): glutil._assert_sframe_equal(sf_a, sf_b) - del sf_a['extra'] + del sf_a["extra"] glutil._assert_sframe_equal(sf_a, sf_b) # Difference in number of rows @@ -106,40 +107,40 @@ def test_sframe_equals(self): glutil._assert_sframe_equal(sf_a, sf_b[0:5]) # Difference in types - sf_a['diff_type'] = sf_a['ints'].astype(str) - sf_b['diff_type'] = sf_b['ints'] + sf_a["diff_type"] = sf_a["ints"].astype(str) + sf_b["diff_type"] = sf_b["ints"] with self.assertRaises(AssertionError): glutil._assert_sframe_equal(sf_a, sf_b) - del sf_a['diff_type'] - del sf_b['diff_type'] + del sf_a["diff_type"] + del sf_b["diff_type"] glutil._assert_sframe_equal(sf_a, sf_b) # Difference in column name - sf_a.rename({'strings':'string'}, inplace=True) + sf_a.rename({"strings": "string"}, inplace=True) with self.assertRaises(AssertionError): glutil._assert_sframe_equal(sf_a, sf_b) glutil._assert_sframe_equal(sf_a, sf_b, check_column_names=False) - sf_a.rename({'string':'strings'}, inplace=True) + sf_a.rename({"string": "strings"}, inplace=True) glutil._assert_sframe_equal(sf_a, sf_b) - sf_a.rename({'ints':'floats1'}, inplace=True) - sf_a.rename({'floats':'ints'}, inplace=True) - sf_a.rename({'floats1':'floats'}, inplace=True) + sf_a.rename({"ints": "floats1"}, inplace=True) + sf_a.rename({"floats": "ints"}, inplace=True) + sf_a.rename({"floats1": "floats"}, inplace=True) glutil._assert_sframe_equal(sf_a, sf_b, check_column_names=False) - sf_a = sf.filter_by([43], 'ints', exclude=True) + sf_a = sf.filter_by([43], "ints", exclude=True) # Difference in column order - sf_a.swap_columns('strings', 'ints', inplace=True) + sf_a.swap_columns("strings", "ints", inplace=True) with self.assertRaises(AssertionError): glutil._assert_sframe_equal(sf_a, sf_b) glutil._assert_sframe_equal(sf_a, sf_b, check_column_order=False) - sf_a.swap_columns('strings', 'ints', inplace=True) + sf_a.swap_columns("strings", "ints", inplace=True) glutil._assert_sframe_equal(sf_a, sf_b) # Difference in row order @@ -151,45 +152,57 @@ def test_sframe_equals(self): glutil._assert_sframe_equal(sf_a, sf_b, check_row_order=False) # Difference in column order AND row order - sf_a.swap_columns('floats', 'strings', inplace=True) + sf_a.swap_columns("floats", "strings", inplace=True) with self.assertRaises(AssertionError): glutil._assert_sframe_equal(sf_a, sf_b) - glutil._assert_sframe_equal(sf_a, sf_b, check_column_order=False, check_row_order=False) + glutil._assert_sframe_equal( + sf_a, sf_b, check_column_order=False, check_row_order=False + ) # Column order, row order, names - sf_a.rename({'floats':'foo','strings':'bar','ints':'baz'}, inplace=True) + sf_a.rename({"floats": "foo", "strings": "bar", "ints": "baz"}, inplace=True) with self.assertRaises(AssertionError): glutil._assert_sframe_equal(sf_a, sf_b) # Illegal stuff with self.assertRaises(ValueError): - glutil._assert_sframe_equal(sf_a, sf_b, check_column_names=False, check_column_order=False) + glutil._assert_sframe_equal( + sf_a, sf_b, check_column_names=False, check_column_order=False + ) with self.assertRaises(ValueError): - glutil._assert_sframe_equal(sf_a, sf_b, check_column_names=False, check_column_order=False, check_row_order=False) + glutil._assert_sframe_equal( + sf_a, + sf_b, + check_column_names=False, + check_column_order=False, + check_row_order=False, + ) with self.assertRaises(TypeError): - glutil._assert_sframe_equal(sf_b['floats'], sf_a['foo']) + glutil._assert_sframe_equal(sf_b["floats"], sf_a["foo"]) def test_get_temp_file_location(self): from ..util import _get_temp_file_location from ..util import _convert_slashes + location = _get_temp_file_location() self.assertTrue(os.path.isdir(location)) - tmp = tempfile.mkdtemp(prefix='test_gl_util') - default_tmp = get_runtime_config()['TURI_CACHE_FILE_LOCATIONS'] + tmp = tempfile.mkdtemp(prefix="test_gl_util") + default_tmp = get_runtime_config()["TURI_CACHE_FILE_LOCATIONS"] try: - set_runtime_config('TURI_CACHE_FILE_LOCATIONS', tmp) + set_runtime_config("TURI_CACHE_FILE_LOCATIONS", tmp) location = _convert_slashes(_get_temp_file_location()) self.assertTrue(location.startswith(_convert_slashes(tmp))) finally: shutil.rmtree(tmp) - set_runtime_config('TURI_CACHE_FILE_LOCATIONS', default_tmp) + set_runtime_config("TURI_CACHE_FILE_LOCATIONS", default_tmp) def test_make_temp_directory(self): from ..util import _make_temp_directory, _get_temp_file_location + tmp_root = _get_temp_file_location() location = _make_temp_directory(prefix=None) @@ -199,7 +212,7 @@ def test_make_temp_directory(self): finally: shutil.rmtree(location) - prefix = 'abc_' + prefix = "abc_" location = _make_temp_directory(prefix=prefix) try: self.assertTrue(os.path.isdir(location)) @@ -210,6 +223,7 @@ def test_make_temp_directory(self): def test_make_temp_filename(self): from ..util import _make_temp_filename, _get_temp_file_location + tmp_root = _get_temp_file_location() location = _make_temp_filename(prefix=None) @@ -218,7 +232,7 @@ def test_make_temp_filename(self): self.assertTrue(location.startswith(tmp_root)) self.assertTrue(isinstance(location, str)) - prefix = 'abc_' + prefix = "abc_" location = _make_temp_filename(prefix=prefix) self.assertFalse(os.path.isfile(location)) self.assertFalse(os.path.exists(location)) diff --git a/src/python/turicreate/test/util.py b/src/python/turicreate/test/util.py index f555012361..7e6024121c 100644 --- a/src/python/turicreate/test/util.py +++ b/src/python/turicreate/test/util.py @@ -18,12 +18,14 @@ import sys from six import StringIO -class SFrameComparer(): + +class SFrameComparer: """ Helper class for comparing sframe and sarrays Adapted from test_sframe.py """ + def _assert_sgraph_equal(self, sg1, sg2): self._assert_sframe_equal(sg1.vertices, sg2.vertices) self._assert_sframe_equal(sg1.edges, sg2.edges) @@ -50,12 +52,13 @@ def _assert_sarray_equal(self, sa1, sa2): assert key in v1 assert v1[key] == v2[key] - elif (hasattr(v1, "__iter__")): + elif hasattr(v1, "__iter__"): assert len(v1) == len(v2) for j in range(len(v1)): - t1 = v1[j]; t2 = v2[j] - if (type(t1) == float): - if (math.isnan(t1)): + t1 = v1[j] + t2 = v2[j] + if type(t1) == float: + if math.isnan(t1): assert math.isnan(t2) else: assert t1 == t2 @@ -64,12 +67,14 @@ def _assert_sarray_equal(self, sa1, sa2): else: assert v1 == v2 -class SubstringMatcher(): + +class SubstringMatcher: """ Helper class for testing substring matching Code adapted from http://www.michaelpollmeier.com/python-mock-how-to-assert-a-substring-of-logger-output/ """ + def __init__(self, containing): self.containing = containing.lower() @@ -80,21 +85,26 @@ def __unicode__(self): return 'a string containing "%s"' % self.containing def __str__(self): - return unicode(self).encode('utf-8') + return unicode(self).encode("utf-8") + __repr__ = __unicode__ -class TempDirectory(): + +class TempDirectory: name = None + def __init__(self): self.name = tempfile.mkdtemp() + def __enter__(self): return self.name + def __exit__(self, type, value, traceback): if self.name is not None: shutil.rmtree(self.name) -def uniform_string_column(n, word_length, alphabet_size, missingness=0.): +def uniform_string_column(n, word_length, alphabet_size, missingness=0.0): """ Return an SArray of strings constructed uniformly randomly from the first 'num_letters' of the lower case alphabet. @@ -129,12 +139,12 @@ def uniform_string_column(n, word_length, alphabet_size, missingness=0.): word = [] for j in range(word_length): word.append(np.random.choice(letters)) - result.append(''.join(word)) + result.append("".join(word)) return SArray(result) -def uniform_numeric_column(n, col_type=float, range=(0, 1), missingness=0.): +def uniform_numeric_column(n, col_type=float, range=(0, 1), missingness=0.0): """ Return an SArray of uniformly random numeric values. @@ -172,19 +182,18 @@ def uniform_numeric_column(n, col_type=float, range=(0, 1), missingness=0.): def assert_longer_verbose_logs(function_call, args, kwargs): # Run command with verbose=False - kwargs['verbose'] = False + kwargs["verbose"] = False old_stdout = sys.stdout sys.stdout = stdout_without_verbose = StringIO() function_call(*args, **kwargs) sys.stdout = old_stdout without_verbose = stdout_without_verbose.getvalue() # Run command with verbose=True - kwargs['verbose'] = True + kwargs["verbose"] = True old_stdout = sys.stdout sys.stdout = stdout_with_verbose = StringIO() function_call(*args, **kwargs) sys.stdout = old_stdout with_verbose = stdout_with_verbose.getvalue() # Assert that verbose logs are longer - assert (len(with_verbose) > len(without_verbose)) - + assert len(with_verbose) > len(without_verbose) diff --git a/src/python/turicreate/toolkits/__init__.py b/src/python/turicreate/toolkits/__init__.py index 0a94f33739..0f74bb1547 100644 --- a/src/python/turicreate/toolkits/__init__.py +++ b/src/python/turicreate/toolkits/__init__.py @@ -4,8 +4,8 @@ # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Turi Create offers a broad set of essential machine learning models as well as task specific toolkits that let you to get started quickly while still giving you the ability to go back and customize models later. -''' +""" diff --git a/src/python/turicreate/toolkits/_coreml_utils.py b/src/python/turicreate/toolkits/_coreml_utils.py index a7ccbb8030..0c8c297b99 100644 --- a/src/python/turicreate/toolkits/_coreml_utils.py +++ b/src/python/turicreate/toolkits/_coreml_utils.py @@ -6,12 +6,19 @@ from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ + + def _mlmodel_short_description(model_type): """ Returns a string to be used in an Core ML model's description metadata. """ from turicreate import __version__ - return '%s created by Turi Create (version %s)' % (model_type.capitalize(), __version__) + + return "%s created by Turi Create (version %s)" % ( + model_type.capitalize(), + __version__, + ) + def _get_tc_version_info(): """ @@ -19,24 +26,27 @@ def _get_tc_version_info(): """ from turicreate import __version__ import platform + return { - 'com.github.apple.turicreate.version': __version__, - 'com.github.apple.os.platform': platform.platform() + "com.github.apple.turicreate.version": __version__, + "com.github.apple.os.platform": platform.platform(), } + def _get_model_metadata(model_class, metadata, version=None): """ Returns user-defined metadata, making sure information all models should have is also available, as a dictionary """ info = _get_tc_version_info() - info['type'] = model_class + info["type"] = model_class if version is not None: - info['version'] = str(version) + info["version"] = str(version) if metadata is not None: info.update(metadata) return info + def _set_model_metadata(mlmodel, model_class, metadata, version=None): """ Sets user-defined metadata, making sure information all models should have diff --git a/src/python/turicreate/toolkits/_data_zoo.py b/src/python/turicreate/toolkits/_data_zoo.py index 2b30eabc48..0c4db076b0 100644 --- a/src/python/turicreate/toolkits/_data_zoo.py +++ b/src/python/turicreate/toolkits/_data_zoo.py @@ -16,25 +16,26 @@ from ._pre_trained_models import _download_and_checksum_files from ._pre_trained_models import _get_cache_dir -DATA_URL_ROOT = 'https://docs-assets.developer.apple.com/turicreate/data/' +DATA_URL_ROOT = "https://docs-assets.developer.apple.com/turicreate/data/" + class OneShotObjectDetectorBackgroundData(object): def __init__(self): self.source_tar_filename = "one_shot_backgrounds.sarray.tar" self.destination_tar_filename = "one_shot_backgrounds.sarray.tar" self.destination_sarray_filename = "one_shot_backgrounds.sarray" - self.destination_sarray_path = _os.path.join(_get_cache_dir("data"), - self.destination_sarray_filename) - self.sarray_url = _urlparse.urljoin( - DATA_URL_ROOT, self.source_tar_filename) + self.destination_sarray_path = _os.path.join( + _get_cache_dir("data"), self.destination_sarray_filename + ) + self.sarray_url = _urlparse.urljoin(DATA_URL_ROOT, self.source_tar_filename) self.sarray_url_md5_pairs = [ (self.sarray_url, "08830e90771897c1cd187a07cdcb52b4") - ] + ] def get_backgrounds(self): tarfile_path = _download_and_checksum_files( self.sarray_url_md5_pairs, _get_cache_dir("data") - )[0] + )[0] backgrounds_tar = _tarfile.open(tarfile_path) try: backgrounds = _tc.SArray(self.destination_sarray_path) diff --git a/src/python/turicreate/toolkits/_decision_tree.py b/src/python/turicreate/toolkits/_decision_tree.py index f321f6e74b..41feb41afd 100644 --- a/src/python/turicreate/toolkits/_decision_tree.py +++ b/src/python/turicreate/toolkits/_decision_tree.py @@ -10,17 +10,26 @@ from turicreate.toolkits._internal_utils import _numeric_param_check_range import sys as _sys + if _sys.version_info.major == 3: long = int -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # # Single Node (of a decision tree) # -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- class Node(object): - def __init__(self, node_id, split_feature, value, node_type, - left_id = None, right_id = None, missing_id = None): + def __init__( + self, + node_id, + split_feature, + value, + node_type, + left_id=None, + right_id=None, + missing_id=None, + ): """ Simple class to make a node for a tree. @@ -40,7 +49,7 @@ def __init__(self, node_id, split_feature, value, node_type, self.node_id = node_id # If not a leaf node. if split_feature is not None: - self.split_feature_column = split_feature[0] + self.split_feature_column = split_feature[0] self.split_feature_index = split_feature[1] else: self.split_feature_column = None @@ -49,7 +58,7 @@ def __init__(self, node_id, split_feature, value, node_type, # float/int/str or leaf. # Set to leaf if the node type is leaf. self.node_type = node_type - is_leaf = node_type == u'leaf' + is_leaf = node_type == u"leaf" self.is_leaf = is_leaf self.value = value @@ -77,7 +86,7 @@ def __repr__(self): out += "\nValue : %s" % self.value return out - def get_decision(self, child, is_missing = False): + def get_decision(self, child, is_missing=False): """ Get the decision from this node to a child node. @@ -112,14 +121,14 @@ def get_decision(self, child, is_missing = False): value = None return { - "node_id" : self.node_id, - "node_type" : self.node_type, - "feature" : feature, - "index" : index, - "sign" : sign, - "value" : value, - "child_id" : child.node_id, - "is_missing" : is_missing + "node_id": self.node_id, + "node_type": self.node_type, + "feature": feature, + "index": index, + "sign": sign, + "value": value, + "child_id": child.node_id, + "is_missing": is_missing, } def to_dict(self): @@ -133,25 +142,28 @@ def to_dict(self): """ out = {} for key in self.__dict__.keys(): - if key not in ['left', 'right', 'missing', 'parent']: + if key not in ["left", "right", "missing", "parent"]: out[key] = self.__dict__[key] return out def __eq__(self, node): - return (self.node_id == node.node_id) and\ - (self.value == node.value) and\ - (self.split_feature_column == node.split_feature_column) and\ - (self.is_leaf == node.is_leaf) and\ - (self.left_id == node.left_id) and\ - (self.missing_id == node.missing_id) and\ - (self.right_id == node.right_id) and\ - (self.num_examples == node.num_examples) - -#----------------------------------------------------------------------------- + return ( + (self.node_id == node.node_id) + and (self.value == node.value) + and (self.split_feature_column == node.split_feature_column) + and (self.is_leaf == node.is_leaf) + and (self.left_id == node.left_id) + and (self.missing_id == node.missing_id) + and (self.right_id == node.right_id) + and (self.num_examples == node.num_examples) + ) + + +# ----------------------------------------------------------------------------- # # Decision Tree (of nodes) # -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- class DecisionTree: def __init__(self): """ @@ -183,16 +195,18 @@ def __init__(self): pass @classmethod - def from_model(cls, model, tree_id = 0): + def from_model(cls, model, tree_id=0): import turicreate as _tc import json as _json - _raise_error_if_not_of_type(tree_id, [int,long], "tree_id") + _raise_error_if_not_of_type(tree_id, [int, long], "tree_id") _numeric_param_check_range("tree_id", tree_id, 0, model.num_trees - 1) tree = DecisionTree() tree_str = _tc.extensions._xgboost_get_tree(model.__proxy__, tree_id) - metadata_mapping = _tc.extensions._supervised_learning._get_metadata_mapping(model.__proxy__) + metadata_mapping = _tc.extensions._supervised_learning._get_metadata_mapping( + model.__proxy__ + ) trees_json = _json.loads(tree_str) # Parse the tree from the JSON. @@ -200,8 +214,12 @@ def from_model(cls, model, tree_id = 0): tree.root_id = 0 # Keep track of the attributes. - for key in {"num_examples", "num_features", "num_unpacked_features", - "max_depth"}: + for key in { + "num_examples", + "num_features", + "num_unpacked_features", + "max_depth", + }: setattr(tree, key, model._get(key)) return tree @@ -224,18 +242,19 @@ def _parse_tree_json_vertices(self, vertices, metadata_mapping): nodes = {} for v in vertices: - node_id = v.get('id', None) - split_feature = v.get('name', None) + node_id = v.get("id", None) + split_feature = v.get("name", None) if split_feature is not None: idx = int(split_feature.strip("{").strip("}")) split_feature = metadata_mapping[idx] - value = v.get('value', None) - node_type = v.get('type', None) - left_id = v.get('yes_child', None) - right_id = v.get('no_child', None) - missing_id = v.get('missing_child', None) - nodes[node_id] = Node(node_id, split_feature, value, node_type, - left_id, right_id, missing_id) + value = v.get("value", None) + node_type = v.get("type", None) + left_id = v.get("yes_child", None) + right_id = v.get("no_child", None) + missing_id = v.get("missing_child", None) + nodes[node_id] = Node( + node_id, split_feature, value, node_type, left_id, right_id, missing_id + ) return nodes def _make_tree(self, trees_json, metadata_mapping): @@ -249,36 +268,23 @@ def _make_tree(self, trees_json, metadata_mapping): for nid, node in self.nodes.items(): if not node.is_leaf: e = [ - { - 'src' : node.node_id, - 'dst' : node.left_id, - 'value' : 'left' - }, - - { - 'src' : node.node_id, - 'dst' : node.right_id, - 'value' : 'right' - }, - { - 'src' : node.node_id, - 'dst' : node.missing_id, - 'value' : 'missing' - }, - ] + {"src": node.node_id, "dst": node.left_id, "value": "left"}, + {"src": node.node_id, "dst": node.right_id, "value": "right"}, + {"src": node.node_id, "dst": node.missing_id, "value": "missing"}, + ] edges += e # Now, make a tree from the edges. for e in edges: - src = e['src'] - dst = e['dst'] - value = e['value'] + src = e["src"] + dst = e["dst"] + value = e["value"] # Left/Right pointers. - if value == 'left': + if value == "left": self.nodes[src].left_id = dst self.nodes[src].left = self.nodes[dst] - elif value == 'right': + elif value == "right": self.nodes[src].right_id = dst self.nodes[src].right = self.nodes[dst] else: @@ -297,7 +303,7 @@ def _make_tree(self, trees_json, metadata_mapping): self._root_id = n.node_id break - def to_json(self, root_id = 0, output = {}): + def to_json(self, root_id=0, output={}): """ Recursive function to dump this tree as a json blob. @@ -357,17 +363,17 @@ def to_json(self, root_id = 0, output = {}): 'split_feature_index': 'count_sum', 'value': 22.5} """ - _raise_error_if_not_of_type(root_id, [int,long], "root_id") + _raise_error_if_not_of_type(root_id, [int, long], "root_id") _numeric_param_check_range("root_id", root_id, 0, self.num_nodes - 1) node = self.nodes[root_id] output = node.to_dict() if node.left_id is not None: j = node.left_id - output['left'] = self.to_json(j, output) + output["left"] = self.to_json(j, output) if node.right_id is not None: j = node.right_id - output['right'] = self.to_json(j, output) + output["right"] = self.to_json(j, output) return output def get_prediction_score(self, node_id): @@ -395,12 +401,12 @@ def get_prediction_score(self, node_id): None """ - _raise_error_if_not_of_type(node_id, [int,long], "node_id") + _raise_error_if_not_of_type(node_id, [int, long], "node_id") _numeric_param_check_range("node_id", node_id, 0, self.num_nodes - 1) node = self.nodes[node_id] return None if node.is_leaf is False else node.value - def get_prediction_path(self, node_id, missing_id = []): + def get_prediction_path(self, node_id, missing_id=[]): """ Return the prediction path from this node to the parent node. @@ -431,24 +437,24 @@ def get_prediction_path(self, node_id, missing_id = []): 'sign': '<=', 'value': 146.5}] """ - _raise_error_if_not_of_type(node_id, [int,long], "node_id") + _raise_error_if_not_of_type(node_id, [int, long], "node_id") _numeric_param_check_range("node_id", node_id, 0, self.num_nodes - 1) def _deduplicate_path(path): - s_nodes = {} # super_nodes + s_nodes = {} # super_nodes s_path = [] # paths of super nodes. for node in path: - feature = node['feature'] - index = node['index'] + feature = node["feature"] + index = node["index"] if (feature, index) not in s_nodes: s_nodes[feature, index] = node s_path.append(node) else: s_node = s_nodes[feature, index] - s_sign = s_node['sign'] - sign = node['sign'] - value = node['value'] + s_sign = s_node["sign"] + sign = node["sign"] + value = node["value"] # Supernode has no range. if s_sign == "<": diff --git a/src/python/turicreate/toolkits/_evaluate_utils.py b/src/python/turicreate/toolkits/_evaluate_utils.py index 074e0b1210..23ac21e9f4 100644 --- a/src/python/turicreate/toolkits/_evaluate_utils.py +++ b/src/python/turicreate/toolkits/_evaluate_utils.py @@ -2,88 +2,141 @@ import math import turicreate as _tc + def entropy(probs): - return _reduce(lambda x, y: x + (y*math.log(1/y, 2) if y > 0 else 0) , probs, 0) / math.log(len(probs),2) + return _reduce( + lambda x, y: x + (y * math.log(1 / y, 2) if y > 0 else 0), probs, 0 + ) / math.log(len(probs), 2) + def confidence(probs): return max(probs) + def relative_confidence(probs): lp = len(probs) - return probs[lp-1] - probs[lp-2] + return probs[lp - 1] - probs[lp - 2] + def get_confusion_matrix(extended_test, labels): - #Init a matrix - sf_confusion_matrix = {'label':[], 'predicted_label':[], 'prob_default':[]} + # Init a matrix + sf_confusion_matrix = {"label": [], "predicted_label": [], "prob_default": []} for target_l in labels: for predicted_l in labels: - sf_confusion_matrix['label'].append(target_l) - sf_confusion_matrix['predicted_label'].append(predicted_l) - sf_confusion_matrix['prob_default'].append(0) + sf_confusion_matrix["label"].append(target_l) + sf_confusion_matrix["predicted_label"].append(predicted_l) + sf_confusion_matrix["prob_default"].append(0) sf_confusion_matrix = _tc.SFrame(sf_confusion_matrix) - sf_confusion_matrix = sf_confusion_matrix.join(extended_test.groupby(['label', 'predicted_label'], {'count' :_tc.aggregate.COUNT}), how='left', on=['label','predicted_label']) - sf_confusion_matrix = sf_confusion_matrix.fillna('count', 0) - - label_column = _tc.SFrame({'label': extended_test['label']}) - predictions = extended_test['probs'] + sf_confusion_matrix = sf_confusion_matrix.join( + extended_test.groupby( + ["label", "predicted_label"], {"count": _tc.aggregate.COUNT} + ), + how="left", + on=["label", "predicted_label"], + ) + sf_confusion_matrix = sf_confusion_matrix.fillna("count", 0) + + label_column = _tc.SFrame({"label": extended_test["label"]}) + predictions = extended_test["probs"] for i in range(0, len(labels)): - new_test_data = label_column.add_columns([predictions.apply(lambda probs: probs[i]), predictions.apply(lambda probs: labels[i])], ['prob','predicted_label']) - if (i==0): + new_test_data = label_column.add_columns( + [ + predictions.apply(lambda probs: probs[i]), + predictions.apply(lambda probs: labels[i]), + ], + ["prob", "predicted_label"], + ) + if i == 0: test_longer_form = new_test_data else: test_longer_form = test_longer_form.append(new_test_data) if len(extended_test) == 0: - sf_confusion_matrix = sf_confusion_matrix.rename({'prob_default': 'prob', 'label': 'target_label'}) + sf_confusion_matrix = sf_confusion_matrix.rename( + {"prob_default": "prob", "label": "target_label"} + ) else: - sf_confusion_matrix = sf_confusion_matrix.join(test_longer_form.groupby(['label', 'predicted_label'], {'prob': _tc.aggregate.SUM('prob')}), how='left', on=['label', 'predicted_label']) - sf_confusion_matrix = sf_confusion_matrix.rename({'label': 'target_label'}).fillna('prob', 0) - - def wo_divide_by_zero(a,b): - if b==0: + sf_confusion_matrix = sf_confusion_matrix.join( + test_longer_form.groupby( + ["label", "predicted_label"], {"prob": _tc.aggregate.SUM("prob")} + ), + how="left", + on=["label", "predicted_label"], + ) + sf_confusion_matrix = sf_confusion_matrix.rename( + {"label": "target_label"} + ).fillna("prob", 0) + + def wo_divide_by_zero(a, b): + if b == 0: return None else: - return a*1.0/b + return a * 1.0 / b - sf_confusion_matrix['norm_prob'] = sf_confusion_matrix.join(sf_confusion_matrix.groupby('target_label', {'sum_prob': _tc.aggregate.SUM('prob')}),how='left').apply(lambda x: wo_divide_by_zero(x['prob'], x['sum_prob'])) - return sf_confusion_matrix.fillna('norm_prob', 0) + sf_confusion_matrix["norm_prob"] = sf_confusion_matrix.join( + sf_confusion_matrix.groupby( + "target_label", {"sum_prob": _tc.aggregate.SUM("prob")} + ), + how="left", + ).apply(lambda x: wo_divide_by_zero(x["prob"], x["sum_prob"])) + return sf_confusion_matrix.fillna("norm_prob", 0) def hclusterSort(vectors, dist_fn): distances = [] vecs = list(vectors)[:] for i in range(0, len(vecs)): - for j in range(i+1, len(vecs)): - distances.append({'from': vecs[i], 'to': vecs[j], 'dist': dist_fn(vecs[i], vecs[j])}) - distances = sorted(distances, key=lambda d: d['dist']) + for j in range(i + 1, len(vecs)): + distances.append( + {"from": vecs[i], "to": vecs[j], "dist": dist_fn(vecs[i], vecs[j])} + ) + distances = sorted(distances, key=lambda d: d["dist"]) excluding_names = [] - while(len(distances) > 0): + while len(distances) > 0: min_dist = distances[0] - new_vec = {'name': str(min_dist['from']['name']) + '|'+ str(min_dist['to']['name']), - 'members': min_dist['from'].get('members', [min_dist['from']]) + min_dist['to'].get('members',[min_dist['to']])} + new_vec = { + "name": str(min_dist["from"]["name"]) + "|" + str(min_dist["to"]["name"]), + "members": min_dist["from"].get("members", [min_dist["from"]]) + + min_dist["to"].get("members", [min_dist["to"]]), + } - excluding_names = [min_dist['from']['name'], min_dist['to']['name']] + excluding_names = [min_dist["from"]["name"], min_dist["to"]["name"]] - vecs = list(filter(lambda v: v['name'] not in excluding_names, vecs)) - distances = list(filter(lambda dist: (dist['from']['name'] not in excluding_names) and (dist['to']['name'] not in excluding_names), distances)) + vecs = list(filter(lambda v: v["name"] not in excluding_names, vecs)) + distances = list( + filter( + lambda dist: (dist["from"]["name"] not in excluding_names) + and (dist["to"]["name"] not in excluding_names), + distances, + ) + ) for v in vecs: total = 0 - for vi in v.get('members', [v]): - for vj in new_vec['members']: + for vi in v.get("members", [v]): + for vj in new_vec["members"]: total += dist_fn(vi, vj) - distances.append({'from': v, 'to': new_vec, 'dist': total/len(v.get('members', [v]))/len(new_vec['members'])}) + distances.append( + { + "from": v, + "to": new_vec, + "dist": total + / len(v.get("members", [v])) + / len(new_vec["members"]), + } + ) vecs.append(new_vec) - distances = sorted(distances, key=lambda d: d['dist']) + distances = sorted(distances, key=lambda d: d["dist"]) return vecs + def l2Dist(v1, v2): dist = 0 - for i in range(0, len(v1['pos'])): - dist += math.pow(v1['pos'][i] - v2['pos'][i], 2) + for i in range(0, len(v1["pos"])): + dist += math.pow(v1["pos"][i] - v2["pos"][i], 2) return math.pow(dist, 0.5) diff --git a/src/python/turicreate/toolkits/_feature_engineering/__init__.py b/src/python/turicreate/toolkits/_feature_engineering/__init__.py index 14d0056500..fcce05dc7d 100644 --- a/src/python/turicreate/toolkits/_feature_engineering/__init__.py +++ b/src/python/turicreate/toolkits/_feature_engineering/__init__.py @@ -3,7 +3,7 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" A transformer is a stateful object that transforms input data (as an SFrame) from one form to another. Transformers are commonly used for feature engineering. In addition to the modules provided in Turi create, users can @@ -25,7 +25,7 @@ +---------------+---------------------------------------------------+ | save | Save the model to a Turi Create archive. | +---------------+---------------------------------------------------+ -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -44,6 +44,7 @@ from ._autovectorizer import AutoVectorizer from turicreate.toolkits._internal_utils import _raise_error_if_not_sframe + def create(dataset, transformers): """ Create a Transformer object to transform data for feature engineering. @@ -95,7 +96,7 @@ def create(dataset, transformers): _raise_error_if_not_sframe(dataset, "dataset") # List of transformers. - if (cls == list): + if cls == list: transformers = TransformerChain(transformers) # Transformer. else: diff --git a/src/python/turicreate/toolkits/_feature_engineering/_autovectorizer.py b/src/python/turicreate/toolkits/_feature_engineering/_autovectorizer.py index 7eef03cdbc..df26718512 100644 --- a/src/python/turicreate/toolkits/_feature_engineering/_autovectorizer.py +++ b/src/python/turicreate/toolkits/_feature_engineering/_autovectorizer.py @@ -26,6 +26,7 @@ from turicreate import SFrame as _SFrame import turicreate as _tc + class _ColumnFunctionTransformation(_TransformerBase, ExposeAttributesFromProxy): """ Utility transformer: Passes all specified columns through a given function. @@ -39,25 +40,32 @@ def _get_version(self): def _setup(self): self.__proxy__ = _PythonProxy() - - def __init__(self, features=None, excluded_features=None, output_column_prefix=None, - transform_function = lambda x: x, transform_function_name = "none"): + def __init__( + self, + features=None, + excluded_features=None, + output_column_prefix=None, + transform_function=lambda x: x, + transform_function_name="none", + ): self._setup() # Process and make a copy of the features, exclude. - _features, _exclude = _internal_utils.process_features(features, excluded_features) + _features, _exclude = _internal_utils.process_features( + features, excluded_features + ) - #Type check + # Type check _raise_error_if_not_of_type(output_column_prefix, [str, type(None)]) state = {} - state['output_column_prefix'] = output_column_prefix - state['features'] = _features - state['excluded_features'] = _exclude - state['fitted'] = False - state['transform_function'] = transform_function - state['transform_function_name'] = transform_function_name + state["output_column_prefix"] = output_column_prefix + state["features"] = _features + state["excluded_features"] = _exclude + state["fitted"] = False + state["transform_function"] = transform_function + state["transform_function_name"] = transform_function_name if _exclude: self._exclude = True @@ -83,8 +91,8 @@ def _load_version(cls, unpickler, version): """ state, _exclude, _features = unpickler.load() - features = state['features'] - excluded_features = state['excluded_features'] + features = state["features"] + excluded_features = state["excluded_features"] model = cls.__new__(cls) model._setup() @@ -114,8 +122,7 @@ def _save_impl(self, pickler): >>> loaded_model = turicreate.load_model('my_model_file') """ raise NotImplementedError("save/load not implemented for feature transformers") - pickler.dump( (self.__proxy__.state, self._exclude, self._features) ) - + pickler.dump((self.__proxy__.state, self._exclude, self._features)) def _get_summary_struct(self): """ @@ -138,9 +145,9 @@ def _get_summary_struct(self): fields = [ ("Features", "features"), ("Excluded_features", "excluded_features"), - ("Transform", "transform_function_name") + ("Transform", "transform_function_name"), ] - section_titles = ['Model fields'] + section_titles = ["Model fields"] return ([fields], section_titles) @@ -156,13 +163,15 @@ def fit(self, data): _raise_error_if_not_sframe(data, "data") fitted_state = {} - feature_columns = _internal_utils.get_column_names(data, self._exclude, self._features) + feature_columns = _internal_utils.get_column_names( + data, self._exclude, self._features + ) if not feature_columns: raise RuntimeError("No valid feature columns specified in transformation.") - fitted_state['features'] = feature_columns - fitted_state['fitted'] = True + fitted_state["features"] = feature_columns + fitted_state["fitted"] = True self.__proxy__.update(fitted_state) @@ -182,7 +191,7 @@ def transform(self, data): if output_column_prefix is None: prefix = "" else: - prefix = output_column_prefix + '.' + prefix = output_column_prefix + "." transform_function = self._get("transform_function") @@ -212,10 +221,14 @@ class _interpretations_class(object): def __get_copy_transform(self, column_name, output_column_prefix): if output_column_prefix: - return [_ColumnFunctionTransformation( - features = [column_name], transform_function = lambda x: x, - output_column_prefix = output_column_prefix, - transform_function_name = "identity")] + return [ + _ColumnFunctionTransformation( + features=[column_name], + transform_function=lambda x: x, + output_column_prefix=output_column_prefix, + transform_function_name="identity", + ) + ] else: return [] @@ -230,15 +243,20 @@ def short_text__str(self, column_name, output_column_prefix): from ._ngram_counter import NGramCounter from ._tfidf import TFIDF - return [NGramCounter(features=[column_name], - n = 3, - method = "character", - output_column_prefix = output_column_prefix), - - TFIDF(features=[column_name], - min_document_frequency=0.01, - max_document_frequency=0.5, - output_column_prefix = output_column_prefix)] + return [ + NGramCounter( + features=[column_name], + n=3, + method="character", + output_column_prefix=output_column_prefix, + ), + TFIDF( + features=[column_name], + min_document_frequency=0.01, + max_document_frequency=0.5, + output_column_prefix=output_column_prefix, + ), + ] short_text__str.description = "3-Character NGram Counts -> TFIDF" short_text__str.output_type = dict @@ -254,15 +272,20 @@ def long_text__str(self, column_name, output_column_prefix): from ._ngram_counter import NGramCounter from ._tfidf import TFIDF - return [NGramCounter(features=[column_name], - n = 2, - method = "word", - output_column_prefix = output_column_prefix), - - TFIDF(features=[column_name], - min_document_frequency=0.01, - max_document_frequency=0.5, - output_column_prefix = output_column_prefix)] + return [ + NGramCounter( + features=[column_name], + n=2, + method="word", + output_column_prefix=output_column_prefix, + ), + TFIDF( + features=[column_name], + min_document_frequency=0.01, + max_document_frequency=0.5, + output_column_prefix=output_column_prefix, + ), + ] long_text__str.description = "2-Word NGram Counts -> TFIDF" long_text__str.output_type = dict @@ -285,11 +308,14 @@ def categorical__int(self, column_name, output_column_prefix): Interprets an integer column as a categorical variable. """ - return [_ColumnFunctionTransformation( - features = [column_name], - output_column_prefix = output_column_prefix, - transform_function = lambda col: col.astype(str), - transform_function_name = "astype(str)")] + return [ + _ColumnFunctionTransformation( + features=[column_name], + output_column_prefix=output_column_prefix, + transform_function=lambda col: col.astype(str), + transform_function_name="astype(str)", + ) + ] categorical__int.description = "astype(str)" categorical__int.output_type = str @@ -301,11 +327,14 @@ def categorical__float(self, column_name, output_column_prefix): Interprets a float column as a categorical variable. """ - return [_ColumnFunctionTransformation( - features = [column_name], - output_column_prefix = output_column_prefix, - transform_function = lambda col: col.astype(str), - transform_function_name = "astype(str)")] + return [ + _ColumnFunctionTransformation( + features=[column_name], + output_column_prefix=output_column_prefix, + transform_function=lambda col: col.astype(str), + transform_function_name="astype(str)", + ) + ] categorical__float.description = "astype(str)" categorical__float.output_type = str @@ -317,8 +346,11 @@ def categorical__list(self, column_name, output_column_prefix): Interprets a list of categories as a sparse vector. """ - return [_TransformToFlatDictionary(features = [column_name], - output_column_prefix = output_column_prefix)] + return [ + _TransformToFlatDictionary( + features=[column_name], output_column_prefix=output_column_prefix + ) + ] categorical__list.description = "Flatten" categorical__list.output_type = dict @@ -330,8 +362,11 @@ def sparse_vector__dict(self, column_name, output_column_prefix): Interprets a dictionary as a sparse_vector. """ - return [_TransformToFlatDictionary(features = [column_name], - output_column_prefix = output_column_prefix)] + return [ + _TransformToFlatDictionary( + features=[column_name], output_column_prefix=output_column_prefix + ) + ] sparse_vector__dict.description = "Flatten" sparse_vector__dict.output_type = dict @@ -372,11 +407,12 @@ def vector__array(self, column_name, output_column_prefix): vector__array.description = "None" vector__array.output_type = _array - ############################################################ + _interpretations = _interpretations_class() + def _get_interpretation_function(interpretation, dtype): """ Retrieves the interpretation function used. @@ -388,11 +424,14 @@ def _get_interpretation_function(interpretation, dtype): global _interpretations if not hasattr(_interpretations, name): - raise ValueError("No transform available for type '%s' with interpretation '%s'." - % (type_string, interpretation)) + raise ValueError( + "No transform available for type '%s' with interpretation '%s'." + % (type_string, interpretation) + ) return getattr(_interpretations, name) + def _get_interpretation_description_and_output_type(interpretation, dtype): """ Returns the description and output type for a given interpretation. @@ -402,15 +441,18 @@ def _get_interpretation_description_and_output_type(interpretation, dtype): name = "%s__%s" % (interpretation, type_string) if not hasattr(_interpretations_class, name): - raise ValueError("No transform available for type '%s' with interpretation '%s'." - % (type_string, interpretation)) + raise ValueError( + "No transform available for type '%s' with interpretation '%s'." + % (type_string, interpretation) + ) # Need unbound method to get the attributes func = getattr(_interpretations_class, name) return func.description, func.output_type -def _get_embeddable_interpretation_doc(indent = 0): + +def _get_embeddable_interpretation_doc(indent=0): """ Returns a list of the available interpretations and what they do. @@ -429,11 +471,14 @@ def _get_embeddable_interpretation_doc(indent = 0): func = getattr(_interpretations, name) output_rows.append("%s (%s type):" % (interpretation, type_str)) - output_rows += [(" " + line) for line in _textwrap.dedent(func.__doc__).strip().split("\n")] + output_rows += [ + (" " + line) for line in _textwrap.dedent(func.__doc__).strip().split("\n") + ] output_rows.append("") - return "\n".join(" "*indent + line for line in output_rows) + return "\n".join(" " * indent + line for line in output_rows) + def infer_column_interpretation(column): """ @@ -441,12 +486,14 @@ def infer_column_interpretation(column): """ from turicreate.extensions import _infer_content_interpretation + return _infer_content_interpretation(column) class AutoVectorizer(_TransformerBase, ExposeAttributesFromProxy): - __doc__ = _textwrap.dedent( - """Creates a feature transformer based on the content in the provided + __doc__ = ( + _textwrap.dedent( + """Creates a feature transformer based on the content in the provided data that turns arbitrary content into informative features usable by any Turi ML algorithm. For example, text is parsed and converted into a sparse dictionary of features based on word @@ -492,12 +539,15 @@ class AutoVectorizer(_TransformerBase, ExposeAttributesFromProxy): %(interpretation_docstrings)s - """) % {"interpretation_docstrings" : _get_embeddable_interpretation_doc(indent = 2) } + """ + ) + % {"interpretation_docstrings": _get_embeddable_interpretation_doc(indent=2)} + ) @classmethod def _get_instance_and_data(cls): - sf = _tc.SFrame({'a' : [1, 2, 3, 2, 3], 'b' : ["a", "b", "a", "b", "b"]}) - encoder = AutoVectorizer( features = ['a', 'b'] ) + sf = _tc.SFrame({"a": [1, 2, 3, 2, 3], "b": ["a", "b", "a", "b", "b"]}) + encoder = AutoVectorizer(features=["a", "b"]) return encoder.fit(sf), sf def _setup(self): @@ -506,36 +556,47 @@ def _setup(self): """ self.__proxy__ = _PythonProxy() - def __init__(self, features = None, excluded_features = None, output_column_prefix = None, - column_interpretations = None, verbose = True): + def __init__( + self, + features=None, + excluded_features=None, + output_column_prefix=None, + column_interpretations=None, + verbose=True, + ): self._setup() - _features, _exclude = _internal_utils.process_features(features, excluded_features) + _features, _exclude = _internal_utils.process_features( + features, excluded_features + ) # Check the column_interpretations parameter type if column_interpretations is None: column_interpretations = {} - if (not isinstance(column_interpretations, dict) - or not all(isinstance(k, str) and isinstance(v, str) - for k, v in column_interpretations.items())): + if not isinstance(column_interpretations, dict) or not all( + isinstance(k, str) and isinstance(v, str) + for k, v in column_interpretations.items() + ): - raise TypeError("`column_interpretations` must be a dictionary of " - "column names to interpretation strings.") + raise TypeError( + "`column_interpretations` must be a dictionary of " + "column names to interpretation strings." + ) state = {} - state['user_column_interpretations'] = column_interpretations.copy() - state['column_interpretations'] = column_interpretations.copy() - state['output_column_prefix'] = output_column_prefix - state['fitted'] = False - state['verbose'] = verbose + state["user_column_interpretations"] = column_interpretations.copy() + state["column_interpretations"] = column_interpretations.copy() + state["output_column_prefix"] = output_column_prefix + state["fitted"] = False + state["verbose"] = verbose - state['transforms'] = {} - state['transform_chain'] = None + state["transforms"] = {} + state["transform_chain"] = None - state['features'] = _features - state['excluded_features'] = _exclude + state["features"] = _features + state["excluded_features"] = _exclude if _exclude: self._exclude = True @@ -546,7 +607,6 @@ def __init__(self, features = None, excluded_features = None, output_column_pref self.__proxy__.update(state) - def _setup_from_data(self, data): """ Sets up the content transforms. @@ -556,7 +616,9 @@ def _setup_from_data(self, data): _raise_error_if_not_of_type(data, [_SFrame]) - feature_columns = _internal_utils.get_column_names(data, self._exclude, self._features) + feature_columns = _internal_utils.get_column_names( + data, self._exclude, self._features + ) if not feature_columns: raise RuntimeError("No valid feature columns specified in transformation.") @@ -567,7 +629,9 @@ def _setup_from_data(self, data): # Helper functions def get_valid_interpretations(): - return list(n.split("__")[0] for n in dir(_interpretations) if not n.startswith("_")) + return list( + n.split("__")[0] for n in dir(_interpretations) if not n.startswith("_") + ) ################################################################################ # Check input data. @@ -582,16 +646,23 @@ def get_valid_interpretations(): # Make sure all the interpretations are valid. for k, v in column_interpretations.items(): if k not in all_col_names: - raise ValueError("Column '%s' in column_interpretations, but not found in `data`." % k) + raise ValueError( + "Column '%s' in column_interpretations, but not found in `data`." + % k + ) # Get the automatic column interpretations. for col_name in feature_columns: if col_name not in column_interpretations: - n = column_interpretations[col_name] = infer_column_interpretation(data[col_name]) + n = column_interpretations[col_name] = infer_column_interpretation( + data[col_name] + ) if n.startswith("unknown"): - raise ValueError("Interpretation inference failed on column '%s'; %s" - % (col_name, n[len("unknown"):].strip())) + raise ValueError( + "Interpretation inference failed on column '%s'; %s" + % (col_name, n[len("unknown") :].strip()) + ) # Now, build up the feature transforms. transforms = {} @@ -605,7 +676,9 @@ def get_valid_interpretations(): for col_name in feature_columns: in_type = input_types[col_name] = data[col_name].dtype - intr_func = _get_interpretation_function(column_interpretations[col_name], in_type) + intr_func = _get_interpretation_function( + column_interpretations[col_name], in_type + ) tr_list = intr_func(col_name, output_column_prefix) transforms[col_name] = tr_list tr_chain += tr_list @@ -639,7 +712,7 @@ def fit(self, data): self._setup_from_data(data) self.transform_chain.fit(data) - self.__proxy__.update({"fitted" : True}) + self.__proxy__.update({"fitted": True}) return self def fit_transform(self, data): @@ -667,7 +740,7 @@ def fit_transform(self, data): self._setup_from_data(data) ret = self.transform_chain.fit_transform(data) - self.__proxy__.update({"fitted" : True}) + self.__proxy__.update({"fitted": True}) return ret def transform(self, data): @@ -694,11 +767,12 @@ def transform(self, data): """ if self.transform_chain is None: - raise RuntimeError("`transform()` method called before `fit` or `fit_transform`.") + raise RuntimeError( + "`transform()` method called before `fit` or `fit_transform`." + ) return self.transform_chain.transform(data) - def _get_summary_struct(self): """ Returns a structured description of the model, including (where relevant) @@ -721,17 +795,23 @@ def _get_summary_struct(self): fields = [] _features = _precomputed_field(_internal_utils.pretty_print_list(self.features)) - _exclude = _precomputed_field(_internal_utils.pretty_print_list(self.excluded_features)) + _exclude = _precomputed_field( + _internal_utils.pretty_print_list(self.excluded_features) + ) - header_fields = [("Features", "features"), - ("Excluded Features", "excluded_features")] + header_fields = [ + ("Features", "features"), + ("Excluded Features", "excluded_features"), + ] sections.append("Model Fields") fields.append(header_fields) if self.user_column_interpretations: sections.append("User Specified Interpretations") - fields.append(list(sorted(self._get("user_column_interpretations").items()))) + fields.append( + list(sorted(self._get("user_column_interpretations").items())) + ) column_interpretations = self._get("column_interpretations") features = self._get("features") @@ -739,15 +819,25 @@ def _get_summary_struct(self): if self._get("fitted") and features is not None: n_rows = len(features) - transform_info = [None]*n_rows + transform_info = [None] * n_rows for i, f in enumerate(features): interpretation = column_interpretations[f] input_type = self.input_types[f] - description, output_type = _get_interpretation_description_and_output_type( - interpretation, input_type) - - transform_info[i] = (f, input_type.__name__, interpretation, description, output_type.__name__) + ( + description, + output_type, + ) = _get_interpretation_description_and_output_type( + interpretation, input_type + ) + + transform_info[i] = ( + f, + input_type.__name__, + interpretation, + description, + output_type.__name__, + ) transform_table = _SFrame() transform_table["Column"] = [t[0] for t in transform_info] @@ -782,8 +872,8 @@ def _load_version(cls, unpickler, version): """ state, _exclude, _features = unpickler.load() - features = state['features'] - excluded_features = state['excluded_features'] + features = state["features"] + excluded_features = state["excluded_features"] model = cls.__new__(cls) model._setup() @@ -812,4 +902,4 @@ def _save_impl(self, pickler): >>> model.save('my_model_file') >>> loaded_model = turicreate.load_model('my_model_file') """ - pickler.dump( (self.__proxy__.state, self._exclude, self._features) ) + pickler.dump((self.__proxy__.state, self._exclude, self._features)) diff --git a/src/python/turicreate/toolkits/_feature_engineering/_bm25.py b/src/python/turicreate/toolkits/_feature_engineering/_bm25.py index bff3673ae7..db97e259e6 100644 --- a/src/python/turicreate/toolkits/_feature_engineering/_bm25.py +++ b/src/python/turicreate/toolkits/_feature_engineering/_bm25.py @@ -13,11 +13,12 @@ from turicreate.toolkits._internal_utils import _toolkit_repr_print from turicreate.toolkits._internal_utils import _precomputed_field from turicreate.util import _raise_error_if_not_of_type + # Feature engineering utils from . import _internal_utils -_fit_examples_doc = ''' +_fit_examples_doc = """ >>> import turicreate as tc # Create the data @@ -46,9 +47,9 @@ | docs | example | 1 | +----------------+---------+--------------------+ [2 rows x 3 columns] -''' +""" -_fit_transform_examples_doc = ''' +_fit_transform_examples_doc = """ >>> import turicreate as tc # Create the data @@ -81,9 +82,9 @@ +----------------+ [3 rows x 1 columns] -''' +""" -_transform_examples_doc = ''' +_transform_examples_doc = """ >>>import turicreate as tc # Dictionary Input: @@ -148,11 +149,11 @@ | 0.0 | +----------------+ [3 rows x 1 columns] -''' +""" class BM25(Transformer): - ''' + """ Transform an SFrame into BM25 scores for a given query. If we have a query with words :math:`q_1, ..., q_n` the BM25 score for @@ -290,15 +291,23 @@ class BM25(Transformer): | docs | example | 1 | +----------------+---------+--------------------+ - ''' + """ # Doc strings _fit_examples_doc = _fit_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc - _transform_examples_doc = _transform_examples_doc - - def __init__(self, feature, query, k1 = 1.5, b = 0.75, min_document_frequency = 0.0, - max_document_frequency=1.0, output_column_name=None): + _transform_examples_doc = _transform_examples_doc + + def __init__( + self, + feature, + query, + k1=1.5, + b=0.75, + min_document_frequency=0.0, + max_document_frequency=1.0, + output_column_name=None, + ): # Convert query to list if necessary if isinstance(query, _tc.SArray): @@ -309,7 +318,7 @@ def __init__(self, feature, query, k1 = 1.5, b = 0.75, min_document_frequency = # Type checking _raise_error_if_not_of_type(feature, [str]) for q in query: - _raise_error_if_not_of_type(q, [str]) # query must be list of strings + _raise_error_if_not_of_type(q, [str]) # query must be list of strings _raise_error_if_not_of_type(k1, [float, int]) _raise_error_if_not_of_type(b, [float, int]) _raise_error_if_not_of_type(min_document_frequency, [float, int]) @@ -318,13 +327,13 @@ def __init__(self, feature, query, k1 = 1.5, b = 0.75, min_document_frequency = # Set up options opts = { - 'features': [feature], - 'query': query, - 'k1': k1, - 'b': b, - 'min_document_frequency': min_document_frequency, - 'max_document_frequency': max_document_frequency, - 'output_column_name' : output_column_name + "features": [feature], + "query": query, + "k1": k1, + "b": b, + "min_document_frequency": min_document_frequency, + "max_document_frequency": max_document_frequency, + "output_column_name": output_column_name, } # Initialize object @@ -334,17 +343,18 @@ def __init__(self, feature, query, k1 = 1.5, b = 0.75, min_document_frequency = def _get_summary_struct(self): _features = _precomputed_field( - _internal_utils.pretty_print_list(self.get('features'))) + _internal_utils.pretty_print_list(self.get("features")) + ) fields = [ ("Features", _features), - ("query", 'query'), - ("k1", 'k1'), - ("b", 'b'), - ("Minimum Document Frequency", 'min_document_frequency'), - ("Maximum Document Frequency", 'max_document_frequency'), - ("Output Column Name", 'output_column_name') + ("query", "query"), + ("k1", "k1"), + ("b", "b"), + ("Minimum Document Frequency", "min_document_frequency"), + ("Maximum Document Frequency", "max_document_frequency"), + ("Output Column Name", "output_column_name"), ] - section_titles = ['Model fields'] + section_titles = ["Model fields"] return ([fields], section_titles) def __repr__(self): @@ -353,7 +363,7 @@ def __repr__(self): @classmethod def _get_instance_and_data(self): - sf = _tc.SFrame({'docs': ["this is a test", "this is another test"]}) - encoder = _tc.feature_engineering.BM25('docs', ['a', 'test']) + sf = _tc.SFrame({"docs": ["this is a test", "this is another test"]}) + encoder = _tc.feature_engineering.BM25("docs", ["a", "test"]) encoder = encoder.fit(sf) return encoder, sf diff --git a/src/python/turicreate/toolkits/_feature_engineering/_feature_engineering.py b/src/python/turicreate/toolkits/_feature_engineering/_feature_engineering.py index 340f69f7a9..6d2a9353cd 100644 --- a/src/python/turicreate/toolkits/_feature_engineering/_feature_engineering.py +++ b/src/python/turicreate/toolkits/_feature_engineering/_feature_engineering.py @@ -8,9 +8,11 @@ from __future__ import absolute_import as _ import turicreate as _tc -from turicreate.toolkits._internal_utils import _toolkit_repr_print, \ - _precomputed_field, \ - _raise_error_if_not_sframe +from turicreate.toolkits._internal_utils import ( + _toolkit_repr_print, + _precomputed_field, + _raise_error_if_not_sframe, +) # Base class for models written by users # --------------------------------------------------- @@ -109,10 +111,10 @@ def __init__(self, **kwargs): def _get_summary_struct(self): model_fields = [] for attr in self.__dict__: - if not attr.startswith('_'): + if not attr.startswith("_"): model_fields.append((attr, _precomputed_field(getattr(self, attr)))) - return ([model_fields], ['Attributes'] ) + return ([model_fields], ["Attributes"]) def __repr__(self): (sections, section_titles) = self._get_summary_struct() @@ -213,18 +215,19 @@ def fit_transform(self, data): def _get_instance_and_data(self): raise NotImplementedError + # Base class for Models written in C++ using the SDK. # --------------------------------------------------- class Transformer(TransformerBase): - _fit_examples_doc = ''' - ''' - _fit_transform_examples_doc = ''' - ''' - _transform_examples_doc = ''' - ''' + _fit_examples_doc = """ + """ + _fit_transform_examples_doc = """ + """ + _transform_examples_doc = """ + """ - def __init__(self, model_proxy = None, _class = None): + def __init__(self, model_proxy=None, _class=None): self.__proxy__ = model_proxy if _class: self.__class__ = _class @@ -353,8 +356,10 @@ def _get(self, field): if field in self._list_fields(): return self.__proxy__.get(field) else: - raise KeyError('Field \"%s\" not in model. Available fields are ' - '%s.' % (field, ', '.join(self._list_fields()))) + raise KeyError( + 'Field "%s" not in model. Available fields are ' + "%s." % (field, ", ".join(self._list_fields())) + ) def __getitem__(self, key): return self.get(key) @@ -367,14 +372,14 @@ def _is_gl_pickle_safe(cls): """ return False -class _SampleTransformer(Transformer): +class _SampleTransformer(Transformer): def __init__(self, features=None, constant=0.5): # Set up options opts = {} - opts['features'] = features - opts['constant'] = constant + opts["features"] = features + opts["constant"] = constant # Initialize object proxy = _tc.extensions._SampleTransformer() @@ -400,9 +405,9 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ section = [] - section_titles = ['Attributes'] + section_titles = ["Attributes"] for f in self._list_fields(): - section.append( ("%s" % f,"%s"% f) ) + section.append(("%s" % f, "%s" % f)) return ([section], section_titles) diff --git a/src/python/turicreate/toolkits/_feature_engineering/_internal_utils.py b/src/python/turicreate/toolkits/_feature_engineering/_internal_utils.py index 13e1d0d0c4..5ff6429061 100644 --- a/src/python/turicreate/toolkits/_feature_engineering/_internal_utils.py +++ b/src/python/turicreate/toolkits/_feature_engineering/_internal_utils.py @@ -15,6 +15,7 @@ NoneType = type(None) + def select_valid_features(data, feature_columns, valid_types): valid_features = [] @@ -22,10 +23,16 @@ def select_valid_features(data, feature_columns, valid_types): if data[f].dtype in valid_types: valid_features.append(f) else: - _logging.warning("Warning: Column " + f + " is excluded due to" + - " invalid column type " + str(data[f].dtype)) + _logging.warning( + "Warning: Column " + + f + + " is excluded due to" + + " invalid column type " + + str(data[f].dtype) + ) return valid_features + def select_feature_subset(data, feature_columns): total_set = set(data.column_names()) feature_set = set(feature_columns) @@ -33,10 +40,18 @@ def select_feature_subset(data, feature_columns): result = total_set.intersection(feature_set) if len(result) != len(feature_set): - _logging.warning("Warning: The model was fit with " + str(len(feature_columns)) + " feature columns but only " + str(len(result)) + " were present during transform()." +" Proceeding with transform by ignoring the missing columns.") + _logging.warning( + "Warning: The model was fit with " + + str(len(feature_columns)) + + " feature columns but only " + + str(len(result)) + + " were present during transform()." + + " Proceeding with transform by ignoring the missing columns." + ) return [f for f in feature_columns if f in result] + def get_column_names(data, interpret_as_excluded, column_names): assert interpret_as_excluded in [True, False] @@ -56,7 +71,6 @@ def get_column_names(data, interpret_as_excluded, column_names): return selected_columns - def validate_feature_columns(data_column_names, feature_column_names): if len(feature_column_names) == 0: @@ -65,7 +79,7 @@ def validate_feature_columns(data_column_names, feature_column_names): set_difference = set(feature_column_names) - set(data_column_names) if len(set_difference) > 0: - err = 'Feature(s) ' + err = "Feature(s) " for s in range(len(set_difference) - 1): err = err + str(list(set_difference)[s]) + ", " err = err + str(list(set_difference).pop()) + " are missing from the dataset." @@ -75,11 +89,18 @@ def validate_feature_columns(data_column_names, feature_column_names): def validate_feature_types(feature_names, feature_types, data): for col_name in feature_names: if data[col_name].dtype != feature_types[col_name]: - err = "Column '" + col_name + "' was of type " + \ - str(feature_types[col_name]) + " when fitted using .fit() but is of type " +\ - str(data[col_name].dtype) + "during .transform()" + err = ( + "Column '" + + col_name + + "' was of type " + + str(feature_types[col_name]) + + " when fitted using .fit() but is of type " + + str(data[col_name].dtype) + + "during .transform()" + ) raise ValueError(err) + def process_features(features, exclude): """ Parameters @@ -101,8 +122,8 @@ def process_features(features, exclude): """ # Check types - _raise_error_if_not_of_type(features, [NoneType, str, list], 'features') - _raise_error_if_not_of_type(exclude, [NoneType, str, list], 'exclude') + _raise_error_if_not_of_type(features, [NoneType, str, list], "features") + _raise_error_if_not_of_type(exclude, [NoneType, str, list], "exclude") # Make a copy of the parameters. _features = _copy.copy(features) @@ -110,12 +131,13 @@ def process_features(features, exclude): # Check of both are None or empty. if _features and _exclude: - raise ValueError("The parameters 'features' and 'exclude' cannot both be set." - " Please set one or the other.") + raise ValueError( + "The parameters 'features' and 'exclude' cannot both be set." + " Please set one or the other." + ) if _features == [] and not _exclude: raise ValueError("Features cannot be an empty list.") - # Allow a single list _features = [_features] if type(_features) == str else _features _exclude = [_exclude] if type(_exclude) == str else _exclude @@ -132,21 +154,23 @@ def process_features(features, exclude): feature_set = set(_features) for col_name in _exclude: if col_name in feature_set: - raise ValueError("'%s' appears in both features and excluded_features." % col_name) + raise ValueError( + "'%s' appears in both features and excluded_features." % col_name + ) return _features, _exclude -def pretty_print_list(lst, name = 'features', repr_format=True): +def pretty_print_list(lst, name="features", repr_format=True): """ Pretty print a list to be readable. """ if not lst or len(lst) < 8: if repr_format: return lst.__repr__() else: - return ', '.join(map(str, lst)) + return ", ".join(map(str, lst)) else: - topk = ', '.join(map(str, lst[:3])) + topk = ", ".join(map(str, lst[:3])) if repr_format: lst_separator = "[" lst_end_separator = "]" @@ -154,6 +178,11 @@ def pretty_print_list(lst, name = 'features', repr_format=True): lst_separator = "" lst_end_separator = "" - return "{start}{topk}, ... {last}{end} (total {size} {name})".format(\ - topk = topk, last = lst[-1], name = name, size = len(lst), - start = lst_separator, end = lst_end_separator) + return "{start}{topk}, ... {last}{end} (total {size} {name})".format( + topk=topk, + last=lst[-1], + name=name, + size=len(lst), + start=lst_separator, + end=lst_end_separator, + ) diff --git a/src/python/turicreate/toolkits/_feature_engineering/_ngram_counter.py b/src/python/turicreate/toolkits/_feature_engineering/_ngram_counter.py index 53ac42c681..bc4ad9db69 100644 --- a/src/python/turicreate/toolkits/_feature_engineering/_ngram_counter.py +++ b/src/python/turicreate/toolkits/_feature_engineering/_ngram_counter.py @@ -13,6 +13,7 @@ from turicreate.toolkits._internal_utils import _toolkit_repr_print from turicreate.toolkits._internal_utils import _precomputed_field from turicreate.util import _raise_error_if_not_of_type + # Feature engineering utils from . import _internal_utils import warnings @@ -20,7 +21,7 @@ _NoneType = type(None) -_fit_examples_doc = ''' +_fit_examples_doc = """ import turicreate as tc # Create the data @@ -41,9 +42,9 @@ # features. >>> encoder['features'] ['dict', 'list', 'string'] -''' +""" -_fit_transform_examples_doc = ''' +_fit_transform_examples_doc = """ import turicreate as tc # Create the data @@ -96,9 +97,9 @@ | {'sentence one': 1} | +---------------------+ [2 rows x 3 columns] -''' +""" -_transform_examples_doc = ''' +_transform_examples_doc = """ >>> import turicreate as tc # For list columns (string elements converted to lower case by default): @@ -181,12 +182,11 @@ +--------------------------------+ [2 rows x 1 columns] -''' - +""" class NGramCounter(Transformer): - ''' + """ __init__(self, features=None, excluded_features=None, n=2, method="word", to_lower=True, ignore_punct=True, ignore_space=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " ", \ @@ -426,24 +426,68 @@ class NGramCounter(Transformer): +-----------------------------------------------------------------+ [2 rows x 1 columns] - ''' + """ # Doc strings _fit_examples_doc = _fit_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc - _transform_examples_doc = _transform_examples_doc - - def __init__(self, features=None, excluded_features=None, - n=2, method="word", to_lower=True, ignore_punct=True, ignore_space=True, - delimiters=["\r", "\v", "\n", "\f", "\t", " ", - "!", "#", "$", "%", "&", "'", "(", ")", - "*", "+", ",", "-", ".", "/", ":", ";", - "<", "=", ">", "?", "@", "[", "\\", "]", - "^", "_", "`", "{", "|", "}", "~"], - output_column_prefix=None): + _transform_examples_doc = _transform_examples_doc + + def __init__( + self, + features=None, + excluded_features=None, + n=2, + method="word", + to_lower=True, + ignore_punct=True, + ignore_space=True, + delimiters=[ + "\r", + "\v", + "\n", + "\f", + "\t", + " ", + "!", + "#", + "$", + "%", + "&", + "'", + "(", + ")", + "*", + "+", + ",", + "-", + ".", + "/", + ":", + ";", + "<", + "=", + ">", + "?", + "@", + "[", + "\\", + "]", + "^", + "_", + "`", + "{", + "|", + "}", + "~", + ], + output_column_prefix=None, + ): # Process and make a copy of the features, exclude. - _features, _exclude = _internal_utils.process_features(features, excluded_features) + _features, _exclude = _internal_utils.process_features( + features, excluded_features + ) # Type checking _raise_error_if_not_of_type(features, [list, str, _NoneType]) @@ -459,36 +503,38 @@ def __init__(self, features=None, excluded_features=None, if delimiters is not None: for delim in delimiters: _raise_error_if_not_of_type(delim, str, "delimiters") - if (len(delim) != 1): + if len(delim) != 1: raise ValueError("Delimiters must be single-character strings") if n < 1: raise ValueError("Input 'n' must be greater than 0") - if n > 5 and method == 'word': + if n > 5 and method == "word": warnings.warn("It is unusual for n-grams to be of size larger than 5.") if method != "word" and method != "character": - raise ValueError("Invalid 'method' input value. Please input " + - "either 'word' or 'character' ") + raise ValueError( + "Invalid 'method' input value. Please input " + + "either 'word' or 'character' " + ) # Set up options opts = { - 'n': n, - 'features': features, - 'ngram_type': method, - 'to_lower': to_lower, - 'ignore_punct': ignore_punct, - 'ignore_space': ignore_space, - 'delimiters': delimiters, - 'output_column_prefix' : output_column_prefix + "n": n, + "features": features, + "ngram_type": method, + "to_lower": to_lower, + "ignore_punct": ignore_punct, + "ignore_space": ignore_space, + "delimiters": delimiters, + "output_column_prefix": output_column_prefix, } if _exclude: - opts['exclude'] = True - opts['features'] = _exclude + opts["exclude"] = True + opts["features"] = _exclude else: - opts['exclude'] = False - opts['features'] = _features + opts["exclude"] = False + opts["features"] = _features # Initialize object proxy = _tc.extensions._NGramCounter() @@ -497,18 +543,19 @@ def __init__(self, features=None, excluded_features=None, def _get_summary_struct(self): _features = _precomputed_field( - _internal_utils.pretty_print_list(self.get('features'))) + _internal_utils.pretty_print_list(self.get("features")) + ) fields = [ - ("NGram length", 'n'), - ("NGram type (word or character)", 'ngram_type'), - ("Convert strings to lower case", 'to_lower'), - ("Ignore punctuation in character ngram", 'ignore_punct'), - ("Ignore space in character ngram", 'ignore_space'), + ("NGram length", "n"), + ("NGram type (word or character)", "ngram_type"), + ("Convert strings to lower case", "to_lower"), + ("Ignore punctuation in character ngram", "ignore_punct"), + ("Ignore space in character ngram", "ignore_space"), ("Delimiters", "delimiters"), ("Features", _features), - ("Output column prefix", 'output_column_prefix') + ("Output column prefix", "output_column_prefix"), ] - section_titles = ['Model fields'] + section_titles = ["Model fields"] return ([fields], section_titles) def __repr__(self): @@ -518,8 +565,13 @@ def __repr__(self): @classmethod def _get_instance_and_data(self): sf = _tc.SFrame( - {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1}, - {'this': 1, 'is': 1, 'another': 2, 'example': 3}]}) - encoder = _tc.feature_engineering.NGramCounter('docs') + { + "docs": [ + {"this": 1, "is": 1, "a": 2, "sample": 1}, + {"this": 1, "is": 1, "another": 2, "example": 3}, + ] + } + ) + encoder = _tc.feature_engineering.NGramCounter("docs") encoder = encoder.fit(sf) return encoder, sf diff --git a/src/python/turicreate/toolkits/_feature_engineering/_tfidf.py b/src/python/turicreate/toolkits/_feature_engineering/_tfidf.py index 299fbea5dc..522f0b945b 100644 --- a/src/python/turicreate/toolkits/_feature_engineering/_tfidf.py +++ b/src/python/turicreate/toolkits/_feature_engineering/_tfidf.py @@ -13,11 +13,12 @@ from turicreate.toolkits._internal_utils import _toolkit_repr_print from turicreate.toolkits._internal_utils import _precomputed_field from turicreate.util import _raise_error_if_not_of_type + # Feature engineering utils from . import _internal_utils -_fit_examples_doc = ''' +_fit_examples_doc = """ import turicreate as tc # Create the data @@ -53,9 +54,9 @@ | docs | example | 1 | +----------------+---------+--------------------+ [6 rows x 3 columns] -''' +""" -_fit_transform_examples_doc = ''' +_fit_transform_examples_doc = """ import turicreate as tc # Create the data @@ -87,9 +88,9 @@ | {'this': 0.0, 'is': 0.0, '... | +-------------------------------+ [2 rows x 1 columns] -''' +""" -_transform_examples_doc = ''' +_transform_examples_doc = """ # For list columns: >>> l1 = ['a','good','example'] @@ -152,12 +153,11 @@ | {'this': 0.0, 'is': 0.0, '... | +-------------------------------+ [2 rows x 1 columns] -''' - +""" class TFIDF(Transformer): - ''' + """ Transform an SFrame into TF-IDF scores. The prototypical application of TF-IDF transformations involves @@ -276,20 +276,26 @@ class TFIDF(Transformer): | {'this': 0.0, 'is': 0.0, 'example': 2.0794415416798357, ... | +-------------------------------------------------------------+ - ''' + """ # Doc strings _fit_examples_doc = _fit_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc - _transform_examples_doc = _transform_examples_doc + _transform_examples_doc = _transform_examples_doc - def __init__(self, features=None, excluded_features=None, - min_document_frequency=0.0, - max_document_frequency=1.0, - output_column_prefix=None): + def __init__( + self, + features=None, + excluded_features=None, + min_document_frequency=0.0, + max_document_frequency=1.0, + output_column_prefix=None, + ): # Process and make a copy of the features, exclude. - _features, _exclude = _internal_utils.process_features(features, excluded_features) + _features, _exclude = _internal_utils.process_features( + features, excluded_features + ) # Type checking _raise_error_if_not_of_type(min_document_frequency, [float, int]) @@ -298,16 +304,16 @@ def __init__(self, features=None, excluded_features=None, # Set up options opts = { - 'min_document_frequency': min_document_frequency, - 'max_document_frequency': max_document_frequency, - 'output_column_prefix' : output_column_prefix + "min_document_frequency": min_document_frequency, + "max_document_frequency": max_document_frequency, + "output_column_prefix": output_column_prefix, } if _exclude: - opts['exclude'] = True - opts['features'] = _exclude + opts["exclude"] = True + opts["features"] = _exclude else: - opts['exclude'] = False - opts['features'] = _features + opts["exclude"] = False + opts["features"] = _features # Initialize object proxy = _tc.extensions._TFIDF() @@ -316,14 +322,15 @@ def __init__(self, features=None, excluded_features=None, def _get_summary_struct(self): _features = _precomputed_field( - _internal_utils.pretty_print_list(self.get('features'))) + _internal_utils.pretty_print_list(self.get("features")) + ) fields = [ ("Features", _features), - ("Minimum Document Frequency", 'min_document_frequency'), - ("Maximum Document Frequency", 'max_document_frequency'), - ("Output Column Prefix", 'output_column_prefix') + ("Minimum Document Frequency", "min_document_frequency"), + ("Maximum Document Frequency", "max_document_frequency"), + ("Output Column Prefix", "output_column_prefix"), ] - section_titles = ['Model fields'] + section_titles = ["Model fields"] return ([fields], section_titles) def __repr__(self): @@ -333,8 +340,13 @@ def __repr__(self): @classmethod def _get_instance_and_data(self): sf = _tc.SFrame( - {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1}, - {'this': 1, 'is': 1, 'another': 2, 'example': 3}]}) - encoder = TFIDF(features=['docs']) + { + "docs": [ + {"this": 1, "is": 1, "a": 2, "sample": 1}, + {"this": 1, "is": 1, "another": 2, "example": 3}, + ] + } + ) + encoder = TFIDF(features=["docs"]) encoder = encoder.fit(sf) return encoder, sf diff --git a/src/python/turicreate/toolkits/_feature_engineering/_tokenizer.py b/src/python/turicreate/toolkits/_feature_engineering/_tokenizer.py index 4581de2170..02e81d65c8 100644 --- a/src/python/turicreate/toolkits/_feature_engineering/_tokenizer.py +++ b/src/python/turicreate/toolkits/_feature_engineering/_tokenizer.py @@ -17,7 +17,7 @@ _NoneType = type(None) -_fit_examples_doc = ''' +_fit_examples_doc = """ >>> import turicreate as tc # Create the data @@ -36,9 +36,9 @@ # features. >>> encoder['features'] [string, string2] -''' +""" -_fit_transform_examples_doc = ''' +_fit_transform_examples_doc = """ >>> import turicreate as tc # Create the data @@ -83,9 +83,9 @@ | [sentence, two...] | [One, Two, THREE] | +--------------------+-------------------+ [2 rows x 2 columns] -''' +""" -_transform_examples_doc = ''' +_transform_examples_doc = """ >>> import turicreate as tc # Create the data @@ -112,10 +112,11 @@ | [sentence, two...] | [One, Two, THREE] | +--------------------+-------------------+ [2 rows x 2 columns] -''' +""" + class Tokenizer(Transformer): - ''' + """ __init__(features=None, excluded_features=None, to_lower=False, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "], output_column_prefix=None) @@ -252,18 +253,25 @@ class Tokenizer(Transformer): >>> tokenizer['features'] # `features` are set to `None` - ''' + """ _fit_examples_doc = _fit_examples_doc _transform_examples_doc = _transform_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc - def __init__(self, features=None, excluded_features=None, - to_lower=False, delimiters=["\r", "\v", "\n", "\f", "\t", " "], - output_column_prefix=None): + def __init__( + self, + features=None, + excluded_features=None, + to_lower=False, + delimiters=["\r", "\v", "\n", "\f", "\t", " "], + output_column_prefix=None, + ): # Process and make a copy of the features, exclude. - _features, _exclude = _internal_utils.process_features(features, excluded_features) + _features, _exclude = _internal_utils.process_features( + features, excluded_features + ) # Type checking _raise_error_if_not_of_type(features, [list, str, _NoneType]) @@ -275,22 +283,22 @@ def __init__(self, features=None, excluded_features=None, if delimiters is not None: for delim in delimiters: _raise_error_if_not_of_type(delim, str, "delimiters") - if (len(delim) != 1): + if len(delim) != 1: raise ValueError("Delimiters must be single-character strings") # Set up options opts = { - 'features': features, - 'to_lower': to_lower, - 'delimiters': delimiters, - 'output_column_prefix' : output_column_prefix + "features": features, + "to_lower": to_lower, + "delimiters": delimiters, + "output_column_prefix": output_column_prefix, } if _exclude: - opts['exclude'] = True - opts['features'] = _exclude + opts["exclude"] = True + opts["features"] = _exclude else: - opts['exclude'] = False - opts['features'] = _features + opts["exclude"] = False + opts["features"] = _features # Initialize object proxy = _tc.extensions._Tokenizer() @@ -315,15 +323,16 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ _features = _precomputed_field( - _internal_utils.pretty_print_list(self.get('features'))) + _internal_utils.pretty_print_list(self.get("features")) + ) fields = [ ("Features", _features), - ("Convert strings to lower case", 'to_lower'), + ("Convert strings to lower case", "to_lower"), ("Delimiters", "delimiters"), - ("Output column prefix", 'output_column_prefix') + ("Output column prefix", "output_column_prefix"), ] - section_titles = ['Model fields'] + section_titles = ["Model fields"] return ([fields], section_titles) @@ -333,6 +342,6 @@ def __repr__(self): @classmethod def _get_instance_and_data(self): - sf = _tc.SFrame({'docs': ["this is a test", "this is another test"]}) - encoder = Tokenizer('docs') + sf = _tc.SFrame({"docs": ["this is a test", "this is another test"]}) + encoder = Tokenizer("docs") return encoder.fit(sf), sf diff --git a/src/python/turicreate/toolkits/_feature_engineering/_transform_to_flat_dictionary.py b/src/python/turicreate/toolkits/_feature_engineering/_transform_to_flat_dictionary.py index 41cef033b5..ce9df7caad 100644 --- a/src/python/turicreate/toolkits/_feature_engineering/_transform_to_flat_dictionary.py +++ b/src/python/turicreate/toolkits/_feature_engineering/_transform_to_flat_dictionary.py @@ -13,11 +13,12 @@ from turicreate.toolkits._internal_utils import _toolkit_repr_print from turicreate.toolkits._internal_utils import _precomputed_field from turicreate.util import _raise_error_if_not_of_type + # Feature engineering utils from . import _internal_utils -_fit_examples_doc = ''' +_fit_examples_doc = """ # Create data >>> sf = turicreate.SFrame({'values': [{"a" : {"b" : 3}, "c": 2}, ... { "a" : { "b" : 3, "c" : 2.5 }, "c" : 2 }, @@ -36,9 +37,9 @@ None Tag : __none__ Output Column Prefix : -''' +""" -_fit_transform_examples_doc = ''' +_fit_transform_examples_doc = """ # Create data >>> sf = turicreate.SFrame({'values': [{"a" : {"b" : 3}, "c": 2}, ... { "a" : { "b" : 3, "c" : 2.5 }, "c" : 2 }, @@ -63,9 +64,9 @@ | {'c': 2, 'a.b': 1} | +--------------------------------+ [4 rows x 1 columns] -''' +""" -_transform_examples_doc = ''' +_transform_examples_doc = """ # Create data >>> sf = turicreate.SFrame({'values': [{"a" : {"b" : 3}, "c": 2}, ... { "a" : { "b" : 3, "c" : 2.5 }, "c" : 2 }, @@ -100,10 +101,11 @@ | {'c': 2, 'a.b': 1} | +--------------------------------+ [4 rows x 1 columns] -''' +""" + class TransformToFlatDictionary(Transformer): - ''' + """ Transforms column values into dictionaries with flat, non-nested string keys and numeric values. Each key in nested containers is a concatenation of the keys in each dictionary with `separator` @@ -183,37 +185,44 @@ class TransformToFlatDictionary(Transformer): | {'c': 2, 'a.b': 1} | +----------------------------------------------+ [4 rows x 1 columns] - ''' + """ # Doc strings _fit_examples_doc = _fit_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc - _transform_examples_doc = _transform_examples_doc + _transform_examples_doc = _transform_examples_doc - def __init__(self, features=None, excluded_features=None, - separator = ".", none_tag = "__none__", - output_column_prefix = None): + def __init__( + self, + features=None, + excluded_features=None, + separator=".", + none_tag="__none__", + output_column_prefix=None, + ): # Process and make a copy of the features, exclude. - _features, _exclude = _internal_utils.process_features(features, excluded_features) + _features, _exclude = _internal_utils.process_features( + features, excluded_features + ) # Type checking _raise_error_if_not_of_type(output_column_prefix, [str, type(None)]) if output_column_prefix is None: - output_column_prefix = '' + output_column_prefix = "" opts = { - 'separator' : separator, - 'none_tag' : none_tag, - 'output_column_prefix' : output_column_prefix - } + "separator": separator, + "none_tag": none_tag, + "output_column_prefix": output_column_prefix, + } if _exclude: - opts['exclude'] = True - opts['features'] = _exclude + opts["exclude"] = True + opts["features"] = _exclude else: - opts['exclude'] = False - opts['features'] = _features + opts["exclude"] = False + opts["features"] = _features # Initialize object proxy = _tc.extensions._TransformToFlatDictionary() @@ -222,18 +231,20 @@ def __init__(self, features=None, excluded_features=None, def _get_summary_struct(self): _features = _precomputed_field( - _internal_utils.pretty_print_list(self.get('features'))) + _internal_utils.pretty_print_list(self.get("features")) + ) _exclude = _precomputed_field( - _internal_utils.pretty_print_list(self.get('excluded_features'))) + _internal_utils.pretty_print_list(self.get("excluded_features")) + ) fields = [ ("Features", _features), ("Excluded_features", _exclude), ("Separator", "separator"), ("None Tag", "none_tag"), - ("Output Column Prefix", 'output_column_prefix') + ("Output Column Prefix", "output_column_prefix"), ] - section_titles = ['Model fields'] + section_titles = ["Model fields"] return ([fields], section_titles) def __repr__(self): @@ -243,8 +254,13 @@ def __repr__(self): @classmethod def _get_instance_and_data(self): sf = _tc.SFrame( - {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1}, - {'this': 1, 'is': 1, 'another': 2, 'example': 3}]}) - encoder = _tc.feature_engineering.TFIDF(features=['docs']) + { + "docs": [ + {"this": 1, "is": 1, "a": 2, "sample": 1}, + {"this": 1, "is": 1, "another": 2, "example": 3}, + ] + } + ) + encoder = _tc.feature_engineering.TFIDF(features=["docs"]) encoder = encoder.fit(sf) return encoder, sf diff --git a/src/python/turicreate/toolkits/_feature_engineering/_transformer_chain.py b/src/python/turicreate/toolkits/_feature_engineering/_transformer_chain.py index 21e1875ee0..c16d608f67 100644 --- a/src/python/turicreate/toolkits/_feature_engineering/_transformer_chain.py +++ b/src/python/turicreate/toolkits/_feature_engineering/_transformer_chain.py @@ -21,6 +21,7 @@ import inspect as _inspect import sys as _sys + class TransformerChain(_TransformerBase): """ Sequentially apply a list of transforms. @@ -76,6 +77,7 @@ class of `TransformerBase`) which can be one of the following: >>> steps = chain['steps'] >>> steps = chain['steps_by_name'] """ + _TRANSFORMER_CHAIN_VERSION = 0 def __init__(self, steps): @@ -135,14 +137,14 @@ def _compact_class_repr(obj): init_func = init_func.__func__ fields = _inspect.getargspec(init_func).args - fields = fields[1:] # remove self - if 'features' in fields: - fields.remove('features') + fields = fields[1:] # remove self + if "features" in fields: + fields.remove("features") features = obj.get("features") if features is not None: - post_repr_string = ' on %s feature(s)' % len(features) - if 'excluded_features' in fields: - fields.remove('excluded_features') + post_repr_string = " on %s feature(s)" % len(features) + if "excluded_features" in fields: + fields.remove("excluded_features") # GLC transformers. if issubclass(obj.__class__, _Transformer): @@ -151,9 +153,8 @@ def _compact_class_repr(obj): # Chains elif obj.__class__ == TransformerChain: - _step_classes = list(map(lambda x: x.__class__.__name__, obj.get('steps'))) - _steps = _internal_utils.pretty_print_list( - _step_classes, 'steps', False) + _step_classes = list(map(lambda x: x.__class__.__name__, obj.get("steps"))) + _steps = _internal_utils.pretty_print_list(_step_classes, "steps", False) dict_str_list.append(_steps) # For user defined transformers. @@ -161,17 +162,21 @@ def _compact_class_repr(obj): for attr in fields: dict_str_list.append("%s=%s" % (attr, obj.__dict__[attr])) - return "%s(%s)%s" % (obj.__class__.__name__, ", ".join(dict_str_list), - post_repr_string) + return "%s(%s)%s" % ( + obj.__class__.__name__, + ", ".join(dict_str_list), + post_repr_string, + ) def _get_struct_summary(self): model_fields = [] for name, tr in self._transformers: - model_fields.append((name, - _precomputed_field(self._compact_class_repr(tr)))) + model_fields.append( + (name, _precomputed_field(self._compact_class_repr(tr))) + ) sections = [model_fields] - section_titles = ['Steps'] + section_titles = ["Steps"] return (sections, section_titles) @@ -183,10 +188,13 @@ def __repr__(self): def __get_steps_repr__(steps): def __repr__(steps): for name, tr in self._transformers: - model_fields.append((name, - _precomputed_field(self._compact_class_repr(tr)))) - return _toolkit_repr_print(steps, [model_fields], width=8, - section_titles = ['Steps']) + model_fields.append( + (name, _precomputed_field(self._compact_class_repr(tr))) + ) + return _toolkit_repr_print( + steps, [model_fields], width=8, section_titles=["Steps"] + ) + return __repr__ def _preprocess(self, data): @@ -197,9 +205,11 @@ def _preprocess(self, data): for name, step in self._transformers[:-1]: transformed_data = step.fit_transform(transformed_data) if type(transformed_data) != _tc.SFrame: - raise RuntimeError("The transform function in step '%s' did not" - " return an SFrame (got %s instead)." % (name, - type(transformed_data).__name__)) + raise RuntimeError( + "The transform function in step '%s' did not" + " return an SFrame (got %s instead)." + % (name, type(transformed_data).__name__) + ) return transformed_data def fit(self, data): @@ -303,8 +313,10 @@ def transform(self, data): for name, step in self._transformers: transformed_data = step.transform(transformed_data) if type(transformed_data) != _tc.SFrame: - raise TypeError("The transform function in step '%s' did not return" - " an SFrame." % name) + raise TypeError( + "The transform function in step '%s' did not return" + " an SFrame." % name + ) return transformed_data def _list_fields(self): diff --git a/src/python/turicreate/toolkits/_feature_engineering/_word_counter.py b/src/python/turicreate/toolkits/_feature_engineering/_word_counter.py index fb24189472..7d54bf76ec 100644 --- a/src/python/turicreate/toolkits/_feature_engineering/_word_counter.py +++ b/src/python/turicreate/toolkits/_feature_engineering/_word_counter.py @@ -13,13 +13,14 @@ from turicreate.toolkits._internal_utils import _toolkit_repr_print from turicreate.toolkits._internal_utils import _precomputed_field from turicreate.util import _raise_error_if_not_of_type + # Feature engineering utils from . import _internal_utils _NoneType = type(None) -_fit_examples_doc = ''' +_fit_examples_doc = """ >>> import turicreate as tc # Create the data @@ -40,9 +41,9 @@ # features. >>> encoder['features'] ['dict', 'list', 'string'] -''' +""" -_fit_transform_examples_doc = ''' +_fit_transform_examples_doc = """ >>> import turicreate as tc # Create the data @@ -85,9 +86,9 @@ | {'two...': 1, 'sentence': 1} | +------------------------------+ [2 rows x 3 columns] -''' +""" -_transform_examples_doc = ''' +_transform_examples_doc = """ >>> import turicreate as tc # For list columns (string elements converted to lower case by default): @@ -146,12 +147,11 @@ | {'this': 1, 'is': 1, 'example': 3, 'another': 2} | +--------------------------------------------------+ [2 rows x 1 columns] -''' - +""" class WordCounter(Transformer): - ''' + """ __init__(features=None, excluded_features=None, to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "], output_column_prefix=None) @@ -304,19 +304,26 @@ class WordCounter(Transformer): # Save the transformer. >>> encoder.save('save-path') -''' +""" # Doc strings _fit_examples_doc = _fit_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc - _transform_examples_doc = _transform_examples_doc + _transform_examples_doc = _transform_examples_doc - def __init__(self, features=None, excluded_features=None, - to_lower=True, delimiters=["\r", "\v", "\n", "\f", "\t", " "], - output_column_prefix=None): + def __init__( + self, + features=None, + excluded_features=None, + to_lower=True, + delimiters=["\r", "\v", "\n", "\f", "\t", " "], + output_column_prefix=None, + ): # Process and make a copy of the features, exclude. - _features, _exclude = _internal_utils.process_features(features, excluded_features) + _features, _exclude = _internal_utils.process_features( + features, excluded_features + ) # Type checking _raise_error_if_not_of_type(features, [list, str, _NoneType]) @@ -328,22 +335,22 @@ def __init__(self, features=None, excluded_features=None, if delimiters is not None: for delim in delimiters: _raise_error_if_not_of_type(delim, str, "delimiters") - if (len(delim) != 1): + if len(delim) != 1: raise ValueError("Delimiters must be single-character strings") # Set up options opts = { - 'features': features, - 'to_lower': to_lower, - 'delimiters': delimiters, - 'output_column_prefix' : output_column_prefix + "features": features, + "to_lower": to_lower, + "delimiters": delimiters, + "output_column_prefix": output_column_prefix, } if _exclude: - opts['exclude'] = True - opts['features'] = _exclude + opts["exclude"] = True + opts["features"] = _exclude else: - opts['exclude'] = False - opts['features'] = _features + opts["exclude"] = False + opts["features"] = _features # Initialize object proxy = _tc.extensions._WordCounter() @@ -352,14 +359,15 @@ def __init__(self, features=None, excluded_features=None, def _get_summary_struct(self): _features = _precomputed_field( - _internal_utils.pretty_print_list(self.get('features'))) + _internal_utils.pretty_print_list(self.get("features")) + ) fields = [ ("Features", _features), - ("Convert strings to lower case", 'to_lower'), + ("Convert strings to lower case", "to_lower"), ("Delimiters", "delimiters"), - ("Output column prefix", 'output_column_prefix') + ("Output column prefix", "output_column_prefix"), ] - section_titles = ['Model fields'] + section_titles = ["Model fields"] return ([fields], section_titles) def __repr__(self): @@ -369,8 +377,13 @@ def __repr__(self): @classmethod def _get_instance_and_data(self): sf = _tc.SFrame( - {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1}, - {'this': 1, 'is': 1, 'another': 2, 'example': 3}]}) - encoder = WordCounter('docs') + { + "docs": [ + {"this": 1, "is": 1, "a": 2, "sample": 1}, + {"this": 1, "is": 1, "another": 2, "example": 3}, + ] + } + ) + encoder = WordCounter("docs") encoder = encoder.fit(sf) return encoder, sf diff --git a/src/python/turicreate/toolkits/_feature_engineering/_word_trimmer.py b/src/python/turicreate/toolkits/_feature_engineering/_word_trimmer.py index 02c088c943..c3e7fafcbe 100644 --- a/src/python/turicreate/toolkits/_feature_engineering/_word_trimmer.py +++ b/src/python/turicreate/toolkits/_feature_engineering/_word_trimmer.py @@ -14,11 +14,12 @@ from turicreate.toolkits._internal_utils import _precomputed_field from turicreate.toolkits._private_utils import _summarize_accessible_fields from turicreate.util import _raise_error_if_not_of_type + # Feature engineering utils from . import _internal_utils -_fit_examples_doc = ''' +_fit_examples_doc = """ >>> import turicreate as tc # Create the data @@ -61,9 +62,9 @@ | string | sentence | 2 | +--------+----------+-------+ [6 rows x 3 columns] -''' +""" -_fit_transform_examples_doc = ''' +_fit_transform_examples_doc = """ >>> import turicreate as tc # Create the data @@ -100,9 +101,9 @@ | {'this': 1, 'is': 1, 'exam... | [two, two] | sentence | +-------------------------------+------------+----------+ [2 rows x 3 columns] -''' +""" -_transform_examples_doc = ''' +_transform_examples_doc = """ >>> import turicreate as tc # For list columns (string elements converted to lower case by default): @@ -167,11 +168,11 @@ | {'this': 1, 'is': 1, 'exam... | +-------------------------------+ [2 rows x 1 columns] -''' +""" class RareWordTrimmer(Transformer): - ''' + """ Remove words that occur below a certain number of times in a given column. This is a common method of cleaning text before it is used, and can increase the quality and explainability of the models learned on the transformed data. @@ -291,20 +292,28 @@ class RareWordTrimmer(Transformer): # Save the transformer. >>> trimmer.save('save-path') -''' +""" # Doc strings _fit_examples_doc = _fit_examples_doc _transform_examples_doc = _transform_examples_doc _fit_transform_examples_doc = _fit_transform_examples_doc - def __init__(self, features=None, excluded_features=None, - threshold=2,stopwords=None,to_lower=True, delimiters=["\r", "\v", "\n", "\f", "\t", " "], -output_column_prefix = None): + def __init__( + self, + features=None, + excluded_features=None, + threshold=2, + stopwords=None, + to_lower=True, + delimiters=["\r", "\v", "\n", "\f", "\t", " "], + output_column_prefix=None, + ): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( - features, excluded_features) + features, excluded_features + ) # Type checking _raise_error_if_not_of_type(features, [list, str, type(None)]) @@ -317,25 +326,23 @@ def __init__(self, features=None, excluded_features=None, if delimiters is not None: for delim in delimiters: _raise_error_if_not_of_type(delim, str, "delimiters") - if (len(delim) != 1): + if len(delim) != 1: raise ValueError("Delimiters must be single-character strings") - - # Set up options opts = { - 'threshold': threshold, - 'output_column_prefix': output_column_prefix, - 'to_lower' : to_lower, - 'stopwords' : stopwords, - 'delimiters': delimiters + "threshold": threshold, + "output_column_prefix": output_column_prefix, + "to_lower": to_lower, + "stopwords": stopwords, + "delimiters": delimiters, } if _exclude: - opts['exclude'] = True - opts['features'] = _exclude + opts["exclude"] = True + opts["features"] = _exclude else: - opts['exclude'] = False - opts['features'] = _features + opts["exclude"] = False + opts["features"] = _features # Initialize object proxy = _tc.extensions._RareWordTrimmer() @@ -360,22 +367,25 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ _features = _precomputed_field( - _internal_utils.pretty_print_list(self.get('features'))) + _internal_utils.pretty_print_list(self.get("features")) + ) _exclude = _precomputed_field( - _internal_utils.pretty_print_list(self.get('excluded_features'))) + _internal_utils.pretty_print_list(self.get("excluded_features")) + ) _stopwords = _precomputed_field( - _internal_utils.pretty_print_list(self.get('stopwords'))) + _internal_utils.pretty_print_list(self.get("stopwords")) + ) fields = [ ("Features", _features), ("Excluded features", _exclude), - ("Output column name", 'output_column_prefix'), - ("Word count threshold", 'threshold'), + ("Output column name", "output_column_prefix"), + ("Word count threshold", "threshold"), ("Manually specified stopwords", _stopwords), ("Whether to convert to lowercase", "to_lower"), - ("Delimiters" , "delimiters") + ("Delimiters", "delimiters"), ] - section_titles = ['Model fields'] + section_titles = ["Model fields"] return ([fields], section_titles) @@ -389,8 +399,7 @@ def __repr__(self): out : string A description of the model. """ - accessible_fields = { - "vocabulary": "The vocabulary of the trimmed input."} + accessible_fields = {"vocabulary": "The vocabulary of the trimmed input."} (sections, section_titles) = self._get_summary_struct() out = _toolkit_repr_print(self, sections, section_titles, width=30) out2 = _summarize_accessible_fields(accessible_fields, width=30) @@ -398,7 +407,6 @@ def __repr__(self): @classmethod def _get_instance_and_data(cls): - sf = _tc.SFrame({'a' : ['dog', 'dog' , 'dog'], 'b' : ['cat', 'one' ,'one']}) - trimmer = RareWordTrimmer( - features = ['a', 'b']) + sf = _tc.SFrame({"a": ["dog", "dog", "dog"], "b": ["cat", "one", "one"]}) + trimmer = RareWordTrimmer(features=["a", "b"]) return trimmer.fit(sf), sf diff --git a/src/python/turicreate/toolkits/_image_feature_extractor.py b/src/python/turicreate/toolkits/_image_feature_extractor.py index 1ad5706dff..660ba5d8a0 100644 --- a/src/python/turicreate/toolkits/_image_feature_extractor.py +++ b/src/python/turicreate/toolkits/_image_feature_extractor.py @@ -10,6 +10,7 @@ from ._pre_trained_models import _get_cache_dir import turicreate.toolkits._tf_utils as _utils + def _create_feature_extractor(model_name): from platform import system from ._internal_utils import _mac_ver @@ -17,19 +18,18 @@ def _create_feature_extractor(model_name): from turicreate import extensions # If we don't have Core ML, use a TensorFlow model. - if system() != 'Darwin' or _mac_ver() < (10, 13): + if system() != "Darwin" or _mac_ver() < (10, 13): ptModel = IMAGE_MODELS[model_name]() return TensorFlowFeatureExtractor(ptModel) download_path = _get_cache_dir() result = extensions.__dict__["image_deep_feature_extractor"]() - result.init_options({'model_name': model_name, 'download_path': download_path}) + result.init_options({"model_name": model_name, "download_path": download_path}) return result class ImageFeatureExtractor(object): - def __init__(self, ptModel): """ Parameters @@ -83,7 +83,7 @@ def __init__(self, ptModel): self.coreml_data_layer = ptModel.coreml_data_layer self.coreml_feature_layer = ptModel.coreml_feature_layer - model_path = ptModel.get_model_path('tensorflow') + model_path = ptModel.get_model_path("tensorflow") self.model = keras.models.load_model(model_path) def __del__(self): @@ -95,25 +95,25 @@ def extract_features(self, dataset, feature, batch_size=64, verbose=False): import numpy as np # Only expose the feature column to the SFrame-to-NumPy loader. - image_sf = tc.SFrame({'image' : dataset[feature]}) + image_sf = tc.SFrame({"image": dataset[feature]}) # Encapsulate state in a dict to sidestep variable scoping issues. state = {} - state['num_started'] = 0 # Images read from the SFrame - state['num_processed'] = 0 # Images processed by TensorFlow - state['total'] = len(dataset) # Images in the dataset + state["num_started"] = 0 # Images read from the SFrame + state["num_processed"] = 0 # Images processed by TensorFlow + state["total"] = len(dataset) # Images in the dataset # We should be using SArrayBuilder, but it doesn't accept ndarray yet. # TODO: https://github.com/apple/turicreate/issues/2672 - #out = _tc.SArrayBuilder(dtype = array.array) - state['out'] = tc.SArray(dtype=array) + # out = _tc.SArrayBuilder(dtype = array.array) + state["out"] = tc.SArray(dtype=array) if verbose: print("Performing feature extraction on resized images...") # Provide an iterator-like interface around the SFrame. def has_next_batch(): - return state['num_started'] < state['total'] + return state["num_started"] < state["total"] # Yield a numpy array representing one batch of images. def next_batch(batch): @@ -122,9 +122,9 @@ def next_batch(batch): return None # Compute the range of the SFrame to yield. - start_index = state['num_started'] - end_index = min(start_index + batch_size, state['total']) - state['num_started'] = end_index + start_index = state["num_started"] + end_index = min(start_index + batch_size, state["total"]) + state["num_started"] = end_index num_images = end_index - start_index shape = (num_images,) + self.ptModel.input_image_shape @@ -134,9 +134,13 @@ def next_batch(batch): # Resize and load the images. future = tc.extensions.sframe_load_to_numpy.run_background( - image_sf, batch.ctypes.data, - batch.strides, batch.shape, - start_index, end_index) + image_sf, + batch.ctypes.data, + batch.strides, + batch.shape, + start_index, + end_index, + ) return future, batch @@ -150,7 +154,7 @@ def ready_batch(batch_info): batch = batch.transpose(0, 2, 3, 1) # NCHW -> NHWC if self.ptModel.input_is_BGR: - batch = batch[:,:,:,::-1] # RGB -> BGR + batch = batch[:, :, :, ::-1] # RGB -> BGR return batch @@ -164,55 +168,60 @@ def handle_request(batch): # progress. def consume_response(tf_out): sa = tc.SArray(tf_out, dtype=array) - state['out'] = state['out'].append(sa) + state["out"] = state["out"].append(sa) - state['num_processed'] += len(tf_out) + state["num_processed"] += len(tf_out) if verbose: - print('Completed {num_processed:{width}d}/{total:{width}d}'.format( - width = len(str(state['total'])), **state)) - + print( + "Completed {num_processed:{width}d}/{total:{width}d}".format( + width=len(str(state["total"])), **state + ) + ) + # These two arrays will swap off to avoid unnecessary allocations. + state["batch_store"] = [] - # These two arrays will swap off to avoid unnecessary allocations. - state['batch_store'] = [] def get_batch_array(): - batch_store = state['batch_store'] - + batch_store = state["batch_store"] + if not batch_store: - batch_store.append(np.zeros((batch_size,) + self.ptModel.input_image_shape, dtype=np.float32)) + batch_store.append( + np.zeros( + (batch_size,) + self.ptModel.input_image_shape, dtype=np.float32 + ) + ) return batch_store.pop() def batch_array_done(b): - state['batch_store'].append(b) - + state["batch_store"].append(b) # Seed the iteration batch_info = next_batch(get_batch_array()) - + # Iterate through the image batches, converting them into batches - # of feature vectors. Do the + # of feature vectors. Do the while batch_info is not None: # Get the now ready batch to process batch = ready_batch(batch_info) - # Start the next one in the background. + # Start the next one in the background. # Returns None if done. batch_info = next_batch(get_batch_array()) - # Now, process all this. + # Now, process all this. predictions_from_tf = handle_request(batch) consume_response(predictions_from_tf) - # Requeue the batch array now that we're done. + # Requeue the batch array now that we're done. batch_array_done(batch) - # Now we have this compiled in - return state['out'] + # Now we have this compiled in + return state["out"] def get_coreml_model(self): import coremltools - model_path = self.ptModel.get_model_path('coreml') + model_path = self.ptModel.get_model_path("coreml") return coremltools.models.MLModel(model_path) diff --git a/src/python/turicreate/toolkits/_internal_utils.py b/src/python/turicreate/toolkits/_internal_utils.py index 40ebbdf5b8..a27f08f251 100644 --- a/src/python/turicreate/toolkits/_internal_utils.py +++ b/src/python/turicreate/toolkits/_internal_utils.py @@ -26,9 +26,12 @@ import six as _six -_proxy_map = {UnitySFrameProxy: (lambda x: _SFrame(_proxy=x)), - UnitySArrayProxy: (lambda x: _SArray(_proxy=x)), - UnityGraphProxy: (lambda x: _SGraph(_proxy=x))} +_proxy_map = { + UnitySFrameProxy: (lambda x: _SFrame(_proxy=x)), + UnitySArrayProxy: (lambda x: _SArray(_proxy=x)), + UnityGraphProxy: (lambda x: _SGraph(_proxy=x)), +} + def _read_env_var_cpp(var_name): """ @@ -39,8 +42,10 @@ def _read_env_var_cpp(var_name): # TODO: Remove the old implementations, and this function, once the dust has # settled. import os as _os + return bool(int(_os.environ.get(var_name, "1"))) + def _toolkit_serialize_summary_struct(model, sections, section_titles): """ Serialize model summary into a dict with ordered lists of sections and section titles @@ -64,10 +69,14 @@ def _toolkit_serialize_summary_struct(model, sections, section_titles): 'section_titles' : ordered list of section labels """ output_dict = dict() - output_dict['sections'] = [ [ ( field[0], __extract_model_summary_value(model, field[1]) ) \ - for field in section ] - for section in sections ] - output_dict['section_titles'] = section_titles + output_dict["sections"] = [ + [ + (field[0], __extract_model_summary_value(model, field[1])) + for field in section + ] + for section in sections + ] + output_dict["section_titles"] = section_titles return output_dict @@ -85,11 +94,14 @@ def predict(x): ''' return x """ + def add_docstring_context(func): def wrapper(*args, **kwargs): return func(*args, **kwargs) + wrapper.__doc__ = func.__doc__.format(**format_dict) return wrapper + return add_docstring_context @@ -106,13 +118,24 @@ def _find_only_column_of_type(sframe, target_type, type_name, col_name): for name, ctype in zip(sframe.column_names(), sframe.column_types()): if ctype in target_type: if image_column_name is not None: - raise ToolkitError('No "{col_name}" column specified and more than one {type_name} column in "dataset". Can not infer correct {col_name} column.'.format(col_name=col_name, type_name=type_name)) + raise ToolkitError( + 'No "{col_name}" column specified and more than one {type_name} column in "dataset". Can not infer correct {col_name} column.'.format( + col_name=col_name, type_name=type_name + ) + ) image_column_name = name if image_column_name is None: - raise ToolkitError('No "{col_name}" column specified and no column with expected type "{type_name}" is found.'.format(col_name=col_name, type_name=type_name) - + ' "datasets" consists of columns with types: '+ ', '.join([x.__name__ for x in sframe.column_types()])+'.') + raise ToolkitError( + 'No "{col_name}" column specified and no column with expected type "{type_name}" is found.'.format( + col_name=col_name, type_name=type_name + ) + + ' "datasets" consists of columns with types: ' + + ", ".join([x.__name__ for x in sframe.column_types()]) + + "." + ) return image_column_name + def _find_only_image_column(sframe): """ Finds the only column in `sframe` with a type of turicreate.Image. @@ -120,8 +143,11 @@ def _find_only_image_column(sframe): be raised. """ from turicreate import Image - return _find_only_column_of_type(sframe, target_type=Image, - type_name='image', col_name='feature') + + return _find_only_column_of_type( + sframe, target_type=Image, type_name="image", col_name="feature" + ) + def _find_only_drawing_column(sframe): """ @@ -133,29 +159,35 @@ def _find_only_drawing_column(sframe): raised. """ from turicreate import Image + bitmap_success, stroke_success = False, False bitmap_error, stroke_error = None, None feature = None try: - feature = _find_only_column_of_type(sframe, - target_type=Image, type_name='drawing', col_name='feature') + feature = _find_only_column_of_type( + sframe, target_type=Image, type_name="drawing", col_name="feature" + ) bitmap_success = True except ToolkitError as err_from_bitmap_search: bitmap_error = err_from_bitmap_search try: - feature = _find_only_column_of_type(sframe, - target_type=list, type_name='drawing', col_name='feature') + feature = _find_only_column_of_type( + sframe, target_type=list, type_name="drawing", col_name="feature" + ) stroke_success = True except ToolkitError as err_from_stroke_search: stroke_error = err_from_stroke_search - more_than_one_image_columns = ("more than one" in str(bitmap_error) - if not bitmap_success else False) - more_than_one_stroke_columns = ("more than one" in str(stroke_error) - if not stroke_success else False) + more_than_one_image_columns = ( + "more than one" in str(bitmap_error) if not bitmap_success else False + ) + more_than_one_stroke_columns = ( + "more than one" in str(stroke_error) if not stroke_success else False + ) - corrective_action_for_user = ("\nThe feature column must contain either " + corrective_action_for_user = ( + "\nThe feature column must contain either " + "bitmap-based drawings or stroke-based drawings but not both.\n" + "Bitmap-based drawing input must be a grayscale " + "tc.Image of any size.\n" @@ -165,72 +197,111 @@ def _find_only_drawing_column(sframe): + "drawn on the canvas. " + "Every point must be a dictionary with two keys, 'x' and 'y', and " + "their respective values must be numerical, " - + "i.e. either integer or float.") + + "i.e. either integer or float." + ) - error_message = (lambda num1, type1, input1, num2, type2, input2: - (("No 'feature' column specified. Found {num1} column with type " + error_message = lambda num1, type1, input1, num2, type2, input2: ( + ( + "No 'feature' column specified. Found {num1} column with type " + "{type1} (for {input1}-based drawing input) and " + "{num2} column with type {type2} (for {input2}-based drawing " + "input) in 'input_dataset'. " - + "Can not infer correct 'feature' column.").format( - num1=num1, input1=input1, type1=type1, - num2=num2, input2=input2, type2=type2) - ) + + "Can not infer correct 'feature' column." + ).format( + num1=num1, input1=input1, type1=type1, num2=num2, input2=input2, type2=type2 ) + ) - if (bitmap_success ^ stroke_success + if ( + bitmap_success ^ stroke_success and not more_than_one_image_columns - and not more_than_one_stroke_columns): + and not more_than_one_stroke_columns + ): # success! # found exactly one of bitmap-based drawing column and # stroke-based drawing column, and found none of the other. return feature elif bitmap_success and stroke_success: - raise ToolkitError(error_message( - "one", "turicreate.Image", "bitmap", "one", "list", "stroke") - + corrective_action_for_user) + raise ToolkitError( + error_message("one", "turicreate.Image", "bitmap", "one", "list", "stroke") + + corrective_action_for_user + ) else: if more_than_one_image_columns and more_than_one_stroke_columns: - raise ToolkitError(error_message( - "more than one", "turicreate.Image", "bitmap", - "more than one", "list", "stroke") - + corrective_action_for_user) + raise ToolkitError( + error_message( + "more than one", + "turicreate.Image", + "bitmap", + "more than one", + "list", + "stroke", + ) + + corrective_action_for_user + ) elif more_than_one_image_columns and not more_than_one_stroke_columns: - raise ToolkitError(error_message( - "more than one", "turicreate.Image", "bitmap", - "no", "list", "stroke") - + corrective_action_for_user) + raise ToolkitError( + error_message( + "more than one", + "turicreate.Image", + "bitmap", + "no", + "list", + "stroke", + ) + + corrective_action_for_user + ) elif not more_than_one_image_columns and more_than_one_stroke_columns: - raise ToolkitError(error_message( - "more than one", "list", "stroke", - "no", "turicreate.Image", "bitmap") - + corrective_action_for_user) + raise ToolkitError( + error_message( + "more than one", + "list", + "stroke", + "no", + "turicreate.Image", + "bitmap", + ) + + corrective_action_for_user + ) else: - raise ToolkitError(error_message( - "no", "list", "stroke", - "no", "turicreate.Image", "bitmap") - + corrective_action_for_user) + raise ToolkitError( + error_message( + "no", "list", "stroke", "no", "turicreate.Image", "bitmap" + ) + + corrective_action_for_user + ) + def _SGraphFromJsonTree(json_str): """ Convert the Json Tree to SGraph """ g = json.loads(json_str) - vertices = [_Vertex(x['id'], - dict([(str(k), v) for k, v in _six.iteritems(x) if k != 'id'])) - for x in g['vertices']] - edges = [_Edge(x['src'], x['dst'], - dict([(str(k), v) for k, v in _six.iteritems(x) if k != 'src' and k != 'dst'])) - for x in g['edges']] + vertices = [ + _Vertex(x["id"], dict([(str(k), v) for k, v in _six.iteritems(x) if k != "id"])) + for x in g["vertices"] + ] + edges = [ + _Edge( + x["src"], + x["dst"], + dict( + [(str(k), v) for k, v in _six.iteritems(x) if k != "src" and k != "dst"] + ), + ) + for x in g["edges"] + ] sg = _SGraph().add_vertices(vertices) if len(edges) > 0: sg = sg.add_edges(edges) return sg + class _precomputed_field(object): def __init__(self, field): self.field = field + def _summarize_coefficients(top_coefs, bottom_coefs): """ Return a tuple of sections and section titles. @@ -252,27 +323,31 @@ def _summarize_coefficients(top_coefs, bottom_coefs): """ def get_row_name(row): - if row['index'] is None: - return row['name'] + if row["index"] is None: + return row["name"] else: - return "%s[%s]" % (row['name'], row['index']) + return "%s[%s]" % (row["name"], row["index"]) if len(top_coefs) == 0: - top_coefs_list = [('No Positive Coefficients', _precomputed_field('') )] + top_coefs_list = [("No Positive Coefficients", _precomputed_field(""))] else: - top_coefs_list = [ (get_row_name(row), - _precomputed_field(row['value'])) \ - for row in top_coefs ] + top_coefs_list = [ + (get_row_name(row), _precomputed_field(row["value"])) for row in top_coefs + ] if len(bottom_coefs) == 0: - bottom_coefs_list = [('No Negative Coefficients', _precomputed_field(''))] + bottom_coefs_list = [("No Negative Coefficients", _precomputed_field(""))] else: - bottom_coefs_list = [ (get_row_name(row), - _precomputed_field(row['value'])) \ - for row in bottom_coefs ] + bottom_coefs_list = [ + (get_row_name(row), _precomputed_field(row["value"])) + for row in bottom_coefs + ] + + return ( + [top_coefs_list, bottom_coefs_list], + ["Highest Positive Coefficients", "Lowest Negative Coefficients"], + ) - return ([top_coefs_list, bottom_coefs_list], \ - [ 'Highest Positive Coefficients', 'Lowest Negative Coefficients'] ) def _toolkit_get_topk_bottomk(values, k=5): """ @@ -296,17 +371,19 @@ def _toolkit_get_topk_bottomk(values, k=5): values """ - top_values = values.topk('value', k=k) - top_values = top_values[top_values['value'] > 0] + top_values = values.topk("value", k=k) + top_values = top_values[top_values["value"] > 0] - bottom_values = values.topk('value', k=k, reverse=True) - bottom_values = bottom_values[bottom_values['value'] < 0] + bottom_values = values.topk("value", k=k, reverse=True) + bottom_values = bottom_values[bottom_values["value"] < 0] return (top_values, bottom_values) + def _toolkit_summary_dict_to_json(summary_dict): return json.dumps(summary_dict, allow_nan=False, ensure_ascii=False) + def _toolkit_summary_to_json(model, sections, section_titles): """ Serialize model summary to JSON string. JSON is an object with ordered arrays of @@ -324,9 +401,10 @@ def _toolkit_summary_to_json(model, sections, section_titles): section_titles : Ordered list of section titles """ - return _toolkit_summary_dict_to_json( \ - _toolkit_serialize_summary_struct( \ - model, sections, section_titles) ) + return _toolkit_summary_dict_to_json( + _toolkit_serialize_summary_struct(model, sections, section_titles) + ) + def __extract_model_summary_value(model, value): """ @@ -344,6 +422,7 @@ def __extract_model_summary_value(model, value): pass return field_value + def _make_repr_table_from_sframe(X): """ Serializes an SFrame to a list of strings, that, when printed, creates a well-formatted table. @@ -353,7 +432,7 @@ def _make_repr_table_from_sframe(X): column_names = X.column_names() - out_data = [ [None]*len(column_names) for i in range(X.num_rows())] + out_data = [[None] * len(column_names) for i in range(X.num_rows())] column_sizes = [len(s) for s in column_names] @@ -363,14 +442,15 @@ def _make_repr_table_from_sframe(X): column_sizes[i] = max(column_sizes[i], len(e)) # now, go through and pad everything. - out_data = ([ [cn.ljust(k, ' ') for cn, k in zip(column_names, column_sizes)], - ["-"*k for k in column_sizes] ] - + [ [e.ljust(k, ' ') for e, k in zip(row, column_sizes)] for row in out_data] ) + out_data = [ + [cn.ljust(k, " ") for cn, k in zip(column_names, column_sizes)], + ["-" * k for k in column_sizes], + ] + [[e.ljust(k, " ") for e, k in zip(row, column_sizes)] for row in out_data] - return [' '.join(row) for row in out_data] + return [" ".join(row) for row in out_data] -def _toolkit_repr_print(model, fields, section_titles, width = None, class_name = 'auto'): +def _toolkit_repr_print(model, fields, section_titles, width=None, class_name="auto"): """ Display a toolkit repr according to some simple rules. @@ -413,14 +493,18 @@ def _toolkit_repr_print(model, fields, section_titles, width = None, class_name _toolkit_repr_print(model, fields, section_titles) """ - assert len(section_titles) == len(fields), \ - "The number of section titles ({0}) ".format(len(section_titles)) +\ - "doesn't match the number of groups of fields, {0}.".format(len(fields)) + assert len(section_titles) == len( + fields + ), "The number of section titles ({0}) ".format( + len(section_titles) + ) + "doesn't match the number of groups of fields, {0}.".format( + len(fields) + ) - if class_name == 'auto': - out_fields = [ ("Class", model.__class__.__name__), ""] + if class_name == "auto": + out_fields = [("Class", model.__class__.__name__), ""] else: - out_fields = [ ("Class", class_name), ""] + out_fields = [("Class", class_name), ""] # Record the max_width so that if width is not provided, we calculate it. max_width = len("Class") @@ -428,13 +512,13 @@ def _toolkit_repr_print(model, fields, section_titles, width = None, class_name for index, (section_title, field_list) in enumerate(zip(section_titles, fields)): # Add in the section header. - out_fields += [section_title, "-"*len(section_title)] + out_fields += [section_title, "-" * len(section_title)] # Add in all the key-value pairs for f in field_list: if isinstance(f, tuple): f = (str(f[0]), f[1]) - out_fields.append( (f[0], __extract_model_summary_value(model, f[1])) ) + out_fields.append((f[0], __extract_model_summary_value(model, f[1]))) max_width = max(max_width, len(f[0])) elif isinstance(f, _SFrame): out_fields.append("") @@ -452,12 +536,14 @@ def _toolkit_repr_print(model, fields, section_titles, width = None, class_name # Now, go through and format the key_value pairs nicely. def format_key_pair(key, value): if type(key) is list: - key = ','.join(str(k) for k in key) + key = ",".join(str(k) for k in key) + + return key.ljust(width, " ") + " : " + str(value) - return key.ljust(width, ' ') + ' : ' + str(value) out_fields = [s if type(s) is str else format_key_pair(*s) for s in out_fields] - return '\n'.join(out_fields) + return "\n".join(out_fields) + def _map_unity_proxy_to_object(value): """ @@ -469,10 +555,11 @@ def _map_unity_proxy_to_object(value): elif vtype == list: return [_map_unity_proxy_to_object(v) for v in value] elif vtype == dict: - return {k:_map_unity_proxy_to_object(v) for k,v in value.items()} + return {k: _map_unity_proxy_to_object(v) for k, v in value.items()} else: return value + def _toolkits_select_columns(dataset, columns): """ Same as select columns but redirect runtime error to ToolkitError. @@ -481,32 +568,40 @@ def _toolkits_select_columns(dataset, columns): return dataset.select_columns(columns) except RuntimeError: missing_features = list(set(columns).difference(set(dataset.column_names()))) - raise ToolkitError("Input data does not contain the following columns: " + - "{}".format(missing_features)) + raise ToolkitError( + "Input data does not contain the following columns: " + + "{}".format(missing_features) + ) + -def _raise_error_if_column_exists(dataset, column_name = 'dataset', - dataset_variable_name = 'dataset', - column_name_error_message_name = 'column_name'): +def _raise_error_if_column_exists( + dataset, + column_name="dataset", + dataset_variable_name="dataset", + column_name_error_message_name="column_name", +): """ Check if a column exists in an SFrame with error message. """ - err_msg = 'The SFrame {0} must contain the column {1}.'.format( - dataset_variable_name, - column_name_error_message_name) + err_msg = "The SFrame {0} must contain the column {1}.".format( + dataset_variable_name, column_name_error_message_name + ) if column_name not in dataset.column_names(): - raise ToolkitError(str(err_msg)) + raise ToolkitError(str(err_msg)) + def _check_categorical_option_type(option_name, option_value, possible_values): """ Check whether or not the requested option is one of the allowed values. """ - err_msg = '{0} is not a valid option for {1}. '.format(option_value, option_name) - err_msg += ' Expected one of: '.format(possible_values) + err_msg = "{0} is not a valid option for {1}. ".format(option_value, option_name) + err_msg += " Expected one of: ".format(possible_values) - err_msg += ', '.join(map(str, possible_values)) + err_msg += ", ".join(map(str, possible_values)) if option_value not in possible_values: raise ToolkitError(err_msg) + def _raise_error_if_not_sarray(dataset, variable_name="SArray"): """ Check if the input is an SArray. Provide a proper error @@ -514,14 +609,18 @@ def _raise_error_if_not_sarray(dataset, variable_name="SArray"): """ err_msg = "Input %s is not an SArray." if not isinstance(dataset, _SArray): - raise ToolkitError(err_msg % variable_name) + raise ToolkitError(err_msg % variable_name) + def _raise_error_if_sarray_not_expected_dtype(sa, name, types): - err_msg = "Column '%s' cannot be of type %s. Expecting a column of type in [%s]." % \ - (name, sa.dtype.__name__, ', '.join([x.__name__ for x in types])) + err_msg = ( + "Column '%s' cannot be of type %s. Expecting a column of type in [%s]." + % (name, sa.dtype.__name__, ", ".join([x.__name__ for x in types])) + ) if sa.dtype not in types: raise ToolkitError(err_msg) + def _raise_error_if_not_sframe(dataset, variable_name="SFrame"): """ Check if the input is an SFrame. Provide a proper error @@ -531,7 +630,8 @@ def _raise_error_if_not_sframe(dataset, variable_name="SFrame"): err_msg += " you may use the to_sframe() function to convert it to an SFrame." if not isinstance(dataset, _SFrame): - raise ToolkitError(err_msg % variable_name) + raise ToolkitError(err_msg % variable_name) + def _raise_error_if_sframe_empty(dataset, variable_name="SFrame"): """ @@ -543,15 +643,17 @@ def _raise_error_if_sframe_empty(dataset, variable_name="SFrame"): if dataset.num_rows() == 0 or dataset.num_columns() == 0: raise ToolkitError(err_msg % variable_name) + def _raise_error_if_not_iterable(dataset, variable_name="SFrame"): """ Check if the input is iterable. """ err_msg = "Input %s is not iterable: hasattr(%s, '__iter__') must be true." - if not hasattr(dataset, '__iter__'): + if not hasattr(dataset, "__iter__"): raise ToolkitError(err_msg % variable_name) + def _raise_error_evaluation_metric_is_valid(metric, allowed_metrics): """ Check if the input is an SFrame. Provide a proper error @@ -562,8 +664,10 @@ def _raise_error_evaluation_metric_is_valid(metric, allowed_metrics): err_msg += " metrics are (%s)." if metric not in allowed_metrics: - raise ToolkitError(err_msg % (metric, - ', '.join(map(lambda x: "'%s'" % x, allowed_metrics)))) + raise ToolkitError( + err_msg % (metric, ", ".join(map(lambda x: "'%s'" % x, allowed_metrics))) + ) + def _numeric_param_check_range(variable_name, variable_value, range_bottom, range_top): """ @@ -574,7 +678,8 @@ def _numeric_param_check_range(variable_name, variable_value, range_bottom, rang if variable_value < range_bottom or variable_value > range_top: raise ToolkitError(err_msg % (variable_name, range_bottom, range_top)) -def _validate_data(dataset, target, features=None, validation_set='auto'): + +def _validate_data(dataset, target, features=None, validation_set="auto"): """ Validate and canonicalize training and validation data. @@ -611,38 +716,46 @@ def _validate_data(dataset, target, features=None, validation_set='auto'): # Determine columns to keep if features is None: features = [feat for feat in dataset.column_names() if feat != target] - if not hasattr(features, '__iter__'): + if not hasattr(features, "__iter__"): raise TypeError("Input 'features' must be a list.") if not all([isinstance(x, str) for x in features]): - raise TypeError( - "Invalid feature %s: Feature names must be of type str" % x) + raise TypeError("Invalid feature %s: Feature names must be of type str" % x) # Check validation_set argument if isinstance(validation_set, str): # Only string value allowed is 'auto' - if validation_set != 'auto': - raise TypeError('Unrecognized value for validation_set.') + if validation_set != "auto": + raise TypeError("Unrecognized value for validation_set.") elif isinstance(validation_set, _SFrame): # Attempt to append the two datasets together to check schema validation_set.head().append(dataset.head()) # Reduce validation set to requested columns - validation_set = _toolkits_select_columns( - validation_set, features + [target]) + validation_set = _toolkits_select_columns(validation_set, features + [target]) elif not validation_set is None: - raise TypeError("validation_set must be either 'auto', None, or an " - "SFrame matching the training data.") + raise TypeError( + "validation_set must be either 'auto', None, or an " + "SFrame matching the training data." + ) # Reduce training set to requested columns dataset = _toolkits_select_columns(dataset, features + [target]) return dataset, validation_set + def _handle_missing_values(dataset, feature_column_name, variable_name="dataset"): if any(feat is None for feat in dataset[feature_column_name]): - raise ToolkitError("Missing value (None) encountered in column " + str(feature_column_name) + " in the " + variable_name + ". Use the SFrame's dropna function to drop rows with 'None' values in them.") + raise ToolkitError( + "Missing value (None) encountered in column " + + str(feature_column_name) + + " in the " + + variable_name + + ". Use the SFrame's dropna function to drop rows with 'None' values in them." + ) + -def _validate_row_label(dataset, label=None, default_label='__id'): +def _validate_row_label(dataset, label=None, default_label="__id"): """ Validate a row label column. If the row label is not specified, a column is created with row numbers, named with the string in the `default_label` @@ -681,17 +794,21 @@ def _validate_row_label(dataset, label=None, default_label='__id'): i = 1 while label in dataset.column_names(): - label = label_name_base + '.{}'.format(i) + label = label_name_base + ".{}".format(i) i += 1 dataset = dataset.add_row_number(column_name=label) ## Validate the label name and types. if not isinstance(label, str): - raise TypeError("The row label column name '{}' must be a string.".format(label)) + raise TypeError( + "The row label column name '{}' must be a string.".format(label) + ) if not label in dataset.column_names(): - raise ToolkitError("Row label column '{}' not found in the dataset.".format(label)) + raise ToolkitError( + "Row label column '{}' not found in the dataset.".format(label) + ) if not dataset[label].dtype in (str, int): raise TypeError("Row labels must be integers or strings.") @@ -699,19 +816,23 @@ def _validate_row_label(dataset, label=None, default_label='__id'): ## Return the modified dataset and label return dataset, label + def _model_version_check(file_version, code_version): """ Checks if a saved model file with version (file_version) is compatible with the current code version (code_version). Throws an exception telling the user to upgrade. """ - if (file_version > code_version): - raise RuntimeError("Failed to load model file.\n\n" - "The model that you are trying to load was saved with a newer version of\n" - "Turi Create than what you have. Please upgrade before attempting to load\n" - "the file again:\n" - "\n" - " pip install -U turicreate\n") + if file_version > code_version: + raise RuntimeError( + "Failed to load model file.\n\n" + "The model that you are trying to load was saved with a newer version of\n" + "Turi Create than what you have. Please upgrade before attempting to load\n" + "the file again:\n" + "\n" + " pip install -U turicreate\n" + ) + def _mac_ver(): """ @@ -720,29 +841,37 @@ def _mac_ver(): """ import platform import sys - if sys.platform == 'darwin': + + if sys.platform == "darwin": ver_str = platform.mac_ver()[0] - return tuple([int(v) for v in ver_str.split('.')]) + return tuple([int(v) for v in ver_str.split(".")]) else: return () -def _print_neural_compute_device(cuda_gpus, use_mps, cuda_mem_req=None, has_mps_impl=True): + +def _print_neural_compute_device( + cuda_gpus, use_mps, cuda_mem_req=None, has_mps_impl=True +): """ Print a message making it clear to the user what compute resource is used in neural network training. """ num_cuda_gpus = len(cuda_gpus) if num_cuda_gpus >= 1: - gpu_names = ', '.join(gpu['name'] for gpu in cuda_gpus) + gpu_names = ", ".join(gpu["name"] for gpu in cuda_gpus) if use_mps: from ._mps_utils import mps_device_name - print('Using GPU to create model ({})'.format(mps_device_name())) + + print("Using GPU to create model ({})".format(mps_device_name())) elif num_cuda_gpus >= 1: - plural = 's' if num_cuda_gpus >= 2 else '' - print('Using GPU{} to create model ({})'.format(plural, gpu_names)) + plural = "s" if num_cuda_gpus >= 2 else "" + print("Using GPU{} to create model ({})".format(plural, gpu_names)) else: import sys - print('Using CPU to create model') - if sys.platform == 'darwin' and _mac_ver() < (10, 14) and has_mps_impl: - print('NOTE: If available, an AMD GPU can be leveraged on macOS 10.14+ for faster model creation') + + print("Using CPU to create model") + if sys.platform == "darwin" and _mac_ver() < (10, 14) and has_mps_impl: + print( + "NOTE: If available, an AMD GPU can be leveraged on macOS 10.14+ for faster model creation" + ) diff --git a/src/python/turicreate/toolkits/_main.py b/src/python/turicreate/toolkits/_main.py index 8ed97658a0..44c522312e 100644 --- a/src/python/turicreate/toolkits/_main.py +++ b/src/python/turicreate/toolkits/_main.py @@ -52,12 +52,12 @@ def run(toolkit_name, options, verbose=True, show_progress=False): Raises RuntimeError if the server fail executing the toolkit. """ unity = glconnect.get_unity() - if (not verbose): + if not verbose: glconnect.get_server().set_log_progress(False) (success, message, params) = unity.run_toolkit(toolkit_name, options) - if (len(message) > 0): + if len(message) > 0: logging.getLogger(__name__).error("Toolkit error: " + message) # set the verbose level back to default diff --git a/src/python/turicreate/toolkits/_model.py b/src/python/turicreate/toolkits/_model.py index 9834a1bd04..c46b7b92cd 100644 --- a/src/python/turicreate/toolkits/_model.py +++ b/src/python/turicreate/toolkits/_model.py @@ -3,11 +3,11 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" @package turicreate.toolkits Defines a basic interface for a model object. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -17,7 +17,10 @@ from turicreate.data_structures.sframe import SFrame as _SFrame import turicreate.extensions as _extensions from turicreate.extensions import _wrap_function_return -from turicreate.toolkits._internal_utils import _toolkit_serialize_summary_struct, _read_env_var_cpp +from turicreate.toolkits._internal_utils import ( + _toolkit_serialize_summary_struct, + _read_env_var_cpp, +) from turicreate.util import _make_internal_url from turicreate.toolkits._main import ToolkitError import turicreate.util._file_util as file_util @@ -29,16 +32,17 @@ MODEL_NAME_MAP = {} # Object detector use C++ codepath -OD_USE_CPP = _read_env_var_cpp('TURI_OD_USE_CPP_PATH') +OD_USE_CPP = _read_env_var_cpp("TURI_OD_USE_CPP_PATH") # Activity Classifier use C++ codepath -AC_USE_CPP = _read_env_var_cpp('TURI_AC_USE_CPP_PATH') +AC_USE_CPP = _read_env_var_cpp("TURI_AC_USE_CPP_PATH") # Style Transfer use C++ codepath -ST_USE_CPP = _read_env_var_cpp('TURI_ST_USE_CPP_PATH') +ST_USE_CPP = _read_env_var_cpp("TURI_ST_USE_CPP_PATH") # Drawing Classifier use C++ codepath -DC_USE_CPP = _read_env_var_cpp('TURI_DC_USE_CPP_PATH') +DC_USE_CPP = _read_env_var_cpp("TURI_DC_USE_CPP_PATH") + def load_model(location): """ @@ -67,16 +71,21 @@ def load_model(location): # 2) GLUnpickler does not support http protocol = file_util.get_protocol(location) dir_archive_exists = False - if protocol == '': + if protocol == "": model_path = file_util.expand_full_path(location) - dir_archive_exists = file_util.exists(os.path.join(model_path, 'dir_archive.ini')) + dir_archive_exists = file_util.exists( + os.path.join(model_path, "dir_archive.ini") + ) else: model_path = location - if protocol in ['http', 'https', 's3']: + if protocol in ["http", "https", "s3"]: dir_archive_exists = True else: import posixpath - dir_archive_exists = file_util.exists(posixpath.join(model_path, 'dir_archive.ini')) + + dir_archive_exists = file_util.exists( + posixpath.join(model_path, "dir_archive.ini") + ) if not dir_archive_exists: raise IOError("Directory %s does not exist" % location) @@ -84,62 +93,80 @@ def load_model(location): saved_state = glconnect.get_unity().load_model(_internal_url) saved_state = _wrap_function_return(saved_state) # The archive version could be both bytes/unicode - key = u'archive_version' - archive_version = saved_state[key] if key in saved_state else saved_state[key.encode()] + key = u"archive_version" + archive_version = ( + saved_state[key] if key in saved_state else saved_state[key.encode()] + ) if archive_version < 0: raise ToolkitError("File does not appear to be a Turi Create model.") elif archive_version > 1: - raise ToolkitError("Unable to load model.\n\n" - "This model looks to have been saved with a future version of Turi Create.\n" - "Please upgrade Turi Create before attempting to load this model file.") + raise ToolkitError( + "Unable to load model.\n\n" + "This model looks to have been saved with a future version of Turi Create.\n" + "Please upgrade Turi Create before attempting to load this model file." + ) elif archive_version == 1: - name = saved_state['model_name']; + name = saved_state["model_name"] if name in MODEL_NAME_MAP: cls = MODEL_NAME_MAP[name] - if 'model' in saved_state: - if name in ['activity_classifier', 'object_detector', 'style_transfer', 'drawing_classifier']: + if "model" in saved_state: + if name in [ + "activity_classifier", + "object_detector", + "style_transfer", + "drawing_classifier", + ]: import turicreate.toolkits.libtctensorflow # this is a native model - return cls(saved_state['model']) + return cls(saved_state["model"]) else: # this is a CustomModel - model_data = saved_state['side_data'] - model_version = model_data['model_version'] - del model_data['model_version'] + model_data = saved_state["side_data"] + model_version = model_data["model_version"] + del model_data["model_version"] - if name=='activity_classifier' and AC_USE_CPP: + if name == "activity_classifier" and AC_USE_CPP: import turicreate.toolkits.libtctensorflow + model = _extensions.activity_classifier() model.import_from_custom_model(model_data, model_version) return cls(model) - if name=='object_detector' and OD_USE_CPP: + if name == "object_detector" and OD_USE_CPP: import turicreate.toolkits.libtctensorflow + model = _extensions.object_detector() model.import_from_custom_model(model_data, model_version) return cls(model) - if name=='style_transfer' and ST_USE_CPP: + if name == "style_transfer" and ST_USE_CPP: import turicreate.toolkits.libtctensorflow + model = _extensions.style_transfer() model.import_from_custom_model(model_data, model_version) return cls(model) - if name=='drawing_classifier' and DC_USE_CPP: + if name == "drawing_classifier" and DC_USE_CPP: import turicreate.toolkits.libtctensorflow + model = _extensions.drawing_classifier() model.import_from_custom_model(model_data, model_version) return cls(model) - if name=='one_shot_object_detector' and OD_USE_CPP: + if name == "one_shot_object_detector" and OD_USE_CPP: import turicreate.toolkits.libtctensorflow - od_cls = MODEL_NAME_MAP['object_detector'] - if 'detector_model' in model_data['detector']: - model_data['detector'] = od_cls(model_data['detector']['detector_model']) + + od_cls = MODEL_NAME_MAP["object_detector"] + if "detector_model" in model_data["detector"]: + model_data["detector"] = od_cls( + model_data["detector"]["detector_model"] + ) else: model = _extensions.object_detector() - model.import_from_custom_model(model_data['detector'], model_data['_detector_version']) - model_data['detector'] = od_cls(model) + model.import_from_custom_model( + model_data["detector"], model_data["_detector_version"] + ) + model_data["detector"] = od_cls(model) return cls(model_data) return cls._load_version(model_data, model_version) @@ -147,37 +174,44 @@ def load_model(location): elif hasattr(_extensions, name): return saved_state["model"] else: - raise ToolkitError("Unable to load model of name '%s'; model name not registered." % name) + raise ToolkitError( + "Unable to load model of name '%s'; model name not registered." % name + ) else: # very legacy model format. Attempt pickle loading import sys - sys.stderr.write("This model was saved in a legacy model format. Compatibility cannot be guaranteed in future versions.\n") - if _six.PY3: - raise ToolkitError("Unable to load legacy model in Python 3.\n\n" - "To migrate a model, try loading it using Turi Create 4.0 or\n" - "later in Python 2 and then re-save it. The re-saved model should\n" - "work in Python 3.") - if 'graphlab' not in sys.modules: - sys.modules['graphlab'] = sys.modules['turicreate'] + sys.stderr.write( + "This model was saved in a legacy model format. Compatibility cannot be guaranteed in future versions.\n" + ) + if _six.PY3: + raise ToolkitError( + "Unable to load legacy model in Python 3.\n\n" + "To migrate a model, try loading it using Turi Create 4.0 or\n" + "later in Python 2 and then re-save it. The re-saved model should\n" + "work in Python 3." + ) + + if "graphlab" not in sys.modules: + sys.modules["graphlab"] = sys.modules["turicreate"] # backward compatibility. Otherwise old pickles will not load - sys.modules["turicreate_util"] = sys.modules['turicreate.util'] - sys.modules["graphlab_util"] = sys.modules['turicreate.util'] + sys.modules["turicreate_util"] = sys.modules["turicreate.util"] + sys.modules["graphlab_util"] = sys.modules["turicreate.util"] # More backwards compatibility with the turicreate namespace code. for k, v in list(sys.modules.items()): - if 'turicreate' in k: - sys.modules[k.replace('turicreate', 'graphlab')] = v - #legacy loader + if "turicreate" in k: + sys.modules[k.replace("turicreate", "graphlab")] = v + # legacy loader import pickle - model_wrapper = pickle.loads(saved_state[b'model_wrapper']) - return model_wrapper(saved_state[b'model_base']) + model_wrapper = pickle.loads(saved_state[b"model_wrapper"]) + return model_wrapper(saved_state[b"model_base"]) -def _get_default_options_wrapper(unity_server_model_name, - module_name='', - python_class_name='', - sdk_model = False): + +def _get_default_options_wrapper( + unity_server_model_name, module_name="", python_class_name="", sdk_model=False +): """ Internal function to return a get_default_options function. @@ -200,7 +234,8 @@ def _get_default_options_wrapper(unity_server_model_name, get_default_options = _get_default_options_wrapper('classifier_svm', 'svm', 'SVMClassifier') """ - def get_default_options_for_model(output_type = 'sframe'): + + def get_default_options_for_model(output_type="sframe"): """ Get the default options for the toolkit :class:`~turicreate.{module_name}.{python_class_name}`. @@ -257,30 +292,36 @@ def get_default_options_for_model(output_type = 'sframe'): """ if sdk_model: response = _tc.extensions._toolkits_sdk_get_default_options( - unity_server_model_name) + unity_server_model_name + ) else: response = _tc.extensions._toolkits_get_default_options( - unity_server_model_name) + unity_server_model_name + ) - if output_type == 'json': - return response + if output_type == "json": + return response else: - json_list = [{'name': k, '': v} for k,v in response.items()] - return _SFrame(json_list).unpack('X1', column_name_prefix='')\ - .unpack('X1', column_name_prefix='') + json_list = [{"name": k, "": v} for k, v in response.items()] + return ( + _SFrame(json_list) + .unpack("X1", column_name_prefix="") + .unpack("X1", column_name_prefix="") + ) # Change the doc string before returning. - get_default_options_for_model.__doc__ = get_default_options_for_model.\ - __doc__.format(python_class_name = python_class_name, - module_name = module_name) + get_default_options_for_model.__doc__ = get_default_options_for_model.__doc__.format( + python_class_name=python_class_name, module_name=module_name + ) return get_default_options_for_model + class RegistrationMetaClass(type): def __new__(meta, name, bases, class_dict): global MODEL_NAME_MAP cls = type.__new__(meta, name, bases, class_dict) # do nothing for the base Model/CustomModel classes - if name == 'Model' or name == 'CustomModel': + if name == "Model" or name == "CustomModel": return cls native_name = cls._native_name() @@ -331,12 +372,14 @@ def update(self, d): def get_state(self): return _copy(self.state) + class ExposeAttributesFromProxy(object): """Mixin to use when a __proxy__ class attribute should be used for additional fields. This allows tab-complete (i.e., calling __dir__ on the object) to include class methods as well as the results of __proxy__.list_fields(). """ + """The UnityModel Proxy Object""" __proxy__ = None @@ -347,7 +390,7 @@ def __dir__(self): """ # Combine dir(current class), the proxy's fields, and the method # list_fields (which is hidden in __getattribute__'s implementation. - return dir(self.__class__) + list(self._list_fields()) + ['_list_fields'] + return dir(self.__class__) + list(self._list_fields()) + ["_list_fields"] def _get(self, field): """ @@ -376,14 +419,14 @@ def __getattribute__(self, attr): """ Use the internal proxy object for obtaining list_fields. """ - proxy = object.__getattribute__(self, '__proxy__') + proxy = object.__getattribute__(self, "__proxy__") # If no proxy exists, use the properties defined for the current class if proxy is None: return object.__getattribute__(self, attr) # Get the fields defined by the proxy object - if not hasattr(proxy, 'list_fields'): + if not hasattr(proxy, "list_fields"): fields = [] else: fields = proxy.list_fields() @@ -391,13 +434,14 @@ def __getattribute__(self, attr): def list_fields(): return fields - if attr == '_list_fields': + if attr == "_list_fields": return list_fields elif attr in fields: return self._get(attr) else: return object.__getattribute__(self, attr) + @_six.add_metaclass(RegistrationMetaClass) class Model(ExposeAttributesFromProxy): """ @@ -464,8 +508,10 @@ def _get(self, field): if field in self._list_fields(): return self.__proxy__.get_value(field) else: - raise KeyError(('Field \"%s\" not in model. Available fields are ' - '%s.') % (field, ', '.join(self._list_fields()))) + raise KeyError( + ('Field "%s" not in model. Available fields are ' "%s.") + % (field, ", ".join(self._list_fields())) + ) @classmethod def _native_name(cls): @@ -515,18 +561,19 @@ def summary(self, output=None): -------- >>> m.summary() """ - if output is None or output == 'stdout': + if output is None or output == "stdout": try: print(self.__repr__()) except: return self.__class__.__name__ - elif (output == 'str'): + elif output == "str": return self.__repr__() - elif output == 'dict': - return _toolkit_serialize_summary_struct(self, \ - *self._get_summary_struct() ) + elif output == "dict": + return _toolkit_serialize_summary_struct(self, *self._get_summary_struct()) else: - raise ToolkitError("Unsupported argument " + str(output) + " for \"summary\" parameter.") + raise ToolkitError( + "Unsupported argument " + str(output) + ' for "summary" parameter.' + ) def __repr__(self): raise NotImplementedError @@ -654,7 +701,9 @@ def name(self): -------- >>> model_name = m.name() """ - warnings.warn("This function is deprecated. It will be removed in the next release. Please use python's builtin type function instead.") + warnings.warn( + "This function is deprecated. It will be removed in the next release. Please use python's builtin type function instead." + ) return self.__class__.__name__ def summary(self, output=None): @@ -680,19 +729,19 @@ def summary(self, output=None): -------- >>> m.summary() """ - if output is None or output == 'stdout': + if output is None or output == "stdout": try: print(self.__repr__()) except: return self.__class__.__name__ - elif (output == 'str'): + elif output == "str": return self.__repr__() - elif output == 'dict': - return _toolkit_serialize_summary_struct(self, \ - *self._get_summary_struct() ) + elif output == "dict": + return _toolkit_serialize_summary_struct(self, *self._get_summary_struct()) else: - raise ToolkitError("Unsupported argument " + str(output) + " for \"summary\" parameter.") - + raise ToolkitError( + "Unsupported argument " + str(output) + ' for "summary" parameter.' + ) def _get_version(self): raise NotImplementedError("_get_version not implemented") @@ -724,10 +773,12 @@ def save(self, location): """ import copy + state = copy.copy(self._get_native_state()) - state['model_version'] = self._get_version() + state["model_version"] = self._get_version() return glconnect.get_unity().save_model2( - self.__class__._native_name(), _make_internal_url(location), state) + self.__class__._native_name(), _make_internal_url(location), state + ) @classmethod def _native_name(cls): diff --git a/src/python/turicreate/toolkits/_mps_utils.py b/src/python/turicreate/toolkits/_mps_utils.py index dc57dda215..e5a13b7142 100644 --- a/src/python/turicreate/toolkits/_mps_utils.py +++ b/src/python/turicreate/toolkits/_mps_utils.py @@ -21,35 +21,36 @@ class MpsGraphNetworkType(object): kSingleReLUGraphNet = 0 kSingleConvGraphNet = 1 - kSingleBNGraphNet = 2 - kSingleMPGraphNet = 3 - kODGraphNet = 4 - kSTGraphNet = 5 + kSingleBNGraphNet = 2 + kSingleMPGraphNet = 3 + kODGraphNet = 4 + kSTGraphNet = 5 class MpsGraphMode(object): - Train = 0 + Train = 0 TrainReturnGrad = 1 - Inference = 2 + Inference = 2 class MpsLowLevelNetworkType(object): - kSingleReLUNet = 0 - kSingleConvNet = 1 - kSingleBNNet = 2 - kSingleMPNet = 3 - kSingle1DConvNet = 4 - kODNet = 5 - kSingleDropOut = 6 - kSingleFcNet = 7 - kSingleSoftMaxNet = 8 - kActivityClassifierNet= 9 - kSingleLstmNet = 10 + kSingleReLUNet = 0 + kSingleConvNet = 1 + kSingleBNNet = 2 + kSingleMPNet = 3 + kSingle1DConvNet = 4 + kODNet = 5 + kSingleDropOut = 6 + kSingleFcNet = 7 + kSingleSoftMaxNet = 8 + kActivityClassifierNet = 9 + kSingleLstmNet = 10 + class MpsLowLevelMode(object): - kLowLevelModeTrain = 0 - kLowLevelModeInference = 1 - kLowLevelModeTest = 2 + kLowLevelModeTrain = 0 + kLowLevelModeInference = 1 + kLowLevelModeTest = 2 def _decode_bytes_to_native_string(s): @@ -58,6 +59,7 @@ def _decode_bytes_to_native_string(s): else: return s + def _prepare_network_parameters(arg_dict): items = [] for name, arr in arg_dict.items(): @@ -71,8 +73,10 @@ def _prepare_network_parameters(arg_dict): arr[i] = items[i][1].handle return items, name, arr + _g_TCMPS_LIB = None + def _load_tcmps_lib(): """ Load global singleton of tcmps lib handler. @@ -91,7 +95,9 @@ def _load_tcmps_lib(): # activity_classifier toolkits will use the same Python/C++ bridge as # the other toolkits, and this usage of ctypes will go away. file_dir = _os.path.dirname(__file__) - lib_path = _os.path.abspath(_os.path.join(file_dir, _os.pardir, 'libunity_shared.dylib')) + lib_path = _os.path.abspath( + _os.path.join(file_dir, _os.pardir, "libunity_shared.dylib") + ) try: _g_TCMPS_LIB = _ctypes.CDLL(lib_path, _ctypes.RTLD_LOCAL) except OSError: @@ -164,9 +170,10 @@ def _xavier_init(weight): if dim > 2: scale = _np.prod(shape[1:-1]) - c = _np.sqrt(3. / (0.5 * (n_in * scale + n_out * scale))) + c = _np.sqrt(3.0 / (0.5 * (n_in * scale + n_out * scale))) return _np.random.uniform(-c, c, shape).astype(_np.float32) + def _shape_tuple_from_ctypes(shape_ptr, dim): # size_t* shape_ptr assert isinstance(shape_ptr, _ctypes.POINTER(_ctypes.c_size_t)) @@ -176,11 +183,13 @@ def _shape_tuple_from_ctypes(shape_ptr, dim): # Wrap size_t* as size_t[dim] shape_buf = (_ctypes.c_size_t * dim.value).from_address( - _ctypes.addressof(shape_ptr.contents)) + _ctypes.addressof(shape_ptr.contents) + ) # Convert size_t[dim] to tuple return tuple(shape_buf) + def _numpy_array_from_ctypes(data_ptr, shape_ptr, dim): # float* data_ptr assert isinstance(data_ptr, _ctypes.POINTER(_ctypes.c_float)) @@ -190,11 +199,13 @@ def _numpy_array_from_ctypes(data_ptr, shape_ptr, dim): # Wrap float* to float[size] size = _np.prod(shape) data_buf = (_ctypes.c_float * size).from_address( - _ctypes.addressof(data_ptr.contents)) + _ctypes.addressof(data_ptr.contents) + ) # Convert float[size] to numpy return _np.fromiter(data_buf, _np.float32, size).reshape(shape) + class MpsFloatArray(object): """ A Python wrapper owning a C++ float_array created by the TCMPS backend. @@ -239,7 +250,8 @@ def __init__(self, x): # Those two properties must outlive the resulting self.handle. self.handle = _ctypes.c_void_p() status_code = self._LIB.TCMPSCreateFloatArray( - _ctypes.byref(self.handle), data_ptr, sz, shape_ptr, dim) + _ctypes.byref(self.handle), data_ptr, sz, shape_ptr, dim + ) assert status_code == 0, "Error calling TCMPSCreateFloatArray" def __del__(self): @@ -251,11 +263,12 @@ def shape(self): # Create C variables that will serve as out parameters for TCMPS. shape_ptr = _ctypes.POINTER(_ctypes.c_size_t)() # size_t* shape_ptr - dim = _ctypes.c_size_t() # size_t dim + dim = _ctypes.c_size_t() # size_t dim # Obtain pointer into memory owned by the C++ object self.handle. status_code = self._LIB.TCMPSGetFloatArrayShape( - self.handle, _ctypes.byref(shape_ptr), _ctypes.byref(dim)) + self.handle, _ctypes.byref(shape_ptr), _ctypes.byref(dim) + ) assert status_code == 0, "Error calling TCMPSGetFloatArrayShape" return _shape_tuple_from_ctypes(shape_ptr, dim) @@ -264,16 +277,19 @@ def asnumpy(self): """Copy the data from TCMPS into a new numpy ndarray""" # Create C variables that will serve as out parameters for TCMPS. - data_ptr = _ctypes.POINTER(_ctypes.c_float)() # float* data_ptr + data_ptr = _ctypes.POINTER(_ctypes.c_float)() # float* data_ptr shape_ptr = _ctypes.POINTER(_ctypes.c_size_t)() # size_t* shape_ptr - dim = _ctypes.c_size_t() # size_t dim + dim = _ctypes.c_size_t() # size_t dim # Obtain pointers into memory owned by the C++ object self.handle. # Note that this may trigger synchronization with another thread # producing the data. status_code = self._LIB.TCMPSReadFloatArray( - self.handle, _ctypes.byref(data_ptr), _ctypes.byref(shape_ptr), - _ctypes.byref(dim)) + self.handle, + _ctypes.byref(data_ptr), + _ctypes.byref(shape_ptr), + _ctypes.byref(dim), + ) assert status_code == 0, "Error calling TCMPSReadFloatArray" return _numpy_array_from_ctypes(data_ptr, shape_ptr, dim) @@ -293,7 +309,9 @@ class MpsFloatArrayIterator(object): def __init__(self, handle): """Wrap the output of a TCMPSExport* function.""" self._LIB = _load_tcmps_lib() - assert self._LIB is not None, "Cannot use MpsFloatArrayIterator without libtcmps.dylib" + assert ( + self._LIB is not None + ), "Cannot use MpsFloatArrayIterator without libtcmps.dylib" self.handle = handle @@ -306,15 +324,19 @@ def __iter__(self): def __next__(self): # Create C variables that will serve as out parameters for TCMPS. - name_ptr = _ctypes.c_char_p() # char* name_ptr - data_ptr = _ctypes.POINTER(_ctypes.c_float)() # float* data_ptr + name_ptr = _ctypes.c_char_p() # char* name_ptr + data_ptr = _ctypes.POINTER(_ctypes.c_float)() # float* data_ptr shape_ptr = _ctypes.POINTER(_ctypes.c_size_t)() # size_t* shape_ptr - dim = _ctypes.c_size_t() # size_t dim + dim = _ctypes.c_size_t() # size_t dim # Obtain pointers into memory owned by the C++ object self.handle. status_code = self._LIB.TCMPSNextFloatArray( - self.handle, _ctypes.byref(name_ptr), _ctypes.byref(data_ptr), - _ctypes.byref(shape_ptr), _ctypes.byref(dim)) + self.handle, + _ctypes.byref(name_ptr), + _ctypes.byref(data_ptr), + _ctypes.byref(shape_ptr), + _ctypes.byref(dim), + ) if status_code != 0: raise StopIteration @@ -331,11 +353,11 @@ def next(self): return self.__next__() -#---------------------------------------------------------- +# ---------------------------------------------------------- # # MPS Graph level API, currently used by Object detector # -#---------------------------------------------------------- +# ---------------------------------------------------------- class MpsGraphAPI(object): @@ -360,14 +382,17 @@ def init(self, n, c_in, h_in, w_in, c_out, h_out, w_out, config=None, weights=No weights = {} if config is None: config = { - 'learning_rate': 1e-3, - 'gradient_clipping': 0.025, - 'weight_decay': 0.00005, - 'momentum': 0.9, + "learning_rate": 1e-3, + "gradient_clipping": 0.025, + "weight_decay": 0.00005, + "momentum": 0.9, } - self._mode = int(config.get('mode', MpsGraphMode.TrainReturnGrad)) - self._is_train = self._mode in {MpsGraphMode.TrainReturnGrad, MpsGraphMode.Train} + self._mode = int(config.get("mode", MpsGraphMode.TrainReturnGrad)) + self._is_train = self._mode in { + MpsGraphMode.TrainReturnGrad, + MpsGraphMode.Train, + } config_items, config_name, config_arr = _prepare_network_parameters(config) weights_items, weights_name, weights_arr = _prepare_network_parameters(weights) @@ -381,8 +406,12 @@ def init(self, n, c_in, h_in, w_in, c_out, h_out, w_out, config=None, weights=No _ctypes.c_int32(c_out), _ctypes.c_int32(h_out), _ctypes.c_int32(w_out), - config_name, config_arr, _ctypes.c_int32(len(config_items)), - weights_name, weights_arr, _ctypes.c_int32(len(weights_items)), + config_name, + config_arr, + _ctypes.c_int32(len(config_items)), + weights_name, + weights_arr, + _ctypes.c_int32(len(weights_items)), ) self._cur_config = _deepcopy(config) if self._mode == MpsGraphMode.TrainReturnGrad: @@ -409,8 +438,11 @@ def train(self, input, label): label_array = MpsFloatArray(label) result_handle = _ctypes.c_void_p() status_code = self._LIB.TCMPSTrainGraph( - self.handle, input_array.handle, label_array.handle, - _ctypes.byref(result_handle)) + self.handle, + input_array.handle, + label_array.handle, + _ctypes.byref(result_handle), + ) assert status_code == 0, "Error calling TCMPSTrainGraph" assert result_handle, "TCMPSTrainGraph unexpectedly returned NULL pointer" @@ -436,7 +468,8 @@ def predict(self, input): input_array = MpsFloatArray(input) result_handle = _ctypes.c_void_p() status_code = self._LIB.TCMPSPredictGraph( - self.handle, input_array.handle, _ctypes.byref(result_handle)) + self.handle, input_array.handle, _ctypes.byref(result_handle) + ) assert status_code == 0, "Error calling TCMPSPredictGraph" assert result_handle, "TCMPSPredictGraph unexpectedly returned NULL pointer" @@ -463,11 +496,16 @@ def train_return_grad(self, input, grad): grad_array = MpsFloatArray(grad) result_handle = _ctypes.c_void_p() status_code = self._LIB.TCMPSTrainGraph( - self.handle, input_array.handle, grad_array.handle, - _ctypes.byref(result_handle)) + self.handle, + input_array.handle, + grad_array.handle, + _ctypes.byref(result_handle), + ) assert status_code == 0, "Error calling TCMPSTrainReturnGradGraph" - assert result_handle, "TCMPSTrainReturnGradGraph unexpectedly returned NULL pointer" + assert ( + result_handle + ), "TCMPSTrainReturnGradGraph unexpectedly returned NULL pointer" result = MpsFloatArray(result_handle) assert result.shape() == self._ishape @@ -481,29 +519,41 @@ def set_learning_rate(self, new_lr): def load(self, weights): self._LIB.TCMPSDeleteGraphModule(self.handle) self.handle = _ctypes.c_void_p() - self._LIB.TCMPSCreateGraphModule(_ctypes.byref(self.handle), - _ctypes.c_int(self._mode)) + self._LIB.TCMPSCreateGraphModule( + _ctypes.byref(self.handle), _ctypes.c_int(self._mode) + ) n, h_in, w_in, c_in = self._ishape _, h_out, w_out, c_out = self._oshape - self.init(n, c_in, h_in, w_in, c_out, h_out, w_out, - config=self._cur_config, weights=weights) + self.init( + n, + c_in, + h_in, + w_in, + c_out, + h_out, + w_out, + config=self._cur_config, + weights=weights, + ) # Reload state if self._cur_learning_rate: self.set_learning_rate(self._cur_learning_rate) def export(self): iter_handle = _ctypes.c_void_p() - status_code = self._LIB.TCMPSExportGraph(self.handle, - _ctypes.byref(iter_handle)) + status_code = self._LIB.TCMPSExportGraph( + self.handle, _ctypes.byref(iter_handle) + ) assert status_code == 0 return dict(MpsFloatArrayIterator(iter_handle)) -#---------------------------------------------------------- +# ---------------------------------------------------------- # # MPS Graph level API, currently used by Activity Classifier # -#---------------------------------------------------------- +# ---------------------------------------------------------- + class MpsLowLevelAPI(object): def __init__(self, network_id=MpsLowLevelNetworkType.kActivityClassifierNet): @@ -533,7 +583,9 @@ def init(self, n, c_in, h_in, w_in, c_out, h_out, w_out, updater=1, config={}): _ctypes.c_int32(h_out), _ctypes.c_int32(w_out), _ctypes.c_int32(updater), - config_name, config_arr, _ctypes.c_int32(len(config_items)), + config_name, + config_arr, + _ctypes.c_int32(len(config_items)), ) sz = n * c_out * h_out * w_out self._buf = (_ctypes.c_float * sz)() @@ -545,12 +597,13 @@ def init(self, n, c_in, h_in, w_in, c_out, h_out, w_out, updater=1, config={}): def load(self, weights): weights_items, weights_name, weights_arr = _prepare_network_parameters(weights) - self._LIB.TCMPSLoad(self.handle, weights_name, weights_arr, _ctypes.c_int32(len(weights_items))) + self._LIB.TCMPSLoad( + self.handle, weights_name, weights_arr, _ctypes.c_int32(len(weights_items)) + ) def export(self): iter_handle = _ctypes.c_void_p() - status_code = self._LIB.TCMPSExport(self.handle, - _ctypes.byref(iter_handle)) + status_code = self._LIB.TCMPSExport(self.handle, _ctypes.byref(iter_handle)) assert status_code == 0 return dict(MpsFloatArrayIterator(iter_handle)) @@ -562,7 +615,7 @@ def initalize_weights(self): self.load(args) def _loss_or_iteration_call(self, lib_method, input, labels, weights): - expected_label_shape = (self._oshape[:-1] + (1,)) + expected_label_shape = self._oshape[:-1] + (1,) assert input.shape == self._ishape assert labels.shape == expected_label_shape assert weights.shape == expected_label_shape @@ -574,8 +627,12 @@ def _loss_or_iteration_call(self, lib_method, input, labels, weights): loss_handle = _ctypes.c_void_p() status_code = lib_method( self.handle, - input_array.handle, labels_array.handle, weights_array.handle, - _ctypes.byref(output_handle), _ctypes.byref(loss_handle)) + input_array.handle, + labels_array.handle, + weights_array.handle, + _ctypes.byref(output_handle), + _ctypes.byref(loss_handle), + ) assert status_code == 0, "Error calling TCMPS" assert output_handle, "TCMPS unexpectedly returned NULL pointer" @@ -590,12 +647,14 @@ def _loss_or_iteration_call(self, lib_method, input, labels, weights): return (output, loss) def train(self, input, labels, weights): - return self._loss_or_iteration_call(self._LIB.TCMPSTrain, input, labels, - weights) + return self._loss_or_iteration_call( + self._LIB.TCMPSTrain, input, labels, weights + ) def predict_with_loss(self, input, labels, weights): - return self._loss_or_iteration_call(self._LIB.TCMPSPredict, input, - labels, weights) + return self._loss_or_iteration_call( + self._LIB.TCMPSPredict, input, labels, weights + ) def predict(self, input): assert input.shape == self._ishape @@ -603,8 +662,13 @@ def predict(self, input): input_array = MpsFloatArray(input) output_handle = _ctypes.c_void_p() status_code = self._LIB.TCMPSPredict( - self.handle, input_array.handle, None, None, - _ctypes.byref(output_handle), None) + self.handle, + input_array.handle, + None, + None, + _ctypes.byref(output_handle), + None, + ) assert status_code == 0, "Error calling TCMPSPredict" assert output_handle, "TCMPSPredict unexpectedly returned NULL pointer" @@ -614,23 +678,26 @@ def predict(self, input): return output + class MpsStyleGraphAPI(object): - def __init__(self, n, c_in, h_in, w_in, c_out, h_out, w_out, config=None, weights=None): + def __init__( + self, n, c_in, h_in, w_in, c_out, h_out, w_out, config=None, weights=None + ): self.handle = _ctypes.c_void_p() self._LIB = _load_tcmps_lib() assert self._LIB is not None, "Cannot use MpsGraphAPI without libtcmps.dylib" - + self.network_id = MpsGraphNetworkType.kSTGraphNet self._cur_config = {} if weights is None: weights = {} - + if config is None: config = { - 'learning_rate': 1e-3, - 'gradient_clipping': 0.025, - 'weight_decay': 0.00005, - 'momentum': 0.9, + "learning_rate": 1e-3, + "gradient_clipping": 0.025, + "weight_decay": 0.00005, + "momentum": 0.9, } config_items, config_name, config_arr = _prepare_network_parameters(config) @@ -646,8 +713,12 @@ def __init__(self, n, c_in, h_in, w_in, c_out, h_out, w_out, config=None, weight _ctypes.c_int32(c_out), _ctypes.c_int32(h_out), _ctypes.c_int32(w_out), - config_name, config_arr, _ctypes.c_int32(len(config_items)), - weights_name, weights_arr, _ctypes.c_int32(len(weights_items)), + config_name, + config_arr, + _ctypes.c_int32(len(config_items)), + weights_name, + weights_arr, + _ctypes.c_int32(len(weights_items)), ) self._cur_config = _deepcopy(config) @@ -659,16 +730,22 @@ def train(self, input, label, index): label_array = MpsFloatArray(label) result_handle = _ctypes.c_void_p() - + status_code = self._LIB.TCMPSTrainStyleTransferGraph( - self.handle, _ctypes.c_int32(index), input_array.handle, label_array.handle, - _ctypes.byref(result_handle)) + self.handle, + _ctypes.c_int32(index), + input_array.handle, + label_array.handle, + _ctypes.byref(result_handle), + ) assert status_code == 0, "Error calling TCMPSTrainStyleTransferGraph" - assert result_handle, "TCMPSTrainStyleTransferGraph unexpectedly returned NULL pointer" + assert ( + result_handle + ), "TCMPSTrainStyleTransferGraph unexpectedly returned NULL pointer" result = MpsFloatArray(result_handle) - + return result def predict(self, input): @@ -676,7 +753,8 @@ def predict(self, input): result_handle = _ctypes.c_void_p() status_code = self._LIB.TCMPSPredictGraph( - self.handle, input_array.handle, _ctypes.byref(result_handle)) + self.handle, input_array.handle, _ctypes.byref(result_handle) + ) assert status_code == 0, "Error calling TCMPSPredictGraph" assert result_handle, "TCMPSPredictGraph unexpectedly returned NULL pointer" @@ -684,16 +762,20 @@ def predict(self, input): result = MpsFloatArray(result_handle) return result - + def train_return_grad(self, input, grad): pass + def set_learning_rate(self, new_lr): pass + def load(self, weights): pass + def export(self): iter_handle = _ctypes.c_void_p() - status_code = self._LIB.TCMPSExportGraph(self.handle, - _ctypes.byref(iter_handle)) + status_code = self._LIB.TCMPSExportGraph( + self.handle, _ctypes.byref(iter_handle) + ) assert status_code == 0 - return dict(MpsFloatArrayIterator(iter_handle)) \ No newline at end of file + return dict(MpsFloatArrayIterator(iter_handle)) diff --git a/src/python/turicreate/toolkits/_pre_trained_models.py b/src/python/turicreate/toolkits/_pre_trained_models.py index 3ca65c7c01..6a8630b38f 100644 --- a/src/python/turicreate/toolkits/_pre_trained_models.py +++ b/src/python/turicreate/toolkits/_pre_trained_models.py @@ -13,26 +13,32 @@ import hashlib as _hashlib from six.moves.urllib import parse as _urlparse -MODELS_URL_ROOT = 'https://docs-assets.developer.apple.com/turicreate/models/' +MODELS_URL_ROOT = "https://docs-assets.developer.apple.com/turicreate/models/" + def _get_cache_dir(_type="model"): - cache_dir = _tc.config.get_runtime_config()['TURI_CACHE_FILE_LOCATIONS'] + cache_dir = _tc.config.get_runtime_config()["TURI_CACHE_FILE_LOCATIONS"] if _type == "model": - download_path = _os.path.join(cache_dir, 'model_cache') + download_path = _os.path.join(cache_dir, "model_cache") else: - download_path = _os.path.join(cache_dir, 'data_cache') + download_path = _os.path.join(cache_dir, "data_cache") if not _os.path.exists(download_path): try: _os.makedirs(download_path) except: - raise RuntimeError("Could not write to the turicreate file cache, which is currently set to \"{cache_dir}\".\n" - "To continue you must update this location to a writable path by calling:\n" - "\ttc.config.set_runtime_config(\'TURI_CACHE_FILE_LOCATIONS\', )\n" - "Where is a writable file path that exists.".format(cache_dir=cache_dir)) + raise RuntimeError( + 'Could not write to the turicreate file cache, which is currently set to "{cache_dir}".\n' + "To continue you must update this location to a writable path by calling:\n" + "\ttc.config.set_runtime_config('TURI_CACHE_FILE_LOCATIONS', )\n" + "Where is a writable file path that exists.".format( + cache_dir=cache_dir + ) + ) return download_path + def _download_and_checksum_files(urls, dirname, delete=False): def url_sha_pair(url_or_pair): if isinstance(url_or_pair, tuple): @@ -59,8 +65,8 @@ def url_sha_pair(url_or_pair): if not _os.path.exists(fn): r = _requests.get(url, stream=True) assert r.status_code == 200, "%s (%d)" % (r.reason, r.status_code) - print('Downloading', url) - with open(fn, 'wb') as f: + print("Downloading", url) + with open(fn, "wb") as f: BUFFER = 1 << 16 for i, chunk in enumerate(r.iter_content(chunk_size=BUFFER)): if chunk: @@ -68,37 +74,40 @@ def url_sha_pair(url_or_pair): if sha is not None: md5.update(chunk) if sha is not None: - assert sha == md5.hexdigest(), "mismatched checksum, please try the command again" - print('Download completed:', fn) + assert ( + sha == md5.hexdigest() + ), "mismatched checksum, please try the command again" + print("Download completed:", fn) except (KeyboardInterrupt, AssertionError, _requests.RequestException) as e: # Only print if message is available (not the case for KeyboardInterrupt) if e: - print('ERROR: Download failed:', e, file=_sys.stderr) + print("ERROR: Download failed:", e, file=_sys.stderr) for fn in fns: if _os.path.exists(fn): _os.remove(fn) _sys.exit(1) return fns + class ImageClassifierPreTrainedModel(object): @classmethod def _is_gl_pickle_safe(cls): return False def get_model_path(self, format): - assert(format in ('coreml', 'tensorflow')) + assert format in ("coreml", "tensorflow") - filename = self.name + '-TuriCreate-6.0' - if(format == 'coreml'): - filename = filename + '.mlmodel' + filename = self.name + "-TuriCreate-6.0" + if format == "coreml": + filename = filename + ".mlmodel" else: - filename = filename + '.h5' + filename = filename + ".h5" url = _urlparse.urljoin(MODELS_URL_ROOT, filename) checksum = self.source_md5[format] - model_path = _download_and_checksum_files( - [(url, checksum)], _get_cache_dir() - )[0] + model_path = _download_and_checksum_files([(url, checksum)], _get_cache_dir())[ + 0 + ] return model_path @@ -107,15 +116,15 @@ class ResNetImageClassifier(ImageClassifierPreTrainedModel): input_image_shape = (3, 224, 224) def __init__(self): - self.name = 'resnet-50' + self.name = "resnet-50" self.input_is_BGR = False - - self.coreml_data_layer = 'data' - self.coreml_feature_layer = 'flatten0' + + self.coreml_data_layer = "data" + self.coreml_feature_layer = "flatten0" self.source_md5 = { - 'coreml': '8503ef18f368b65ebaaa07ba5689b5f8', - 'tensorflow': 'ac73d2cc03700035c6cd756742bd59d6' + "coreml": "8503ef18f368b65ebaaa07ba5689b5f8", + "tensorflow": "ac73d2cc03700035c6cd756742bd59d6", } @@ -123,23 +132,24 @@ class SqueezeNetImageClassifierV1_1(ImageClassifierPreTrainedModel): input_image_shape = (3, 227, 227) def __init__(self): - self.name = 'squeezenet_v1.1' + self.name = "squeezenet_v1.1" self.input_is_BGR = True - self.coreml_data_layer = 'data' - self.coreml_feature_layer = 'flatten' + self.coreml_data_layer = "data" + self.coreml_feature_layer = "flatten" self.source_md5 = { - 'coreml': '5d8a41bb9a48f71b779a98b345de0900', - 'tensorflow': '60d5afff4c5bc535bc29655feac5571f' + "coreml": "5d8a41bb9a48f71b779a98b345de0900", + "tensorflow": "60d5afff4c5bc535bc29655feac5571f", } IMAGE_MODELS = { - 'resnet-50': ResNetImageClassifier, - 'squeezenet_v1.1': SqueezeNetImageClassifierV1_1 + "resnet-50": ResNetImageClassifier, + "squeezenet_v1.1": SqueezeNetImageClassifierV1_1, } + class ObjectDetectorBasePreTrainedModel(object): @classmethod def _is_gl_pickle_safe(cls): @@ -148,20 +158,20 @@ def _is_gl_pickle_safe(cls): class DarkNetObjectDetectorBase(ObjectDetectorBasePreTrainedModel): def __init__(self): - self.name = 'darknet' + self.name = "darknet" self.spatial_reduction = 32 - self.source_url = _urlparse.urljoin(MODELS_URL_ROOT, 'darknet.params') - self.source_md5 = '1d7eea1fd286d2cfd7f2d9c93cbbdf9d' + self.source_url = _urlparse.urljoin(MODELS_URL_ROOT, "darknet.params") + self.source_md5 = "1d7eea1fd286d2cfd7f2d9c93cbbdf9d" self.weight_names = [] for i in range(7): self.weight_names += [ - 'conv%d_weight' % i, - 'batchnorm%d_gamma' % i, - 'batchnorm%d_beta' % i, + "conv%d_weight" % i, + "batchnorm%d_gamma" % i, + "batchnorm%d_beta" % i, ] - self.model_path = _download_and_checksum_files([ - (self.source_url, self.source_md5) - ], _get_cache_dir())[0] + self.model_path = _download_and_checksum_files( + [(self.source_url, self.source_md5)], _get_cache_dir() + )[0] def available_parameters_subset(self, mx_params): """ @@ -170,110 +180,114 @@ def available_parameters_subset(self, mx_params): """ from copy import copy from collections import OrderedDict + subset_params = copy(mx_params) - subset_params._params = OrderedDict([ - (k, v) for k, v in mx_params.items() if k in self.weight_names - ]) + subset_params._params = OrderedDict( + [(k, v) for k, v in mx_params.items() if k in self.weight_names] + ) return subset_params + class DarkNetObjectDetectorModel(ObjectDetectorBasePreTrainedModel): def __init__(self): - self.name = 'darknet' - self.source_url = _urlparse.urljoin(MODELS_URL_ROOT, 'darknet.mlmodel') - self.source_md5 = 'a06761976a0472cf0553b64ecc15b0fe' + self.name = "darknet" + self.source_url = _urlparse.urljoin(MODELS_URL_ROOT, "darknet.mlmodel") + self.source_md5 = "a06761976a0472cf0553b64ecc15b0fe" def get_model_path(self): model_path = _download_and_checksum_files( [(self.source_url, self.source_md5)], _get_cache_dir() - )[0] + )[0] return model_path + OBJECT_DETECTION_BASE_MODELS = { - 'darknet': DarkNetObjectDetectorBase, - 'darknet_mlmodel': DarkNetObjectDetectorModel + "darknet": DarkNetObjectDetectorBase, + "darknet_mlmodel": DarkNetObjectDetectorModel, } -class StyleTransferTransformer(): - +class StyleTransferTransformer: def __init__(self): - self.name = 'resnet-16' - self.source_md5 = { 'mxnet' : 'ac232afa6d0ead93a8c75b6c455f6dd3', - 'coreml': 'e0f3adaa9952ecc7d96f5e4eefb0d690' } - + self.name = "resnet-16" + self.source_md5 = { + "mxnet": "ac232afa6d0ead93a8c75b6c455f6dd3", + "coreml": "e0f3adaa9952ecc7d96f5e4eefb0d690", + } def get_model_path(self, format): - assert(format in ('coreml', 'mxnet')) - if(format == 'coreml'): - filename = self.name + '.mlmodel' + assert format in ("coreml", "mxnet") + if format == "coreml": + filename = self.name + ".mlmodel" else: - filename = self.name + '.params' + filename = self.name + ".params" url = _urlparse.urljoin(MODELS_URL_ROOT, filename) checksum = self.source_md5[format] - model_path = _download_and_checksum_files([(url, checksum)], _get_cache_dir())[0] + model_path = _download_and_checksum_files([(url, checksum)], _get_cache_dir())[ + 0 + ] return model_path -class Vgg16(): - +class Vgg16: def __init__(self): - self.name = 'Vgg16-conv1_1-4_3' + self.name = "Vgg16-conv1_1-4_3" self.source_md5 = { - 'mxnet': '52e75e03160e64e5aa9cfbbc62a92345', - 'coreml':'9c9508a8256d9ca1c113ac94bc9f8c6f' + "mxnet": "52e75e03160e64e5aa9cfbbc62a92345", + "coreml": "9c9508a8256d9ca1c113ac94bc9f8c6f", } - def get_model_path(self, format): - assert(format in ('coreml', 'mxnet')) - if (format in 'coreml') : - filename = 'vgg16-conv1_1-4_3.mlmodel' + assert format in ("coreml", "mxnet") + if format in "coreml": + filename = "vgg16-conv1_1-4_3.mlmodel" else: - filename = 'vgg16-conv1_1-4_3.params' + filename = "vgg16-conv1_1-4_3.params" url = _urlparse.urljoin(MODELS_URL_ROOT, filename) checksum = self.source_md5[format] - model_path = _download_and_checksum_files([(url, checksum)], _get_cache_dir())[0] + model_path = _download_and_checksum_files([(url, checksum)], _get_cache_dir())[ + 0 + ] return model_path -STYLE_TRANSFER_BASE_MODELS = { - 'resnet-16': StyleTransferTransformer, - 'Vgg16': Vgg16 -} +STYLE_TRANSFER_BASE_MODELS = {"resnet-16": StyleTransferTransformer, "Vgg16": Vgg16} -class VGGish(): +class VGGish: def __init__(self): - self.name = 'VGGishFeatureEmbedding-v1' + self.name = "VGGishFeatureEmbedding-v1" self.source_md5 = { - 'coreml': 'e8ae7d8cbcabb988b6ed6c0bf3f45571', - 'tensorflow': '1ae04d42492703e75fa79304873c642a' + "coreml": "e8ae7d8cbcabb988b6ed6c0bf3f45571", + "tensorflow": "1ae04d42492703e75fa79304873c642a", } def get_model_path(self, format): - assert(format in ('coreml', 'tensorflow')) + assert format in ("coreml", "tensorflow") - if(format == 'coreml'): - filename = self.name + '.mlmodel' + if format == "coreml": + filename = self.name + ".mlmodel" else: - filename = self.name + '.h5' + filename = self.name + ".h5" url = _urlparse.urljoin(MODELS_URL_ROOT, filename) checksum = self.source_md5[format] - model_path = _download_and_checksum_files( - [(url, checksum)], _get_cache_dir() - )[0] + model_path = _download_and_checksum_files([(url, checksum)], _get_cache_dir())[ + 0 + ] return model_path + class DrawingClassifierPreTrainedModel(object): def __init__(self, warm_start="auto"): self.model_to_filename = { "auto": "drawing_classifier_pre_trained_model_245_classes_v0.params", - "quickdraw_245_v0": "drawing_classifier_pre_trained_model_245_classes_v0.params" + "quickdraw_245_v0": "drawing_classifier_pre_trained_model_245_classes_v0.params", } self.source_url = _urlparse.urljoin( - MODELS_URL_ROOT, self.model_to_filename[warm_start]) + MODELS_URL_ROOT, self.model_to_filename[warm_start] + ) # @TODO: Think about how to bypass the md5 checksum if the user wants to # provide their own pretrained model. self.source_md5 = "71ba78e48a852f35fb22999650f0a655" @@ -281,18 +295,20 @@ def __init__(self, warm_start="auto"): def get_model_path(self): model_path = _download_and_checksum_files( [(self.source_url, self.source_md5)], _get_cache_dir() - )[0] + )[0] return model_path + class DrawingClassifierPreTrainedMLModel(object): def __init__(self): - self.source_url = _urlparse.urljoin(MODELS_URL_ROOT, - 'drawing_classifier_pre_trained_model_245_classes_v0.mlmodel') - self.source_md5 = 'fc1c04126728514c47991a62b9e66715' + self.source_url = _urlparse.urljoin( + MODELS_URL_ROOT, + "drawing_classifier_pre_trained_model_245_classes_v0.mlmodel", + ) + self.source_md5 = "fc1c04126728514c47991a62b9e66715" def get_model_path(self): model_path = _download_and_checksum_files( [(self.source_url, self.source_md5)], _get_cache_dir() - )[0] + )[0] return model_path - diff --git a/src/python/turicreate/toolkits/_private_utils.py b/src/python/turicreate/toolkits/_private_utils.py index cf2b8f2469..e9ccfc916f 100644 --- a/src/python/turicreate/toolkits/_private_utils.py +++ b/src/python/turicreate/toolkits/_private_utils.py @@ -60,13 +60,13 @@ def _robust_column_name(base_name, column_names): i = 1 while robust_name in column_names: - robust_name = base_name + '.{}'.format(i) + robust_name = base_name + ".{}".format(i) i += 1 return robust_name -def _select_valid_features(dataset, features, valid_feature_types, - target_column=None): + +def _select_valid_features(dataset, features, valid_feature_types, target_column=None): """ Utility function for selecting columns of only valid feature types. @@ -106,7 +106,7 @@ def _select_valid_features(dataset, features, valid_feature_types, >>> valid_columns = _select_valid_features(sf, ['X1', 'X2', 'X3'], [dict, array.array]) """ if features is not None: - if not hasattr(features, '__iter__'): + if not hasattr(features, "__iter__"): raise TypeError("Input 'features' must be an iterable type.") if not all([isinstance(x, str) for x in features]): @@ -117,31 +117,43 @@ def _select_valid_features(dataset, features, valid_feature_types, features = dataset.column_names() col_type_map = { - col_name: col_type for (col_name, col_type) in - zip(dataset.column_names(), dataset.column_types())} + col_name: col_type + for (col_name, col_type) in zip(dataset.column_names(), dataset.column_types()) + } valid_features = [] for col_name in features: if col_name not in dataset.column_names(): - _logging.warning("Column '{}' is not in the input dataset.".format(col_name)) + _logging.warning( + "Column '{}' is not in the input dataset.".format(col_name) + ) elif col_name == target_column: - _logging.warning("Excluding target column " + target_column + " as a feature.") + _logging.warning( + "Excluding target column " + target_column + " as a feature." + ) elif col_type_map[col_name] not in valid_feature_types: - _logging.warning("Column '{}' is excluded as a ".format(col_name) + - "feature due to invalid column type.") + _logging.warning( + "Column '{}' is excluded as a ".format(col_name) + + "feature due to invalid column type." + ) else: valid_features.append(col_name) if len(valid_features) == 0: - raise ValueError("The dataset does not contain any valid feature columns. " + - "Accepted feature types are " + str(valid_feature_types) + ".") + raise ValueError( + "The dataset does not contain any valid feature columns. " + + "Accepted feature types are " + + str(valid_feature_types) + + "." + ) return valid_features + def _check_elements_equal(lst): """ Returns true if all of the elements in the list are equal. @@ -149,8 +161,14 @@ def _check_elements_equal(lst): assert isinstance(lst, list), "Input value must be a list." return not lst or lst.count(lst[0]) == len(lst) -def _validate_lists(sa, allowed_types=[str], require_same_type=True, - require_equal_length=False, num_to_check=10): + +def _validate_lists( + sa, + allowed_types=[str], + require_same_type=True, + require_equal_length=False, + num_to_check=10, +): """ For a list-typed SArray, check whether the first elements are lists that - contain only the provided types @@ -216,8 +234,10 @@ def _validate_lists(sa, allowed_types=[str], require_same_type=True, return True -def _summarize_accessible_fields(field_descriptions, width=40, - section_title='Accessible fields'): + +def _summarize_accessible_fields( + field_descriptions, width=40, section_title="Accessible fields" +): """ Create a summary string for the accessible fields in a model. Unlike `_toolkit_repr_print`, this function does not look up the values of the diff --git a/src/python/turicreate/toolkits/_supervised_learning.py b/src/python/turicreate/toolkits/_supervised_learning.py index 0f4d9b75a3..a9f6b5b7c5 100644 --- a/src/python/turicreate/toolkits/_supervised_learning.py +++ b/src/python/turicreate/toolkits/_supervised_learning.py @@ -18,11 +18,13 @@ from turicreate.toolkits._main import ToolkitError from turicreate._cython.cy_server import QuietProgress + class SupervisedLearningModel(Model): """ Supervised learning module to predict a target variable as a function of several feature variables. """ + def __init__(self, model_proxy=None, name=None): self.__proxy__ = model_proxy self.__name__ = name @@ -55,8 +57,9 @@ def __repr__(self): """ return self.__class__.__name__ - def predict(self, dataset, missing_value_action='auto', - output_type='', options={}, **kwargs): + def predict( + self, dataset, missing_value_action="auto", output_type="", options={}, **kwargs + ): """ Return predictions for ``dataset``, using the trained supervised_learning model. Predictions are generated as class labels (0 or @@ -96,26 +99,34 @@ def predict(self, dataset, missing_value_action='auto', out : SArray An SArray with model predictions. """ - if missing_value_action == 'auto': - missing_value_action = select_default_missing_value_policy(self, 'predict') + if missing_value_action == "auto": + missing_value_action = select_default_missing_value_policy(self, "predict") # Low latency path if isinstance(dataset, list): return self.__proxy__.fast_predict( - dataset, missing_value_action, output_type) + dataset, missing_value_action, output_type + ) if isinstance(dataset, dict): return self.__proxy__.fast_predict( - [dataset], missing_value_action, output_type) + [dataset], missing_value_action, output_type + ) # Batch predictions path else: _raise_error_if_not_sframe(dataset, "dataset") - return self.__proxy__.predict( - dataset, missing_value_action, output_type) - - def evaluate(self, dataset, metric="auto", - missing_value_action='auto', with_predictions=False, options={}, **kwargs): + return self.__proxy__.predict(dataset, missing_value_action, output_type) + + def evaluate( + self, + dataset, + metric="auto", + missing_value_action="auto", + with_predictions=False, + options={}, + **kwargs + ): """ Evaluate the model by making predictions of target values and comparing these to actual values. @@ -148,13 +159,13 @@ def evaluate(self, dataset, metric="auto", kwargs : dict additional options to be passed into prediction """ - if missing_value_action == 'auto': - missing_value_action = select_default_missing_value_policy( - self, 'evaluate') + if missing_value_action == "auto": + missing_value_action = select_default_missing_value_policy(self, "evaluate") _raise_error_if_not_sframe(dataset, "dataset") results = self.__proxy__.evaluate( - dataset, missing_value_action, metric, with_predictions=with_predictions); + dataset, missing_value_action, metric, with_predictions=with_predictions + ) return results def _training_stats(self): @@ -195,8 +206,7 @@ class Classifier(SupervisedLearningModel): def _native_name(cls): return None - - def classify(self, dataset, missing_value_action='auto'): + def classify(self, dataset, missing_value_action="auto"): """ Return predictions for ``dataset``, using the trained supervised_learning model. Predictions are generated as class labels (0 or @@ -225,8 +235,8 @@ def classify(self, dataset, missing_value_action='auto'): out : SFrame An SFrame with model predictions. """ - if (missing_value_action == 'auto'): - missing_value_action = select_default_missing_value_policy(self, 'classify') + if missing_value_action == "auto": + missing_value_action = select_default_missing_value_policy(self, "classify") # Low latency path if isinstance(dataset, list): @@ -239,13 +249,23 @@ def classify(self, dataset, missing_value_action='auto'): def print_validation_track_notification(): - print ("PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.\n" - " You can set ``validation_set=None`` to disable validation tracking.\n") - - -def create(dataset, target, model_name, features=None, - validation_set='auto', distributed='auto', - verbose=True, seed=None, **kwargs): + print( + "PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.\n" + " You can set ``validation_set=None`` to disable validation tracking.\n" + ) + + +def create( + dataset, + target, + model_name, + features=None, + validation_set="auto", + distributed="auto", + verbose=True, + seed=None, + **kwargs +): """ Create a :class:`~turicreate.toolkits.SupervisedLearningModel`, @@ -293,16 +313,15 @@ def create(dataset, target, model_name, features=None, """ # Perform error-checking and trim inputs to specified columns - dataset, validation_set = _validate_data(dataset, target, features, - validation_set) + dataset, validation_set = _validate_data(dataset, target, features, validation_set) # Sample a validation set from the training data if requested if isinstance(validation_set, str): - assert validation_set == 'auto' + assert validation_set == "auto" if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() - dataset, validation_set = dataset.random_split(.95, seed=seed, exact=True) + dataset, validation_set = dataset.random_split(0.95, seed=seed, exact=True) else: validation_set = _turicreate.SFrame() elif validation_set is None: @@ -319,8 +338,9 @@ def create(dataset, target, model_name, features=None, return SupervisedLearningModel(model, model_name) -def create_classification_with_model_selector(dataset, target, model_selector, - features=None, validation_set='auto', verbose=True): +def create_classification_with_model_selector( + dataset, target, model_selector, features=None, validation_set="auto", verbose=True +): """ Create a :class:`~turicreate.toolkits.SupervisedLearningModel`, @@ -352,14 +372,13 @@ def create_classification_with_model_selector(dataset, target, model_selector, """ # Perform error-checking and trim inputs to specified columns - dataset, validation_set = _validate_data(dataset, target, features, - validation_set) + dataset, validation_set = _validate_data(dataset, target, features, validation_set) # Sample the data features_sframe = dataset if features_sframe.num_rows() > 1e5: fraction = 1.0 * 1e5 / features_sframe.num_rows() - features_sframe = features_sframe.sample(fraction, seed = 0) + features_sframe = features_sframe.sample(fraction, seed=0) # Get available models for this dataset num_classes = len(dataset[target].unique()) @@ -367,54 +386,62 @@ def create_classification_with_model_selector(dataset, target, model_selector, # Create a validation set if isinstance(validation_set, str): - if validation_set == 'auto': + if validation_set == "auto": if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() - dataset, validation_set = dataset.random_split(.95, exact=True) + dataset, validation_set = dataset.random_split(0.95, exact=True) else: validation_set = None else: - raise TypeError('Unrecognized value for validation_set.') + raise TypeError("Unrecognized value for validation_set.") # Match C++ model names with user model names - python_names = {'boosted_trees_classifier': 'BoostedTreesClassifier', - 'random_forest_classifier': 'RandomForestClassifier', - 'decision_tree_classifier': 'DecisionTreeClassifier', - 'classifier_logistic_regression': 'LogisticClassifier', - 'classifier_svm': 'SVMClassifier'} + python_names = { + "boosted_trees_classifier": "BoostedTreesClassifier", + "random_forest_classifier": "RandomForestClassifier", + "decision_tree_classifier": "DecisionTreeClassifier", + "classifier_logistic_regression": "LogisticClassifier", + "classifier_svm": "SVMClassifier", + } # Print useful user-facing progress messages if verbose: - print('PROGRESS: The following methods are available for this type of problem.') - print('PROGRESS: ' + ', '.join([python_names[x] for x in selected_model_names])) + print("PROGRESS: The following methods are available for this type of problem.") + print("PROGRESS: " + ", ".join([python_names[x] for x in selected_model_names])) if len(selected_model_names) > 1: - print('PROGRESS: The returned model will be chosen according to validation accuracy.') + print( + "PROGRESS: The returned model will be chosen according to validation accuracy." + ) models = {} metrics = {} for model_name in selected_model_names: # Fit each of the available models - m = create_selected(model_name, dataset, target, features, validation_set, verbose) + m = create_selected( + model_name, dataset, target, features, validation_set, verbose + ) models[model_name] = m - if 'validation_accuracy' in m._list_fields(): + if "validation_accuracy" in m._list_fields(): metrics[model_name] = m.validation_accuracy - elif 'training_accuracy' in m._list_fields(): + elif "training_accuracy" in m._list_fields(): metrics[model_name] = m.training_accuracy # Most models have this. - elif 'progress' in m._list_fields(): + elif "progress" in m._list_fields(): prog = m.progress - validation_column = 'Validation Accuracy' - accuracy_column = 'Training Accuracy' + validation_column = "Validation Accuracy" + accuracy_column = "Training Accuracy" if validation_column in prog.column_names(): metrics[model_name] = float(prog[validation_column].tail(1)[0]) else: metrics[model_name] = float(prog[accuracy_column].tail(1)[0]) else: - raise ValueError("Model does not have metrics that can be used for model selection.") + raise ValueError( + "Model does not have metrics that can be used for model selection." + ) # Choose model based on either validation, if available. best_model = None @@ -430,66 +457,68 @@ def create_classification_with_model_selector(dataset, target, model_selector, ret = [] width = 32 if len(selected_model_names) > 1: - ret.append('PROGRESS: Model selection based on validation accuracy:') - ret.append('---------------------------------------------') - key_str = '{:<{}}: {}' + ret.append("PROGRESS: Model selection based on validation accuracy:") + ret.append("---------------------------------------------") + key_str = "{:<{}}: {}" for model_name in selected_model_names: name = python_names[model_name] row = key_str.format(name, width, str(metrics[model_name])) ret.append(row) - ret.append('---------------------------------------------') - ret.append('Selecting ' + python_names[best_model] + ' based on validation set performance.') + ret.append("---------------------------------------------") + ret.append( + "Selecting " + + python_names[best_model] + + " based on validation set performance." + ) if verbose: - print('\nPROGRESS: '.join(ret)) + print("\nPROGRESS: ".join(ret)) return models[best_model] -def create_selected(selected_model_name, dataset, target, features, - validation_set='auto', verbose=True): + +def create_selected( + selected_model_name, dataset, target, features, validation_set="auto", verbose=True +): # Create the model - model = create(dataset, + model = create( + dataset, target, selected_model_name, features=features, validation_set=validation_set, - verbose=verbose) + verbose=verbose, + ) return wrap_model_proxy(model.__proxy__) + def wrap_model_proxy(model_proxy): selected_model_name = model_proxy.__class__.__name__ # Return the model - if selected_model_name == 'boosted_trees_regression': - return _turicreate.boosted_trees_regression.BoostedTreesRegression(\ - model_proxy) - elif selected_model_name == 'random_forest_regression': - return _turicreate.random_forest_regression.RandomForestRegression(\ - model_proxy) - elif selected_model_name == 'decision_tree_regression': - return _turicreate.decision_tree_classifier.DecisionTreeRegression(\ - model_proxy) - elif selected_model_name == 'regression_linear_regression': - return _turicreate.linear_regression.LinearRegression(\ - model_proxy) - elif selected_model_name == 'boosted_trees_classifier': - return _turicreate.boosted_trees_classifier.BoostedTreesClassifier(\ - model_proxy) - elif selected_model_name == 'random_forest_classifier': - return _turicreate.random_forest_classifier.RandomForestClassifier(\ - model_proxy) - elif selected_model_name == 'decision_tree_classifier': - return _turicreate.decision_tree_classifier.DecisionTreeClassifier(\ - model_proxy) - elif selected_model_name == 'classifier_logistic_regression': - return _turicreate.logistic_classifier.LogisticClassifier(\ - model_proxy) - elif selected_model_name == 'classifier_svm': + if selected_model_name == "boosted_trees_regression": + return _turicreate.boosted_trees_regression.BoostedTreesRegression(model_proxy) + elif selected_model_name == "random_forest_regression": + return _turicreate.random_forest_regression.RandomForestRegression(model_proxy) + elif selected_model_name == "decision_tree_regression": + return _turicreate.decision_tree_classifier.DecisionTreeRegression(model_proxy) + elif selected_model_name == "regression_linear_regression": + return _turicreate.linear_regression.LinearRegression(model_proxy) + elif selected_model_name == "boosted_trees_classifier": + return _turicreate.boosted_trees_classifier.BoostedTreesClassifier(model_proxy) + elif selected_model_name == "random_forest_classifier": + return _turicreate.random_forest_classifier.RandomForestClassifier(model_proxy) + elif selected_model_name == "decision_tree_classifier": + return _turicreate.decision_tree_classifier.DecisionTreeClassifier(model_proxy) + elif selected_model_name == "classifier_logistic_regression": + return _turicreate.logistic_classifier.LogisticClassifier(model_proxy) + elif selected_model_name == "classifier_svm": return _turicreate.svm_classifier.SVMClassifier(model_proxy) else: raise ToolkitError("Internal error: Incorrect model returned.") + def select_default_missing_value_policy(model, action): from .classifier.boosted_trees_classifier import BoostedTreesClassifier from .classifier.random_forest_classifier import RandomForestClassifier @@ -498,11 +527,16 @@ def select_default_missing_value_policy(model, action): from .regression.random_forest_regression import RandomForestRegression from .regression.decision_tree_regression import DecisionTreeRegression - tree_models = [BoostedTreesClassifier, BoostedTreesRegression, - RandomForestClassifier, RandomForestRegression, - DecisionTreeClassifier, DecisionTreeRegression] - - if (any(isinstance(model, tree_model) for tree_model in tree_models)): - return 'none' + tree_models = [ + BoostedTreesClassifier, + BoostedTreesRegression, + RandomForestClassifier, + RandomForestRegression, + DecisionTreeClassifier, + DecisionTreeRegression, + ] + + if any(isinstance(model, tree_model) for tree_model in tree_models): + return "none" else: - return 'impute' + return "impute" diff --git a/src/python/turicreate/toolkits/_tf_model.py b/src/python/turicreate/toolkits/_tf_model.py index 7af380df2a..49154c50d3 100644 --- a/src/python/turicreate/toolkits/_tf_model.py +++ b/src/python/turicreate/toolkits/_tf_model.py @@ -10,13 +10,14 @@ import abc as _abc import six as _six + @_six.add_metaclass(_abc.ABCMeta) class TensorFlowModel(object): - - """ + + """ Base Class for neural networks written in tensorflow used to abstract across model architectures. It defines the computational graph and initialize a session to run the graph. - + Make placeholders for input and targets self.data = tf.placeholder() self.target = tf.placeholder() @@ -33,25 +34,25 @@ class TensorFlowModel(object): Make the graph - conv = tf.nn.conv1d(self.data, self.weights['conv'], ..) + conv = tf.nn.conv1d(self.data, self.weights['conv'], ..) dense = tf.add(tf.matmul() + self.bias()) ... Make loss_op with the loss and train_op with the optimizer - loss_op = + loss_op = train_op = - Define Session + Define Session self.sess = tf.Session() """ - @_abc.abstractmethod - def __init__(self): - raise NotImplementedError + @_abc.abstractmethod + def __init__(self): + raise NotImplementedError - """ - Train will do a forward and backward pass and update weights - This accepts a dictionary that has feature/target as key and + """ + Train will do a forward and backward pass and update weights + This accepts a dictionary that has feature/target as key and the numpy arrays as value corresponding to them respectively. It returns a dictionary of loss and output (probabilities) This matches model backend train @@ -59,19 +60,20 @@ def __init__(self): Argument : A dictionary of input and true labels Returns : A dictionary of expected output (toolkit specific) - It will train a mini batch by running the optimizer in the session + It will train a mini batch by running the optimizer in the session Running the optimizer is thepart that does back propogation self.sess.run([train_op, loss_op, ..], feed_dict= {self.data = ..., self.target= ..}) """ - def train(self, feed_dict): - raise NotImplementedError - """ + def train(self, feed_dict): + raise NotImplementedError + + """ Predict does only a forward pass and does not update any weights - This accepts a dictionary that has feature/target as key and + This accepts a dictionary that has feature/target as key and the numpy arrays as value corresponding to them respectively. - It also returns a dictionary of loss and output + It also returns a dictionary of loss and output This matches the model backend predict Argument : A dictionary of input and true labels @@ -81,29 +83,31 @@ def train(self, feed_dict): self.sess.run([loss_op, ..], feed_dict= {self.data = ..., self.target= ..}) """ - def predict(self, feed_dict): - raise NotImplementedError + def predict(self, feed_dict): + raise NotImplementedError - """ + """ Exports the network weights in CoreML format. - Returns : A dictionary of weight names as keys and + Returns : A dictionary of weight names as keys and layer_names = tf.trainable_variables() layer_weights = self.sess.run(tvars) - - This will get you the layer names from tensorflow and their corresponding - values. They need to be converted to CoreML format and stored back in a + + This will get you the layer names from tensorflow and their corresponding + values. They need to be converted to CoreML format and stored back in a dictionary with their names and values of correct shapes. """ - def export_weights(self): - raise NotImplementedError - """ + def export_weights(self): + raise NotImplementedError + + """ Sets the optimizer to learn at the specified learning rate or using a learning rate scheduler. """ - def set_learning_rate(self, learning_rate): - raise NotImplementedError + + def set_learning_rate(self, learning_rate): + raise NotImplementedError diff --git a/src/python/turicreate/toolkits/_tf_utils.py b/src/python/turicreate/toolkits/_tf_utils.py index f8a2d13f49..bc01bf5c9a 100644 --- a/src/python/turicreate/toolkits/_tf_utils.py +++ b/src/python/turicreate/toolkits/_tf_utils.py @@ -9,6 +9,7 @@ from __future__ import absolute_import as _ import numpy as np + class TensorFlowGPUPolicy(object): """ Suppresses GPU usage within TensorFlow if the Turi Create configuration has @@ -17,25 +18,28 @@ class TensorFlowGPUPolicy(object): def start(self): import turicreate as tc + self.force_cpu = tc.config.get_num_gpus() == 0 if self.force_cpu: # Setting the environment variable CUDA_VISIBLE_DEVICES appears to # be the most reliable way to suppress GPU usage. Set that, but # first save the old value so we can restore it when we're done. import os - if 'CUDA_VISIBLE_DEVICES' in os.environ: - self.cuda_visible_devices = os.environ['CUDA_VISIBLE_DEVICES'] + + if "CUDA_VISIBLE_DEVICES" in os.environ: + self.cuda_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"] else: self.cuda_visible_devices = None - os.environ['CUDA_VISIBLE_DEVICES'] = '' + os.environ["CUDA_VISIBLE_DEVICES"] = "" def stop(self): if self.force_cpu: import os + if self.cuda_visible_devices is not None: - os.environ['CUDA_VISIBLE_DEVICES'] = self.cuda_visible_devices - elif 'CUDA_VISIBLE_DEVICES' in os.environ: - del os.environ['CUDA_VISIBLE_DEVICES'] + os.environ["CUDA_VISIBLE_DEVICES"] = self.cuda_visible_devices + elif "CUDA_VISIBLE_DEVICES" in os.environ: + del os.environ["CUDA_VISIBLE_DEVICES"] def __enter__(self): self.start() @@ -43,26 +47,32 @@ def __enter__(self): def __exit__(self, exception_type, exception_val, exception_traceback): self.stop() + def suppress_tensorflow_warnings(): """ Suppresses tensorflow warnings """ import os - os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' + + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" import tensorflow.compat.v1 as _tf + _tf.disable_v2_behavior() _tf.logging.set_verbosity(_tf.logging.ERROR) _tf.debugging.set_log_device_placement(False) + def get_gpu_names(): """ Gets the available GPU names. """ import tensorflow as _tf - gpu_names = _tf.config.experimental.list_physical_devices('GPU') + + gpu_names = _tf.config.experimental.list_physical_devices("GPU") return [str(gpu_name) for gpu_name in gpu_names] + def convert_shared_float_array_to_numpy(array): """ The initialization from C++ implementation is mapped to SharedFloatArray @@ -81,6 +91,7 @@ def convert_shared_float_array_to_numpy(array): """ return np.array(array, copy=False, dtype=np.float32) + def convert_conv1d_coreml_to_tf(conv_weights): """ The Convolutional weights in CoreML specification converted to @@ -102,6 +113,7 @@ def convert_conv1d_coreml_to_tf(conv_weights): conv_weights = np.transpose(conv_weights, (3, 1, 0, 2)) return np.squeeze(conv_weights, axis=3) + def convert_conv2d_coreml_to_tf(conv_weights): """ @@ -122,7 +134,10 @@ def convert_conv2d_coreml_to_tf(conv_weights): conv_weights = np.transpose(conv_weights, (2, 3, 1, 0)) return conv_weights -def convert_lstm_weight_coreml_to_tf(i2h_i, i2h_c, i2h_f, i2h_o, h2h_i, h2h_c, h2h_f, h2h_o): + +def convert_lstm_weight_coreml_to_tf( + i2h_i, i2h_c, i2h_f, i2h_o, h2h_i, h2h_c, h2h_f, h2h_o +): """ The weights of four gates of LSTM - input, state, forget, output are sent separately for input to hidden layer and hidden to hidden layer. @@ -152,6 +167,7 @@ def convert_lstm_weight_coreml_to_tf(i2h_i, i2h_c, i2h_f, i2h_o, h2h_i, h2h_c, h lstm = np.concatenate((i2h, h2h), axis=1) return np.transpose(lstm) + def convert_lstm_bias_coreml_to_tf(h2h_i, h2h_c, h2h_f, h2h_o): """ The biases of four gates of LSTM - input, state, forget, output are @@ -171,9 +187,10 @@ def convert_lstm_bias_coreml_to_tf(h2h_i, h2h_c, h2h_f, h2h_o): """ - lstm_bias = np.concatenate((h2h_i, h2h_c, h2h_f, h2h_o), axis=0) + lstm_bias = np.concatenate((h2h_i, h2h_c, h2h_f, h2h_o), axis=0) return lstm_bias + def convert_dense_coreml_to_tf(dense_weights): """ The Dense layer weights from CoreML are [C_out, C_in, 1, 1] and need to be @@ -193,6 +210,7 @@ def convert_dense_coreml_to_tf(dense_weights): dense_weights = np.transpose(dense_weights, (1, 0, 2, 3)) return np.reshape(dense_weights, (dense_weights.shape[0], dense_weights.shape[1])) + def convert_conv1d_tf_to_coreml(conv_weights): """ Convolutional weights from TensorFlow in the format [kernelWidth, kernelChannels, outputChannels] @@ -214,6 +232,7 @@ def convert_conv1d_tf_to_coreml(conv_weights): conv_weights = np.transpose(conv_weights, (3, 1, 2, 0)) return conv_weights + def convert_conv2d_tf_to_coreml(conv_weights): """ Convolutional weights from TensorFlow in the format [filter_height, filter_width, input_channels, output_channels] @@ -232,6 +251,7 @@ def convert_conv2d_tf_to_coreml(conv_weights): conv_weights = np.transpose(conv_weights, (3, 2, 0, 1)) return conv_weights + def convert_lstm_weight_tf_to_coreml(lstm_weight, split): """ The weights of four gates of LSTM - input, state, forget, output are @@ -252,7 +272,7 @@ def convert_lstm_weight_tf_to_coreml(lstm_weight, split): h2h_gate : [hidden_units, hidden_units] """ - lstm_i2h , lstm_h2h = np.split(lstm_weight, [split]) + lstm_i2h, lstm_h2h = np.split(lstm_weight, [split]) i2h_i, i2h_c, i2h_f, i2h_o = np.split(lstm_i2h, 4, axis=1) h2h_i, h2h_c, h2h_f, h2h_o = np.split(lstm_h2h, 4, axis=1) i2h_i = np.transpose(i2h_i) @@ -265,6 +285,7 @@ def convert_lstm_weight_tf_to_coreml(lstm_weight, split): h2h_o = np.transpose(h2h_o) return i2h_i, i2h_c, i2h_f, i2h_o, h2h_i, h2h_c, h2h_f, h2h_o + def convert_lstm_bias_tf_to_coreml(lstm_bias): """ The biases of four gates of LSTM - input, state, forget, output are @@ -286,6 +307,7 @@ def convert_lstm_bias_tf_to_coreml(lstm_bias): h2h_i, h2h_c, h2h_f, h2h_o = np.split(lstm_bias, 4) return h2h_i, h2h_c, h2h_f, h2h_o + def convert_dense_tf_to_coreml(dense_weights): """ The Dense layer weights from TensorFlow are converted @@ -303,4 +325,6 @@ def convert_dense_tf_to_coreml(dense_weights): """ dense_weights = np.transpose(dense_weights) - return np.reshape(dense_weights, (dense_weights.shape[0], dense_weights.shape[1], 1, 1)) + return np.reshape( + dense_weights, (dense_weights.shape[0], dense_weights.shape[1], 1, 1) + ) diff --git a/src/python/turicreate/toolkits/_tree_model_mixin.py b/src/python/turicreate/toolkits/_tree_model_mixin.py index 2d5807dd81..3826d2c442 100644 --- a/src/python/turicreate/toolkits/_tree_model_mixin.py +++ b/src/python/turicreate/toolkits/_tree_model_mixin.py @@ -11,6 +11,7 @@ from turicreate.toolkits._supervised_learning import select_default_missing_value_policy from turicreate.toolkits import _coreml_utils + class TreeModelMixin(object): """ Implements common methods among tree models: @@ -63,7 +64,7 @@ def get_feature_importance(self): """ return tc.extensions._xgboost_feature_importance(self.__proxy__) - def extract_features(self, dataset, missing_value_action='auto'): + def extract_features(self, dataset, missing_value_action="auto"): """ For each example in the dataset, extract the leaf indices of each tree as features. @@ -114,14 +115,16 @@ def extract_features(self, dataset, missing_value_action='auto'): >>> data['classification_tree_features'] = model.extract_features(data) """ _raise_error_if_not_sframe(dataset, "dataset") - if missing_value_action == 'auto': - missing_value_action = select_default_missing_value_policy(self, - 'extract_features') + if missing_value_action == "auto": + missing_value_action = select_default_missing_value_policy( + self, "extract_features" + ) return self.__proxy__.extract_features(dataset, missing_value_action) - def _extract_features_with_missing(self, dataset, tree_id = 0, - missing_value_action = 'auto'): + def _extract_features_with_missing( + self, dataset, tree_id=0, missing_value_action="auto" + ): """ Extract features along with all the missing features associated with a dataset. @@ -155,40 +158,41 @@ def _extract_features_with_missing(self, dataset, tree_id = 0, # Extract the features from only one tree. sf = dataset - sf['leaf_id'] = self.extract_features(dataset, missing_value_action)\ - .vector_slice(tree_id)\ - .astype(int) + sf["leaf_id"] = ( + self.extract_features(dataset, missing_value_action) + .vector_slice(tree_id) + .astype(int) + ) tree = self._get_tree(tree_id) type_map = dict(zip(dataset.column_names(), dataset.column_types())) def get_missing_features(row): - x = row['leaf_id'] + x = row["leaf_id"] path = tree.get_prediction_path(x) - missing_id = [] # List of "missing_id" children. + missing_id = [] # List of "missing_id" children. # For each node in the prediction path. for p in path: - fname = p['feature'] - idx = p['index'] + fname = p["feature"] + idx = p["index"] f = row[fname] if type_map[fname] in [int, float]: if f is None: - missing_id.append(p['child_id']) + missing_id.append(p["child_id"]) elif type_map[fname] in [dict]: if f is None: - missing_id.append(p['child_id']) + missing_id.append(p["child_id"]) if idx not in f: - missing_id.append(p['child_id']) + missing_id.append(p["child_id"]) else: pass return missing_id - sf['missing_id'] = sf.apply(get_missing_features, list) - return sf[['leaf_id', 'missing_id']] - + sf["missing_id"] = sf.apply(get_missing_features, list) + return sf[["leaf_id", "missing_id"]] def _dump_to_text(self, with_stats): """ @@ -206,7 +210,9 @@ def _dump_to_text(self, with_stats): A table with two columns: feature, count, ordered by 'count' in descending order. """ - return tc.extensions._xgboost_dump_model(self.__proxy__, with_stats=with_stats, format='text') + return tc.extensions._xgboost_dump_model( + self.__proxy__, with_stats=with_stats, format="text" + ) def _dump_to_json(self, with_stats): """ @@ -225,7 +231,10 @@ def _dump_to_json(self, with_stats): ordered by 'count' in descending order. """ import json - trees_json_str = tc.extensions._xgboost_dump_model(self.__proxy__, with_stats=with_stats, format='json') + + trees_json_str = tc.extensions._xgboost_dump_model( + self.__proxy__, with_stats=with_stats, format="json" + ) trees_json = [json.loads(x) for x in trees_json_str] # To avoid lose precision when using libjson, _dump_model with json format encode @@ -234,20 +243,25 @@ def _dump_to_json(self, with_stats): # in little endian import struct import sys + def hexadecimal_to_float(s): if sys.version_info[0] >= 3: - return struct.unpack('>> model.export_coreml("MyModel.mlmodel") """ - short_description = _coreml_utils._mlmodel_short_description('Activity classifier') + short_description = _coreml_utils._mlmodel_short_description( + "Activity classifier" + ) additional_user_defined_metadata = _coreml_utils._get_tc_version_info() - self.__proxy__.export_to_coreml(filename, short_description, - additional_user_defined_metadata) - + self.__proxy__.export_to_coreml( + filename, short_description, additional_user_defined_metadata + ) - def predict(self, dataset, output_type='class', output_frequency='per_row'): + def predict(self, dataset, output_type="class", output_frequency="per_row"): """ Return predictions for ``dataset``, using the trained activity classifier. Predictions can be generated as class labels, or as a probability @@ -346,14 +364,14 @@ class as a vector. The probability of the first class (sorted +---------------+------------+-----+ """ _tkutl._check_categorical_option_type( - 'output_frequency', output_frequency, ['per_window', 'per_row']) - if output_frequency == 'per_row': + "output_frequency", output_frequency, ["per_window", "per_row"] + ) + if output_frequency == "per_row": return self.__proxy__.predict(dataset, output_type) - elif output_frequency == 'per_window': + elif output_frequency == "per_window": return self.__proxy__.predict_per_window(dataset, output_type) - - def evaluate(self, dataset, metric='auto'): + def evaluate(self, dataset, metric="auto"): """ Evaluate the model by making predictions of target values and comparing these to actual values. @@ -400,7 +418,9 @@ def evaluate(self, dataset, metric='auto'): """ return self.__proxy__.evaluate(dataset, metric) - def predict_topk(self, dataset, output_type='probability', k=3, output_frequency='per_row'): + def predict_topk( + self, dataset, output_type="probability", k=3, output_frequency="per_row" + ): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `prediction_id`, @@ -460,11 +480,11 @@ def predict_topk(self, dataset, output_type='probability', k=3, output_frequency +---------------+-------+-------------------+ """ if not isinstance(k, int): - raise TypeError('k must be of type int') - _tkutl._numeric_param_check_range('k', k, 1, _six.MAXSIZE) - return self.__proxy__.predict_topk(dataset, output_type, k, output_frequency); + raise TypeError("k must be of type int") + _tkutl._numeric_param_check_range("k", k, 1, _six.MAXSIZE) + return self.__proxy__.predict_topk(dataset, output_type, k, output_frequency) - def classify(self, dataset, output_frequency='per_row'): + def classify(self, dataset, output_frequency="per_row"): """ Return a classification, for each ``prediction_window`` examples in the ``dataset``, using the trained activity classification model. The output @@ -498,7 +518,7 @@ def classify(self, dataset, output_frequency='per_row'): ---------- >>> classes = model.classify(data) """ - return self.__proxy__.classify(dataset, output_frequency); + return self.__proxy__.classify(dataset, output_frequency) def _get_summary_struct(self): """ @@ -518,16 +538,16 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ model_fields = [ - ('Number of examples', 'num_examples'), - ('Number of sessions', 'num_sessions'), - ('Number of classes', 'num_classes'), - ('Number of feature columns', 'num_features'), - ('Prediction window', 'prediction_window'), + ("Number of examples", "num_examples"), + ("Number of sessions", "num_sessions"), + ("Number of classes", "num_classes"), + ("Number of feature columns", "num_features"), + ("Prediction window", "prediction_window"), ] training_fields = [ - ('Log-likelihood', 'training_log_loss'), - ('Training time (sec)', 'training_time'), + ("Log-likelihood", "training_log_loss"), + ("Training time (sec)", "training_time"), ] - section_titles = ['Schema', 'Training summary'] - return([model_fields, training_fields], section_titles) + section_titles = ["Schema", "Training summary"] + return ([model_fields, training_fields], section_titles) diff --git a/src/python/turicreate/toolkits/activity_classifier/_mps_model_architecture.py b/src/python/turicreate/toolkits/activity_classifier/_mps_model_architecture.py index b425ca4eca..c65e60cb68 100644 --- a/src/python/turicreate/toolkits/activity_classifier/_mps_model_architecture.py +++ b/src/python/turicreate/toolkits/activity_classifier/_mps_model_architecture.py @@ -8,15 +8,22 @@ from __future__ import absolute_import as _ import numpy as _np -from .._mps_utils import (MpsLowLevelAPI as _MpsLowLevelAPI, - MpsLowLevelNetworkType as _MpsLowLevelNetworkType, - MpsLowLevelMode as _MpsLowLevelMode) +from .._mps_utils import ( + MpsLowLevelAPI as _MpsLowLevelAPI, + MpsLowLevelNetworkType as _MpsLowLevelNetworkType, + MpsLowLevelMode as _MpsLowLevelMode, +) -def _define_model_mps(batch_size, num_features, num_classes, pred_win, seq_len, is_prediction_model): - config = {"mode" : _MpsLowLevelMode.kLowLevelModeTrain, - "ac_pred_window" : pred_win, - "ac_seq_len" : seq_len} +def _define_model_mps( + batch_size, num_features, num_classes, pred_win, seq_len, is_prediction_model +): + + config = { + "mode": _MpsLowLevelMode.kLowLevelModeTrain, + "ac_pred_window": pred_win, + "ac_seq_len": seq_len, + } input_width = pred_win * seq_len @@ -24,12 +31,24 @@ def _define_model_mps(batch_size, num_features, num_classes, pred_win, seq_len, config["mode"] = _MpsLowLevelMode.kLowLevelModeInference model = _MpsLowLevelAPI(network_id=_MpsLowLevelNetworkType.kActivityClassifierNet) - model.init(batch_size, num_features, 1, input_width, num_classes, 1, seq_len, updater=2, config=config) + model.init( + batch_size, + num_features, + 1, + input_width, + num_classes, + 1, + seq_len, + updater=2, + config=config, + ) return model -def _calc_batch_metrics(output, labels, weights, actual_seq_len, actual_batch_len, loss_per_sec): +def _calc_batch_metrics( + output, labels, weights, actual_seq_len, actual_batch_len, loss_per_sec +): prediction = _np.argmax(output, axis=-1) prediction = _np.expand_dims(prediction, 2) accuracy = (prediction == labels) * (weights > 0) @@ -39,6 +58,7 @@ def _calc_batch_metrics(output, labels, weights, actual_seq_len, actual_batch_le return batch_loss, batch_accuracy, acc_per_seq + def _fit_model_mps(model, data_iter, valid_iter, max_iterations, verbose): from time import time as _time @@ -46,25 +66,24 @@ def _fit_model_mps(model, data_iter, valid_iter, max_iterations, verbose): if verbose: # Print progress table header - column_names = ['Iteration', 'Train Accuracy', 'Train Loss'] + column_names = ["Iteration", "Train Accuracy", "Train Loss"] if valid_iter: - column_names += ['Validation Accuracy', 'Validation Loss'] - column_names.append('Elapsed Time') + column_names += ["Validation Accuracy", "Validation Loss"] + column_names.append("Elapsed Time") num_columns = len(column_names) column_width = max(map(lambda x: len(x), column_names)) + 2 - hr = '+' + '+'.join(['-' * column_width] * num_columns) + '+' + hr = "+" + "+".join(["-" * column_width] * num_columns) + "+" print(hr) - print(('| {:<{width}}' * num_columns + '|').format(*column_names, width=column_width-1)) + print( + ("| {:<{width}}" * num_columns + "|").format( + *column_names, width=column_width - 1 + ) + ) print(hr) begin = _time() for iteration in range(max_iterations): - log = { - 'train_loss': 0., - 'train_acc': 0., - 'valid_loss': 0., - 'valid_acc': 0. - } + log = {"train_loss": 0.0, "train_acc": 0.0, "valid_loss": 0.0, "valid_acc": 0.0} # Training iteration data_iter.reset() @@ -77,33 +96,56 @@ def start_batch(batch, batch_idx, is_train): weights = batch.weights actual_seq_len = _np.sum(weights, axis=1) actual_batch_len = _np.sum((actual_seq_len > 0)) - if (is_train and actual_batch_len > 0): + if is_train and actual_batch_len > 0: weights /= actual_batch_len # MPS model requires 4-dimensional NHWC input model_fn = model.train if is_train else model.predict_with_loss - (fwd_out, loss_out) = model_fn(_np.expand_dims(input_data, 1), - _np.expand_dims(labels, 1), - _np.expand_dims(weights, 1)) - - return {'labels' : labels, 'weights' : weights, 'actual_seq_len' : actual_seq_len, 'actual_batch_len' : actual_batch_len, 'fwd_out' : fwd_out, 'loss_out' : loss_out} + (fwd_out, loss_out) = model_fn( + _np.expand_dims(input_data, 1), + _np.expand_dims(labels, 1), + _np.expand_dims(weights, 1), + ) + + return { + "labels": labels, + "weights": weights, + "actual_seq_len": actual_seq_len, + "actual_batch_len": actual_batch_len, + "fwd_out": fwd_out, + "loss_out": loss_out, + } # Encapsulates the work for processing a response from the model. - def finish_batch(batch_idx, is_train, labels, weights, actual_seq_len, actual_batch_len, fwd_out, loss_out): + def finish_batch( + batch_idx, + is_train, + labels, + weights, + actual_seq_len, + actual_batch_len, + fwd_out, + loss_out, + ): # MPS yields 4-dimensional NHWC output. Collapse the H dimension, # which should have size 1. forward_output = _np.squeeze(fwd_out.asnumpy(), axis=1) loss_per_sequence = _np.squeeze(loss_out.asnumpy(), axis=1) batch_loss, batch_accuracy, acc_per_sequence = _calc_batch_metrics( - forward_output, labels, weights, actual_seq_len, - actual_batch_len, loss_per_sequence) + forward_output, + labels, + weights, + actual_seq_len, + actual_batch_len, + loss_per_sequence, + ) if is_train: - log['train_loss'] += batch_loss / train_batches - log['train_acc'] += batch_accuracy / train_batches + log["train_loss"] += batch_loss / train_batches + log["train_acc"] += batch_accuracy / train_batches else: - log['valid_loss'] += _np.sum(loss_per_sequence) / valid_num_seq_in_epoch - log['valid_acc'] += _np.sum(acc_per_sequence) / valid_num_seq_in_epoch + log["valid_loss"] += _np.sum(loss_per_sequence) / valid_num_seq_in_epoch + log["valid_acc"] += _np.sum(acc_per_sequence) / valid_num_seq_in_epoch # Perform the following sequence of calls, effectively double buffering: # start_batch(1) @@ -120,9 +162,12 @@ def perform_batches(data_iter, is_train=True): prev_batch_info = None last_batch_info = None for batch in data_iter: - (prev_batch_info, last_batch_info) = (last_batch_info, start_batch(batch, batch_count, is_train)) + (prev_batch_info, last_batch_info) = ( + last_batch_info, + start_batch(batch, batch_count, is_train), + ) if batch_count > 0: - finish_batch(batch_count - 1, is_train, **prev_batch_info) + finish_batch(batch_count - 1, is_train, **prev_batch_info) batch_count += 1 if batch_count > 0: finish_batch(batch_count - 1, is_train, **last_batch_info) @@ -139,25 +184,39 @@ def perform_batches(data_iter, is_train=True): elapsed_time = _time() - begin if valid_iter is None: # print progress row without validation info - print("| {cur_iter:<{width}}| {train_acc:<{width}.3f}| {train_loss:<{width}.3f}| {time:<{width}.1f}|".format( - cur_iter = iteration + 1, train_acc = log['train_acc'], train_loss = log['train_loss'], - time = elapsed_time, width = column_width-1)) + print( + "| {cur_iter:<{width}}| {train_acc:<{width}.3f}| {train_loss:<{width}.3f}| {time:<{width}.1f}|".format( + cur_iter=iteration + 1, + train_acc=log["train_acc"], + train_loss=log["train_loss"], + time=elapsed_time, + width=column_width - 1, + ) + ) else: # print progress row with validation info - print("| {cur_iter:<{width}}| {train_acc:<{width}.3f}| {train_loss:<{width}.3f}" - "| {valid_acc:<{width}.3f}| {valid_loss:<{width}.3f}| {time:<{width}.1f}| ".format( - cur_iter = iteration + 1, train_acc = log['train_acc'], train_loss = log['train_loss'], - valid_acc = log['valid_acc'], valid_loss = log['valid_loss'], time = elapsed_time, - width = column_width-1)) + print( + "| {cur_iter:<{width}}| {train_acc:<{width}.3f}| {train_loss:<{width}.3f}" + "| {valid_acc:<{width}.3f}| {valid_loss:<{width}.3f}| {time:<{width}.1f}| ".format( + cur_iter=iteration + 1, + train_acc=log["train_acc"], + train_loss=log["train_loss"], + valid_acc=log["valid_acc"], + valid_loss=log["valid_loss"], + time=elapsed_time, + width=column_width - 1, + ) + ) if verbose: print(hr) - print('Training complete') + print("Training complete") end = _time() - print('Total Time Spent: %gs' % (end - begin)) + print("Total Time Spent: %gs" % (end - begin)) return log + def _predict_mps(pred_model, data_iter): data_iter.reset() @@ -172,7 +231,7 @@ def _predict_mps(pred_model, data_iter): raw_output = pred_model.predict(_np.expand_dims(input_data, 1)) output = _np.squeeze(raw_output.asnumpy(), axis=1) - trimmed_output = output[0:output.shape[0]-pad].copy() + trimmed_output = output[0 : output.shape[0] - pad].copy() output_list.append(trimmed_output) prediction_output = _np.concatenate([output for output in output_list]) diff --git a/src/python/turicreate/toolkits/activity_classifier/_tf_model_architecture.py b/src/python/turicreate/toolkits/activity_classifier/_tf_model_architecture.py index 1e91dbc1bb..4194300f50 100644 --- a/src/python/turicreate/toolkits/activity_classifier/_tf_model_architecture.py +++ b/src/python/turicreate/toolkits/activity_classifier/_tf_model_architecture.py @@ -9,6 +9,7 @@ import turicreate.toolkits._tf_utils as _utils import tensorflow.compat.v1 as _tf + _tf.disable_v2_behavior() from .._tf_model import TensorFlowModel @@ -19,16 +20,25 @@ LSTM_H = 200 DENSE_H = 128 -class ActivityTensorFlowModel(TensorFlowModel): - def __init__(self, net_params, batch_size, num_features, num_classes, prediction_window, seq_len): +class ActivityTensorFlowModel(TensorFlowModel): + def __init__( + self, + net_params, + batch_size, + num_features, + num_classes, + prediction_window, + seq_len, + ): self.gpu_policy = _utils.TensorFlowGPUPolicy() self.gpu_policy.start() for key in net_params.keys(): - net_params[key] = _utils.convert_shared_float_array_to_numpy(net_params[key]) - + net_params[key] = _utils.convert_shared_float_array_to_numpy( + net_params[key] + ) self.ac_graph = _tf.Graph() self.num_classes = num_classes @@ -36,12 +46,18 @@ def __init__(self, net_params, batch_size, num_features, num_classes, prediction self.seq_len = seq_len self.sess = _tf.Session(graph=self.ac_graph) with self.ac_graph.as_default(): - self.init_activity_classifier_graph(net_params, num_features, prediction_window) + self.init_activity_classifier_graph( + net_params, num_features, prediction_window + ) - def init_activity_classifier_graph(self, net_params, num_features, prediction_window): + def init_activity_classifier_graph( + self, net_params, num_features, prediction_window + ): # Vars - self.data = _tf.placeholder(_tf.float32, [None, prediction_window*self.seq_len, num_features]) + self.data = _tf.placeholder( + _tf.float32, [None, prediction_window * self.seq_len, num_features] + ) self.weight = _tf.placeholder(_tf.float32, [None, self.seq_len, 1]) self.target = _tf.placeholder(_tf.int32, [None, self.seq_len, 1]) self.is_training = _tf.placeholder(_tf.bool) @@ -55,56 +71,99 @@ def init_activity_classifier_graph(self, net_params, num_features, prediction_wi # Weights self.weights = { - 'conv_weight' : _tf.Variable(_tf.zeros([prediction_window, num_features, CONV_H]), name='conv_weight'), - 'dense0_weight': _tf.Variable(_tf.zeros([LSTM_H, DENSE_H]), name='dense0_weight'), - 'dense1_weight' : _tf.Variable(_tf.zeros([DENSE_H, self.num_classes]), name='dense1_weight') + "conv_weight": _tf.Variable( + _tf.zeros([prediction_window, num_features, CONV_H]), name="conv_weight" + ), + "dense0_weight": _tf.Variable( + _tf.zeros([LSTM_H, DENSE_H]), name="dense0_weight" + ), + "dense1_weight": _tf.Variable( + _tf.zeros([DENSE_H, self.num_classes]), name="dense1_weight" + ), } # Biases self.biases = { - 'conv_bias' : _tf.Variable(_tf.zeros([CONV_H]), name='conv_bias'), - 'dense0_bias': _tf.Variable(_tf.zeros([DENSE_H]), name='dense0_bias'), - 'dense1_bias' : _tf.Variable(_tf.zeros([self.num_classes]), name='dense1_bias') + "conv_bias": _tf.Variable(_tf.zeros([CONV_H]), name="conv_bias"), + "dense0_bias": _tf.Variable(_tf.zeros([DENSE_H]), name="dense0_bias"), + "dense1_bias": _tf.Variable( + _tf.zeros([self.num_classes]), name="dense1_bias" + ), } # Convolution - conv = _tf.nn.conv1d(self.data, self.weights['conv_weight'], stride=prediction_window, padding='SAME') - conv = _tf.nn.bias_add(conv, self.biases['conv_bias']) + conv = _tf.nn.conv1d( + self.data, + self.weights["conv_weight"], + stride=prediction_window, + padding="SAME", + ) + conv = _tf.nn.bias_add(conv, self.biases["conv_bias"]) conv = _tf.nn.relu(conv) dropout = _tf.layers.dropout(conv, rate=0.2, training=self.is_training) # Long Stem Term Memory lstm = self.load_lstm_weights_params(net_params) - cells = _tf.nn.rnn_cell.LSTMCell(num_units=LSTM_H, reuse=_tf.AUTO_REUSE, forget_bias=0.0, - initializer=_tf.initializers.constant(lstm, verify_shape=True)) + cells = _tf.nn.rnn_cell.LSTMCell( + num_units=LSTM_H, + reuse=_tf.AUTO_REUSE, + forget_bias=0.0, + initializer=_tf.initializers.constant(lstm, verify_shape=True), + ) init_state = cells.zero_state(self.batch_size, _tf.float32) - rnn_outputs, final_state = _tf.nn.dynamic_rnn(cells, dropout, initial_state=init_state) + rnn_outputs, final_state = _tf.nn.dynamic_rnn( + cells, dropout, initial_state=init_state + ) # Dense dense = _tf.reshape(rnn_outputs, (-1, LSTM_H)) - dense = _tf.add(_tf.matmul(dense, self.weights['dense0_weight']), self.biases['dense0_bias']) - dense = _tf.layers.batch_normalization(inputs=dense, - beta_initializer=_tf.initializers.constant(net_params['bn_beta'], verify_shape=True), - gamma_initializer=_tf.initializers.constant(net_params['bn_gamma'], verify_shape=True), - moving_mean_initializer=_tf.initializers.constant(net_params['bn_running_mean'], verify_shape=True), - moving_variance_initializer=_tf.initializers.constant(net_params['bn_running_var'], verify_shape=True), training=self.is_training ) + dense = _tf.add( + _tf.matmul(dense, self.weights["dense0_weight"]), self.biases["dense0_bias"] + ) + dense = _tf.layers.batch_normalization( + inputs=dense, + beta_initializer=_tf.initializers.constant( + net_params["bn_beta"], verify_shape=True + ), + gamma_initializer=_tf.initializers.constant( + net_params["bn_gamma"], verify_shape=True + ), + moving_mean_initializer=_tf.initializers.constant( + net_params["bn_running_mean"], verify_shape=True + ), + moving_variance_initializer=_tf.initializers.constant( + net_params["bn_running_var"], verify_shape=True + ), + training=self.is_training, + ) dense = _tf.nn.relu(dense) dense = _tf.layers.dropout(dense, rate=0.5, training=self.is_training) # Output - out = _tf.add(_tf.matmul(dense, self.weights['dense1_weight']), self.biases['dense1_bias']) + out = _tf.add( + _tf.matmul(dense, self.weights["dense1_weight"]), self.biases["dense1_bias"] + ) out = _tf.reshape(out, (-1, self.seq_len, self.num_classes)) self.probs = _tf.nn.softmax(out) # Weights seq_sum_weights = _tf.reduce_sum(reshaped_weight, axis=1) - binary_seq_sum_weights = _tf.reduce_sum(_tf.cast(seq_sum_weights > 0, dtype=_tf.float32)) + binary_seq_sum_weights = _tf.reduce_sum( + _tf.cast(seq_sum_weights > 0, dtype=_tf.float32) + ) # Loss - loss = _tf.losses.softmax_cross_entropy(logits=out, onehot_labels=one_hot_target, weights=reshaped_weight, reduction=_tf.losses.Reduction.NONE) + loss = _tf.losses.softmax_cross_entropy( + logits=out, + onehot_labels=one_hot_target, + weights=reshaped_weight, + reduction=_tf.losses.Reduction.NONE, + ) self.loss_per_seq = _tf.reduce_sum(loss, axis=1) / (seq_sum_weights + 1e-5) - self.loss_op = _tf.reduce_sum(self.loss_per_seq) / (binary_seq_sum_weights + 1e-5) + self.loss_op = _tf.reduce_sum(self.loss_per_seq) / ( + binary_seq_sum_weights + 1e-5 + ) # Optimizer update_ops = _tf.get_collection(_tf.GraphKeys.UPDATE_OPS) @@ -136,18 +195,19 @@ def load_lstm_weights_params(self, net_params): lstm: lstm weights in Tensorflow Format """ - i2h_i = net_params['lstm_i2h_i_weight'] - i2h_f = net_params['lstm_i2h_f_weight'] - i2h_c = net_params['lstm_i2h_c_weight'] - i2h_o = net_params['lstm_i2h_o_weight'] - h2h_i = net_params['lstm_h2h_i_weight'] - h2h_f = net_params['lstm_h2h_f_weight'] - h2h_c = net_params['lstm_h2h_c_weight'] - h2h_o = net_params['lstm_h2h_o_weight'] - lstm = _utils.convert_lstm_weight_coreml_to_tf(i2h_i, i2h_c, i2h_f, i2h_o, h2h_i, h2h_c, h2h_f, h2h_o) + i2h_i = net_params["lstm_i2h_i_weight"] + i2h_f = net_params["lstm_i2h_f_weight"] + i2h_c = net_params["lstm_i2h_c_weight"] + i2h_o = net_params["lstm_i2h_o_weight"] + h2h_i = net_params["lstm_h2h_i_weight"] + h2h_f = net_params["lstm_h2h_f_weight"] + h2h_c = net_params["lstm_h2h_c_weight"] + h2h_o = net_params["lstm_h2h_o_weight"] + lstm = _utils.convert_lstm_weight_coreml_to_tf( + i2h_i, i2h_c, i2h_f, i2h_o, h2h_i, h2h_c, h2h_f, h2h_o + ) return lstm - def load_weights(self, net_params): """ Function to load weights from the C++ implementation into TensorFlow @@ -160,21 +220,45 @@ def load_weights(self, net_params): """ for key in net_params.keys(): if key in self.weights.keys(): - if key.startswith('conv'): - net_params[key] = _utils.convert_conv1d_coreml_to_tf(net_params[key]) - self.sess.run(_tf.assign(_tf.get_default_graph().get_tensor_by_name(key+":0"), net_params[key])) - elif key.startswith('dense'): + if key.startswith("conv"): + net_params[key] = _utils.convert_conv1d_coreml_to_tf( + net_params[key] + ) + self.sess.run( + _tf.assign( + _tf.get_default_graph().get_tensor_by_name(key + ":0"), + net_params[key], + ) + ) + elif key.startswith("dense"): net_params[key] = _utils.convert_dense_coreml_to_tf(net_params[key]) - self.sess.run(_tf.assign(_tf.get_default_graph().get_tensor_by_name(key+":0"), net_params[key] )) + self.sess.run( + _tf.assign( + _tf.get_default_graph().get_tensor_by_name(key + ":0"), + net_params[key], + ) + ) elif key in self.biases.keys(): - self.sess.run(_tf.assign(_tf.get_default_graph().get_tensor_by_name(key+":0"), net_params[key])) - - h2h_i_bias = net_params['lstm_h2h_i_bias'] - h2h_c_bias = net_params['lstm_h2h_c_bias'] - h2h_f_bias = net_params['lstm_h2h_f_bias'] - h2h_o_bias = net_params['lstm_h2h_o_bias'] - lstm_bias = _utils.convert_lstm_bias_coreml_to_tf(h2h_i_bias, h2h_c_bias, h2h_f_bias, h2h_o_bias) - self.sess.run(_tf.assign(_tf.get_default_graph().get_tensor_by_name('rnn/lstm_cell/bias:0'), lstm_bias)) + self.sess.run( + _tf.assign( + _tf.get_default_graph().get_tensor_by_name(key + ":0"), + net_params[key], + ) + ) + + h2h_i_bias = net_params["lstm_h2h_i_bias"] + h2h_c_bias = net_params["lstm_h2h_c_bias"] + h2h_f_bias = net_params["lstm_h2h_f_bias"] + h2h_o_bias = net_params["lstm_h2h_o_bias"] + lstm_bias = _utils.convert_lstm_bias_coreml_to_tf( + h2h_i_bias, h2h_c_bias, h2h_f_bias, h2h_o_bias + ) + self.sess.run( + _tf.assign( + _tf.get_default_graph().get_tensor_by_name("rnn/lstm_cell/bias:0"), + lstm_bias, + ) + ) def train(self, feed_dict): """ @@ -194,14 +278,30 @@ def train(self, feed_dict): for key in feed_dict.keys(): feed_dict[key] = _utils.convert_shared_float_array_to_numpy(feed_dict[key]) feed_dict[key] = _np.squeeze(feed_dict[key], axis=1) - feed_dict[key] = _np.reshape(feed_dict[key], (feed_dict[key].shape[0], feed_dict[key].shape[1], feed_dict[key].shape[2])) - - _, loss, probs = self.sess.run([self.train_op, self.loss_per_seq, self.probs], - feed_dict={self.data : feed_dict['input'], self.target : feed_dict['labels'], self.weight : feed_dict['weights'], self.is_training : True}) + feed_dict[key] = _np.reshape( + feed_dict[key], + ( + feed_dict[key].shape[0], + feed_dict[key].shape[1], + feed_dict[key].shape[2], + ), + ) + + _, loss, probs = self.sess.run( + [self.train_op, self.loss_per_seq, self.probs], + feed_dict={ + self.data: feed_dict["input"], + self.target: feed_dict["labels"], + self.weight: feed_dict["weights"], + self.is_training: True, + }, + ) prob = _np.array(probs) - probabilities = _np.reshape(prob, (prob.shape[0], prob.shape[1]*prob.shape[2])) - result = {'loss' : _np.array(loss), 'output': probabilities } + probabilities = _np.reshape( + prob, (prob.shape[0], prob.shape[1] * prob.shape[2]) + ) + result = {"loss": _np.array(loss), "output": probabilities} return result def predict(self, feed_dict): @@ -225,23 +325,42 @@ def predict(self, feed_dict): for key in feed_dict.keys(): feed_dict[key] = _utils.convert_shared_float_array_to_numpy(feed_dict[key]) feed_dict[key] = _np.squeeze(feed_dict[key], axis=1) - feed_dict[key] = _np.reshape(feed_dict[key], (feed_dict[key].shape[0], feed_dict[key].shape[1], feed_dict[key].shape[2])) + feed_dict[key] = _np.reshape( + feed_dict[key], + ( + feed_dict[key].shape[0], + feed_dict[key].shape[1], + feed_dict[key].shape[2], + ), + ) if len(feed_dict.keys()) == 1: - probs = self.sess.run(self.probs, - feed_dict={self.data : feed_dict['input'], self.is_training: False}) + probs = self.sess.run( + self.probs, + feed_dict={self.data: feed_dict["input"], self.is_training: False}, + ) prob = _np.array(probs) - probabilities = _np.reshape(prob, (prob.shape[0], prob.shape[1]*prob.shape[2])) - result = { 'output' : probabilities} + probabilities = _np.reshape( + prob, (prob.shape[0], prob.shape[1] * prob.shape[2]) + ) + result = {"output": probabilities} else: - loss, probs= self.sess.run([self.loss_per_seq, self.probs], - feed_dict={self.data : feed_dict['input'], self.target : feed_dict['labels'], self.weight : feed_dict['weights'], self.is_training: False}) + loss, probs = self.sess.run( + [self.loss_per_seq, self.probs], + feed_dict={ + self.data: feed_dict["input"], + self.target: feed_dict["labels"], + self.weight: feed_dict["weights"], + self.is_training: False, + }, + ) prob = _np.array(probs) - probabilities = _np.reshape(prob, (prob.shape[0], prob.shape[1]*prob.shape[2])) - result = {'loss' : _np.array(loss), 'output': probabilities } + probabilities = _np.reshape( + prob, (prob.shape[0], prob.shape[1] * prob.shape[2]) + ) + result = {"loss": _np.array(loss), "output": probabilities} return result - def export_weights(self): """ Function to store TensorFlow weights back to into a dict in CoreML format to be used @@ -258,45 +377,64 @@ def export_weights(self): tvars_vals = self.sess.run(tvars) for var, val in zip(tvars, tvars_vals): - if 'weight' in var.name: - if var.name.startswith('conv'): - - tf_export_params[var.name.split(':')[0]] = _utils.convert_conv1d_tf_to_coreml(val) - elif var.name.startswith('dense'): - tf_export_params[var.name.split(':')[0]] = _utils.convert_dense_tf_to_coreml(val) - elif var.name.startswith('rnn/lstm_cell/kernel'): - i2h_i, i2h_c, i2h_f, i2h_o, h2h_i, h2h_c, h2h_f, h2h_o = _utils.convert_lstm_weight_tf_to_coreml(val, CONV_H) - tf_export_params['lstm_i2h_i_weight'] = i2h_i - tf_export_params['lstm_i2h_c_weight'] = i2h_c - tf_export_params['lstm_i2h_f_weight'] = i2h_f - tf_export_params['lstm_i2h_o_weight'] = i2h_o - tf_export_params['lstm_h2h_i_weight'] = h2h_i - tf_export_params['lstm_h2h_c_weight'] = h2h_c - tf_export_params['lstm_h2h_f_weight'] = h2h_f - tf_export_params['lstm_h2h_o_weight'] = h2h_o - elif var.name.startswith('rnn/lstm_cell/bias'): - h2h_i_bias, h2h_c_bias, h2h_f_bias, h2h_o_bias = _utils.convert_lstm_bias_tf_to_coreml(val) - tf_export_params['lstm_h2h_i_bias'] = h2h_i_bias - tf_export_params['lstm_h2h_c_bias'] = h2h_c_bias - tf_export_params['lstm_h2h_f_bias'] = h2h_f_bias - tf_export_params['lstm_h2h_o_bias'] = h2h_o_bias - elif var.name.startswith('batch_normalization'): - tf_export_params['bn_'+var.name.split('/')[-1][0:-2]] = _np.array(val) + if "weight" in var.name: + if var.name.startswith("conv"): + + tf_export_params[ + var.name.split(":")[0] + ] = _utils.convert_conv1d_tf_to_coreml(val) + elif var.name.startswith("dense"): + tf_export_params[ + var.name.split(":")[0] + ] = _utils.convert_dense_tf_to_coreml(val) + elif var.name.startswith("rnn/lstm_cell/kernel"): + ( + i2h_i, + i2h_c, + i2h_f, + i2h_o, + h2h_i, + h2h_c, + h2h_f, + h2h_o, + ) = _utils.convert_lstm_weight_tf_to_coreml(val, CONV_H) + tf_export_params["lstm_i2h_i_weight"] = i2h_i + tf_export_params["lstm_i2h_c_weight"] = i2h_c + tf_export_params["lstm_i2h_f_weight"] = i2h_f + tf_export_params["lstm_i2h_o_weight"] = i2h_o + tf_export_params["lstm_h2h_i_weight"] = h2h_i + tf_export_params["lstm_h2h_c_weight"] = h2h_c + tf_export_params["lstm_h2h_f_weight"] = h2h_f + tf_export_params["lstm_h2h_o_weight"] = h2h_o + elif var.name.startswith("rnn/lstm_cell/bias"): + ( + h2h_i_bias, + h2h_c_bias, + h2h_f_bias, + h2h_o_bias, + ) = _utils.convert_lstm_bias_tf_to_coreml(val) + tf_export_params["lstm_h2h_i_bias"] = h2h_i_bias + tf_export_params["lstm_h2h_c_bias"] = h2h_c_bias + tf_export_params["lstm_h2h_f_bias"] = h2h_f_bias + tf_export_params["lstm_h2h_o_bias"] = h2h_o_bias + elif var.name.startswith("batch_normalization"): + tf_export_params["bn_" + var.name.split("/")[-1][0:-2]] = _np.array(val) else: - tf_export_params[var.name.split(':')[0]] = _np.array(val) + tf_export_params[var.name.split(":")[0]] = _np.array(val) tvars = _tf.global_variables() tvars_vals = self.sess.run(tvars) for var, val in zip(tvars, tvars_vals): - if 'moving_mean' in var.name: - tf_export_params['bn_running_mean'] = _np.array(val) - if 'moving_variance' in var.name: - tf_export_params['bn_running_var'] = _np.array(val) + if "moving_mean" in var.name: + tf_export_params["bn_running_mean"] = _np.array(val) + if "moving_variance" in var.name: + tf_export_params["bn_running_var"] = _np.array(val) for layer_name in tf_export_params.keys(): - tf_export_params[layer_name] = _np.ascontiguousarray(tf_export_params[layer_name]) + tf_export_params[layer_name] = _np.ascontiguousarray( + tf_export_params[layer_name] + ) return tf_export_params - def set_learning_rate(self, lr): """ Set the learning rate diff --git a/src/python/turicreate/toolkits/activity_classifier/util.py b/src/python/turicreate/toolkits/activity_classifier/util.py index 4aa9ab001e..cf89f76087 100644 --- a/src/python/turicreate/toolkits/activity_classifier/util.py +++ b/src/python/turicreate/toolkits/activity_classifier/util.py @@ -12,11 +12,13 @@ from turicreate.toolkits._internal_utils import _numeric_param_check_range import sys as _sys + if _sys.version_info.major > 2: long = int _MIN_NUM_SESSIONS_FOR_SPLIT = 100 + def random_split_by_session(dataset, session_id, fraction=0.9, seed=None): """ Randomly split an SFrame into two SFrames based on the `session_id` such @@ -63,26 +65,28 @@ def random_split_by_session(dataset, session_id, fraction=0.9, seed=None): """ from random import Random - _raise_error_if_not_of_type(dataset, _SFrame, 'dataset') - _raise_error_if_not_of_type(session_id, str, 'session_id') - _raise_error_if_not_of_type(fraction, float, 'fraction') - _raise_error_if_not_of_type(seed, [int, type(None)], 'seed') - _numeric_param_check_range('fraction', fraction, 0, 1) + _raise_error_if_not_of_type(dataset, _SFrame, "dataset") + _raise_error_if_not_of_type(session_id, str, "session_id") + _raise_error_if_not_of_type(fraction, float, "fraction") + _raise_error_if_not_of_type(seed, [int, type(None)], "seed") + _numeric_param_check_range("fraction", fraction, 0, 1) if session_id not in dataset.column_names(): raise _ToolkitError( - 'Input "dataset" must contain a column called %s.' % session_id) + 'Input "dataset" must contain a column called %s.' % session_id + ) if seed is None: # Include the nanosecond component as well. import time + seed = abs(hash("%0.20f" % time.time())) % (2 ** 31) # The cython bindings require this to be an int, so cast if we can. try: seed = int(seed) except ValueError: - raise ValueError('The \'seed\' parameter must be of type int.') + raise ValueError("The 'seed' parameter must be of type int.") random = Random() diff --git a/src/python/turicreate/toolkits/audio_analysis/__init__.py b/src/python/turicreate/toolkits/audio_analysis/__init__.py index 12cb29daec..b4b316a2d3 100644 --- a/src/python/turicreate/toolkits/audio_analysis/__init__.py +++ b/src/python/turicreate/toolkits/audio_analysis/__init__.py @@ -8,6 +8,6 @@ from __future__ import division as _ from __future__ import absolute_import as _ -__all__ = ['audio_analysis'] +__all__ = ["audio_analysis"] from . import audio_analysis diff --git a/src/python/turicreate/toolkits/audio_analysis/audio_analysis.py b/src/python/turicreate/toolkits/audio_analysis/audio_analysis.py index 8c49345345..2d14230139 100644 --- a/src/python/turicreate/toolkits/audio_analysis/audio_analysis.py +++ b/src/python/turicreate/toolkits/audio_analysis/audio_analysis.py @@ -18,7 +18,9 @@ from turicreate.toolkits._main import ToolkitError as _ToolkitError -def load_audio(path, with_path=True, recursive=True, ignore_failure=True, random_order=False): +def load_audio( + path, with_path=True, recursive=True, ignore_failure=True, random_order=False +): """ Loads WAV file(s) from a path. @@ -62,20 +64,22 @@ def load_audio(path, with_path=True, recursive=True, ignore_failure=True, random all_wav_files = [] - if _fnmatch(path, '*.wav'): # single file + if _fnmatch(path, "*.wav"): # single file all_wav_files.append(path) elif recursive: for (dir_path, _, file_names) in _os.walk(path): for cur_file in file_names: - if _fnmatch(cur_file, '*.wav'): - all_wav_files.append(dir_path + '/' + cur_file) + if _fnmatch(cur_file, "*.wav"): + all_wav_files.append(dir_path + "/" + cur_file) else: - all_wav_files = _glob(path + '/*.wav') + all_wav_files = _glob(path + "/*.wav") if random_order: _shuffle(all_wav_files) - result_builder = _tc.SFrameBuilder(column_types=[dict, str], column_names=['audio', 'path']) + result_builder = _tc.SFrameBuilder( + column_types=[dict, str], column_names=["audio", "path"] + ) for cur_file_path in all_wav_files: try: sample_rate, data = _wavfile.read(cur_file_path) @@ -87,9 +91,11 @@ def load_audio(path, with_path=True, recursive=True, ignore_failure=True, random print(error_string) continue - result_builder.append([{'sample_rate': sample_rate, 'data': data}, cur_file_path]) + result_builder.append( + [{"sample_rate": sample_rate, "data": data}, cur_file_path] + ) result = result_builder.close() if not with_path: - del result['path'] + del result["path"] return result diff --git a/src/python/turicreate/toolkits/classifier/_classifier.py b/src/python/turicreate/toolkits/classifier/_classifier.py index 5a4642aead..ab8fba1770 100644 --- a/src/python/turicreate/toolkits/classifier/_classifier.py +++ b/src/python/turicreate/toolkits/classifier/_classifier.py @@ -9,8 +9,8 @@ import turicreate as _turicreate from turicreate.toolkits import _supervised_learning as _sl -def create(dataset, target, features=None, validation_set = 'auto', - verbose=True): + +def create(dataset, target, features=None, validation_set="auto", verbose=True): """ Automatically create a suitable classifier model based on the provided training data. @@ -100,7 +100,8 @@ class and 1 being the reference class. Use `model.classes` to return _sl.create_classification_with_model_selector( dataset, target, - model_selector = _turicreate.extensions._supervised_learning._classifier_available_models, - features = features, - validation_set = validation_set, - verbose = verbose) + model_selector=_turicreate.extensions._supervised_learning._classifier_available_models, + features=features, + validation_set=validation_set, + verbose=verbose, + ) diff --git a/src/python/turicreate/toolkits/classifier/boosted_trees_classifier.py b/src/python/turicreate/toolkits/classifier/boosted_trees_classifier.py index 9afb4090c2..073db6de30 100644 --- a/src/python/turicreate/toolkits/classifier/boosted_trees_classifier.py +++ b/src/python/turicreate/toolkits/classifier/boosted_trees_classifier.py @@ -23,13 +23,14 @@ from turicreate.util import _make_internal_url -__doc_string_context = ''' +__doc_string_context = """ >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = turicreate.SFrame.read_csv(url) >>> train, test = data.random_split(0.8) >>> model = turicreate.boosted_trees_classifier.create(train, target='label') -''' +""" + class BoostedTreesClassifier(_Classifier, _TreeModelMixin): """ @@ -59,6 +60,7 @@ class BoostedTreesClassifier(_Classifier, _TreeModelMixin): create """ + def __init__(self, proxy): """__init__(self)""" self.__proxy__ = proxy @@ -150,7 +152,7 @@ def _get(self, field): """ return super(_Classifier, self)._get(field) - def evaluate(self, dataset, metric='auto', missing_value_action='auto'): + def evaluate(self, dataset, metric="auto", missing_value_action="auto"): """ Evaluate the model by making predictions of target values and comparing these to actual values. @@ -210,14 +212,25 @@ def evaluate(self, dataset, metric='auto', missing_value_action='auto'): >>> results = model.evaluate(test_data, metric='confusion_matrix') """ - _raise_error_evaluation_metric_is_valid(metric, - ['auto', 'accuracy', 'confusion_matrix', 'roc_curve', 'auc', - 'log_loss', 'precision', 'recall', 'f1_score']) - return super(_Classifier, self).evaluate(dataset, - missing_value_action=missing_value_action, - metric=metric) - - def predict(self, dataset, output_type='class', missing_value_action='auto'): + _raise_error_evaluation_metric_is_valid( + metric, + [ + "auto", + "accuracy", + "confusion_matrix", + "roc_curve", + "auc", + "log_loss", + "precision", + "recall", + "f1_score", + ], + ) + return super(_Classifier, self).evaluate( + dataset, missing_value_action=missing_value_action, metric=metric + ) + + def predict(self, dataset, output_type="class", missing_value_action="auto"): """ A flexible and advanced prediction API. @@ -274,13 +287,18 @@ class as a vector. The probability of the first class (sorted >>> m.predict(testdata, output_type='probability') >>> m.predict(testdata, output_type='margin') """ - _check_categorical_option_type('output_type', output_type, - ['class', 'margin', 'probability', 'probability_vector']) - return super(_Classifier, self).predict(dataset, - output_type=output_type, - missing_value_action=missing_value_action) - - def predict_topk(self, dataset, output_type="probability", k=3, missing_value_action='auto'): + _check_categorical_option_type( + "output_type", + output_type, + ["class", "margin", "probability", "probability_vector"], + ) + return super(_Classifier, self).predict( + dataset, output_type=output_type, missing_value_action=missing_value_action + ) + + def predict_topk( + self, dataset, output_type="probability", k=3, missing_value_action="auto" + ): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `id`, @@ -346,23 +364,30 @@ def predict_topk(self, dataset, output_type="probability", k=3, missing_value_ac +--------+-------+-------------------+ [35688 rows x 3 columns] """ - _check_categorical_option_type('output_type', output_type, ['rank', 'margin', 'probability']) - if missing_value_action == 'auto': - missing_value_action = _sl.select_default_missing_value_policy(self, 'predict') + _check_categorical_option_type( + "output_type", output_type, ["rank", "margin", "probability"] + ) + if missing_value_action == "auto": + missing_value_action = _sl.select_default_missing_value_policy( + self, "predict" + ) # Low latency path if isinstance(dataset, list): return self.__proxy__.fast_predict_topk( - dataset, missing_value_action, output_type, k) + dataset, missing_value_action, output_type, k + ) if isinstance(dataset, dict): return self.__proxy__.fast_predict_topk( - [dataset], missing_value_action, output_type, k) + [dataset], missing_value_action, output_type, k + ) # Fast path _raise_error_if_not_sframe(dataset, "dataset") return self.__proxy__.predict_topk( - dataset, missing_value_action, output_type, k) + dataset, missing_value_action, output_type, k + ) - def classify(self, dataset, missing_value_action='auto'): + def classify(self, dataset, missing_value_action="auto"): """ Return a classification, for each example in the ``dataset``, using the trained boosted trees model. The output SFrame contains predictions @@ -409,9 +434,9 @@ def classify(self, dataset, missing_value_action='auto'): >>> classes = model.classify(data) """ - return super(BoostedTreesClassifier, self).classify(dataset, - missing_value_action=missing_value_action) - + return super(BoostedTreesClassifier, self).classify( + dataset, missing_value_action=missing_value_action + ) def export_coreml(self, filename): """ @@ -427,26 +452,36 @@ def export_coreml(self, filename): >>> model.export_coreml("MyModel.mlmodel") """ from turicreate.toolkits import _coreml_utils + display_name = "boosted trees classifier" short_description = _coreml_utils._mlmodel_short_description(display_name) - context = {"mode" : "classification", - "model_type" : "boosted_trees", - "class": self.__class__.__name__, - "short_description": short_description, - } + context = { + "mode": "classification", + "model_type": "boosted_trees", + "class": self.__class__.__name__, + "short_description": short_description, + } self._export_coreml_impl(filename, context) -def create(dataset, target, - features=None, max_iterations=10, - validation_set='auto', - class_weights = None, - max_depth=6, step_size=0.3, - min_loss_reduction=0.0, min_child_weight=0.1, - row_subsample=1.0, column_subsample=1.0, - verbose=True, - random_seed = None, - metric='auto', - **kwargs): + +def create( + dataset, + target, + features=None, + max_iterations=10, + validation_set="auto", + class_weights=None, + max_depth=6, + step_size=0.3, + min_loss_reduction=0.0, + min_child_weight=0.1, + row_subsample=1.0, + column_subsample=1.0, + verbose=True, + random_seed=None, + metric="auto", + **kwargs +): """ Create a (binary or multi-class) classifier model of type :class:`~turicreate.boosted_trees_classifier.BoostedTreesClassifier` using @@ -602,26 +637,32 @@ def create(dataset, target, >>> results = model.evaluate(test) """ if random_seed is not None: - kwargs['random_seed'] = random_seed - if 'model_checkpoint_path' in kwargs: - kwargs['model_checkpoint_path'] = _make_internal_url(kwargs['model_checkpoint_path']) - if 'resume_from_checkpoint' in kwargs: - kwargs['resume_from_checkpoint'] = _make_internal_url(kwargs['resume_from_checkpoint']) - - model = _sl.create(dataset = dataset, - target = target, - features = features, - model_name = 'boosted_trees_classifier', - max_iterations = max_iterations, - validation_set = validation_set, - class_weights = class_weights, - max_depth = max_depth, - step_size = step_size, - min_loss_reduction = min_loss_reduction, - min_child_weight = min_child_weight, - row_subsample = row_subsample, - column_subsample = column_subsample, - verbose = verbose, - metric = metric, - **kwargs) + kwargs["random_seed"] = random_seed + if "model_checkpoint_path" in kwargs: + kwargs["model_checkpoint_path"] = _make_internal_url( + kwargs["model_checkpoint_path"] + ) + if "resume_from_checkpoint" in kwargs: + kwargs["resume_from_checkpoint"] = _make_internal_url( + kwargs["resume_from_checkpoint"] + ) + + model = _sl.create( + dataset=dataset, + target=target, + features=features, + model_name="boosted_trees_classifier", + max_iterations=max_iterations, + validation_set=validation_set, + class_weights=class_weights, + max_depth=max_depth, + step_size=step_size, + min_loss_reduction=min_loss_reduction, + min_child_weight=min_child_weight, + row_subsample=row_subsample, + column_subsample=column_subsample, + verbose=verbose, + metric=metric, + **kwargs + ) return BoostedTreesClassifier(model.__proxy__) diff --git a/src/python/turicreate/toolkits/classifier/decision_tree_classifier.py b/src/python/turicreate/toolkits/classifier/decision_tree_classifier.py index ba30f76e14..9872fed59d 100644 --- a/src/python/turicreate/toolkits/classifier/decision_tree_classifier.py +++ b/src/python/turicreate/toolkits/classifier/decision_tree_classifier.py @@ -19,20 +19,34 @@ from turicreate.toolkits._tree_model_mixin import TreeModelMixin as _TreeModelMixin -_DECISION_TREE_MODEL_PARAMS_KEYS = ['max_depth', 'min_child_weight', -'min_loss_reduction'] -_DECISION_TREE_TRAINING_PARAMS_KEYS = ['objective', 'training_time', -'training_error', 'validation_error', 'evaluation_metric'] -_DECISION_TREE_TRAINING_DATA_PARAMS_KEYS = ['target', 'features', -'num_features', 'num_examples', 'num_validation_examples'] - -__doc_string_context = ''' +_DECISION_TREE_MODEL_PARAMS_KEYS = [ + "max_depth", + "min_child_weight", + "min_loss_reduction", +] +_DECISION_TREE_TRAINING_PARAMS_KEYS = [ + "objective", + "training_time", + "training_error", + "validation_error", + "evaluation_metric", +] +_DECISION_TREE_TRAINING_DATA_PARAMS_KEYS = [ + "target", + "features", + "num_features", + "num_examples", + "num_validation_examples", +] + +__doc_string_context = """ >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = turicreate.SFrame.read_csv(url) >>> train, test = data.random_split(0.8) >>> model = turicreate.decision_tree_classifier.create(train, target='label') -''' +""" + class DecisionTreeClassifier(_Classifier, _TreeModelMixin): """ @@ -56,6 +70,7 @@ class DecisionTreeClassifier(_Classifier, _TreeModelMixin): create """ + def __init__(self, proxy): """__init__(self)""" self.__proxy__ = proxy @@ -138,7 +153,7 @@ def _get(self, field): """ return super(_Classifier, self)._get(field) - def evaluate(self, dataset, metric='auto', missing_value_action='auto'): + def evaluate(self, dataset, metric="auto", missing_value_action="auto"): """ Evaluate the model by making predictions of target values and comparing these to actual values. @@ -198,14 +213,25 @@ def evaluate(self, dataset, metric='auto', missing_value_action='auto'): >>> results = model.evaluate(test_data, metric='confusion_matrix') """ - _raise_error_evaluation_metric_is_valid(metric, - ['auto', 'accuracy', 'confusion_matrix', 'roc_curve', 'auc', - 'log_loss', 'precision', 'recall', 'f1_score']) - return super(_Classifier, self).evaluate(dataset, - missing_value_action=missing_value_action, - metric=metric) - - def predict(self, dataset, output_type='class', missing_value_action='auto'): + _raise_error_evaluation_metric_is_valid( + metric, + [ + "auto", + "accuracy", + "confusion_matrix", + "roc_curve", + "auc", + "log_loss", + "precision", + "recall", + "f1_score", + ], + ) + return super(_Classifier, self).evaluate( + dataset, missing_value_action=missing_value_action, metric=metric + ) + + def predict(self, dataset, output_type="class", missing_value_action="auto"): """ A flexible and advanced prediction API. @@ -262,13 +288,18 @@ class as a vector. The probability of the first class (sorted >>> m.predict(testdata, output_type='probability') >>> m.predict(testdata, output_type='margin') """ - _check_categorical_option_type('output_type', output_type, - ['class', 'margin', 'probability', 'probability_vector']) - return super(_Classifier, self).predict(dataset, - output_type=output_type, - missing_value_action=missing_value_action) - - def predict_topk(self, dataset, output_type="probability", k=3, missing_value_action='auto'): + _check_categorical_option_type( + "output_type", + output_type, + ["class", "margin", "probability", "probability_vector"], + ) + return super(_Classifier, self).predict( + dataset, output_type=output_type, missing_value_action=missing_value_action + ) + + def predict_topk( + self, dataset, output_type="probability", k=3, missing_value_action="auto" + ): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `id`, @@ -334,23 +365,30 @@ def predict_topk(self, dataset, output_type="probability", k=3, missing_value_ac +--------+-------+-------------------+ [35688 rows x 3 columns] """ - _check_categorical_option_type('output_type', output_type, ['rank', 'margin', 'probability']) - if missing_value_action == 'auto': - missing_value_action = _sl.select_default_missing_value_policy(self, 'predict') + _check_categorical_option_type( + "output_type", output_type, ["rank", "margin", "probability"] + ) + if missing_value_action == "auto": + missing_value_action = _sl.select_default_missing_value_policy( + self, "predict" + ) # Low latency path if isinstance(dataset, list): return self.__proxy__.fast_predict_topk( - dataset, missing_value_action, output_type, k) + dataset, missing_value_action, output_type, k + ) if isinstance(dataset, dict): return self.__proxy__.fast_predict_topk( - [dataset], missing_value_action, output_type, k) + [dataset], missing_value_action, output_type, k + ) # Fast path _raise_error_if_not_sframe(dataset, "dataset") return self.__proxy__.predict_topk( - dataset, missing_value_action, output_type, k) + dataset, missing_value_action, output_type, k + ) - def classify(self, dataset, missing_value_action='auto'): + def classify(self, dataset, missing_value_action="auto"): """ Return a classification, for each example in the ``dataset``, using the trained model. The output SFrame contains predictions as class labels @@ -397,9 +435,9 @@ def classify(self, dataset, missing_value_action='auto'): >>> classes = model.classify(data) """ - return super(DecisionTreeClassifier, self).classify(dataset, - missing_value_action=missing_value_action) - + return super(DecisionTreeClassifier, self).classify( + dataset, missing_value_action=missing_value_action + ) def export_coreml(self, filename): """ @@ -415,27 +453,33 @@ def export_coreml(self, filename): >>> model.export_coreml("MyModel.mlmodel") """ from turicreate.toolkits import _coreml_utils + display_name = "decision tree classifier" short_description = _coreml_utils._mlmodel_short_description(display_name) - context = {"mode" : "classification", - "model_type" : "decision_tree", - "class": self.__class__.__name__, - "short_description": short_description, - } + context = { + "mode": "classification", + "model_type": "decision_tree", + "class": self.__class__.__name__, + "short_description": short_description, + } self._export_coreml_impl(filename, context) -def create(dataset, target, - features=None, - validation_set='auto', - class_weights=None, - max_depth=6, - min_loss_reduction=0.0, - min_child_weight=0.1, - verbose=True, - random_seed=None, - metric='auto', - **kwargs): + +def create( + dataset, + target, + features=None, + validation_set="auto", + class_weights=None, + max_depth=6, + min_loss_reduction=0.0, + min_child_weight=0.1, + verbose=True, + random_seed=None, + metric="auto", + **kwargs +): """ Create a (binary or multi-class) classifier model of type :class:`~turicreate.decision_tree_classifier.DecisionTreeClassifier`. This @@ -545,18 +589,20 @@ def create(dataset, target, >>> results = model.evaluate(test) """ if random_seed is not None: - kwargs['random_seed'] = random_seed - - model = _sl.create(dataset = dataset, - target = target, - features = features, - model_name = 'decision_tree_classifier', - validation_set = validation_set, - class_weights = class_weights, - max_depth = max_depth, - min_loss_reduction = min_loss_reduction, - min_child_weight = min_child_weight, - verbose = verbose, - metric = metric, - **kwargs) + kwargs["random_seed"] = random_seed + + model = _sl.create( + dataset=dataset, + target=target, + features=features, + model_name="decision_tree_classifier", + validation_set=validation_set, + class_weights=class_weights, + max_depth=max_depth, + min_loss_reduction=min_loss_reduction, + min_child_weight=min_child_weight, + verbose=verbose, + metric=metric, + **kwargs + ) return DecisionTreeClassifier(model.__proxy__) diff --git a/src/python/turicreate/toolkits/classifier/logistic_classifier.py b/src/python/turicreate/toolkits/classifier/logistic_classifier.py index 1b6c288cd7..2e6110a407 100644 --- a/src/python/turicreate/toolkits/classifier/logistic_classifier.py +++ b/src/python/turicreate/toolkits/classifier/logistic_classifier.py @@ -12,30 +12,40 @@ import turicreate.toolkits._supervised_learning as _sl from turicreate.toolkits._supervised_learning import Classifier as _Classifier -from turicreate.toolkits._internal_utils import _toolkit_repr_print, \ - _toolkit_get_topk_bottomk, \ - _raise_error_if_not_sframe, \ - _check_categorical_option_type, \ - _raise_error_evaluation_metric_is_valid, \ - _summarize_coefficients +from turicreate.toolkits._internal_utils import ( + _toolkit_repr_print, + _toolkit_get_topk_bottomk, + _raise_error_if_not_sframe, + _check_categorical_option_type, + _raise_error_evaluation_metric_is_valid, + _summarize_coefficients, +) _DEFAULT_SOLVER_OPTIONS = { -'convergence_threshold': 1e-2, -'step_size': 1.0, -'lbfgs_memory_level': 11, -'max_iterations': 10} - -def create(dataset, target, features=None, - l2_penalty=0.01, l1_penalty=0.0, - solver='auto', feature_rescaling=True, - convergence_threshold = _DEFAULT_SOLVER_OPTIONS['convergence_threshold'], - step_size = _DEFAULT_SOLVER_OPTIONS['step_size'], - lbfgs_memory_level = _DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'], - max_iterations = _DEFAULT_SOLVER_OPTIONS['max_iterations'], - class_weights = None, - validation_set = 'auto', + "convergence_threshold": 1e-2, + "step_size": 1.0, + "lbfgs_memory_level": 11, + "max_iterations": 10, +} + + +def create( + dataset, + target, + features=None, + l2_penalty=0.01, + l1_penalty=0.0, + solver="auto", + feature_rescaling=True, + convergence_threshold=_DEFAULT_SOLVER_OPTIONS["convergence_threshold"], + step_size=_DEFAULT_SOLVER_OPTIONS["step_size"], + lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS["lbfgs_memory_level"], + max_iterations=_DEFAULT_SOLVER_OPTIONS["max_iterations"], + class_weights=None, + validation_set="auto", verbose=True, - seed=None): + seed=None, +): """ Create a :class:`~turicreate.logistic_classifier.LogisticClassifier` (using logistic regression as a classifier) to predict the class of a discrete @@ -292,25 +302,32 @@ def create(dataset, target, features=None, """ - # Regression model names. model_name = "classifier_logistic_regression" solver = solver.lower() - model = _sl.create(dataset, target, model_name, features=features, - validation_set = validation_set, verbose = verbose, - l2_penalty=l2_penalty, l1_penalty = l1_penalty, - feature_rescaling = feature_rescaling, - convergence_threshold = convergence_threshold, - step_size = step_size, - solver = solver, - lbfgs_memory_level = lbfgs_memory_level, - max_iterations = max_iterations, - class_weights = class_weights, - seed=seed) + model = _sl.create( + dataset, + target, + model_name, + features=features, + validation_set=validation_set, + verbose=verbose, + l2_penalty=l2_penalty, + l1_penalty=l1_penalty, + feature_rescaling=feature_rescaling, + convergence_threshold=convergence_threshold, + step_size=step_size, + solver=solver, + lbfgs_memory_level=lbfgs_memory_level, + max_iterations=max_iterations, + class_weights=class_weights, + seed=seed, + ) return LogisticClassifier(model.__proxy__) + class LogisticClassifier(_Classifier): """ Logistic regression models a discrete target variable as a function of @@ -379,6 +396,7 @@ class LogisticClassifier(_Classifier): """ + def __init__(self, model_proxy): self.__proxy__ = model_proxy @@ -418,36 +436,34 @@ def _get_summary_struct(self): """ model_fields = [ - ('Number of coefficients', 'num_coefficients'), - ('Number of examples', 'num_examples'), - ('Number of classes', 'num_classes'), - ('Number of feature columns', 'num_features'), - ('Number of unpacked features', 'num_unpacked_features')] - - hyperparam_fields = [ - ("L1 penalty", 'l1_penalty'), - ("L2 penalty", 'l2_penalty') + ("Number of coefficients", "num_coefficients"), + ("Number of examples", "num_examples"), + ("Number of classes", "num_classes"), + ("Number of feature columns", "num_features"), + ("Number of unpacked features", "num_unpacked_features"), ] + hyperparam_fields = [("L1 penalty", "l1_penalty"), ("L2 penalty", "l2_penalty")] + solver_fields = [ - ("Solver", 'solver'), - ("Solver iterations", 'training_iterations'), - ("Solver status", 'training_solver_status'), - ("Training time (sec)", 'training_time')] + ("Solver", "solver"), + ("Solver iterations", "training_iterations"), + ("Solver status", "training_solver_status"), + ("Training time (sec)", "training_time"), + ] - training_fields = [ - ("Log-likelihood", 'training_loss')] + training_fields = [("Log-likelihood", "training_loss")] coefs = self.coefficients - top_coefs, bottom_coefs = _toolkit_get_topk_bottomk(coefs,k=5) + top_coefs, bottom_coefs = _toolkit_get_topk_bottomk(coefs, k=5) - (coefs_list, titles_list) = _summarize_coefficients(top_coefs, \ - bottom_coefs) + (coefs_list, titles_list) = _summarize_coefficients(top_coefs, bottom_coefs) - return ([ model_fields, hyperparam_fields, \ - solver_fields, training_fields ] + coefs_list, \ - [ 'Schema', 'Hyperparameters', \ - 'Training Summary', 'Settings' ] + titles_list ) + return ( + [model_fields, hyperparam_fields, solver_fields, training_fields] + + coefs_list, + ["Schema", "Hyperparameters", "Training Summary", "Settings"] + titles_list, + ) def __repr__(self): """ @@ -474,12 +490,14 @@ def export_coreml(self, filename): """ from turicreate.extensions import _logistic_classifier_export_as_model_asset from turicreate.toolkits import _coreml_utils + display_name = "logistic classifier" short_description = _coreml_utils._mlmodel_short_description(display_name) - context = {"class": self.__class__.__name__, - "short_description": short_description, - } - context['user_defined'] = _coreml_utils._get_tc_version_info() + context = { + "class": self.__class__.__name__, + "short_description": short_description, + } + context["user_defined"] = _coreml_utils._get_tc_version_info() _logistic_classifier_export_as_model_asset(self.__proxy__, filename, context) def _get(self, field): @@ -545,8 +563,7 @@ def _get(self, field): """ return super(_Classifier, self)._get(field) - def predict(self, dataset, output_type='class', - missing_value_action='auto'): + def predict(self, dataset, output_type="class", missing_value_action="auto"): """ Return predictions for ``dataset``, using the trained logistic regression model. Predictions can be generated as class labels, @@ -618,13 +635,11 @@ class as a vector. The probability of the first class (sorted """ - return super(_Classifier, self).predict(dataset, - output_type=output_type, - missing_value_action=missing_value_action) - - + return super(_Classifier, self).predict( + dataset, output_type=output_type, missing_value_action=missing_value_action + ) - def classify(self, dataset, missing_value_action='auto'): + def classify(self, dataset, missing_value_action="auto"): """ Return a classification, for each example in the ``dataset``, using the trained logistic regression model. The output SFrame contains predictions @@ -672,10 +687,13 @@ def classify(self, dataset, missing_value_action='auto'): """ - return super(LogisticClassifier, self).classify(dataset, - missing_value_action=missing_value_action) + return super(LogisticClassifier, self).classify( + dataset, missing_value_action=missing_value_action + ) - def predict_topk(self, dataset, output_type="probability", k=3, missing_value_action='auto'): + def predict_topk( + self, dataset, output_type="probability", k=3, missing_value_action="auto" + ): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `id`, @@ -741,30 +759,41 @@ def predict_topk(self, dataset, output_type="probability", k=3, missing_value_ac +--------+-------+-------------------+ [35688 rows x 3 columns] """ - _check_categorical_option_type('output_type', output_type, - ['rank', 'margin', 'probability']) - _check_categorical_option_type('missing_value_action', missing_value_action, - ['auto', 'impute', 'error']) - if missing_value_action == 'auto': - missing_value_action = 'impute' + _check_categorical_option_type( + "output_type", output_type, ["rank", "margin", "probability"] + ) + _check_categorical_option_type( + "missing_value_action", missing_value_action, ["auto", "impute", "error"] + ) + if missing_value_action == "auto": + missing_value_action = "impute" # Low latency path if isinstance(dataset, list): return self.__proxy__.fast_predict_topk( - dataset, missing_value_action, output_type, k) + dataset, missing_value_action, output_type, k + ) if isinstance(dataset, dict): return self.__proxy__.fast_predict_topk( - [dataset], missing_value_action, output_type, k) + [dataset], missing_value_action, output_type, k + ) # Fast path _raise_error_if_not_sframe(dataset, "dataset") - if (missing_value_action == 'auto'): + if missing_value_action == "auto": missing_value_action = _sl.select_default_missing_value_policy( - self, 'predict') + self, "predict" + ) return self.__proxy__.predict_topk( - dataset, missing_value_action, output_type, k) - - - def evaluate(self, dataset, metric='auto', missing_value_action='auto', with_predictions=False): + dataset, missing_value_action, output_type, k + ) + + def evaluate( + self, + dataset, + metric="auto", + missing_value_action="auto", + with_predictions=False, + ): """ Evaluate the model by making predictions of target values and comparing these to actual values. @@ -828,10 +857,23 @@ def evaluate(self, dataset, metric='auto', missing_value_action='auto', with_pre >>> print results['accuracy'] """ - _raise_error_evaluation_metric_is_valid(metric, - ['auto', 'accuracy', 'confusion_matrix', 'roc_curve', 'auc', - 'log_loss', 'precision', 'recall', 'f1_score']) - return super(_Classifier, self).evaluate(dataset, - missing_value_action=missing_value_action, - metric=metric, - with_predictions=with_predictions) + _raise_error_evaluation_metric_is_valid( + metric, + [ + "auto", + "accuracy", + "confusion_matrix", + "roc_curve", + "auc", + "log_loss", + "precision", + "recall", + "f1_score", + ], + ) + return super(_Classifier, self).evaluate( + dataset, + missing_value_action=missing_value_action, + metric=metric, + with_predictions=with_predictions, + ) diff --git a/src/python/turicreate/toolkits/classifier/nearest_neighbor_classifier.py b/src/python/turicreate/toolkits/classifier/nearest_neighbor_classifier.py index ce966bfe0b..4753aeb26c 100644 --- a/src/python/turicreate/toolkits/classifier/nearest_neighbor_classifier.py +++ b/src/python/turicreate/toolkits/classifier/nearest_neighbor_classifier.py @@ -36,7 +36,7 @@ def _sort_topk_votes(x, k): votes, then truncate to the highest 'k' classes. """ y = sorted(x.items(), key=lambda x: x[1], reverse=True)[:k] - return [{'class': i[0], 'votes': i[1]} for i in y] + return [{"class": i[0], "votes": i[1]} for i in y] def _construct_auto_distance(features, column_types): @@ -74,8 +74,9 @@ def _construct_auto_distance(features, column_types): try: ftr_type = column_types[ftr] except: - raise ValueError("The specified feature does not exist in the " + - "input data.") + raise ValueError( + "The specified feature does not exist in the " + "input data." + ) if ftr_type == str: string_ftrs.append(ftr) @@ -87,28 +88,29 @@ def _construct_auto_distance(features, column_types): numeric_ftrs.append(ftr) else: - raise TypeError("Unable to automatically construct a distance " + - "function for feature '{}'. ".format(ftr) + - "For the nearest neighbor classifier, features " + - "must be of type integer, float, string, dictionary, " + - "or array.array.") + raise TypeError( + "Unable to automatically construct a distance " + + "function for feature '{}'. ".format(ftr) + + "For the nearest neighbor classifier, features " + + "must be of type integer, float, string, dictionary, " + + "or array.array." + ) ## Construct the distance function dist = [] for ftr in string_ftrs: - dist.append([[ftr], 'levenshtein', 1]) + dist.append([[ftr], "levenshtein", 1]) if len(dict_ftrs) > 0: - dist.append([dict_ftrs, 'weighted_jaccard', len(dict_ftrs)]) + dist.append([dict_ftrs, "weighted_jaccard", len(dict_ftrs)]) if len(numeric_ftrs) > 0: - dist.append([numeric_ftrs, 'euclidean', len(numeric_ftrs)]) + dist.append([numeric_ftrs, "euclidean", len(numeric_ftrs)]) return dist - ## -------------- ## ## Model creation ## ## -------------- ## @@ -224,7 +226,6 @@ def create(dataset, target, features=None, distance=None, verbose=True): ## ------ start_time = _time.time() - ## Validation and preprocessing ## ---------------------------- @@ -232,24 +233,25 @@ def create(dataset, target, features=None, distance=None, verbose=True): _raise_error_if_not_sframe(dataset, "dataset") _raise_error_if_sframe_empty(dataset, "dataset") - ## 'target' must be a string, in 'dataset', and the type of the target must # be string or integer. if not isinstance(target, str) or target not in dataset.column_names(): - raise _ToolkitError("The 'target' parameter must be the name of a " - "column in the input dataset.") + raise _ToolkitError( + "The 'target' parameter must be the name of a " + "column in the input dataset." + ) if not dataset[target].dtype == str and not dataset[target].dtype == int: raise TypeError("The target column must contain integers or strings.") - ## Warn that 'None' values in the target may lead to ambiguous predictions. if dataset[target].countna() > 0: - _logging.warning("Missing values detected in the target column. This " + - "may lead to ambiguous 'None' predictions, if the " + - "'radius' parameter is set too small in the prediction, " + - "classification, or evaluation methods.") - + _logging.warning( + "Missing values detected in the target column. This " + + "may lead to ambiguous 'None' predictions, if the " + + "'radius' parameter is set too small in the prediction, " + + "classification, or evaluation methods." + ) ## convert features and distance arguments into a composite distance ## NOTE: this is done here instead of in the nearest neighbors toolkit @@ -260,46 +262,47 @@ def create(dataset, target, features=None, distance=None, verbose=True): else: _features = [x for x in features if x != target] - if isinstance(distance, list): distance = _copy.deepcopy(distance) - elif (hasattr(distance, '__call__') or - (isinstance(distance, str) and not distance == 'auto')): + elif hasattr(distance, "__call__") or ( + isinstance(distance, str) and not distance == "auto" + ): distance = [[_features, distance, 1]] - elif distance is None or distance == 'auto': - col_types = {k: v for k, v in zip(dataset.column_names(), - dataset.column_types())} + elif distance is None or distance == "auto": + col_types = { + k: v for k, v in zip(dataset.column_names(), dataset.column_types()) + } distance = _construct_auto_distance(_features, col_types) else: - raise TypeError("Input 'distance' not understood. The 'distance' " + - "parameter must be a string or a composite distance, " + - " or left unspecified.") - + raise TypeError( + "Input 'distance' not understood. The 'distance' " + + "parameter must be a string or a composite distance, " + + " or left unspecified." + ) ## Construct and query the nearest neighbors model ## ----------------------------------------------- - knn_model = _tc.nearest_neighbors.create(dataset, label=target, - distance=distance, - verbose=verbose) - + knn_model = _tc.nearest_neighbors.create( + dataset, label=target, distance=distance, verbose=verbose + ) ## Postprocessing and formatting ## ----------------------------- state = { - 'verbose' : verbose, - 'distance' : knn_model.distance, - 'num_distance_components' : knn_model.num_distance_components, - 'num_examples' : dataset.num_rows(), - 'features' : knn_model.features, - 'target': target, - 'num_classes': len(dataset[target].unique()), - 'num_features': knn_model.num_features, - 'num_unpacked_features': knn_model.num_unpacked_features, - 'training_time': _time.time() - start_time, - '_target_type': dataset[target].dtype, + "verbose": verbose, + "distance": knn_model.distance, + "num_distance_components": knn_model.num_distance_components, + "num_examples": dataset.num_rows(), + "features": knn_model.features, + "target": target, + "num_classes": len(dataset[target].unique()), + "num_features": knn_model.num_features, + "num_unpacked_features": knn_model.num_unpacked_features, + "training_time": _time.time() - start_time, + "_target_type": dataset[target].dtype, } model = NearestNeighborClassifier(knn_model, state) return model @@ -323,7 +326,7 @@ class of any observation to be the most common class among the observation's def __init__(self, knn_model, state=None): self.__proxy__ = _PythonProxy(state) - assert(isinstance(knn_model, _tc.nearest_neighbors.NearestNeighborsModel)) + assert isinstance(knn_model, _tc.nearest_neighbors.NearestNeighborsModel) self._knn_model = knn_model @classmethod @@ -335,8 +338,8 @@ def _get_version(self): def _get_native_state(self): retstate = self.__proxy__.get_state() - retstate['knn_model'] = self._knn_model.__proxy__ - retstate['_target_type'] = self._target_type.__name__ + retstate["knn_model"] = self._knn_model.__proxy__ + retstate["_target_type"] = self._target_type.__name__ return retstate @classmethod @@ -352,10 +355,10 @@ def _load_version(cls, state, version): version : int Version number maintained by the class writer. """ - assert(version == cls._PYTHON_NN_CLASSIFIER_MODEL_VERSION) - knn_model = _tc.nearest_neighbors.NearestNeighborsModel(state['knn_model']) - del state['knn_model'] - state['_target_type'] = eval(state['_target_type']) + assert version == cls._PYTHON_NN_CLASSIFIER_MODEL_VERSION + knn_model = _tc.nearest_neighbors.NearestNeighborsModel(state["knn_model"]) + del state["knn_model"] + state["_target_type"] = eval(state["_target_type"]) return cls(knn_model, state) def __repr__(self): @@ -396,16 +399,16 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ model_fields = [ - ('Number of examples', 'num_examples'), - ('Number of feature columns', 'num_features'), - ('Number of unpacked features', 'num_unpacked_features'), - ('Number of distance components', 'num_distance_components'), - ('Number of classes', 'num_classes')] + ("Number of examples", "num_examples"), + ("Number of feature columns", "num_features"), + ("Number of unpacked features", "num_unpacked_features"), + ("Number of distance components", "num_distance_components"), + ("Number of classes", "num_classes"), + ] - training_fields = [ - ('Training time (seconds)', 'training_time')] + training_fields = [("Training time (seconds)", "training_time")] - section_titles = [ 'Schema', 'Training Summary'] + section_titles = ["Schema", "Training Summary"] return ([model_fields, training_fields], section_titles) def classify(self, dataset, max_neighbors=10, radius=None, verbose=True): @@ -489,41 +492,49 @@ def classify(self, dataset, max_neighbors=10, radius=None, verbose=True): if max_neighbors <= 0: raise ValueError("Input 'max_neighbors' must be larger than 0.") - ## Find the nearest neighbors for each query and count the number of # votes for each class. - knn = self._knn_model.query(dataset, k=max_neighbors, radius=radius, - verbose=verbose) + knn = self._knn_model.query( + dataset, k=max_neighbors, radius=radius, verbose=verbose + ) ## If there are *no* results for *any* query make an SFrame of nothing. if knn.num_rows() == 0: ystar = _tc.SFrame( - {'class': _tc.SArray([None] * n_query, self._target_type), - 'probability': _tc.SArray([None] * n_query, int)}) + { + "class": _tc.SArray([None] * n_query, self._target_type), + "probability": _tc.SArray([None] * n_query, int), + } + ) else: ## Find the class with the most votes for each query and postprocess. - grp = knn.groupby(['query_label', 'reference_label'], _tc.aggregate.COUNT) + grp = knn.groupby(["query_label", "reference_label"], _tc.aggregate.COUNT) - ystar = grp.groupby('query_label', - {'class': _tc.aggregate.ARGMAX('Count', 'reference_label'), - 'max_votes': _tc.aggregate.MAX('Count'), - 'total_votes': _tc.aggregate.SUM('Count')}) + ystar = grp.groupby( + "query_label", + { + "class": _tc.aggregate.ARGMAX("Count", "reference_label"), + "max_votes": _tc.aggregate.MAX("Count"), + "total_votes": _tc.aggregate.SUM("Count"), + }, + ) - ystar['probability'] = ystar['max_votes'] / ystar['total_votes'] + ystar["probability"] = ystar["max_votes"] / ystar["total_votes"] ## Fill in 'None' for query points that don't have any near neighbors. - row_ids = _tc.SFrame({'query_label': range(n_query)}) - ystar = ystar.join(row_ids, how='right') + row_ids = _tc.SFrame({"query_label": range(n_query)}) + ystar = ystar.join(row_ids, how="right") ## Sort by row number (because row number is not returned) and return - ystar = ystar.sort('query_label', ascending=True) - ystar = ystar[['class', 'probability']] + ystar = ystar.sort("query_label", ascending=True) + ystar = ystar[["class", "probability"]] return ystar - def predict(self, dataset, max_neighbors=10, radius=None, - output_type='class', verbose=True): + def predict( + self, dataset, max_neighbors=10, radius=None, output_type="class", verbose=True + ): """ Return predicted class labels for instances in *dataset*. This model makes predictions based on the closest neighbors stored in the nearest @@ -586,21 +597,23 @@ def predict(self, dataset, max_neighbors=10, radius=None, ['dog', 'fossa'] """ - ystar = self.classify(dataset=dataset, max_neighbors=max_neighbors, - radius=radius, verbose=verbose) + ystar = self.classify( + dataset=dataset, max_neighbors=max_neighbors, radius=radius, verbose=verbose + ) - if output_type == 'class': - return ystar['class'] + if output_type == "class": + return ystar["class"] - elif output_type == 'probability': - return ystar['probability'] + elif output_type == "probability": + return ystar["probability"] else: - raise ValueError("Input 'output_type' not understood. 'output_type' " - "must be either 'class' or 'probability'.") + raise ValueError( + "Input 'output_type' not understood. 'output_type' " + "must be either 'class' or 'probability'." + ) - def predict_topk(self, dataset, max_neighbors=10, radius=None, k=3, - verbose=False): + def predict_topk(self, dataset, max_neighbors=10, radius=None, k=3, verbose=False): """ Return top-k most likely predictions for each observation in ``dataset``. Predictions are returned as an SFrame with three columns: @@ -667,9 +680,10 @@ def predict_topk(self, dataset, max_neighbors=10, radius=None, k=3, # 'max_neighbors' and 'radius' parameters are validated by the nearest # neighbor model's query method. if not isinstance(k, int) or k < 1: - raise TypeError("The number of results to return for each point, " + - "'k', must be an integer greater than 0.") - + raise TypeError( + "The number of results to return for each point, " + + "'k', must be an integer greater than 0." + ) ## Validate the query dataset. _raise_error_if_not_sframe(dataset, "dataset") @@ -685,43 +699,40 @@ def predict_topk(self, dataset, max_neighbors=10, radius=None, k=3, if max_neighbors <= 0: raise ValueError("Input 'max_neighbors' must be larger than 0.") - ## Find the nearest neighbors for each query and count the number of # votes for each class. - knn = self._knn_model.query(dataset, k=max_neighbors, radius=radius, - verbose=verbose) + knn = self._knn_model.query( + dataset, k=max_neighbors, radius=radius, verbose=verbose + ) ## If there are *no* results for *any* query make an empty SFrame. if knn.num_rows() == 0: - ystar = _tc.SFrame({'row_id': [], 'class': [], 'probability': []}) - ystar['row_id'] = ystar['row_id'].astype(int) - ystar['class'] = ystar['class'].astype(str) - + ystar = _tc.SFrame({"row_id": [], "class": [], "probability": []}) + ystar["row_id"] = ystar["row_id"].astype(int) + ystar["class"] = ystar["class"].astype(str) else: ## Find the classes with the top-k vote totals - grp = knn.groupby(['query_label', 'reference_label'], - _tc.aggregate.COUNT) + grp = knn.groupby(["query_label", "reference_label"], _tc.aggregate.COUNT) - ystar = grp.unstack(column_names=['reference_label', 'Count'], - new_column_name='votes') + ystar = grp.unstack( + column_names=["reference_label", "Count"], new_column_name="votes" + ) - ystar['topk'] = ystar['votes'].apply( - lambda x: _sort_topk_votes(x, k)) - ystar['total_votes'] = ystar['votes'].apply( - lambda x: sum(x.values())) + ystar["topk"] = ystar["votes"].apply(lambda x: _sort_topk_votes(x, k)) + ystar["total_votes"] = ystar["votes"].apply(lambda x: sum(x.values())) ## Re-stack, unpack, and rename the results - ystar = ystar.stack('topk', new_column_name='topk') - ystar = ystar.unpack('topk') - ystar.rename({'topk.class': 'class', 'query_label': 'row_id'}, inplace=True) - ystar['probability'] = ystar['topk.votes'] / ystar['total_votes'] - ystar = ystar[['row_id', 'class', 'probability']] + ystar = ystar.stack("topk", new_column_name="topk") + ystar = ystar.unpack("topk") + ystar.rename({"topk.class": "class", "query_label": "row_id"}, inplace=True) + ystar["probability"] = ystar["topk.votes"] / ystar["total_votes"] + ystar = ystar[["row_id", "class", "probability"]] return ystar # - def evaluate(self, dataset, metric='auto', max_neighbors=10, radius=None): + def evaluate(self, dataset, metric="auto", max_neighbors=10, radius=None): """ Evaluate the model's predictive accuracy. This is done by predicting the target class for instances in a new dataset and comparing to known @@ -788,50 +799,58 @@ def evaluate(self, dataset, metric='auto', max_neighbors=10, radius=None): """ ## Validate the metric name - _raise_error_evaluation_metric_is_valid(metric, - ['auto', 'accuracy', 'confusion_matrix', 'roc_curve']) + _raise_error_evaluation_metric_is_valid( + metric, ["auto", "accuracy", "confusion_matrix", "roc_curve"] + ) ## Make sure the input dataset has a target column with an appropriate # type. target = self.target - _raise_error_if_column_exists(dataset, target, 'dataset', target) + _raise_error_if_column_exists(dataset, target, "dataset", target) if not dataset[target].dtype == str and not dataset[target].dtype == int: - raise TypeError("The target column of the evaluation dataset must " - "contain integers or strings.") + raise TypeError( + "The target column of the evaluation dataset must " + "contain integers or strings." + ) if self.num_classes != 2: - if (metric == 'roc_curve') or (metric == ['roc_curve']): - err_msg = "Currently, ROC curve is not supported for " + if (metric == "roc_curve") or (metric == ["roc_curve"]): + err_msg = "Currently, ROC curve is not supported for " err_msg += "multi-class classification in this model." raise _ToolkitError(err_msg) else: - warn_msg = "WARNING: Ignoring `roc_curve`. " + warn_msg = "WARNING: Ignoring `roc_curve`. " warn_msg += "Not supported for multi-class classification." print(warn_msg) ## Compute predictions with the input dataset. - ystar = self.predict(dataset, output_type='class', - max_neighbors=max_neighbors, radius=radius) - ystar_prob = self.predict(dataset, output_type='probability', - max_neighbors=max_neighbors, radius=radius) - + ystar = self.predict( + dataset, output_type="class", max_neighbors=max_neighbors, radius=radius + ) + ystar_prob = self.predict( + dataset, + output_type="probability", + max_neighbors=max_neighbors, + radius=radius, + ) ## Compile accuracy metrics results = {} - if metric in ['accuracy', 'auto']: - results['accuracy'] = _evaluation.accuracy(targets=dataset[target], - predictions=ystar) + if metric in ["accuracy", "auto"]: + results["accuracy"] = _evaluation.accuracy( + targets=dataset[target], predictions=ystar + ) - if metric in ['confusion_matrix', 'auto']: - results['confusion_matrix'] = \ - _evaluation.confusion_matrix(targets=dataset[target], - predictions=ystar) + if metric in ["confusion_matrix", "auto"]: + results["confusion_matrix"] = _evaluation.confusion_matrix( + targets=dataset[target], predictions=ystar + ) if self.num_classes == 2: - if metric in ['roc_curve', 'auto']: - results['roc_curve'] = \ - _evaluation.roc_curve(targets=dataset[target], - predictions=ystar_prob) + if metric in ["roc_curve", "auto"]: + results["roc_curve"] = _evaluation.roc_curve( + targets=dataset[target], predictions=ystar_prob + ) return results diff --git a/src/python/turicreate/toolkits/classifier/random_forest_classifier.py b/src/python/turicreate/toolkits/classifier/random_forest_classifier.py index a219f6dcaa..9c8e167ca3 100644 --- a/src/python/turicreate/toolkits/classifier/random_forest_classifier.py +++ b/src/python/turicreate/toolkits/classifier/random_forest_classifier.py @@ -19,13 +19,14 @@ from turicreate.toolkits._tree_model_mixin import TreeModelMixin as _TreeModelMixin from turicreate.util import _make_internal_url -__doc_string_context = ''' +__doc_string_context = """ >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = turicreate.SFrame.read_csv(url) >>> train, test = data.random_split(0.8) >>> model = turicreate.random_forest_classifier.create(train, target='label') -''' +""" + class RandomForestClassifier(_Classifier, _TreeModelMixin): """ @@ -55,6 +56,7 @@ class RandomForestClassifier(_Classifier, _TreeModelMixin): create """ + def __init__(self, proxy): """__init__(self)""" self.__proxy__ = proxy @@ -146,7 +148,7 @@ def _get(self, field): """ return super(_Classifier, self)._get(field) - def evaluate(self, dataset, metric='auto', missing_value_action='auto'): + def evaluate(self, dataset, metric="auto", missing_value_action="auto"): """ Evaluate the model by making predictions of target values and comparing these to actual values. @@ -206,14 +208,25 @@ def evaluate(self, dataset, metric='auto', missing_value_action='auto'): >>> results = model.evaluate(test_data, metric='confusion_matrix') """ - _raise_error_evaluation_metric_is_valid(metric, - ['auto', 'accuracy', 'confusion_matrix', 'roc_curve', 'auc', - 'log_loss', 'precision', 'recall', 'f1_score']) - return super(_Classifier, self).evaluate(dataset, - missing_value_action=missing_value_action, - metric=metric) - - def predict(self, dataset, output_type='class', missing_value_action='auto'): + _raise_error_evaluation_metric_is_valid( + metric, + [ + "auto", + "accuracy", + "confusion_matrix", + "roc_curve", + "auc", + "log_loss", + "precision", + "recall", + "f1_score", + ], + ) + return super(_Classifier, self).evaluate( + dataset, missing_value_action=missing_value_action, metric=metric + ) + + def predict(self, dataset, output_type="class", missing_value_action="auto"): """ A flexible and advanced prediction API. @@ -269,13 +282,18 @@ class as a vector. The probability of the first class (sorted >>> m.predict(testdata, output_type='probability') >>> m.predict(testdata, output_type='margin') """ - _check_categorical_option_type('output_type', output_type, - ['class', 'margin', 'probability', 'probability_vector']) - return super(_Classifier, self).predict(dataset, - output_type=output_type, - missing_value_action=missing_value_action) - - def predict_topk(self, dataset, output_type="probability", k=3, missing_value_action='auto'): + _check_categorical_option_type( + "output_type", + output_type, + ["class", "margin", "probability", "probability_vector"], + ) + return super(_Classifier, self).predict( + dataset, output_type=output_type, missing_value_action=missing_value_action + ) + + def predict_topk( + self, dataset, output_type="probability", k=3, missing_value_action="auto" + ): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `id`, @@ -341,22 +359,29 @@ def predict_topk(self, dataset, output_type="probability", k=3, missing_value_ac +--------+-------+-------------------+ [35688 rows x 3 columns] """ - _check_categorical_option_type('output_type', output_type, ['rank', 'margin', 'probability']) - if missing_value_action == 'auto': - missing_value_action = _sl.select_default_missing_value_policy(self, 'predict') + _check_categorical_option_type( + "output_type", output_type, ["rank", "margin", "probability"] + ) + if missing_value_action == "auto": + missing_value_action = _sl.select_default_missing_value_policy( + self, "predict" + ) # Low latency path if isinstance(dataset, list): return self.__proxy__.fast_predict_topk( - dataset, missing_value_action, output_type, k) + dataset, missing_value_action, output_type, k + ) if isinstance(dataset, dict): return self.__proxy__.fast_predict_topk( - [dataset], missing_value_action, output_type, k) + [dataset], missing_value_action, output_type, k + ) return self.__proxy__.predict_topk( - dataset, missing_value_action, output_type, k) + dataset, missing_value_action, output_type, k + ) - def classify(self, dataset, missing_value_action='auto'): + def classify(self, dataset, missing_value_action="auto"): """ Return a classification, for each example in the ``dataset``, using the trained random forest model. The output SFrame contains predictions @@ -401,9 +426,9 @@ def classify(self, dataset, missing_value_action='auto'): >>> features=['bath', 'bedroom', 'size']) >>> classes = model.classify(data) """ - return super(RandomForestClassifier, self).classify(dataset, - missing_value_action=missing_value_action) - + return super(RandomForestClassifier, self).classify( + dataset, missing_value_action=missing_value_action + ) def export_coreml(self, filename): """ @@ -419,23 +444,30 @@ def export_coreml(self, filename): >>> model.export_coreml("MyModel.mlmodel") """ from turicreate.toolkits import _coreml_utils + display_name = "random forest classifier" short_description = _coreml_utils._mlmodel_short_description(display_name) - context = {"mode" : "classification", - "model_type" : "random_forest", - "class": self.__class__.__name__, - "short_description": short_description, - } + context = { + "mode": "classification", + "model_type": "random_forest", + "class": self.__class__.__name__, + "short_description": short_description, + } self._export_coreml_impl(filename, context) -def create(dataset, target, - features=None, - max_iterations=10, - validation_set='auto', - verbose=True, class_weights=None, - random_seed=None, - metric='auto', - **kwargs): + +def create( + dataset, + target, + features=None, + max_iterations=10, + validation_set="auto", + verbose=True, + class_weights=None, + random_seed=None, + metric="auto", + **kwargs +): """ Create a (binary or multi-class) classifier model of type :class:`~turicreate.random_forest_classifier.RandomForestClassifier` using @@ -572,20 +604,26 @@ def create(dataset, target, """ if random_seed is not None: - kwargs['random_seed'] = random_seed - if 'model_checkpoint_path' in kwargs: - kwargs['model_checkpoint_path'] = _make_internal_url(kwargs['model_checkpoint_path']) - if 'resume_from_checkpoint' in kwargs: - kwargs['resume_from_checkpoint'] = _make_internal_url(kwargs['resume_from_checkpoint']) - - model = _sl.create(dataset = dataset, - target = target, - features = features, - model_name = 'random_forest_classifier', - max_iterations = max_iterations, - validation_set = validation_set, - class_weights = class_weights, - verbose = verbose, - metric = metric, - **kwargs) + kwargs["random_seed"] = random_seed + if "model_checkpoint_path" in kwargs: + kwargs["model_checkpoint_path"] = _make_internal_url( + kwargs["model_checkpoint_path"] + ) + if "resume_from_checkpoint" in kwargs: + kwargs["resume_from_checkpoint"] = _make_internal_url( + kwargs["resume_from_checkpoint"] + ) + + model = _sl.create( + dataset=dataset, + target=target, + features=features, + model_name="random_forest_classifier", + max_iterations=max_iterations, + validation_set=validation_set, + class_weights=class_weights, + verbose=verbose, + metric=metric, + **kwargs + ) return RandomForestClassifier(model.__proxy__) diff --git a/src/python/turicreate/toolkits/classifier/svm_classifier.py b/src/python/turicreate/toolkits/classifier/svm_classifier.py index 32c566bc38..5a8b90ba9d 100644 --- a/src/python/turicreate/toolkits/classifier/svm_classifier.py +++ b/src/python/turicreate/toolkits/classifier/svm_classifier.py @@ -12,28 +12,36 @@ import turicreate.toolkits._supervised_learning as _sl from turicreate.toolkits._supervised_learning import Classifier as _Classifier -from turicreate.toolkits._internal_utils import _toolkit_repr_print, \ - _toolkit_get_topk_bottomk, \ - _raise_error_evaluation_metric_is_valid, \ - _check_categorical_option_type, \ - _summarize_coefficients +from turicreate.toolkits._internal_utils import ( + _toolkit_repr_print, + _toolkit_get_topk_bottomk, + _raise_error_evaluation_metric_is_valid, + _check_categorical_option_type, + _summarize_coefficients, +) _DEFAULT_SOLVER_OPTIONS = { -'convergence_threshold': 1e-2, -'max_iterations': 10, -'lbfgs_memory_level': 11, + "convergence_threshold": 1e-2, + "max_iterations": 10, + "lbfgs_memory_level": 11, } -def create(dataset, target, features=None, - penalty=1.0, solver='auto', + +def create( + dataset, + target, + features=None, + penalty=1.0, + solver="auto", feature_rescaling=True, - convergence_threshold = _DEFAULT_SOLVER_OPTIONS['convergence_threshold'], - lbfgs_memory_level = _DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'], - max_iterations = _DEFAULT_SOLVER_OPTIONS['max_iterations'], - class_weights = None, - validation_set = 'auto', - verbose=True): + convergence_threshold=_DEFAULT_SOLVER_OPTIONS["convergence_threshold"], + lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS["lbfgs_memory_level"], + max_iterations=_DEFAULT_SOLVER_OPTIONS["max_iterations"], + class_weights=None, + validation_set="auto", + verbose=True, +): """ Create a :class:`~turicreate.svm_classifier.SVMClassifier` to predict the class of a binary target variable based on a model of which side of a hyperplane the example @@ -215,14 +223,20 @@ def create(dataset, target, features=None, model_name = "classifier_svm" solver = solver.lower() - model = _sl.create(dataset, target, model_name, features=features, - validation_set = validation_set, verbose = verbose, - penalty = penalty, - feature_rescaling = feature_rescaling, - convergence_threshold = convergence_threshold, - lbfgs_memory_level = lbfgs_memory_level, - max_iterations = max_iterations, - class_weights = class_weights) + model = _sl.create( + dataset, + target, + model_name, + features=features, + validation_set=validation_set, + verbose=verbose, + penalty=penalty, + feature_rescaling=feature_rescaling, + convergence_threshold=convergence_threshold, + lbfgs_memory_level=lbfgs_memory_level, + max_iterations=max_iterations, + class_weights=class_weights, + ) return SVMClassifier(model.__proxy__) @@ -289,8 +303,9 @@ class SVMClassifier(_Classifier): create """ + def __init__(self, model_proxy): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model_proxy self.__name__ = self.__class__._native_name() @@ -327,35 +342,36 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ model_fields = [ - ('Number of coefficients', 'num_coefficients'), - ('Number of examples', 'num_examples'), - ('Number of classes', 'num_classes'), - ('Number of feature columns', 'num_features'), - ('Number of unpacked features', 'num_unpacked_features')] + ("Number of coefficients", "num_coefficients"), + ("Number of examples", "num_examples"), + ("Number of classes", "num_classes"), + ("Number of feature columns", "num_features"), + ("Number of unpacked features", "num_unpacked_features"), + ] hyperparam_fields = [ - ("Mis-classification penalty", 'penalty'), + ("Mis-classification penalty", "penalty"), ] solver_fields = [ - ("Solver", 'solver'), - ("Solver iterations", 'training_iterations'), - ("Solver status", 'training_solver_status'), - ("Training time (sec)", 'training_time')] + ("Solver", "solver"), + ("Solver iterations", "training_iterations"), + ("Solver status", "training_solver_status"), + ("Training time (sec)", "training_time"), + ] - training_fields = [ - ("Train Loss", 'training_loss')] + training_fields = [("Train Loss", "training_loss")] coefs = self.coefficients (top_coefs, bottom_coefs) = _toolkit_get_topk_bottomk(coefs, k=5) - (coefs_list, titles_list) = _summarize_coefficients(top_coefs, \ - bottom_coefs) + (coefs_list, titles_list) = _summarize_coefficients(top_coefs, bottom_coefs) - return ([model_fields, hyperparam_fields, solver_fields, \ - training_fields] + coefs_list, - [ 'Schema', 'Hyperparameters', \ - 'Training Summary', 'Settings'] + titles_list, ) + return ( + [model_fields, hyperparam_fields, solver_fields, training_fields] + + coefs_list, + ["Schema", "Hyperparameters", "Training Summary", "Settings"] + titles_list, + ) def __repr__(self): """ @@ -381,12 +397,14 @@ def export_coreml(self, filename): """ from turicreate.extensions import _linear_svm_export_as_model_asset from turicreate.toolkits import _coreml_utils + display_name = "svm classifier" short_description = _coreml_utils._mlmodel_short_description(display_name) - context = {"class": self.__class__.__name__, - "short_description": short_description, - } - context['user_defined'] = _coreml_utils._get_tc_version_info() + context = { + "class": self.__class__.__name__, + "short_description": short_description, + } + context["user_defined"] = _coreml_utils._get_tc_version_info() _linear_svm_export_as_model_asset(self.__proxy__, filename, context) def _get(self, field): @@ -448,7 +466,7 @@ def _get(self, field): """ return super(_Classifier, self)._get(field) - def predict(self, dataset, output_type='class', missing_value_action='auto'): + def predict(self, dataset, output_type="class", missing_value_action="auto"): """ Return predictions for ``dataset``, using the trained logistic regression model. Predictions can be generated as class labels (0 or @@ -510,13 +528,12 @@ def predict(self, dataset, output_type='class', missing_value_action='auto'): """ - _check_categorical_option_type('output_type', output_type, - ['class', 'margin']) - return super(_Classifier, self).predict(dataset, - output_type=output_type, - missing_value_action=missing_value_action) + _check_categorical_option_type("output_type", output_type, ["class", "margin"]) + return super(_Classifier, self).predict( + dataset, output_type=output_type, missing_value_action=missing_value_action + ) - def classify(self, dataset, missing_value_action='auto'): + def classify(self, dataset, missing_value_action="auto"): """ Return a classification, for each example in the ``dataset``, using the trained SVM model. The output SFrame contains predictions @@ -561,10 +578,17 @@ def classify(self, dataset, missing_value_action='auto'): >>> classes = model.classify(data) """ - return super(SVMClassifier, self).classify(dataset, missing_value_action=missing_value_action) - - - def evaluate(self, dataset, metric='auto', missing_value_action='auto', with_predictions=False): + return super(SVMClassifier, self).classify( + dataset, missing_value_action=missing_value_action + ) + + def evaluate( + self, + dataset, + metric="auto", + missing_value_action="auto", + with_predictions=False, + ): """ Evaluate the model by making predictions of target values and comparing these to actual values. @@ -628,10 +652,13 @@ def evaluate(self, dataset, metric='auto', missing_value_action='auto', with_pre >>> results = model.progressvaluate(data) >>> print results['accuracy'] """ - _raise_error_evaluation_metric_is_valid(metric, - ['auto', 'accuracy', 'confusion_matrix', 'precision', 'recall', - 'f1_score']) - return super(_Classifier, self).evaluate(dataset, - missing_value_action=missing_value_action, - metric=metric, - with_predictions=with_predictions) + _raise_error_evaluation_metric_is_valid( + metric, + ["auto", "accuracy", "confusion_matrix", "precision", "recall", "f1_score"], + ) + return super(_Classifier, self).evaluate( + dataset, + missing_value_action=missing_value_action, + metric=metric, + with_predictions=with_predictions, + ) diff --git a/src/python/turicreate/toolkits/clustering/__init__.py b/src/python/turicreate/toolkits/clustering/__init__.py index ac079de301..26cc4669ca 100644 --- a/src/python/turicreate/toolkits/clustering/__init__.py +++ b/src/python/turicreate/toolkits/clustering/__init__.py @@ -19,7 +19,7 @@ from __future__ import division as _ from __future__ import absolute_import as _ -__all__ = ['kmeans', 'dbscan'] +__all__ = ["kmeans", "dbscan"] from . import kmeans from . import dbscan diff --git a/src/python/turicreate/toolkits/clustering/dbscan.py b/src/python/turicreate/toolkits/clustering/dbscan.py index 3135b5849a..72d1574267 100644 --- a/src/python/turicreate/toolkits/clustering/dbscan.py +++ b/src/python/turicreate/toolkits/clustering/dbscan.py @@ -21,8 +21,14 @@ from turicreate.toolkits._model import PythonProxy as _PythonProxy -def create(dataset, features=None, distance=None, radius=1., - min_core_neighbors=10, verbose=True): +def create( + dataset, + features=None, + distance=None, + radius=1.0, + min_core_neighbors=10, + verbose=True, +): """ Create a DBSCAN clustering model. The DBSCAN method partitions the input dataset into three types of points, based on the estimated probability @@ -175,31 +181,34 @@ def create(dataset, features=None, distance=None, radius=1., _tkutl._raise_error_if_not_sframe(dataset, "dataset") _tkutl._raise_error_if_sframe_empty(dataset, "dataset") - ## Validate neighborhood parameters if not isinstance(min_core_neighbors, int) or min_core_neighbors < 0: - raise ValueError("Input 'min_core_neighbors' must be a non-negative " + - "integer.") + raise ValueError( + "Input 'min_core_neighbors' must be a non-negative " + "integer." + ) if not isinstance(radius, (int, float)) or radius < 0: - raise ValueError("Input 'radius' must be a non-negative integer " + - "or float.") - + raise ValueError("Input 'radius' must be a non-negative integer " + "or float.") ## Compute all-point nearest neighbors within `radius` and count # neighborhood sizes - knn_model = _tc.nearest_neighbors.create(dataset, features=features, - distance=distance, - method='brute_force', - verbose=verbose) - - knn = knn_model.similarity_graph(k=None, radius=radius, - include_self_edges=False, - output_type='SFrame', - verbose=verbose) - - neighbor_counts = knn.groupby('query_label', _agg.COUNT) - + knn_model = _tc.nearest_neighbors.create( + dataset, + features=features, + distance=distance, + method="brute_force", + verbose=verbose, + ) + + knn = knn_model.similarity_graph( + k=None, + radius=radius, + include_self_edges=False, + output_type="SFrame", + verbose=verbose, + ) + + neighbor_counts = knn.groupby("query_label", _agg.COUNT) ### NOTE: points with NO neighbors are already dropped here! @@ -209,13 +218,12 @@ def create(dataset, features=None, distance=None, radius=1., if verbose: logger.info("Identifying noise points and core points.") - boundary_mask = neighbor_counts['Count'] < min_core_neighbors + boundary_mask = neighbor_counts["Count"] < min_core_neighbors core_mask = 1 - boundary_mask # this includes too small clusters - boundary_idx = neighbor_counts[boundary_mask]['query_label'] - core_idx = neighbor_counts[core_mask]['query_label'] - + boundary_idx = neighbor_counts[boundary_mask]["query_label"] + core_idx = neighbor_counts[core_mask]["query_label"] ## Build a similarity graph on the core points ## NOTE: careful with singleton core points - the second filter removes them @@ -223,98 +231,105 @@ def create(dataset, features=None, distance=None, radius=1., if verbose: logger.info("Constructing the core point similarity graph.") - core_vertices = knn.filter_by(core_idx, 'query_label') - core_edges = core_vertices.filter_by(core_idx, 'reference_label') + core_vertices = knn.filter_by(core_idx, "query_label") + core_edges = core_vertices.filter_by(core_idx, "reference_label") core_graph = _tc.SGraph() - core_graph = core_graph.add_vertices(core_vertices[['query_label']], - vid_field='query_label') - core_graph = core_graph.add_edges(core_edges, src_field='query_label', - dst_field='reference_label') - + core_graph = core_graph.add_vertices( + core_vertices[["query_label"]], vid_field="query_label" + ) + core_graph = core_graph.add_edges( + core_edges, src_field="query_label", dst_field="reference_label" + ) ## Compute core point connected components and relabel to be consecutive # integers cc = _tc.connected_components.create(core_graph, verbose=verbose) - cc_labels = cc.component_size.add_row_number('__label') - core_assignments = cc.component_id.join(cc_labels, on='component_id', - how='left')[['__id', '__label']] - core_assignments['type'] = 'core' - + cc_labels = cc.component_size.add_row_number("__label") + core_assignments = cc.component_id.join(cc_labels, on="component_id", how="left")[ + ["__id", "__label"] + ] + core_assignments["type"] = "core" ## Join potential boundary points to core cluster labels (points that aren't # really on a boundary are implicitly dropped) if verbose: logger.info("Processing boundary points.") - boundary_edges = knn.filter_by(boundary_idx, 'query_label') + boundary_edges = knn.filter_by(boundary_idx, "query_label") # separate real boundary points from points in small isolated clusters - boundary_core_edges = boundary_edges.filter_by(core_idx, 'reference_label') + boundary_core_edges = boundary_edges.filter_by(core_idx, "reference_label") # join a boundary point to its single closest core point. - boundary_assignments = boundary_core_edges.groupby('query_label', - {'reference_label': _agg.ARGMIN('rank', 'reference_label')}) - - boundary_assignments = boundary_assignments.join(core_assignments, - on={'reference_label': '__id'}) - - boundary_assignments = boundary_assignments.rename({'query_label': '__id'}, inplace=True) - boundary_assignments = boundary_assignments.remove_column('reference_label', inplace=True) - boundary_assignments['type'] = 'boundary' - + boundary_assignments = boundary_core_edges.groupby( + "query_label", {"reference_label": _agg.ARGMIN("rank", "reference_label")} + ) + + boundary_assignments = boundary_assignments.join( + core_assignments, on={"reference_label": "__id"} + ) + + boundary_assignments = boundary_assignments.rename( + {"query_label": "__id"}, inplace=True + ) + boundary_assignments = boundary_assignments.remove_column( + "reference_label", inplace=True + ) + boundary_assignments["type"] = "boundary" ## Identify boundary candidates that turned out to be in small clusters but # not on real cluster boundaries - small_cluster_idx = set(boundary_idx).difference( - boundary_assignments['__id']) - + small_cluster_idx = set(boundary_idx).difference(boundary_assignments["__id"]) ## Identify individual noise points by the fact that they have no neighbors. noise_idx = set(range(dataset.num_rows())).difference( - neighbor_counts['query_label']) + neighbor_counts["query_label"] + ) noise_idx = noise_idx.union(small_cluster_idx) - noise_assignments = _tc.SFrame({'row_id': _tc.SArray(list(noise_idx), int)}) - noise_assignments['cluster_id'] = None - noise_assignments['cluster_id'] = noise_assignments['cluster_id'].astype(int) - noise_assignments['type'] = 'noise' - + noise_assignments = _tc.SFrame({"row_id": _tc.SArray(list(noise_idx), int)}) + noise_assignments["cluster_id"] = None + noise_assignments["cluster_id"] = noise_assignments["cluster_id"].astype(int) + noise_assignments["type"] = "noise" ## Append core, boundary, and noise results to each other. master_assignments = _tc.SFrame() num_clusters = 0 if core_assignments.num_rows() > 0: - core_assignments = core_assignments.rename({'__id': 'row_id', - '__label': 'cluster_id'}, inplace=True) + core_assignments = core_assignments.rename( + {"__id": "row_id", "__label": "cluster_id"}, inplace=True + ) master_assignments = master_assignments.append(core_assignments) - num_clusters = len(core_assignments['cluster_id'].unique()) + num_clusters = len(core_assignments["cluster_id"].unique()) if boundary_assignments.num_rows() > 0: - boundary_assignments = boundary_assignments.rename({'__id': 'row_id', - '__label': 'cluster_id'}, inplace=True) + boundary_assignments = boundary_assignments.rename( + {"__id": "row_id", "__label": "cluster_id"}, inplace=True + ) master_assignments = master_assignments.append(boundary_assignments) if noise_assignments.num_rows() > 0: master_assignments = master_assignments.append(noise_assignments) - ## Post-processing and formatting - state = {'verbose': verbose, - 'radius': radius, - 'min_core_neighbors': min_core_neighbors, - 'distance': knn_model.distance, - 'num_distance_components': knn_model.num_distance_components, - 'num_examples': dataset.num_rows(), - 'features': knn_model.features, - 'num_features': knn_model.num_features, - 'unpacked_features': knn_model.unpacked_features, - 'num_unpacked_features': knn_model.num_unpacked_features, - 'cluster_id': master_assignments, - 'num_clusters': num_clusters, - 'training_time': _time.time() - start_time} + state = { + "verbose": verbose, + "radius": radius, + "min_core_neighbors": min_core_neighbors, + "distance": knn_model.distance, + "num_distance_components": knn_model.num_distance_components, + "num_examples": dataset.num_rows(), + "features": knn_model.features, + "num_features": knn_model.num_features, + "unpacked_features": knn_model.unpacked_features, + "num_unpacked_features": knn_model.num_unpacked_features, + "cluster_id": master_assignments, + "num_clusters": num_clusters, + "training_time": _time.time() - start_time, + } return DBSCANModel(state) @@ -329,6 +344,7 @@ class DBSCANModel(_CustomModel): :func:`turicreate.clustering.dbscan.create` to create an instance of this model. """ + _PYTHON_DBSCAN_MODEL_VERSION = 1 def __init__(self, state): @@ -380,7 +396,8 @@ def __repr__(self): sections, section_titles = self._get_summary_struct() accessible_fields = { - "cluster_id": "Cluster label for each row in the input dataset."} + "cluster_id": "Cluster label for each row in the input dataset." + } out = _toolkit_repr_print(self, sections, section_titles, width=width) out2 = _summarize_accessible_fields(accessible_fields, width=width) @@ -404,15 +421,17 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ model_fields = [ - ('Number of examples', 'num_examples'), - ('Number of feature columns', 'num_features'), - ('Max distance to a neighbor (radius)', 'radius'), - ('Min number of neighbors for core points', 'min_core_neighbors'), - ('Number of distance components', 'num_distance_components')] + ("Number of examples", "num_examples"), + ("Number of feature columns", "num_features"), + ("Max distance to a neighbor (radius)", "radius"), + ("Min number of neighbors for core points", "min_core_neighbors"), + ("Number of distance components", "num_distance_components"), + ] training_fields = [ - ('Total training time (seconds)', 'training_time'), - ('Number of clusters', 'num_clusters')] + ("Total training time (seconds)", "training_time"), + ("Number of clusters", "num_clusters"), + ] section_titles = ["Schema", "Training summary"] - return([model_fields, training_fields], section_titles) + return ([model_fields, training_fields], section_titles) diff --git a/src/python/turicreate/toolkits/clustering/kmeans.py b/src/python/turicreate/toolkits/clustering/kmeans.py index 8b8f38e3a3..c17ca9873e 100644 --- a/src/python/turicreate/toolkits/clustering/kmeans.py +++ b/src/python/turicreate/toolkits/clustering/kmeans.py @@ -22,6 +22,7 @@ class which provides methods for inspecting the returned cluster information. from turicreate.toolkits._main import ToolkitError as _ToolkitError from turicreate._cython.cy_server import QuietProgress as _QuietProgress + def _validate_dataset(dataset): """ Validate the main Kmeans dataset. @@ -51,8 +52,9 @@ def _validate_initial_centers(initial_centers): raise TypeError("Input 'initial_centers' must be an SFrame.") if initial_centers.num_rows() == 0 or initial_centers.num_columns() == 0: - raise ValueError("An 'initial_centers' argument is provided " + - "but has no data.") + raise ValueError( + "An 'initial_centers' argument is provided " + "but has no data." + ) def _validate_num_clusters(num_clusters, initial_centers, num_rows): @@ -87,9 +89,11 @@ def _validate_num_clusters(num_clusters, initial_centers, num_rows): ## Determine the correct number of clusters. if initial_centers is None: if num_clusters is None: - raise ValueError("Number of clusters cannot be determined from " + - "'num_clusters' or 'initial_centers'. You must " + - "specify one of these arguments.") + raise ValueError( + "Number of clusters cannot be determined from " + + "'num_clusters' or 'initial_centers'. You must " + + "specify one of these arguments." + ) else: _num_clusters = num_clusters @@ -100,17 +104,21 @@ def _validate_num_clusters(num_clusters, initial_centers, num_rows): _num_clusters = num_centers else: if num_clusters != num_centers: - raise ValueError("The value of 'num_clusters' does not match " + - "the number of provided initial centers. " + - "Please provide only one of these arguments " + - "or ensure the values match.") + raise ValueError( + "The value of 'num_clusters' does not match " + + "the number of provided initial centers. " + + "Please provide only one of these arguments " + + "or ensure the values match." + ) else: _num_clusters = num_clusters if _num_clusters > num_rows: - raise ValueError("The desired number of clusters exceeds the number " + - "of data points. Please set 'num_clusters' to be " + - "smaller than the number of data points.") + raise ValueError( + "The desired number of clusters exceeds the number " + + "of data points. Please set 'num_clusters' to be " + + "smaller than the number of data points." + ) return _num_clusters @@ -144,8 +152,9 @@ def _validate_features(features, column_type_map, valid_types, label): raise TypeError("Input 'features' must be a list, if specified.") if len(features) == 0: - raise ValueError("If specified, input 'features' must contain " + - "at least one column name.") + raise ValueError( + "If specified, input 'features' must contain " + "at least one column name." + ) ## Remove duplicates num_original_features = len(features) @@ -164,24 +173,32 @@ def _validate_features(features, column_type_map, valid_types, label): for ftr in features: if not isinstance(ftr, str): - _logging.warning("Feature '{}' excluded. ".format(ftr) + - "Features must be specified as strings " + - "corresponding to column names in the input dataset.") + _logging.warning( + "Feature '{}' excluded. ".format(ftr) + + "Features must be specified as strings " + + "corresponding to column names in the input dataset." + ) elif ftr not in column_type_map.keys(): - _logging.warning("Feature '{}' excluded because ".format(ftr) + - "it is not in the input dataset.") + _logging.warning( + "Feature '{}' excluded because ".format(ftr) + + "it is not in the input dataset." + ) elif column_type_map[ftr] not in valid_types: - _logging.warning("Feature '{}' excluded because of its type. ".format(ftr) + - "Kmeans features must be int, float, dict, or array.array type.") + _logging.warning( + "Feature '{}' excluded because of its type. ".format(ftr) + + "Kmeans features must be int, float, dict, or array.array type." + ) else: valid_features.append(ftr) if len(valid_features) == 0: - raise _ToolkitError("All specified features have been excluded. " + - "Please specify valid features.") + raise _ToolkitError( + "All specified features have been excluded. " + + "Please specify valid features." + ) return valid_features @@ -197,6 +214,7 @@ class KmeansModel(_Model): detailed list of parameter options and code samples are available in the documentation for the create function. """ + def __init__(self, model): self.__proxy__ = model self.__name__ = self.__class__._native_name() @@ -205,7 +223,7 @@ def __init__(self, model): def _native_name(cls): return "kmeans" - def predict(self, dataset, output_type='cluster_id', verbose=True): + def predict(self, dataset, output_type="cluster_id", verbose=True): """ Return predicted cluster label for instances in the new 'dataset'. K-means predictions are made by assigning each new instance to the @@ -263,28 +281,32 @@ def predict(self, dataset, output_type='cluster_id', verbose=True): if not isinstance(output_type, str): raise TypeError("The 'output_type' parameter must be a string.") - if not output_type in ('cluster_id', 'distance'): - raise ValueError("The 'output_type' parameter must be either " + - "'cluster_label' or 'distance'.") + if not output_type in ("cluster_id", "distance"): + raise ValueError( + "The 'output_type' parameter must be either " + + "'cluster_label' or 'distance'." + ) ## Get model features. ref_features = self.features sf_features = _tkutl._toolkits_select_columns(dataset, ref_features) ## Compute predictions. - opts = {'model': self.__proxy__, - 'model_name': self.__name__, - 'dataset': sf_features} + opts = { + "model": self.__proxy__, + "model_name": self.__name__, + "dataset": sf_features, + } with _QuietProgress(verbose): result = _tc.extensions._kmeans.predict(opts) - sf_result = result['predictions'] + sf_result = result["predictions"] - if output_type == 'distance': - return sf_result['distance'] + if output_type == "distance": + return sf_result["distance"] else: - return sf_result['cluster_id'] + return sf_result["cluster_id"] def _get(self, field): """ @@ -337,12 +359,10 @@ def _get(self, field): out Value of the requested field """ - opts = {'model': self.__proxy__, - 'model_name': self.__name__, - 'field': field} + opts = {"model": self.__proxy__, "model_name": self.__name__, "field": field} response = _tc.extensions._kmeans.get_value(opts) - return response['value'] + return response["value"] def __str__(self): """ @@ -373,19 +393,21 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ model_fields = [ - ('Number of clusters', 'num_clusters'), - ('Number of examples', 'num_examples'), - ('Number of feature columns', 'num_features'), - ('Number of unpacked features', 'num_unpacked_features'), - ('Row label name', 'row_label_name')] + ("Number of clusters", "num_clusters"), + ("Number of examples", "num_examples"), + ("Number of feature columns", "num_features"), + ("Number of unpacked features", "num_unpacked_features"), + ("Row label name", "row_label_name"), + ] training_fields = [ - ('Training method', 'method'), - ('Number of training iterations', 'training_iterations'), - ('Batch size' , 'batch_size'), - ('Total training time (seconds)', 'training_time')] + ("Training method", "method"), + ("Number of training iterations", "training_iterations"), + ("Batch size", "batch_size"), + ("Total training time (seconds)", "training_time"), + ] - section_titles = [ 'Schema', 'Training Summary'] + section_titles = ["Schema", "Training Summary"] return ([model_fields, training_fields], section_titles) @@ -400,16 +422,24 @@ def __repr__(self): (sections, section_titles) = self._get_summary_struct() accessible_fields = { "cluster_id": "An SFrame containing the cluster assignments.", - "cluster_info": "An SFrame containing the cluster centers."} + "cluster_info": "An SFrame containing the cluster centers.", + } - out = _tkutl._toolkit_repr_print(self, sections, section_titles, - width=width) + out = _tkutl._toolkit_repr_print(self, sections, section_titles, width=width) out2 = _summarize_accessible_fields(accessible_fields, width=width) return out + "\n" + out2 -def create(dataset, num_clusters=None, features=None, label=None, - initial_centers=None, max_iterations=10, batch_size=None, - verbose=True): + +def create( + dataset, + num_clusters=None, + features=None, + label=None, + initial_centers=None, + max_iterations=10, + batch_size=None, + verbose=True, +): """ Create a k-means clustering model. The KmeansModel object contains the computed cluster centers and the cluster assignment for each instance in @@ -524,9 +554,10 @@ def create(dataset, num_clusters=None, features=None, label=None, ... >>> model = turicreate.kmeans.create(sf, num_clusters=3) """ - opts = {'model_name': 'kmeans', - 'max_iterations': max_iterations, - } + opts = { + "model_name": "kmeans", + "max_iterations": max_iterations, + } ## Validate the input dataset and initial centers. _validate_dataset(dataset) @@ -535,9 +566,9 @@ def create(dataset, num_clusters=None, features=None, label=None, _validate_initial_centers(initial_centers) ## Validate and determine the correct number of clusters. - opts['num_clusters'] = _validate_num_clusters(num_clusters, - initial_centers, - dataset.num_rows()) + opts["num_clusters"] = _validate_num_clusters( + num_clusters, initial_centers, dataset.num_rows() + ) ## Validate the row label col_type_map = {c: dataset[c].dtype for c in dataset.column_names()} @@ -545,58 +576,62 @@ def create(dataset, num_clusters=None, features=None, label=None, if label is not None: _validate_row_label(label, col_type_map) - if label in ['cluster_id', 'distance']: - raise ValueError("Row label column name cannot be 'cluster_id' " + - "or 'distance'; these are reserved for other " + - "columns in the Kmeans model's output.") + if label in ["cluster_id", "distance"]: + raise ValueError( + "Row label column name cannot be 'cluster_id' " + + "or 'distance'; these are reserved for other " + + "columns in the Kmeans model's output." + ) - opts['row_labels'] = dataset[label] - opts['row_label_name'] = label + opts["row_labels"] = dataset[label] + opts["row_label_name"] = label else: - opts['row_labels'] = _tc.SArray.from_sequence(dataset.num_rows()) - opts['row_label_name'] = 'row_id' - + opts["row_labels"] = _tc.SArray.from_sequence(dataset.num_rows()) + opts["row_label_name"] = "row_id" ## Validate the features relative to the input dataset. if features is None: features = dataset.column_names() - valid_features = _validate_features(features, col_type_map, - valid_types=[_array, dict, int, float], - label=label) + valid_features = _validate_features( + features, col_type_map, valid_types=[_array, dict, int, float], label=label + ) sf_features = dataset.select_columns(valid_features) - opts['features'] = sf_features + opts["features"] = sf_features ## Validate the features in the initial centers (if provided) if initial_centers is not None: try: initial_centers = initial_centers.select_columns(valid_features) except: - raise ValueError("Specified features cannot be extracted from " + - "the provided initial centers.") + raise ValueError( + "Specified features cannot be extracted from " + + "the provided initial centers." + ) if initial_centers.column_types() != sf_features.column_types(): - raise TypeError("Feature types are different in the dataset and " + - "initial centers.") + raise TypeError( + "Feature types are different in the dataset and " + "initial centers." + ) else: initial_centers = _tc.SFrame() - opts['initial_centers'] = initial_centers + opts["initial_centers"] = initial_centers ## Validate the batch size and determine the training method. if batch_size is None: - opts['method'] = 'elkan' - opts['batch_size'] = dataset.num_rows() + opts["method"] = "elkan" + opts["batch_size"] = dataset.num_rows() else: - opts['method'] = 'minibatch' - opts['batch_size'] = batch_size + opts["method"] = "minibatch" + opts["batch_size"] = batch_size ## Create and return the model with _QuietProgress(verbose): params = _tc.extensions._kmeans.train(opts) - return KmeansModel(params['model']) + return KmeansModel(params["model"]) diff --git a/src/python/turicreate/toolkits/distances/__init__.py b/src/python/turicreate/toolkits/distances/__init__.py index 479751fcea..377a4dbed2 100644 --- a/src/python/turicreate/toolkits/distances/__init__.py +++ b/src/python/turicreate/toolkits/distances/__init__.py @@ -121,10 +121,17 @@ from __future__ import division as _ from __future__ import absolute_import as _ -__all__ = ['_distances', '_util'] +__all__ = ["_distances", "_util"] from ._distances import euclidean, squared_euclidean, manhattan -from ._distances import cosine, dot_product, transformed_dot_product, jaccard, weighted_jaccard, gaussian_kernel +from ._distances import ( + cosine, + dot_product, + transformed_dot_product, + jaccard, + weighted_jaccard, + gaussian_kernel, +) from ._distances import levenshtein from . import _util diff --git a/src/python/turicreate/toolkits/distances/_distances.py b/src/python/turicreate/toolkits/distances/_distances.py index c603ae3845..aced7b44a8 100644 --- a/src/python/turicreate/toolkits/distances/_distances.py +++ b/src/python/turicreate/toolkits/distances/_distances.py @@ -17,6 +17,7 @@ ### Standard distance functions ### ### --------------------------- ### + def euclidean(x, y): """ Compute the Euclidean distance between two dictionaries or two lists @@ -58,6 +59,7 @@ def euclidean(x, y): """ return _tc.extensions._distances.euclidean(x, y) + def gaussian_kernel(x, y): """ Compute a Gaussian-type distance between two dictionaries or two lists @@ -99,6 +101,7 @@ def gaussian_kernel(x, y): """ return _tc.extensions._distances.gaussian_kernel(x, y) + def squared_euclidean(x, y): """ Compute the squared Euclidean distance between two dictionaries or @@ -145,6 +148,7 @@ def squared_euclidean(x, y): """ return _tc.extensions._distances.squared_euclidean(x, y) + def manhattan(x, y): """ Compute the Manhattan distance between between two dictionaries or @@ -189,6 +193,7 @@ def manhattan(x, y): """ return _tc.extensions._distances.manhattan(x, y) + def cosine(x, y): """ Compute the cosine distance between between two dictionaries or two @@ -236,6 +241,7 @@ def cosine(x, y): """ return _tc.extensions._distances.cosine(x, y) + def levenshtein(x, y): """ Compute the Levenshtein distance between between strings. The @@ -277,6 +283,7 @@ def levenshtein(x, y): """ return _tc.extensions._distances.levenshtein(x, y) + def dot_product(x, y): """ Compute the dot_product between two dictionaries or two lists of @@ -322,6 +329,7 @@ def dot_product(x, y): """ return _tc.extensions._distances.dot_product(x, y) + def transformed_dot_product(x, y): """ Compute the "transformed_dot_product" distance between two dictionaries or @@ -370,6 +378,7 @@ def transformed_dot_product(x, y): """ return _tc.extensions._distances.transformed_dot_product(x, y) + def jaccard(x, y): """ Compute the Jaccard distance between between two dictionaries. @@ -408,6 +417,7 @@ def jaccard(x, y): """ return _tc.extensions._distances.jaccard(x, y) + def weighted_jaccard(x, y): """ Compute the weighted Jaccard distance between between two diff --git a/src/python/turicreate/toolkits/distances/_util.py b/src/python/turicreate/toolkits/distances/_util.py index 2c421bb3e9..e39bde96f8 100644 --- a/src/python/turicreate/toolkits/distances/_util.py +++ b/src/python/turicreate/toolkits/distances/_util.py @@ -18,9 +18,11 @@ import turicreate as _tc import sys as _sys + if _sys.version_info.major == 3: from functools import reduce + def compute_composite_distance(distance, x, y): """ Compute the value of a composite distance function on two dictionaries, @@ -69,20 +71,24 @@ def compute_composite_distance(distance, x, y): distance = _convert_distance_names_to_functions(distance) if not isinstance(x, dict) or not isinstance(y, dict): - raise TypeError("Inputs 'x' and 'y' must be in dictionary form. " + - "Selecting individual rows of an SFrame yields the " + - "correct format.") + raise TypeError( + "Inputs 'x' and 'y' must be in dictionary form. " + + "Selecting individual rows of an SFrame yields the " + + "correct format." + ) - ans = 0. + ans = 0.0 for d in distance: ftrs, dist, weight = d ## Special check for multiple columns with levenshtein distance. if dist == _tc.distances.levenshtein and len(ftrs) > 1: - raise ValueError("levenshtein distance cannot be used with multiple" + - "columns. Please concatenate strings into a single " + - "column before computing the distance.") + raise ValueError( + "levenshtein distance cannot be used with multiple" + + "columns. Please concatenate strings into a single " + + "column before computing the distance." + ) ## Extract values for specified features. a = {} @@ -90,7 +96,9 @@ def compute_composite_distance(distance, x, y): for ftr in ftrs: if type(x[ftr]) != type(y[ftr]): - if not isinstance(x[ftr], (int, float)) or not isinstance(y[ftr], (int, float)): + if not isinstance(x[ftr], (int, float)) or not isinstance( + y[ftr], (int, float) + ): raise ValueError("Input data has different types.") if isinstance(x[ftr], (int, float, str)): @@ -99,10 +107,10 @@ def compute_composite_distance(distance, x, y): elif isinstance(x[ftr], dict): for key, val in _six.iteritems(x[ftr]): - a['{}.{}'.format(ftr, key)] = val + a["{}.{}".format(ftr, key)] = val for key, val in _six.iteritems(y[ftr]): - b['{}.{}'.format(ftr, key)] = val + b["{}.{}".format(ftr, key)] = val elif isinstance(x[ftr], (list, _array.array)): for i, val in enumerate(x[ftr]): @@ -114,7 +122,6 @@ def compute_composite_distance(distance, x, y): else: raise TypeError("Type of feature '{}' not understood.".format(ftr)) - ## Pull out the raw values for levenshtein if dist == _tc.distances.levenshtein: a = list(a.values())[0] @@ -136,10 +143,12 @@ def _validate_composite_distance(distance): raise TypeError("Input 'distance' must be a composite distance.") if len(distance) < 1: - raise ValueError("Composite distances must have a least one distance " - "component, consisting of a list of feature names, " - "a distance function (string or function handle), " - "and a weight.") + raise ValueError( + "Composite distances must have a least one distance " + "component, consisting of a list of feature names, " + "a distance function (string or function handle), " + "and a weight." + ) for d in distance: @@ -147,15 +156,19 @@ def _validate_composite_distance(distance): try: ftrs, dist, weight = d except: - raise TypeError("Elements of a composite distance function must " + - "have three items: a set of feature names (tuple or list), " + - "a distance function (string or function handle), " + - "and a weight.") + raise TypeError( + "Elements of a composite distance function must " + + "have three items: a set of feature names (tuple or list), " + + "a distance function (string or function handle), " + + "and a weight." + ) ## Validate feature names if len(ftrs) == 0: - raise ValueError("An empty list of features cannot be passed " +\ - "as part of a composite distance function.") + raise ValueError( + "An empty list of features cannot be passed " + + "as part of a composite distance function." + ) if not isinstance(ftrs, (list, tuple)): raise TypeError("Feature names must be specified in a list or tuple.") @@ -163,11 +176,12 @@ def _validate_composite_distance(distance): if not all([isinstance(x, str) for x in ftrs]): raise TypeError("Feature lists must contain only strings.") - ## Validate standard distance function - if not isinstance(dist, str) and not hasattr(dist, '__call__'): - raise ValueError("Standard distances must be the name of a distance " + - "function (string) or a distance function handle") + if not isinstance(dist, str) and not hasattr(dist, "__call__"): + raise ValueError( + "Standard distances must be the name of a distance " + + "function (string) or a distance function handle" + ) if isinstance(dist, str): try: @@ -175,16 +189,18 @@ def _validate_composite_distance(distance): except: raise ValueError("Distance '{}' not recognized".format(dist)) - ## Validate weight if not isinstance(weight, (int, float)): raise ValueError( - "The weight of each distance component must be a single " +\ - "integer or a float value.") + "The weight of each distance component must be a single " + + "integer or a float value." + ) if weight < 0: - raise ValueError("The weight on each distance component must be " + - "greater than or equal to zero.") + raise ValueError( + "The weight on each distance component must be " + + "greater than or equal to zero." + ) def _scrub_composite_distance_features(distance, feature_blacklist): @@ -229,8 +245,9 @@ def _get_composite_distance_features(distance): return list(set(reduce(iadd, [x[0] for x in distance], []))) -def build_address_distance(number=None, street=None, city=None, state=None, - zip_code=None): +def build_address_distance( + number=None, street=None, city=None, state=None, zip_code=None +): """ Construct a composite distance appropriate for matching address data. NOTE: this utility function does not guarantee that the output composite distance @@ -270,8 +287,10 @@ def build_address_distance(number=None, street=None, city=None, state=None, ## Validate inputs for param in [number, street, city, state, zip_code]: if param is not None and not isinstance(param, str): - raise TypeError("All inputs must be strings. Each parameter is " + - "intended to be the name of an SFrame column.") + raise TypeError( + "All inputs must be strings. Each parameter is " + + "intended to be the name of an SFrame column." + ) ## Figure out features for levenshtein distance. string_features = [] @@ -282,20 +301,19 @@ def build_address_distance(number=None, street=None, city=None, state=None, if zip_code: string_features.append(zip_code) - ## Compile the distance components. dist = [] if number: - dist.append([[number], 'jaccard', 1]) + dist.append([[number], "jaccard", 1]) if street: - dist.append([[street], 'jaccard', 5]) + dist.append([[street], "jaccard", 5]) if state: - dist.append([[state], 'jaccard', 5]) + dist.append([[state], "jaccard", 5]) if len(string_features) > 0: - dist.append([string_features, 'levenshtein', 1]) + dist.append([string_features, "levenshtein", 1]) return dist diff --git a/src/python/turicreate/toolkits/drawing_classifier/__init__.py b/src/python/turicreate/toolkits/drawing_classifier/__init__.py index 84cfdbecf0..01c57405f9 100644 --- a/src/python/turicreate/toolkits/drawing_classifier/__init__.py +++ b/src/python/turicreate/toolkits/drawing_classifier/__init__.py @@ -10,4 +10,4 @@ from ..image_classifier._annotate import annotate, recover_annotation from . import util -__all__ = ['create', 'DrawingClassifier', 'util', 'annotate', 'recover_annotation'] +__all__ = ["create", "DrawingClassifier", "util", "annotate", "recover_annotation"] diff --git a/src/python/turicreate/toolkits/drawing_classifier/_tf_drawing_classifier.py b/src/python/turicreate/toolkits/drawing_classifier/_tf_drawing_classifier.py index 07aaa44ba2..f4b1f37a59 100644 --- a/src/python/turicreate/toolkits/drawing_classifier/_tf_drawing_classifier.py +++ b/src/python/turicreate/toolkits/drawing_classifier/_tf_drawing_classifier.py @@ -30,7 +30,6 @@ def __init__(self, net_params, batch_size, num_classes): net_params[key] ) - self.dc_graph = _tf.Graph() self.num_classes = num_classes self.batch_size = batch_size diff --git a/src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py b/src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py index d8eec597ad..5f6c38f572 100644 --- a/src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py +++ b/src/python/turicreate/toolkits/drawing_classifier/drawing_classifier.py @@ -24,36 +24,52 @@ BITMAP_WIDTH = 28 BITMAP_HEIGHT = 28 -TRAIN_VALIDATION_SPLIT = .95 +TRAIN_VALIDATION_SPLIT = 0.95 -def _raise_error_if_not_drawing_classifier_input_sframe( - dataset, feature, target): + +def _raise_error_if_not_drawing_classifier_input_sframe(dataset, feature, target): """ Performs some sanity checks on the SFrame provided as input to `turicreate.drawing_classifier.create` and raises a ToolkitError if something in the dataset is missing or wrong. """ from turicreate.toolkits._internal_utils import _raise_error_if_not_sframe + _raise_error_if_not_sframe(dataset) if feature not in dataset.column_names(): raise _ToolkitError("Feature column '%s' does not exist" % feature) if target not in dataset.column_names(): raise _ToolkitError("Target column '%s' does not exist" % target) - if (dataset[feature].dtype != _tc.Image and dataset[feature].dtype != list): - raise _ToolkitError("Feature column must contain images" + if dataset[feature].dtype != _tc.Image and dataset[feature].dtype != list: + raise _ToolkitError( + "Feature column must contain images" + " or stroke-based drawings encoded as lists of strokes" + " where each stroke is a list of points and" - + " each point is stored as a dictionary") + + " each point is stored as a dictionary" + ) if dataset[target].dtype != int and dataset[target].dtype != str: - raise _ToolkitError("Target column contains " + str(dataset[target].dtype) + raise _ToolkitError( + "Target column contains " + + str(dataset[target].dtype) + " but it must contain strings or integers to represent" - + " labels for drawings.") + + " labels for drawings." + ) if len(dataset) == 0: raise _ToolkitError("Input Dataset is empty!") -def create(input_dataset, target, feature=None, validation_set='auto', - warm_start='auto', batch_size=256, - max_iterations=500, verbose=True, random_seed=None, **kwargs): + +def create( + input_dataset, + target, + feature=None, + validation_set="auto", + warm_start="auto", + batch_size=256, + max_iterations=500, + verbose=True, + random_seed=None, + **kwargs +): """ Create a :class:`DrawingClassifier` model. @@ -137,25 +153,31 @@ def create(input_dataset, target, feature=None, validation_set='auto', accepted_values_for_warm_start = ["auto", "quickdraw_245_v0", None] if warm_start is not None: if type(warm_start) is not str: - raise TypeError("'warm_start' must be a string or None. " + raise TypeError( + "'warm_start' must be a string or None. " + "'warm_start' can take in the following values: " - + str(accepted_values_for_warm_start)) + + str(accepted_values_for_warm_start) + ) if warm_start not in accepted_values_for_warm_start: - raise _ToolkitError("Unrecognized value for 'warm_start': " - + warm_start + ". 'warm_start' can take in the following " - + "values: " + str(accepted_values_for_warm_start)) + raise _ToolkitError( + "Unrecognized value for 'warm_start': " + + warm_start + + ". 'warm_start' can take in the following " + + "values: " + + str(accepted_values_for_warm_start) + ) # Replace 'auto' with name of current default Warm Start model. warm_start = warm_start.replace("auto", "quickdraw_245_v0") - if '_advanced_parameters' in kwargs: + if "_advanced_parameters" in kwargs: # Make sure no additional parameters are provided - new_keys = set(kwargs['_advanced_parameters'].keys()) + new_keys = set(kwargs["_advanced_parameters"].keys()) set_keys = set(params.keys()) unsupported = new_keys - set_keys if unsupported: - raise _ToolkitError('Unknown advanced parameters: {}'.format(unsupported)) + raise _ToolkitError("Unknown advanced parameters: {}".format(unsupported)) - params.update(kwargs['_advanced_parameters']) + params.update(kwargs["_advanced_parameters"]) # @TODO: Should be able to automatically choose number of iterations # based on data size: Tracked in Github Issue #1576 @@ -166,8 +188,7 @@ def create(input_dataset, target, feature=None, validation_set='auto', if feature is None: feature = _tkutl._find_only_drawing_column(input_dataset) - _raise_error_if_not_drawing_classifier_input_sframe( - input_dataset, feature, target) + _raise_error_if_not_drawing_classifier_input_sframe(input_dataset, feature, target) if batch_size is not None and not isinstance(batch_size, int): raise TypeError("'batch_size' must be an integer >= 1") @@ -179,6 +200,7 @@ def create(input_dataset, target, feature=None, validation_set='auto', raise ValueError("'max_iterations' must be >= 1") import turicreate.toolkits.libtctensorflow + model = _tc.extensions.drawing_classifier() options = dict() options["batch_size"] = batch_size @@ -192,11 +214,12 @@ def create(input_dataset, target, feature=None, validation_set='auto', pretrained_mlmodel = _pre_trained_models.DrawingClassifierPreTrainedMLModel() options["mlmodel_path"] = pretrained_mlmodel.get_model_path() if random_seed is not None: - options['random_seed'] = random_seed + options["random_seed"] = random_seed options["warm_start"] = "" if warm_start is None else warm_start model.train(input_dataset, target, feature, validation_set, options) return DrawingClassifier(model_proxy=model, name="drawing_classifier") + class DrawingClassifier(_Model): """ A trained model using C++ implementation that is ready to use for classification or export to @@ -204,6 +227,7 @@ class DrawingClassifier(_Model): This model should not be constructed directly. """ + _CPP_DRAWING_CLASSIFIER_VERSION = 1 def __init__(self, model_proxy=None, name=None): @@ -239,8 +263,7 @@ def __repr__(self): width = 40 sections, section_titles = self._get_summary_struct() - out = _tkutl._toolkit_repr_print(self, sections, section_titles, - width=width) + out = _tkutl._toolkit_repr_print(self, sections, section_titles, width=width) return out def _get_version(self): @@ -260,11 +283,14 @@ def export_coreml(self, filename): >>> model.export_coreml("MyModel.mlmodel") """ additional_user_defined_metadata = _coreml_utils._get_tc_version_info() - short_description = _coreml_utils._mlmodel_short_description('Drawing Classifier') - self.__proxy__.export_to_coreml(filename, short_description, - additional_user_defined_metadata) + short_description = _coreml_utils._mlmodel_short_description( + "Drawing Classifier" + ) + self.__proxy__.export_to_coreml( + filename, short_description, additional_user_defined_metadata + ) - def predict(self, dataset, output_type='class'): + def predict(self, dataset, output_type="class"): """ Predict on an SFrame or SArray of drawings, or on a single drawing. @@ -329,7 +355,7 @@ class as a vector. Label ordering is dictated by the ``classes`` dataset = _tc.SFrame({self.feature: dataset}) return self.__proxy__.predict(dataset, output_type) - def predict_topk(self, dataset, output_type='probability', k=3): + def predict_topk(self, dataset, output_type="probability", k=3): """ Return top-k predictions for the ``dataset``, using the trained model. Predictions are returned as an SFrame with three columns: `id`, @@ -393,7 +419,7 @@ def predict_topk(self, dataset, output_type='probability', k=3): dataset = _tc.SFrame({self.feature: dataset}) return self.__proxy__.predict_topk(dataset, output_type, k) - def evaluate(self, dataset, metric='auto'): + def evaluate(self, dataset, metric="auto"): """ Evaluate the model by making predictions of target values and comparing these to actual values. @@ -447,62 +473,110 @@ def evaluate(self, dataset, metric='auto'): del evaluation_result["prediction_class"] del evaluation_result["prediction_prob"] - predicted = _tc.SFrame({"label": class_label, "probability": probability_vector}) + predicted = _tc.SFrame( + {"label": class_label, "probability": probability_vector} + ) labels = self.classes - from .._evaluate_utils import ( + from .._evaluate_utils import ( entropy, confidence, relative_confidence, get_confusion_matrix, hclusterSort, - l2Dist + l2Dist, ) - evaluation_result['num_test_examples'] = len(dataset) - for k in ['num_classes', 'num_examples', 'training_time', 'max_iterations']: + evaluation_result["num_test_examples"] = len(dataset) + for k in ["num_classes", "num_examples", "training_time", "max_iterations"]: evaluation_result[k] = getattr(self, k) - #evaluation_result['input_image_shape'] = getattr(self, 'input_image_shape') + # evaluation_result['input_image_shape'] = getattr(self, 'input_image_shape') evaluation_result["model_name"] = "Drawing Classifier" - extended_test = dataset.add_column(predicted["probability"], 'probs') - extended_test['label'] = dataset[self.target] - - extended_test = extended_test.add_columns( [extended_test.apply(lambda d: labels[d['probs'].index(confidence(d['probs']))]), - extended_test.apply(lambda d: entropy(d['probs'])), - extended_test.apply(lambda d: confidence(d['probs'])), - extended_test.apply(lambda d: relative_confidence(d['probs']))], - ['predicted_label', 'entropy', 'confidence', 'relative_confidence']) + extended_test = dataset.add_column(predicted["probability"], "probs") + extended_test["label"] = dataset[self.target] + + extended_test = extended_test.add_columns( + [ + extended_test.apply( + lambda d: labels[d["probs"].index(confidence(d["probs"]))] + ), + extended_test.apply(lambda d: entropy(d["probs"])), + extended_test.apply(lambda d: confidence(d["probs"])), + extended_test.apply(lambda d: relative_confidence(d["probs"])), + ], + ["predicted_label", "entropy", "confidence", "relative_confidence"], + ) - extended_test = extended_test.add_column(extended_test.apply(lambda d: d['label'] == d['predicted_label']), 'correct') + extended_test = extended_test.add_column( + extended_test.apply(lambda d: d["label"] == d["predicted_label"]), "correct" + ) sf_conf_mat = get_confusion_matrix(extended_test, labels) confidence_threshold = 0.5 hesitant_threshold = 0.2 - evaluation_result['confidence_threshold'] = confidence_threshold - evaluation_result['hesitant_threshold'] = hesitant_threshold - evaluation_result['confidence_metric_for_threshold'] = 'relative_confidence' - - evaluation_result['conf_mat'] = list(sf_conf_mat) - - vectors = map(lambda l: {'name': l, 'pos':list(sf_conf_mat[sf_conf_mat['target_label']==l].sort('predicted_label')['norm_prob'])}, - labels) - evaluation_result['sorted_labels'] = hclusterSort(vectors, l2Dist)[0]['name'].split("|") - - per_l = extended_test.groupby(['label'], {'count': _tc.aggregate.COUNT, 'correct_count': _tc.aggregate.SUM('correct') }) - per_l['recall'] = per_l.apply(lambda l: l['correct_count']*1.0 / l['count']) - - per_pl = extended_test.groupby(['predicted_label'], {'predicted_count': _tc.aggregate.COUNT, 'correct_count': _tc.aggregate.SUM('correct') }) - per_pl['precision'] = per_pl.apply(lambda l: l['correct_count']*1.0 / l['predicted_count']) - per_pl = per_pl.rename({'predicted_label': 'label'}) - evaluation_result['label_metrics'] = list(per_l.join(per_pl, on='label', how='outer').select_columns(['label', 'count', 'correct_count', 'predicted_count', 'recall', 'precision'])) - evaluation_result['labels'] = labels + evaluation_result["confidence_threshold"] = confidence_threshold + evaluation_result["hesitant_threshold"] = hesitant_threshold + evaluation_result["confidence_metric_for_threshold"] = "relative_confidence" + + evaluation_result["conf_mat"] = list(sf_conf_mat) + + vectors = map( + lambda l: { + "name": l, + "pos": list( + sf_conf_mat[sf_conf_mat["target_label"] == l].sort( + "predicted_label" + )["norm_prob"] + ), + }, + labels, + ) + evaluation_result["sorted_labels"] = hclusterSort(vectors, l2Dist)[0][ + "name" + ].split("|") + + per_l = extended_test.groupby( + ["label"], + { + "count": _tc.aggregate.COUNT, + "correct_count": _tc.aggregate.SUM("correct"), + }, + ) + per_l["recall"] = per_l.apply(lambda l: l["correct_count"] * 1.0 / l["count"]) + + per_pl = extended_test.groupby( + ["predicted_label"], + { + "predicted_count": _tc.aggregate.COUNT, + "correct_count": _tc.aggregate.SUM("correct"), + }, + ) + per_pl["precision"] = per_pl.apply( + lambda l: l["correct_count"] * 1.0 / l["predicted_count"] + ) + per_pl = per_pl.rename({"predicted_label": "label"}) + evaluation_result["label_metrics"] = list( + per_l.join(per_pl, on="label", how="outer").select_columns( + [ + "label", + "count", + "correct_count", + "predicted_count", + "recall", + "precision", + ] + ) + ) + evaluation_result["labels"] = labels - extended_test = extended_test.add_row_number('__idx').rename({'label': 'target_label'}) + extended_test = extended_test.add_row_number("__idx").rename( + {"label": "target_label"} + ) - evaluation_result['test_data'] = extended_test - evaluation_result['feature'] = self.feature + evaluation_result["test_data"] = extended_test + evaluation_result["feature"] = self.feature return _Evaluation(evaluation_result) @@ -524,17 +598,17 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ model_fields = [ - ('Number of classes', 'num_classes'), - ('Feature column', 'feature'), - ('Target column', 'target') + ("Number of classes", "num_classes"), + ("Feature column", "feature"), + ("Target column", "target"), ] training_fields = [ - ('Training Iterations', 'max_iterations'), - ('Training Accuracy', 'training_accuracy'), - ('Validation Accuracy', 'validation_accuracy'), - ('Training Time', 'training_time'), - ('Number of Examples', 'num_examples') + ("Training Iterations", "max_iterations"), + ("Training Accuracy", "training_accuracy"), + ("Validation Accuracy", "validation_accuracy"), + ("Training Time", "training_time"), + ("Number of Examples", "num_examples"), ] - section_titles = ['Schema', 'Training summary'] - return([model_fields, training_fields], section_titles) + section_titles = ["Schema", "Training summary"] + return ([model_fields, training_fields], section_titles) diff --git a/src/python/turicreate/toolkits/drawing_classifier/util/__init__.py b/src/python/turicreate/toolkits/drawing_classifier/util/__init__.py index 5ea6efdf02..ae8e2d9435 100644 --- a/src/python/turicreate/toolkits/drawing_classifier/util/__init__.py +++ b/src/python/turicreate/toolkits/drawing_classifier/util/__init__.py @@ -8,4 +8,4 @@ from __future__ import absolute_import as _ from ._visualization import draw_strokes -__all__ = ['draw_strokes'] +__all__ = ["draw_strokes"] diff --git a/src/python/turicreate/toolkits/drawing_classifier/util/_visualization.py b/src/python/turicreate/toolkits/drawing_classifier/util/_visualization.py index 3e2a4104e9..d965e147dd 100644 --- a/src/python/turicreate/toolkits/drawing_classifier/util/_visualization.py +++ b/src/python/turicreate/toolkits/drawing_classifier/util/_visualization.py @@ -7,6 +7,7 @@ from turicreate import extensions as _extensions from turicreate.toolkits._main import ToolkitError as _ToolkitError + def draw_strokes(stroke_based_drawings): """ Visualizes drawings (ground truth or predictions) by @@ -32,23 +33,29 @@ def draw_strokes(stroke_based_drawings): """ single_input = False - if (not isinstance(stroke_based_drawings, _tc.SArray) - and not isinstance(stroke_based_drawings, list)): - raise _ToolkitError("Input to draw_strokes must be of type " - + "turicreate.SArray or list (for a single stroke-based drawing)") - if (isinstance(stroke_based_drawings, _tc.SArray) - and stroke_based_drawings.dtype != list): - raise _ToolkitError("SArray input to draw_strokes must have dtype " + if not isinstance(stroke_based_drawings, _tc.SArray) and not isinstance( + stroke_based_drawings, list + ): + raise _ToolkitError( + "Input to draw_strokes must be of type " + + "turicreate.SArray or list (for a single stroke-based drawing)" + ) + if ( + isinstance(stroke_based_drawings, _tc.SArray) + and stroke_based_drawings.dtype != list + ): + raise _ToolkitError( + "SArray input to draw_strokes must have dtype " + "list. Each element in the SArray should be a list of strokes, " + "where each stroke is a list of points, " + "and each point is represented as a dictionary " - + "with two keys, \"x\" and \"y\".") + + 'with two keys, "x" and "y".' + ) if isinstance(stroke_based_drawings, list): single_input = True stroke_based_drawings = _tc.SArray([stroke_based_drawings]) sf = _tc.SFrame({"drawings": stroke_based_drawings}) - sf_with_drawings = _extensions._drawing_classifier_prepare_data( - sf, "drawings") + sf_with_drawings = _extensions._drawing_classifier_prepare_data(sf, "drawings") if single_input: return sf_with_drawings["drawings"][0] return sf_with_drawings["drawings"] diff --git a/src/python/turicreate/toolkits/evaluation.py b/src/python/turicreate/toolkits/evaluation.py index 3c55f155d0..531e120b78 100644 --- a/src/python/turicreate/toolkits/evaluation.py +++ b/src/python/turicreate/toolkits/evaluation.py @@ -29,10 +29,13 @@ from __future__ import absolute_import as _ import turicreate as _turicreate -from turicreate.toolkits._internal_utils import _raise_error_if_not_sarray,\ - _check_categorical_option_type +from turicreate.toolkits._internal_utils import ( + _raise_error_if_not_sarray, + _check_categorical_option_type, +) from turicreate.toolkits._main import ToolkitError as _ToolkitError + def _check_prob_and_prob_vector(predictions): """ Check that the predictionsa are either probabilities of prob-vectors. @@ -41,12 +44,14 @@ def _check_prob_and_prob_vector(predictions): ptype = predictions.dtype import array + if ptype not in [float, numpy.ndarray, array.array, int]: - err_msg = "Input `predictions` must be of numeric type (for binary " + err_msg = "Input `predictions` must be of numeric type (for binary " err_msg += "classification) or array (of probability vectors) for " err_msg += "multiclass classification." raise TypeError(err_msg) + def _supervised_evaluation_error_checking(targets, predictions): """ Perform basic error checking for the evaluation metrics. Check @@ -54,35 +59,46 @@ def _supervised_evaluation_error_checking(targets, predictions): """ _raise_error_if_not_sarray(targets, "targets") _raise_error_if_not_sarray(predictions, "predictions") - if (len(targets) != len(predictions)): + if len(targets) != len(predictions): raise _ToolkitError( - "Input SArrays 'targets' and 'predictions' must be of the same length.") + "Input SArrays 'targets' and 'predictions' must be of the same length." + ) + # The ignore_float_check is because of [None, None, None] being cast as float :( -def _check_same_type_not_float(targets, predictions, ignore_float_check = False): +def _check_same_type_not_float(targets, predictions, ignore_float_check=False): if not ignore_float_check: if targets.dtype == float: raise TypeError("Input `targets` cannot be an SArray of type float.") if predictions.dtype == float: raise TypeError("Input `predictions` cannot be an SArray of type float.") if targets.dtype != predictions.dtype: - raise TypeError("Inputs SArrays `targets` and `predictions` must be of the same type.") + raise TypeError( + "Inputs SArrays `targets` and `predictions` must be of the same type." + ) + def _check_target_not_float(targets): if targets.dtype == float: raise TypeError("Input `targets` cannot be an SArray of type float.") + def _check_index_map(index_map): if index_map is None: return if not isinstance(index_map, dict): - raise TypeError("Input `index_map` must be a dict mapping target label to prediction-vector index.") + raise TypeError( + "Input `index_map` must be a dict mapping target label to prediction-vector index." + ) - indices = [v for k,v in index_map.items()] + indices = [v for k, v in index_map.items()] indices.sort() if indices != list(range(len(index_map))): - raise _ToolkitError("Invalid index_map: each target label must map to a distinct index into the prediction vector.") + raise _ToolkitError( + "Invalid index_map: each target label must map to a distinct index into the prediction vector." + ) + def log_loss(targets, predictions, index_map=None): r""" @@ -234,14 +250,16 @@ class as sorted alphanumerically. Hence, for the probability vector [0.1, opts = {} if index_map is not None: - opts['index_map'] = index_map + opts["index_map"] = index_map if multiclass: - result = _turicreate.extensions._supervised_streaming_evaluator(targets, - predictions, "multiclass_logloss", opts) + result = _turicreate.extensions._supervised_streaming_evaluator( + targets, predictions, "multiclass_logloss", opts + ) else: - result = _turicreate.extensions._supervised_streaming_evaluator(targets, - predictions, "binary_logloss", opts) + result = _turicreate.extensions._supervised_streaming_evaluator( + targets, predictions, "binary_logloss", opts + ) return result @@ -284,8 +302,10 @@ def max_error(targets, predictions): """ _supervised_evaluation_error_checking(targets, predictions) - return _turicreate.extensions._supervised_streaming_evaluator(targets, - predictions, "max_error", {}) + return _turicreate.extensions._supervised_streaming_evaluator( + targets, predictions, "max_error", {} + ) + def rmse(targets, predictions): r""" @@ -332,8 +352,11 @@ def rmse(targets, predictions): """ _supervised_evaluation_error_checking(targets, predictions) - return _turicreate.extensions._supervised_streaming_evaluator(targets, - predictions, "rmse", {}) + return _turicreate.extensions._supervised_streaming_evaluator( + targets, predictions, "rmse", {} + ) + + def confusion_matrix(targets, predictions): r""" Compute the confusion matrix for classifier predictions. @@ -368,10 +391,12 @@ def confusion_matrix(targets, predictions): _supervised_evaluation_error_checking(targets, predictions) _check_same_type_not_float(targets, predictions) - return _turicreate.extensions._supervised_streaming_evaluator(targets, - predictions, "confusion_matrix_no_map", {}) + return _turicreate.extensions._supervised_streaming_evaluator( + targets, predictions, "confusion_matrix_no_map", {} + ) + -def accuracy(targets, predictions, average='micro'): +def accuracy(targets, predictions, average="micro"): r""" Compute the accuracy score; which measures the fraction of predictions made by the classifier that are exactly correct. The score lies in the range [0,1] @@ -467,11 +492,12 @@ class label and the value is the score for the corresponding class _supervised_evaluation_error_checking(targets, predictions) _check_same_type_not_float(targets, predictions) opts = {"average": average} - return _turicreate.extensions._supervised_streaming_evaluator(targets, - predictions, "flexible_accuracy", opts) + return _turicreate.extensions._supervised_streaming_evaluator( + targets, predictions, "flexible_accuracy", opts + ) -def fbeta_score(targets, predictions, beta=1.0, average='macro'): +def fbeta_score(targets, predictions, beta=1.0, average="macro"): r""" Compute the F-beta score. The F-beta score is the weighted harmonic mean of precision and recall. The score lies in the range [0,1] with 1 being ideal @@ -595,16 +621,16 @@ class label and the value is the score for the corresponding class """ _supervised_evaluation_error_checking(targets, predictions) - _check_categorical_option_type('average', average, - ['micro', 'macro', None]) + _check_categorical_option_type("average", average, ["micro", "macro", None]) _check_same_type_not_float(targets, predictions) - opts = {"beta" : beta, - "average" : average} - return _turicreate.extensions._supervised_streaming_evaluator(targets, - predictions, "fbeta_score", opts) + opts = {"beta": beta, "average": average} + return _turicreate.extensions._supervised_streaming_evaluator( + targets, predictions, "fbeta_score", opts + ) -def f1_score(targets, predictions, average='macro'): + +def f1_score(targets, predictions, average="macro"): r""" Compute the F1 score (sometimes known as the balanced F-score or F-measure). The F1 score is commonly interpreted as the average of @@ -720,9 +746,10 @@ class label and the value is the score for the corresponding class Management 45.4 (2009): 427-437. """ - return fbeta_score(targets, predictions, beta = 1.0, average = average) + return fbeta_score(targets, predictions, beta=1.0, average=average) + -def precision(targets, predictions, average='macro'): +def precision(targets, predictions, average="macro"): r""" Compute the precision score for classification tasks. The precision score @@ -831,15 +858,15 @@ class label and the value is the score for the corresponding class {0: 0.0, 1: 0.25, 2: 1.0, 3: 0.0} """ _supervised_evaluation_error_checking(targets, predictions) - _check_categorical_option_type('average', average, - ['micro', 'macro', None]) + _check_categorical_option_type("average", average, ["micro", "macro", None]) _check_same_type_not_float(targets, predictions) opts = {"average": average} - return _turicreate.extensions._supervised_streaming_evaluator(targets, - predictions, "precision", opts) + return _turicreate.extensions._supervised_streaming_evaluator( + targets, predictions, "precision", opts + ) -def recall(targets, predictions, average='macro'): +def recall(targets, predictions, average="macro"): r""" Compute the recall score for classification tasks. The recall score quantifies the ability of a classifier to predict `positive` examples. @@ -947,12 +974,13 @@ class label and the value is the score for the corresponding class {0: 0.0, 1: 0.5, 2: 1.0, 3: 0.0} """ _supervised_evaluation_error_checking(targets, predictions) - _check_categorical_option_type('average', average, - ['micro', 'macro', None]) + _check_categorical_option_type("average", average, ["micro", "macro", None]) _check_same_type_not_float(targets, predictions) opts = {"average": average} - return _turicreate.extensions._supervised_streaming_evaluator(targets, - predictions, "recall", opts) + return _turicreate.extensions._supervised_streaming_evaluator( + targets, predictions, "recall", opts + ) + def roc_curve(targets, predictions, average=None, index_map=None): r""" @@ -1134,20 +1162,21 @@ def roc_curve(targets, predictions, average=None, index_map=None): [300003 rows x 6 columns] """ _supervised_evaluation_error_checking(targets, predictions) - _check_categorical_option_type('average', average, [None]) + _check_categorical_option_type("average", average, [None]) _check_prob_and_prob_vector(predictions) _check_target_not_float(targets) _check_index_map(index_map) - opts = {"average": average, - "binary": predictions.dtype in [int, float]} + opts = {"average": average, "binary": predictions.dtype in [int, float]} if index_map is not None: - opts['index_map'] = index_map + opts["index_map"] = index_map + + return _turicreate.extensions._supervised_streaming_evaluator( + targets, predictions, "roc_curve", opts + ) - return _turicreate.extensions._supervised_streaming_evaluator(targets, - predictions, "roc_curve", opts) -def auc(targets, predictions, average='macro', index_map=None): +def auc(targets, predictions, average="macro", index_map=None): r""" Compute the area under the ROC curve for the given targets and predictions. @@ -1254,16 +1283,15 @@ class label and the value is the score for the corresponding class """ _supervised_evaluation_error_checking(targets, predictions) - _check_categorical_option_type('average', average, - ['macro', None]) + _check_categorical_option_type("average", average, ["macro", None]) _check_prob_and_prob_vector(predictions) _check_target_not_float(targets) _check_index_map(index_map) - opts = {"average": average, - "binary": predictions.dtype in [int, float]} + opts = {"average": average, "binary": predictions.dtype in [int, float]} if index_map is not None: - opts['index_map'] = index_map + opts["index_map"] = index_map - return _turicreate.extensions._supervised_streaming_evaluator(targets, - predictions, "auc", opts) + return _turicreate.extensions._supervised_streaming_evaluator( + targets, predictions, "auc", opts + ) diff --git a/src/python/turicreate/toolkits/graph_analytics/_model_base.py b/src/python/turicreate/toolkits/graph_analytics/_model_base.py index d2e69a17ea..62d46b95f5 100644 --- a/src/python/turicreate/toolkits/graph_analytics/_model_base.py +++ b/src/python/turicreate/toolkits/graph_analytics/_model_base.py @@ -17,8 +17,8 @@ import six -class GraphAnalyticsModel(CustomModel): +class GraphAnalyticsModel(CustomModel): @classmethod def _native_name(cls): return None @@ -45,7 +45,10 @@ def _get(self, field): if field in self._list_fields(): return self.__proxy__.get(field) else: - raise KeyError('Key \"%s\" not in model. Available fields are %s.' % (field, ', '.join(self._list_fields()))) + raise KeyError( + 'Key "%s" not in model. Available fields are %s.' + % (field, ", ".join(self._list_fields())) + ) @classmethod def _describe_fields(cls): @@ -54,32 +57,34 @@ def _describe_fields(cls): Fields should NOT be wrapped by _precomputed_field, if necessary """ dispatch_table = { - 'ShortestPathModel': 'sssp', - 'GraphColoringModel': 'graph_coloring', - 'PagerankModel': 'pagerank', - 'ConnectedComponentsModel': 'connected_components', - 'TriangleCountingModel': 'triangle_counting', - 'KcoreModel': 'kcore', - 'DegreeCountingModel': 'degree_count', - 'LabelPropagationModel': 'label_propagation' + "ShortestPathModel": "sssp", + "GraphColoringModel": "graph_coloring", + "PagerankModel": "pagerank", + "ConnectedComponentsModel": "connected_components", + "TriangleCountingModel": "triangle_counting", + "KcoreModel": "kcore", + "DegreeCountingModel": "degree_count", + "LabelPropagationModel": "label_propagation", } try: toolkit_name = dispatch_table[cls.__name__] toolkit = _tc.extensions._toolkits.graph.__dict__[toolkit_name] return toolkit.get_model_fields({}) except: - raise RuntimeError('Model %s does not have fields description' % cls.__name__) + raise RuntimeError( + "Model %s does not have fields description" % cls.__name__ + ) def _format(self, title, key_values): if len(key_values) == 0: return "" tbl = _PrettyTable(header=False) for k, v in six.iteritems(key_values): - tbl.add_row([k, v]) - tbl.align['Field 1'] = 'l' - tbl.align['Field 2'] = 'l' + tbl.add_row([k, v]) + tbl.align["Field 1"] = "l" + tbl.align["Field 2"] = "l" s = title + ":\n" - s += tbl.__str__() + '\n' + s += tbl.__str__() + "\n" return s def _get_summary_struct(self): @@ -101,20 +106,30 @@ def _get_summary_struct(self): """ g = self.graph - section_titles = ['Graph'] + section_titles = ["Graph"] - graph_summary = [(k, _precomputed_field(v)) for k, v in six.iteritems(g.summary())] + graph_summary = [ + (k, _precomputed_field(v)) for k, v in six.iteritems(g.summary()) + ] sections = [graph_summary] # collect other sections - results = [(k, _precomputed_field(v)) for k, v in six.iteritems(self._result_fields())] - methods = [(k, _precomputed_field(v)) for k, v in six.iteritems(self._method_fields())] + results = [ + (k, _precomputed_field(v)) for k, v in six.iteritems(self._result_fields()) + ] + methods = [ + (k, _precomputed_field(v)) for k, v in six.iteritems(self._method_fields()) + ] settings = [(k, v) for k, v in six.iteritems(self._setting_fields())] metrics = [(k, v) for k, v in six.iteritems(self._metric_fields())] - optional_sections = [('Results', results), ('Settings', settings), \ - ('Metrics', metrics), ('Methods', methods)] + optional_sections = [ + ("Results", results), + ("Settings", settings), + ("Metrics", metrics), + ("Methods", methods), + ] # if section is not empty, append to summary structure for (title, section) in optional_sections: @@ -126,16 +141,23 @@ def _get_summary_struct(self): def __repr__(self): - descriptions = [(k, _precomputed_field(v)) for k, v in six.iteritems(self._describe_fields())] + descriptions = [ + (k, _precomputed_field(v)) + for k, v in six.iteritems(self._describe_fields()) + ] (sections, section_titles) = self._get_summary_struct() non_empty_sections = [s for s in sections if len(s) > 0] - non_empty_section_titles = [section_titles[i] for i in range(len(sections)) if len(sections[i]) > 0] + non_empty_section_titles = [ + section_titles[i] for i in range(len(sections)) if len(sections[i]) > 0 + ] - non_empty_section_titles.append('Queryable Fields') + non_empty_section_titles.append("Queryable Fields") non_empty_sections.append(descriptions) - return _toolkit_repr_print(self, non_empty_sections, non_empty_section_titles, width=40) + return _toolkit_repr_print( + self, non_empty_sections, non_empty_section_titles, width=40 + ) def __str__(self): return self.__repr__() @@ -159,11 +181,11 @@ def _result_fields(self): Return results information Fields should NOT be wrapped by _precomputed_field """ - return {'graph': "SGraph. See m['graph']"} + return {"graph": "SGraph. See m['graph']"} def _metric_fields(self): """ Return model fields related to training metric Fields SHOULD be wrapped by _precomputed_field, if necessary """ - return {'training time (secs)': 'training_time'} + return {"training time (secs)": "training_time"} diff --git a/src/python/turicreate/toolkits/graph_analytics/connected_components.py b/src/python/turicreate/toolkits/graph_analytics/connected_components.py index 9ca93a8743..052ce3ba94 100644 --- a/src/python/turicreate/toolkits/graph_analytics/connected_components.py +++ b/src/python/turicreate/toolkits/graph_analytics/connected_components.py @@ -9,7 +9,9 @@ import turicreate as _tc from turicreate.data_structures.sgraph import SGraph as _SGraph -from turicreate.toolkits.graph_analytics._model_base import GraphAnalyticsModel as _ModelBase +from turicreate.toolkits.graph_analytics._model_base import ( + GraphAnalyticsModel as _ModelBase, +) class ConnectedComponentsModel(_ModelBase): @@ -43,8 +45,9 @@ class ConnectedComponentsModel(_ModelBase): -------- create """ + def __init__(self, model): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model self.__model_name__ = self.__class__._native_name() @@ -56,16 +59,16 @@ def _native_name(cls): return "connected_components" def _get_native_state(self): - return {'model':self.__proxy__} + return {"model": self.__proxy__} @classmethod def _load_version(cls, state, version): - assert(version == 0) - return cls(state['model']) + assert version == 0 + return cls(state["model"]) def _result_fields(self): ret = super(ConnectedComponentsModel, self)._result_fields() - ret["number of connected components"] = len(self['component_size']) + ret["number of connected components"] = len(self["component_size"]) ret["component size"] = "SFrame. See m['component_size']" ret["vertex component id"] = "SFrame. See m['component_id']" return ret @@ -133,5 +136,6 @@ def create(graph, verbose=True): with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.connected_components.create( - {'graph': graph.__proxy__}) - return ConnectedComponentsModel(params['model']) + {"graph": graph.__proxy__} + ) + return ConnectedComponentsModel(params["model"]) diff --git a/src/python/turicreate/toolkits/graph_analytics/degree_counting.py b/src/python/turicreate/toolkits/graph_analytics/degree_counting.py index 15ace29456..2606c98231 100644 --- a/src/python/turicreate/toolkits/graph_analytics/degree_counting.py +++ b/src/python/turicreate/toolkits/graph_analytics/degree_counting.py @@ -9,7 +9,9 @@ import turicreate as _tc from turicreate.data_structures.sgraph import SGraph as _SGraph -from turicreate.toolkits.graph_analytics._model_base import GraphAnalyticsModel as _ModelBase +from turicreate.toolkits.graph_analytics._model_base import ( + GraphAnalyticsModel as _ModelBase, +) class DegreeCountingModel(_ModelBase): @@ -35,8 +37,9 @@ class DegreeCountingModel(_ModelBase): -------- create """ + def __init__(self, model): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model def _get_version(self): @@ -47,12 +50,13 @@ def _native_name(cls): return "degree_count" def _get_native_state(self): - return {'model':self.__proxy__} + return {"model": self.__proxy__} @classmethod def _load_version(cls, state, version): - assert(version == 0) - return cls(state['model']) + assert version == 0 + return cls(state["model"]) + def create(graph, verbose=True): """ @@ -115,5 +119,6 @@ def create(graph, verbose=True): with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.degree_count.create( - {'graph': graph.__proxy__}) - return DegreeCountingModel(params['model']) + {"graph": graph.__proxy__} + ) + return DegreeCountingModel(params["model"]) diff --git a/src/python/turicreate/toolkits/graph_analytics/graph_coloring.py b/src/python/turicreate/toolkits/graph_analytics/graph_coloring.py index 154a8d9fd3..594fa2754d 100644 --- a/src/python/turicreate/toolkits/graph_analytics/graph_coloring.py +++ b/src/python/turicreate/toolkits/graph_analytics/graph_coloring.py @@ -9,7 +9,10 @@ import turicreate as _tc from turicreate.data_structures.sgraph import SGraph as _SGraph -from turicreate.toolkits.graph_analytics._model_base import GraphAnalyticsModel as _ModelBase +from turicreate.toolkits.graph_analytics._model_base import ( + GraphAnalyticsModel as _ModelBase, +) + class GraphColoringModel(_ModelBase): """ @@ -41,15 +44,16 @@ class GraphColoringModel(_ModelBase): -------- create """ + def __init__(self, model): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model self.__model_name__ = self.__class__._native_name() def _result_fields(self): ret = super(GraphColoringModel, self)._result_fields() - ret['number of colors in the graph'] = self.num_colors - ret['vertex color id'] = "SFrame. See m.color_id" + ret["number of colors in the graph"] = self.num_colors + ret["vertex color id"] = "SFrame. See m.color_id" return ret def _get_version(self): @@ -60,12 +64,12 @@ def _native_name(cls): return "graph_coloring" def _get_native_state(self): - return {'model':self.__proxy__} + return {"model": self.__proxy__} @classmethod def _load_version(cls, state, version): - assert(version == 0) - return cls(state['model']) + assert version == 0 + return cls(state["model"]) def create(graph, verbose=True): @@ -118,9 +122,10 @@ def create(graph, verbose=True): from turicreate._cython.cy_server import QuietProgress if not isinstance(graph, _SGraph): - raise TypeError('graph input must be a SGraph object.') + raise TypeError("graph input must be a SGraph object.") with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.graph_coloring.create( - {'graph': graph.__proxy__}) - return GraphColoringModel(params['model']) + {"graph": graph.__proxy__} + ) + return GraphColoringModel(params["model"]) diff --git a/src/python/turicreate/toolkits/graph_analytics/kcore.py b/src/python/turicreate/toolkits/graph_analytics/kcore.py index ee9c222bdd..245d9f1a06 100644 --- a/src/python/turicreate/toolkits/graph_analytics/kcore.py +++ b/src/python/turicreate/toolkits/graph_analytics/kcore.py @@ -9,7 +9,9 @@ import turicreate as _tc from turicreate.data_structures.sgraph import SGraph as _SGraph -from turicreate.toolkits.graph_analytics._model_base import GraphAnalyticsModel as _ModelBase +from turicreate.toolkits.graph_analytics._model_base import ( + GraphAnalyticsModel as _ModelBase, +) class KcoreModel(_ModelBase): @@ -50,8 +52,9 @@ class KcoreModel(_ModelBase): -------- create """ + def __init__(self, model): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model self.__model_name__ = self.__class__._native_name() @@ -62,8 +65,8 @@ def _result_fields(self): def _setting_fields(self): ret = super(KcoreModel, self)._setting_fields() - ret['minimum core id assigned to any vertex'] = 'kmin' - ret['maximum core id assigned to any vertex '] = 'kmax' + ret["minimum core id assigned to any vertex"] = "kmin" + ret["maximum core id assigned to any vertex "] = "kmax" return ret def _get_version(self): @@ -74,12 +77,12 @@ def _native_name(cls): return "kcore" def _get_native_state(self): - return {'model':self.__proxy__} + return {"model": self.__proxy__} @classmethod def _load_version(cls, state, version): - assert(version == 0) - return cls(state['model']) + assert version == 0 + return cls(state["model"]) def create(graph, kmin=0, kmax=10, verbose=True): @@ -139,11 +142,11 @@ def create(graph, kmin=0, kmax=10, verbose=True): from turicreate._cython.cy_server import QuietProgress if not isinstance(graph, _SGraph): - raise TypeError('graph input must be a SGraph object.') + raise TypeError("graph input must be a SGraph object.") - opts = {'graph': graph.__proxy__, 'kmin': kmin, 'kmax': kmax} + opts = {"graph": graph.__proxy__, "kmin": kmin, "kmax": kmax} with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.kcore.create(opts) - return KcoreModel(params['model']) + return KcoreModel(params["model"]) diff --git a/src/python/turicreate/toolkits/graph_analytics/label_propagation.py b/src/python/turicreate/toolkits/graph_analytics/label_propagation.py index 07374dcdb4..4194d89639 100644 --- a/src/python/turicreate/toolkits/graph_analytics/label_propagation.py +++ b/src/python/turicreate/toolkits/graph_analytics/label_propagation.py @@ -9,7 +9,9 @@ import turicreate as _tc from turicreate.data_structures.sgraph import SGraph as _SGraph -from turicreate.toolkits.graph_analytics._model_base import GraphAnalyticsModel as _ModelBase +from turicreate.toolkits.graph_analytics._model_base import ( + GraphAnalyticsModel as _ModelBase, +) from turicreate.util import _raise_error_if_not_of_type @@ -86,31 +88,31 @@ class LabelPropagationModel(_ModelBase): -------- create """ + def __init__(self, model): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model def _result_fields(self): ret = super(LabelPropagationModel, self)._result_fields() ret["vertex label probability"] = "SFrame. See m['labels']" - ret['change in last iteration (avg. of L2)'] = self['delta'] + ret["change in last iteration (avg. of L2)"] = self["delta"] return ret def _metric_fields(self): ret = super(LabelPropagationModel, self)._metric_fields() - ret['number of iterations'] = 'num_iterations' + ret["number of iterations"] = "num_iterations" return ret def _setting_fields(self): ret = super(LabelPropagationModel, self)._setting_fields() - ret['convergence threshold (avg. of L2 norm)'] = 'threshold' - ret['treated edge as undirected'] = 'undirected' - ret['weight for self edge'] = 'self_weight' - ret['edge weight field id'] = 'weight_field' - ret['vertex label field id'] = 'label_field' + ret["convergence threshold (avg. of L2 norm)"] = "threshold" + ret["treated edge as undirected"] = "undirected" + ret["weight for self edge"] = "self_weight" + ret["edge weight field id"] = "weight_field" + ret["vertex label field id"] = "label_field" return ret - def _get_version(self): return 0 @@ -119,24 +121,26 @@ def _native_name(cls): return "label_propagation" def _get_native_state(self): - return {'model':self.__proxy__} + return {"model": self.__proxy__} @classmethod def _load_version(cls, state, version): - assert(version == 0) - return cls(state['model']) - - - -def create(graph, label_field, - threshold=1e-3, - weight_field='', - self_weight=1.0, - undirected=False, - max_iterations=None, - _single_precision=False, - _distributed='auto', - verbose=True): + assert version == 0 + return cls(state["model"]) + + +def create( + graph, + label_field, + threshold=1e-3, + weight_field="", + self_weight=1.0, + undirected=False, + max_iterations=None, + _single_precision=False, + _distributed="auto", + verbose=True, +): """ Given a weighted graph with observed class labels of a subset of vertices, infer the label probability for the unobserved vertices using the @@ -253,22 +257,24 @@ def create(graph, label_field, _raise_error_if_not_of_type(weight_field, str) if not isinstance(graph, _SGraph): - raise TypeError('graph input must be a SGraph object.') + raise TypeError("graph input must be a SGraph object.") if graph.vertices[label_field].dtype != int: - raise TypeError('label_field %s must be integer typed.' % label_field) - - opts = {'label_field': label_field, - 'threshold': threshold, - 'weight_field': weight_field, - 'self_weight': self_weight, - 'undirected': undirected, - 'max_iterations': max_iterations, - 'single_precision': _single_precision, - 'graph': graph.__proxy__} + raise TypeError("label_field %s must be integer typed." % label_field) + + opts = { + "label_field": label_field, + "threshold": threshold, + "weight_field": weight_field, + "self_weight": self_weight, + "undirected": undirected, + "max_iterations": max_iterations, + "single_precision": _single_precision, + "graph": graph.__proxy__, + } with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.label_propagation.create(opts) - model = params['model'] + model = params["model"] return LabelPropagationModel(model) diff --git a/src/python/turicreate/toolkits/graph_analytics/pagerank.py b/src/python/turicreate/toolkits/graph_analytics/pagerank.py index e95c439023..15769b83c8 100644 --- a/src/python/turicreate/toolkits/graph_analytics/pagerank.py +++ b/src/python/turicreate/toolkits/graph_analytics/pagerank.py @@ -9,7 +9,9 @@ import turicreate as _tc from turicreate.data_structures.sgraph import SGraph as _SGraph -from turicreate.toolkits.graph_analytics._model_base import GraphAnalyticsModel as _ModelBase +from turicreate.toolkits.graph_analytics._model_base import ( + GraphAnalyticsModel as _ModelBase, +) class PagerankModel(_ModelBase): @@ -64,26 +66,29 @@ class PagerankModel(_ModelBase): -------- create """ + def __init__(self, model): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model def _result_fields(self): ret = super(PagerankModel, self)._result_fields() ret["vertex pagerank"] = "SFrame. See m.pagerank" - ret['change in last iteration (L1 norm)'] = self.delta + ret["change in last iteration (L1 norm)"] = self.delta return ret def _metric_fields(self): ret = super(PagerankModel, self)._metric_fields() - ret['number of iterations'] = 'num_iterations' + ret["number of iterations"] = "num_iterations" return ret def _setting_fields(self): ret = super(PagerankModel, self)._setting_fields() - ret['probability of random jumps to any node in the graph'] = 'reset_probability' - ret['convergence threshold (L1 norm)'] = 'threshold' - ret['maximum number of iterations'] = 'max_iterations' + ret[ + "probability of random jumps to any node in the graph" + ] = "reset_probability" + ret["convergence threshold (L1 norm)"] = "threshold" + ret["maximum number of iterations"] = "max_iterations" return ret def _get_version(self): @@ -94,20 +99,23 @@ def _native_name(cls): return "pagerank" def _get_native_state(self): - return {'model':self.__proxy__} + return {"model": self.__proxy__} @classmethod def _load_version(cls, state, version): - assert(version == 0) - return cls(state['model']) - - -def create(graph, reset_probability=0.15, - threshold=1e-2, - max_iterations=20, - _single_precision=False, - _distributed='auto', - verbose=True): + assert version == 0 + return cls(state["model"]) + + +def create( + graph, + reset_probability=0.15, + threshold=1e-2, + max_iterations=20, + _single_precision=False, + _distributed="auto", + verbose=True, +): """ Compute the PageRank for each vertex in the graph. Return a model object with total PageRank as well as the PageRank value for each vertex in the @@ -177,15 +185,18 @@ def create(graph, reset_probability=0.15, from turicreate._cython.cy_server import QuietProgress if not isinstance(graph, _SGraph): - raise TypeError('graph input must be a SGraph object.') + raise TypeError("graph input must be a SGraph object.") - opts = {'threshold': threshold, 'reset_probability': reset_probability, - 'max_iterations': max_iterations, - 'single_precision': _single_precision, - 'graph': graph.__proxy__} + opts = { + "threshold": threshold, + "reset_probability": reset_probability, + "max_iterations": max_iterations, + "single_precision": _single_precision, + "graph": graph.__proxy__, + } with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.pagerank.create(opts) - model = params['model'] + model = params["model"] return PagerankModel(model) diff --git a/src/python/turicreate/toolkits/graph_analytics/shortest_path.py b/src/python/turicreate/toolkits/graph_analytics/shortest_path.py index 07a496e351..98ab6b022f 100644 --- a/src/python/turicreate/toolkits/graph_analytics/shortest_path.py +++ b/src/python/turicreate/toolkits/graph_analytics/shortest_path.py @@ -9,7 +9,9 @@ import turicreate as _tc from turicreate.data_structures.sgraph import SGraph as _SGraph -from turicreate.toolkits.graph_analytics._model_base import GraphAnalyticsModel as _ModelBase +from turicreate.toolkits.graph_analytics._model_base import ( + GraphAnalyticsModel as _ModelBase, +) import copy as _copy @@ -49,8 +51,9 @@ class ShortestPathModel(_ModelBase): -------- create """ + def __init__(self, model): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model self._path_query_table = None @@ -60,7 +63,7 @@ def _result_fields(self): Fields should NOT be wrapped by _precomputed_field """ ret = super(ShortestPathModel, self)._result_fields() - ret['vertex distance to the source vertex'] = "SFrame. m.distance" + ret["vertex distance to the source vertex"] = "SFrame. m.distance" return ret def _setting_fields(self): @@ -69,9 +72,9 @@ def _setting_fields(self): Fields SHOULD be wrapped by _precomputed_field, if necessary """ ret = super(ShortestPathModel, self)._setting_fields() - ret['source vertex id'] = 'source_vid' - ret['edge weight field id'] = 'weight_field' - ret['maximum distance between vertices'] = 'max_distance' + ret["source vertex id"] = "source_vid" + ret["edge weight field id"] = "weight_field" + ret["maximum distance between vertices"] = "max_distance" return ret def _method_fields(self): @@ -79,7 +82,7 @@ def _method_fields(self): Return model fields related to model methods Fields should NOT be wrapped by _precomputed_field """ - return {'get shortest path': 'get_path() e.g. m.get_path(vid=target_vid)'} + return {"get shortest path": "get_path() e.g. m.get_path(vid=target_vid)"} def get_path(self, vid, highlight=None): """ @@ -114,25 +117,27 @@ def get_path(self, vid, highlight=None): source_vid = self.source_vid path = [] path_query_table = self._path_query_table - if not vid in path_query_table['vid']: - raise ValueError('Destination vertex id ' + str(vid) + ' not found') + if not vid in path_query_table["vid"]: + raise ValueError("Destination vertex id " + str(vid) + " not found") - record = path_query_table[path_query_table['vid'] == vid][0] - dist = record['distance'] + record = path_query_table[path_query_table["vid"] == vid][0] + dist = record["distance"] if dist > 1e5: - raise ValueError('The distance to {} is too large to show the path.'.format(vid)) + raise ValueError( + "The distance to {} is too large to show the path.".format(vid) + ) path = [(vid, dist)] max_iter = len(path_query_table) num_iter = 0 - while record['distance'] != 0 and num_iter < max_iter: - parent_id = record['parent_row_id'] + while record["distance"] != 0 and num_iter < max_iter: + parent_id = record["parent_row_id"] assert parent_id < len(path_query_table) assert parent_id >= 0 record = path_query_table[parent_id] - path.append((record['vid'], record['distance'])) + path.append((record["vid"], record["distance"])) num_iter += 1 - assert record['vid'] == source_vid + assert record["vid"] == source_vid assert num_iter < max_iter path.reverse() return path @@ -146,14 +151,14 @@ def _generate_path_sframe(self): weight_field = self.weight_field query_table = _copy.copy(self.distance) - query_table = query_table.add_row_number('row_id') + query_table = query_table.add_row_number("row_id") g = self.graph.add_vertices(query_table) # The sequence id which a vertex is visited, initialized with 0 meaning not visited. - g.vertices['__parent__'] = -1 + g.vertices["__parent__"] = -1 weight_field = self.weight_field - if (weight_field == ""): - weight_field = '__unit_weight__' + if weight_field == "": + weight_field = "__unit_weight__" g.edges[weight_field] = 1 # Traverse the graph once and get the parent row id for each vertex @@ -165,13 +170,15 @@ def _generate_path_sframe(self): # return (src, edge, dst) # # the internal lambda appear to have some issues. - traverse_fun = lambda src, edge, dst: \ - _tc.extensions._toolkits.graph.sssp.shortest_path_traverse_function( - src, edge, dst, source_vid, weight_field) - - g = g.triple_apply(traverse_fun, ['__parent__']) - query_table = query_table.join(g.get_vertices()[['__id', '__parent__']], '__id').sort('row_id') - query_table.rename({'__parent__': 'parent_row_id', '__id': 'vid'}, inplace=True) + traverse_fun = lambda src, edge, dst: _tc.extensions._toolkits.graph.sssp.shortest_path_traverse_function( + src, edge, dst, source_vid, weight_field + ) + + g = g.triple_apply(traverse_fun, ["__parent__"]) + query_table = query_table.join( + g.get_vertices()[["__id", "__parent__"]], "__id" + ).sort("row_id") + query_table.rename({"__parent__": "parent_row_id", "__id": "vid"}, inplace=True) return query_table def _get_version(self): @@ -182,12 +189,12 @@ def _native_name(cls): return "shortest_path" def _get_native_state(self): - return {'model':self.__proxy__} + return {"model": self.__proxy__} @classmethod def _load_version(cls, state, version): - assert(version == 0) - return cls(state['model']) + assert version == 0 + return cls(state["model"]) def create(graph, source_vid, weight_field="", max_distance=1e30, verbose=True): @@ -259,13 +266,17 @@ def create(graph, source_vid, weight_field="", max_distance=1e30, verbose=True): from turicreate._cython.cy_server import QuietProgress if not isinstance(graph, _SGraph): - raise TypeError('graph input must be a SGraph object.') - - opts = {'source_vid': source_vid, 'weight_field': weight_field, - 'max_distance': max_distance, 'graph': graph.__proxy__} + raise TypeError("graph input must be a SGraph object.") + + opts = { + "source_vid": source_vid, + "weight_field": weight_field, + "max_distance": max_distance, + "graph": graph.__proxy__, + } with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.sssp.create(opts) - return ShortestPathModel(params['model']) + return ShortestPathModel(params["model"]) def _compute_shortest_path(graph, source_vids, dest_vids, weight_field=""): @@ -325,4 +336,5 @@ def _compute_shortest_path(graph, source_vids, dest_vids, weight_field=""): if type(dest_vids) != list: dest_vids = [dest_vids] return _tc.extensions._toolkits.graph.sssp.all_shortest_paths( - graph, source_vids, dest_vids, weight_field) + graph, source_vids, dest_vids, weight_field + ) diff --git a/src/python/turicreate/toolkits/graph_analytics/triangle_counting.py b/src/python/turicreate/toolkits/graph_analytics/triangle_counting.py index 90e6c0916f..24306a8945 100644 --- a/src/python/turicreate/toolkits/graph_analytics/triangle_counting.py +++ b/src/python/turicreate/toolkits/graph_analytics/triangle_counting.py @@ -9,7 +9,10 @@ import turicreate as _tc from turicreate.data_structures.sgraph import SGraph as _SGraph -from turicreate.toolkits.graph_analytics._model_base import GraphAnalyticsModel as _ModelBase +from turicreate.toolkits.graph_analytics._model_base import ( + GraphAnalyticsModel as _ModelBase, +) + class TriangleCountingModel(_ModelBase): """ @@ -47,13 +50,14 @@ class TriangleCountingModel(_ModelBase): -------- create """ + def __init__(self, model): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model def _result_fields(self): ret = super(TriangleCountingModel, self)._result_fields() - ret['total number of triangles'] = self.num_triangles + ret["total number of triangles"] = self.num_triangles ret["vertex triangle count"] = "SFrame. See m.triangle_count" return ret @@ -65,12 +69,12 @@ def _native_name(cls): return "triangle_count" def _get_native_state(self): - return {'model':self.__proxy__} + return {"model": self.__proxy__} @classmethod def _load_version(cls, state, version): - assert(version == 0) - return cls(state['model']) + assert version == 0 + return cls(state["model"]) def create(graph, verbose=True): @@ -125,9 +129,10 @@ def create(graph, verbose=True): from turicreate._cython.cy_server import QuietProgress if not isinstance(graph, _SGraph): - raise TypeError('graph input must be a SGraph object.') + raise TypeError("graph input must be a SGraph object.") with QuietProgress(verbose): params = _tc.extensions._toolkits.graph.triangle_counting.create( - {'graph': graph.__proxy__}) - return TriangleCountingModel(params['model']) + {"graph": graph.__proxy__} + ) + return TriangleCountingModel(params["model"]) diff --git a/src/python/turicreate/toolkits/image_analysis/__init__.py b/src/python/turicreate/toolkits/image_analysis/__init__.py index c495fdb082..4b31b6b920 100644 --- a/src/python/turicreate/toolkits/image_analysis/__init__.py +++ b/src/python/turicreate/toolkits/image_analysis/__init__.py @@ -7,6 +7,6 @@ from __future__ import division as _ from __future__ import absolute_import as _ -__all__ = ['image_analysis'] +__all__ = ["image_analysis"] from . import image_analysis diff --git a/src/python/turicreate/toolkits/image_analysis/image_analysis.py b/src/python/turicreate/toolkits/image_analysis/image_analysis.py index ff201d55ab..8373a3378a 100644 --- a/src/python/turicreate/toolkits/image_analysis/image_analysis.py +++ b/src/python/turicreate/toolkits/image_analysis/image_analysis.py @@ -9,7 +9,14 @@ from ...data_structures.image import Image as _Image -def load_images(url, format='auto', with_path=True, recursive=True, ignore_failure=True, random_order=False): +def load_images( + url, + format="auto", + with_path=True, + recursive=True, + ignore_failure=True, + random_order=False, +): """ Loads images from a directory. JPEG and PNG images are supported. @@ -55,8 +62,10 @@ def load_images(url, format='auto', with_path=True, recursive=True, ignore_failu ... recursive=True) """ from ... import extensions as _extensions - return _extensions.load_images(url, format, with_path, - recursive, ignore_failure, random_order) + + return _extensions.load_images( + url, format, with_path, recursive, ignore_failure, random_order + ) def _decode(image_data): @@ -65,15 +74,14 @@ def _decode(image_data): """ from ...data_structures.sarray import SArray as _SArray from ... import extensions as _extensions + if type(image_data) is _SArray: return _extensions.decode_image_sarray(image_data) elif type(image_data) is _Image: return _extensions.decode_image(image_data) - -def resize(image, width, height, channels=None, decode=False, - resample='nearest'): +def resize(image, width, height, channels=None, decode=False, resample="nearest"): """ Resizes the image or SArray of Images to a specific width, height, and number of channels. @@ -134,7 +142,7 @@ def resize(image, width, height, channels=None, decode=False, if height < 0 or width < 0: raise ValueError("Cannot resize to negative sizes") - if resample not in ('nearest', 'bilinear'): + if resample not in ("nearest", "bilinear"): raise ValueError("Unknown resample option: '%s'" % resample) from ...data_structures.sarray import SArray as _SArray @@ -143,19 +151,27 @@ def resize(image, width, height, channels=None, decode=False, if type(image) is _Image: - assert resample in ('nearest', 'bilinear') - resample_method = 0 if resample == 'nearest' else 1 + assert resample in ("nearest", "bilinear") + resample_method = 0 if resample == "nearest" else 1 if channels is None: channels = image.channels if channels <= 0: raise ValueError("cannot resize images to 0 or fewer channels") - return _extensions.resize_image(image, width, height, channels, decode, resample_method) + return _extensions.resize_image( + image, width, height, channels, decode, resample_method + ) elif type(image) is _SArray: if channels is None: channels = 3 if channels <= 0: raise ValueError("cannot resize images to 0 or fewer channels") - return image.apply(lambda x: _tc.image_analysis.resize(x, width, height, channels, decode, resample)) + return image.apply( + lambda x: _tc.image_analysis.resize( + x, width, height, channels, decode, resample + ) + ) else: - raise ValueError("Cannot call 'resize' on objects that are not either an Image or SArray of Images") + raise ValueError( + "Cannot call 'resize' on objects that are not either an Image or SArray of Images" + ) diff --git a/src/python/turicreate/toolkits/image_classifier/__init__.py b/src/python/turicreate/toolkits/image_classifier/__init__.py index 6bccd38744..282e68aced 100644 --- a/src/python/turicreate/toolkits/image_classifier/__init__.py +++ b/src/python/turicreate/toolkits/image_classifier/__init__.py @@ -10,4 +10,4 @@ from __future__ import division as _ from .image_classifier import * -from ._annotate import annotate, recover_annotation \ No newline at end of file +from ._annotate import annotate, recover_annotation diff --git a/src/python/turicreate/toolkits/image_classifier/_annotate.py b/src/python/turicreate/toolkits/image_classifier/_annotate.py index 11bc7d440f..254206b84c 100644 --- a/src/python/turicreate/toolkits/image_classifier/_annotate.py +++ b/src/python/turicreate/toolkits/image_classifier/_annotate.py @@ -16,7 +16,8 @@ import turicreate as __tc -def annotate(data, image_column=None, annotation_column='annotations'): + +def annotate(data, image_column=None, annotation_column="annotations"): """ Annotate images using a GUI assisted application. When the GUI is terminated an SFrame with the representative images and annotations is @@ -72,10 +73,10 @@ def annotate(data, image_column=None, annotation_column='annotations'): # Check Value of Column Variables if not isinstance(data, __tc.SFrame): raise TypeError('"data" must be of type SFrame.') - + # Check if Value is Empty if data.num_rows() == 0: - raise Exception('input data cannot be empty') + raise Exception("input data cannot be empty") if image_column == None: image_column = _tkutl._find_only_image_column(data) @@ -92,35 +93,33 @@ def annotate(data, image_column=None, annotation_column='annotations'): if type(annotation_column) != str: raise TypeError("'annotation_column' has to be of type 'str'") - # Check Data Structure if type(data) == __tc.data_structures.image.Image: - data = __tc.SFrame({image_column:__tc.SArray([data])}) + data = __tc.SFrame({image_column: __tc.SArray([data])}) elif type(data) == __tc.data_structures.sframe.SFrame: - if(data.shape[0] == 0): + if data.shape[0] == 0: return data if not (data[image_column].dtype == __tc.data_structures.image.Image): raise TypeError("'data[image_column]' must be an SFrame or SArray") elif type(data) == __tc.data_structures.sarray.SArray: - if(data.shape[0] == 0): + if data.shape[0] == 0: return data - data = __tc.SFrame({image_column:data}) + data = __tc.SFrame({image_column: data}) else: raise TypeError("'data' must be an SFrame or SArray") annotation_window = __tc.extensions.create_image_classification_annotation( - data, - [image_column], - annotation_column - ) + data, [image_column], annotation_column + ) with _QuietProgress(False): annotation_window.annotate(_get_client_app_path()) return annotation_window.returnAnnotations() + def recover_annotation(): """ Recover the last annotated SFrame. diff --git a/src/python/turicreate/toolkits/image_classifier/_evaluation.py b/src/python/turicreate/toolkits/image_classifier/_evaluation.py index 472be33079..3d22d64558 100644 --- a/src/python/turicreate/toolkits/image_classifier/_evaluation.py +++ b/src/python/turicreate/toolkits/image_classifier/_evaluation.py @@ -20,223 +20,324 @@ import base64 as _base64 import math as _math + class Evaluation(dict): - def __init__(self, obj = {}): - dict.__init__(self) - self._data = obj - - metrics_keys = ['f1_score', 'auc', 'recall', 'precision', 'log_loss', 'roc_curve', 'confusion_matrix', 'accuracy'] - metrics_obj = { k: v for k, v in obj.items() if k in metrics_keys } - - self.update(metrics_obj) - - def _get_eval_json(self): - evaluation_dictionary = dict() - - for key, value in _six.iteritems(self._data): - if (isinstance(value, float) or isinstance(value, int)) and _math.isnan(value): - continue - if (key == "test_data" or key == "confusion_matrix" or key == "roc_curve"): - continue - evaluation_dictionary[key] = value - - evaluation_dictionary["table_spec"] = { - "column_names": ["path", "image", "target_label", "predicted_label"], - "size": len(self._data["test_data"]), - "title": "", - "column_types": ["string", "image", "string", "string"] - } + def __init__(self, obj={}): + dict.__init__(self) + self._data = obj + + metrics_keys = [ + "f1_score", + "auc", + "recall", + "precision", + "log_loss", + "roc_curve", + "confusion_matrix", + "accuracy", + ] + metrics_obj = {k: v for k, v in obj.items() if k in metrics_keys} + + self.update(metrics_obj) + + def _get_eval_json(self): + evaluation_dictionary = dict() + + for key, value in _six.iteritems(self._data): + if (isinstance(value, float) or isinstance(value, int)) and _math.isnan( + value + ): + continue + if key == "test_data" or key == "confusion_matrix" or key == "roc_curve": + continue + evaluation_dictionary[key] = value + + evaluation_dictionary["table_spec"] = { + "column_names": ["path", "image", "target_label", "predicted_label"], + "size": len(self._data["test_data"]), + "title": "", + "column_types": ["string", "image", "string", "string"], + } - evaluation_dictionary["corrects_size"] = len(self._data["test_data"].filter_by([True], "correct")) - evaluation_dictionary["incorrects_size"] = evaluation_dictionary["table_spec"]["size"] - evaluation_dictionary["corrects_size"] + evaluation_dictionary["corrects_size"] = len( + self._data["test_data"].filter_by([True], "correct") + ) + evaluation_dictionary["incorrects_size"] = ( + evaluation_dictionary["table_spec"]["size"] + - evaluation_dictionary["corrects_size"] + ) - return str(_json.dumps({ "evaluation_spec": evaluation_dictionary }, allow_nan = False)) + return str( + _json.dumps({"evaluation_spec": evaluation_dictionary}, allow_nan=False) + ) - def explore(self): - """ + def explore(self): + """ Explore model evaluation qualitatively through a GUI assisted application. """ - if self._data["test_data"][self._data["feature"]].dtype == _tc.Image: - print("Resizing image data... ", end='') - self._data["test_data"][self._data["feature"]] = self._data["test_data"][self._data["feature"]].apply(_image_conversion) - self._data["test_data"].materialize() - print("Done.") - params = (self._get_eval_json()+"\n", self._data["test_data"], self, ) - # Suppress visualization output if 'none' target is set - from ...visualization._plot import _target - if _target == 'none': - return - _thread.start_new_thread(_start_process, params) + if self._data["test_data"][self._data["feature"]].dtype == _tc.Image: + print("Resizing image data... ", end="") + self._data["test_data"][self._data["feature"]] = self._data["test_data"][ + self._data["feature"] + ].apply(_image_conversion) + self._data["test_data"].materialize() + print("Done.") + params = ( + self._get_eval_json() + "\n", + self._data["test_data"], + self, + ) + # Suppress visualization output if 'none' target is set + from ...visualization._plot import _target + + if _target == "none": + return + _thread.start_new_thread(_start_process, params) def _get_data_spec(filters, start, length, row_type, mat_type, sframe, evaluation): - data_spec = None - sf = sframe - if row_type == "all": - - table_data = _filter_sframe(filters, "table", mat_type, sf, evaluation) - table_data_sliced = _reform_sframe(table_data[(table_data['__idx'] >= start)].head(length)) - - corrects = _filter_sframe(filters, "corrects", mat_type, sf, evaluation) - corrects_sliced = _reform_sframe(corrects[(corrects["__idx"] >= start)].head(length)) - - incorrects = _filter_sframe(filters, "incorrects", mat_type, sf, evaluation) - incorrects_sliced = _reform_sframe(incorrects[(incorrects["__idx"] >= start)].head(length)) - - data_spec = { - "data_spec": { - "table": { "data": { "values": table_data_sliced }, "size": table_data.num_rows() }, - "gallery": { - "corrects": { "data": corrects_sliced, "size": corrects.num_rows() }, - "incorrects": { "data": incorrects_sliced, "size": incorrects.num_rows() } + data_spec = None + sf = sframe + if row_type == "all": + + table_data = _filter_sframe(filters, "table", mat_type, sf, evaluation) + table_data_sliced = _reform_sframe( + table_data[(table_data["__idx"] >= start)].head(length) + ) + + corrects = _filter_sframe(filters, "corrects", mat_type, sf, evaluation) + corrects_sliced = _reform_sframe( + corrects[(corrects["__idx"] >= start)].head(length) + ) + + incorrects = _filter_sframe(filters, "incorrects", mat_type, sf, evaluation) + incorrects_sliced = _reform_sframe( + incorrects[(incorrects["__idx"] >= start)].head(length) + ) + + data_spec = { + "data_spec": { + "table": { + "data": {"values": table_data_sliced}, + "size": table_data.num_rows(), + }, + "gallery": { + "corrects": {"data": corrects_sliced, "size": corrects.num_rows()}, + "incorrects": { + "data": incorrects_sliced, + "size": incorrects.num_rows(), + }, + }, + } } - } - } - else: - sf = _filter_sframe(filters, row_type, mat_type, sf, evaluation) - list_test_data = _reform_sframe(sf[(sf['__idx'] >= start)].head(length)) - if (row_type == "table"): - data_spec = { "data_spec": { "table": { "data": { "values": list_test_data }, "size": sf.num_rows() }}} - elif (row_type == "corrects"): - data_spec = { "data_spec": { "gallery": { "corrects": { "data": list_test_data, "size": sf.num_rows() }}}} - elif (row_type == "incorrects"): - data_spec = { "data_spec": { "gallery": { "incorrects": { "data": list_test_data, "size": sf.num_rows() }}}} - - return _json.dumps(data_spec) + "\n" + else: + sf = _filter_sframe(filters, row_type, mat_type, sf, evaluation) + list_test_data = _reform_sframe(sf[(sf["__idx"] >= start)].head(length)) + if row_type == "table": + data_spec = { + "data_spec": { + "table": {"data": {"values": list_test_data}, "size": sf.num_rows()} + } + } + elif row_type == "corrects": + data_spec = { + "data_spec": { + "gallery": { + "corrects": {"data": list_test_data, "size": sf.num_rows()} + } + } + } + elif row_type == "incorrects": + data_spec = { + "data_spec": { + "gallery": { + "incorrects": {"data": list_test_data, "size": sf.num_rows()} + } + } + } + + return _json.dumps(data_spec) + "\n" + def _reform_sframe(sf): - sf_sending_data = sf.select_columns(["__idx", "path", "target_label", "predicted_label", "confidence", "relative_confidence", "entropy"]) - sf_sending_data["image"] = sf["image"] - sf_sending_data["probs"] = sf["probs"].astype(list) - return list(sf_sending_data) + sf_sending_data = sf.select_columns( + [ + "__idx", + "path", + "target_label", + "predicted_label", + "confidence", + "relative_confidence", + "entropy", + ] + ) + sf_sending_data["image"] = sf["image"] + sf_sending_data["probs"] = sf["probs"].astype(list) + return list(sf_sending_data) + def _filter_sframe(filters, row_type, mat_type, sf, evaluation): - conf_metric = evaluation._data["confidence_metric_for_threshold"] + conf_metric = evaluation._data["confidence_metric_for_threshold"] - if mat_type == "conf_wrong": - sf = sf[sf[conf_metric] > evaluation._data["confidence_threshold"]] - elif mat_type == "hesitant": - sf = sf[sf[conf_metric] < evaluation._data["hesitant_threshold"]] + if mat_type == "conf_wrong": + sf = sf[sf[conf_metric] > evaluation._data["confidence_threshold"]] + elif mat_type == "hesitant": + sf = sf[sf[conf_metric] < evaluation._data["hesitant_threshold"]] - filtered_array = None - if row_type == "corrects": - sf = sf.filter_by([True], "correct") - elif row_type == "incorrects": - sf = sf.filter_by([False], "correct") + filtered_array = None + if row_type == "corrects": + sf = sf.filter_by([True], "correct") + elif row_type == "incorrects": + sf = sf.filter_by([False], "correct") - if len(filters) == 0: - return sf + if len(filters) == 0: + return sf - for f in filters: - target_label = (f['target_label']) - predicted_label = (f['predicted_label']) - filtered_sframe = sf.filter_by([target_label], "target_label").filter_by([predicted_label], "predicted_label") - if filtered_array is None: - filtered_array = filtered_sframe - else: - filtered_array = filtered_array.append(filtered_sframe) + for f in filters: + target_label = f["target_label"] + predicted_label = f["predicted_label"] + filtered_sframe = sf.filter_by([target_label], "target_label").filter_by( + [predicted_label], "predicted_label" + ) + if filtered_array is None: + filtered_array = filtered_sframe + else: + filtered_array = filtered_array.append(filtered_sframe) + + return filtered_array - return filtered_array def __get_incorrects(label, sf, evaluation): - conf_metric = evaluation._data["confidence_metric_for_threshold"] + conf_metric = evaluation._data["confidence_metric_for_threshold"] - sf = sf.filter_by([False], "correct") + sf = sf.filter_by([False], "correct") - if sf["target_label"].dtype == int: - label = int(label) + if sf["target_label"].dtype == int: + label = int(label) - filtered_sframe = sf.filter_by([label], "target_label") - unique_predictions = list(filtered_sframe["predicted_label"].unique()) + filtered_sframe = sf.filter_by([label], "target_label") + unique_predictions = list(filtered_sframe["predicted_label"].unique()) - data = [] - for u in unique_predictions: - u_filt = filtered_sframe.filter_by([u], "predicted_label") - data.append({"label": str(u), "images": list(u_filt[evaluation._data['feature']])}) + data = [] + for u in unique_predictions: + u_filt = filtered_sframe.filter_by([u], "predicted_label") + data.append( + {"label": str(u), "images": list(u_filt[evaluation._data["feature"]])} + ) - return _json.dumps({"data_spec": { "incorrects": {"target": label, "data": data }}}) + "\n" + return ( + _json.dumps({"data_spec": {"incorrects": {"target": label, "data": data}}}) + + "\n" + ) -def __get_corrects(sf, evaluation): - conf_metric = evaluation._data["confidence_metric_for_threshold"] - sf = sf.filter_by([True], "correct") +def __get_corrects(sf, evaluation): + conf_metric = evaluation._data["confidence_metric_for_threshold"] - unique_predictions = list(sf["target_label"].unique()) + sf = sf.filter_by([True], "correct") - data = [] - for u in unique_predictions: - u_filt = sf.filter_by([u], "predicted_label") - data.append({"target": u, "images": list(u_filt[evaluation._data['feature']])}) - return _json.dumps({"data_spec": { "correct": data}}) + "\n" + unique_predictions = list(sf["target_label"].unique()) -def _process_value(value, extended_sframe, proc, evaluation): - json_value = None + data = [] + for u in unique_predictions: + u_filt = sf.filter_by([u], "predicted_label") + data.append({"target": u, "images": list(u_filt[evaluation._data["feature"]])}) + return _json.dumps({"data_spec": {"correct": data}}) + "\n" - try: - json_value = _json.loads(value) - except: - pass - if json_value != None: - if(json_value['method'] == "get_rows_eval"): - proc.stdin.write(_get_data_spec(json_value['cells'], json_value['start'], json_value['length'], json_value['row_type'], json_value['mat_type'], extended_sframe, evaluation).encode('utf-8')) - proc.stdin.flush() - - if(json_value['method'] == "get_corrects"): - proc.stdin.write(__get_corrects(extended_sframe, evaluation).encode('utf-8')) - proc.stdin.flush() +def _process_value(value, extended_sframe, proc, evaluation): + json_value = None + + try: + json_value = _json.loads(value) + except: + pass + + if json_value != None: + if json_value["method"] == "get_rows_eval": + proc.stdin.write( + _get_data_spec( + json_value["cells"], + json_value["start"], + json_value["length"], + json_value["row_type"], + json_value["mat_type"], + extended_sframe, + evaluation, + ).encode("utf-8") + ) + proc.stdin.flush() + + if json_value["method"] == "get_corrects": + proc.stdin.write( + __get_corrects(extended_sframe, evaluation).encode("utf-8") + ) + proc.stdin.flush() + + if json_value["method"] == "get_incorrects": + proc.stdin.write( + __get_incorrects( + json_value["label"], extended_sframe, evaluation + ).encode("utf-8") + ) + proc.stdin.flush() - if(json_value['method'] == "get_incorrects"): - proc.stdin.write(__get_incorrects(json_value['label'], extended_sframe, evaluation).encode('utf-8')) - proc.stdin.flush() def _start_process(process_input, extended_sframe, evaluation): - proc = __subprocess.Popen(_get_client_app_path(), stdout=__subprocess.PIPE, stdin=__subprocess.PIPE) - proc.stdin.write(process_input.encode('utf-8')) - proc.stdin.flush() + proc = __subprocess.Popen( + _get_client_app_path(), stdout=__subprocess.PIPE, stdin=__subprocess.PIPE + ) + proc.stdin.write(process_input.encode("utf-8")) + proc.stdin.flush() + + # https://docs.python.org/2/library/subprocess.html#subprocess.Popen.communicate - #https://docs.python.org/2/library/subprocess.html#subprocess.Popen.communicate + while proc.poll() == None: + value = proc.stdout.readline() + if value == "": + continue - while(proc.poll() == None): - value = proc.stdout.readline() - if value == '': - continue + _process_value(value, extended_sframe, proc, evaluation) - _process_value(value, extended_sframe, proc, evaluation) + return proc - return proc def _image_resize(image): - # resize with decode=False will produce a PNG encoded image - # (even starting with a decoded image) - # this behavior is enforced in test/unity/image_util.cxx:test_resize - # then, we can go through this function again and fall into the above case. - # while we're resizing (which implies decoding) anyway, let's take - # the opportunity to reduce the size if it's quite large. - width = image.width - height = image.height - while (width > 400): - width = width / 2 - height = height / 2 - - # if already in jpeg and the size isn't changing, force a decode/encode - # to convert to PNG - if image._format_enum == 0 and width == image.width: - image = _tc.image_analysis.resize(image, width=width, height=height, decode=True) - - # now resize with decode=False to get PNG - image = _tc.image_analysis.resize(image, width=int(width), height=int(height), decode=False) - assert(image._format_enum == 1) # png - - return image + # resize with decode=False will produce a PNG encoded image + # (even starting with a decoded image) + # this behavior is enforced in test/unity/image_util.cxx:test_resize + # then, we can go through this function again and fall into the above case. + # while we're resizing (which implies decoding) anyway, let's take + # the opportunity to reduce the size if it's quite large. + width = image.width + height = image.height + while width > 400: + width = width / 2 + height = height / 2 + + # if already in jpeg and the size isn't changing, force a decode/encode + # to convert to PNG + if image._format_enum == 0 and width == image.width: + image = _tc.image_analysis.resize( + image, width=width, height=height, decode=True + ) + + # now resize with decode=False to get PNG + image = _tc.image_analysis.resize( + image, width=int(width), height=int(height), decode=False + ) + assert image._format_enum == 1 # png + + return image + def _image_conversion(image): - result = { - "width": image.width, - "height": image.height, - "column": "image", - "format": "png" - } - - result["data"] = _base64.b64encode(_image_resize(image)._image_data) - return result + result = { + "width": image.width, + "height": image.height, + "column": "image", + "format": "png", + } + + result["data"] = _base64.b64encode(_image_resize(image)._image_data) + return result diff --git a/src/python/turicreate/toolkits/image_classifier/image_classifier.py b/src/python/turicreate/toolkits/image_classifier/image_classifier.py index 4495fd86ba..4fbcbbdf2f 100644 --- a/src/python/turicreate/toolkits/image_classifier/image_classifier.py +++ b/src/python/turicreate/toolkits/image_classifier/image_classifier.py @@ -25,25 +25,32 @@ from turicreate.toolkits import _coreml_utils _DEFAULT_SOLVER_OPTIONS = { -'convergence_threshold': 1e-2, -'step_size': 1.0, -'lbfgs_memory_level': 11, -'max_iterations': 10} - - -def create(dataset, target, feature=None, model = 'resnet-50', + "convergence_threshold": 1e-2, + "step_size": 1.0, + "lbfgs_memory_level": 11, + "max_iterations": 10, +} + + +def create( + dataset, + target, + feature=None, + model="resnet-50", l2_penalty=0.01, l1_penalty=0.0, - solver='auto', feature_rescaling=True, - convergence_threshold = _DEFAULT_SOLVER_OPTIONS['convergence_threshold'], - step_size = _DEFAULT_SOLVER_OPTIONS['step_size'], - lbfgs_memory_level = _DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'], - max_iterations = _DEFAULT_SOLVER_OPTIONS['max_iterations'], - class_weights = None, - validation_set = 'auto', + solver="auto", + feature_rescaling=True, + convergence_threshold=_DEFAULT_SOLVER_OPTIONS["convergence_threshold"], + step_size=_DEFAULT_SOLVER_OPTIONS["step_size"], + lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS["lbfgs_memory_level"], + max_iterations=_DEFAULT_SOLVER_OPTIONS["max_iterations"], + class_weights=None, + validation_set="auto", verbose=True, seed=None, - batch_size=64): + batch_size=64, +): """ Create a :class:`ImageClassifier` model. @@ -219,82 +226,98 @@ def create(dataset, target, feature=None, model = 'resnet-50', # Check model parameter allowed_models = list(_pre_trained_models.IMAGE_MODELS.keys()) - if _mac_ver() >= (10,14): - allowed_models.append('VisionFeaturePrint_Scene') - _tkutl._check_categorical_option_type('model', model, allowed_models) + if _mac_ver() >= (10, 14): + allowed_models.append("VisionFeaturePrint_Scene") + _tkutl._check_categorical_option_type("model", model, allowed_models) # Check dataset parameter if not isinstance(dataset, _tc.SFrame): raise TypeError("Unrecognized type for 'dataset'. An SFrame is expected.") if len(dataset) == 0: - raise _ToolkitError('Unable to train on empty dataset') + raise _ToolkitError("Unable to train on empty dataset") if (feature is not None) and (feature not in dataset.column_names()): raise _ToolkitError("Image feature column '%s' does not exist" % feature) if target not in dataset.column_names(): raise _ToolkitError("Target column '%s' does not exist" % target) - if(batch_size < 1): + if batch_size < 1: raise ValueError("'batch_size' must be greater than or equal to 1") - if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto' or validation_set is None): + if not ( + isinstance(validation_set, _tc.SFrame) + or validation_set == "auto" + or validation_set is None + ): raise TypeError("Unrecognized value for 'validation_set'.") if feature is None: feature = _tkutl._find_only_image_column(dataset) - _tkutl._handle_missing_values(dataset, feature, 'training_dataset') + _tkutl._handle_missing_values(dataset, feature, "training_dataset") feature_extractor = _image_feature_extractor._create_feature_extractor(model) # Extract features - extracted_features = _tc.SFrame({ - target: dataset[target], - '__image_features__': feature_extractor.extract_features(dataset, feature, verbose=verbose, batch_size=batch_size), - }) + extracted_features = _tc.SFrame( + { + target: dataset[target], + "__image_features__": feature_extractor.extract_features( + dataset, feature, verbose=verbose, batch_size=batch_size + ), + } + ) if isinstance(validation_set, _tc.SFrame): - _tkutl._handle_missing_values(dataset, feature, 'validation_set') - extracted_features_validation = _tc.SFrame({ - target: validation_set[target], - '__image_features__': feature_extractor.extract_features(validation_set, feature, verbose=verbose, batch_size=batch_size), - }) + _tkutl._handle_missing_values(dataset, feature, "validation_set") + extracted_features_validation = _tc.SFrame( + { + target: validation_set[target], + "__image_features__": feature_extractor.extract_features( + validation_set, feature, verbose=verbose, batch_size=batch_size + ), + } + ) else: extracted_features_validation = validation_set # Train a classifier using the extracted features extracted_features[target] = dataset[target] - lr_model = _tc.logistic_classifier.create(extracted_features, - features=['__image_features__'], - target=target, - max_iterations=max_iterations, - validation_set=extracted_features_validation, - seed=seed, - verbose=verbose, l2_penalty=l2_penalty, l1_penalty=l1_penalty, - solver=solver, feature_rescaling=feature_rescaling, - convergence_threshold=convergence_threshold, - step_size=step_size, - lbfgs_memory_level=lbfgs_memory_level, - class_weights=class_weights) - + lr_model = _tc.logistic_classifier.create( + extracted_features, + features=["__image_features__"], + target=target, + max_iterations=max_iterations, + validation_set=extracted_features_validation, + seed=seed, + verbose=verbose, + l2_penalty=l2_penalty, + l1_penalty=l1_penalty, + solver=solver, + feature_rescaling=feature_rescaling, + convergence_threshold=convergence_threshold, + step_size=step_size, + lbfgs_memory_level=lbfgs_memory_level, + class_weights=class_weights, + ) # set input image shape if model in _pre_trained_models.IMAGE_MODELS: input_image_shape = _pre_trained_models.IMAGE_MODELS[model].input_image_shape - else: # model == VisionFeaturePrint_Scene + else: # model == VisionFeaturePrint_Scene input_image_shape = (3, 299, 299) # Save the model state = { - 'classifier': lr_model, - 'model': model, - 'max_iterations': max_iterations, - 'feature_extractor': feature_extractor, - 'input_image_shape': input_image_shape, - 'target': target, - 'feature': feature, - 'num_features': 1, - 'num_classes': lr_model.num_classes, - 'classes': lr_model.classes, - 'num_examples': lr_model.num_examples, - 'training_time': _time.time() - start_time, - 'training_loss': lr_model.training_loss, + "classifier": lr_model, + "model": model, + "max_iterations": max_iterations, + "feature_extractor": feature_extractor, + "input_image_shape": input_image_shape, + "target": target, + "feature": feature, + "num_features": 1, + "num_classes": lr_model.num_classes, + "classes": lr_model.classes, + "num_examples": lr_model.num_examples, + "training_time": _time.time() - start_time, + "training_loss": lr_model.training_loss, } return ImageClassifier(state) @@ -324,9 +347,9 @@ def _get_native_state(self): :py:func:`~turicreate.load_model` method. """ state = self.__proxy__.get_state() - state['classifier'] = state['classifier'].__proxy__ - del state['feature_extractor'] - del state['classes'] + state["classifier"] = state["classifier"].__proxy__ + del state["feature_extractor"] + del state["classes"] return state @classmethod @@ -336,21 +359,28 @@ def _load_version(cls, state, version): instance. """ _tkutl._model_version_check(version, cls._PYTHON_IMAGE_CLASSIFIER_VERSION) - from turicreate.toolkits.classifier.logistic_classifier import LogisticClassifier - state['classifier'] = LogisticClassifier(state['classifier']) - state['classes'] = state['classifier'].classes + from turicreate.toolkits.classifier.logistic_classifier import ( + LogisticClassifier, + ) + + state["classifier"] = LogisticClassifier(state["classifier"]) + state["classes"] = state["classifier"].classes # Correct models saved with a previous typo - if state['model'] == "VisionFeaturePrint_Screen": - state['model'] = "VisionFeaturePrint_Scene" + if state["model"] == "VisionFeaturePrint_Screen": + state["model"] = "VisionFeaturePrint_Scene" # Load pre-trained model & feature extractor - model_name = state['model'] - if model_name == "VisionFeaturePrint_Scene" and _mac_ver() < (10,14): - raise ToolkitError("Can not load model on this operating system. This model uses VisionFeaturePrint_Scene, " - "which is only supported on macOS 10.14 and higher.") - state['feature_extractor'] = _image_feature_extractor._create_feature_extractor(model_name) - state['input_image_shape'] = tuple([int(i) for i in state['input_image_shape']]) + model_name = state["model"] + if model_name == "VisionFeaturePrint_Scene" and _mac_ver() < (10, 14): + raise ToolkitError( + "Can not load model on this operating system. This model uses VisionFeaturePrint_Scene, " + "which is only supported on macOS 10.14 and higher." + ) + state["feature_extractor"] = _image_feature_extractor._create_feature_extractor( + model_name + ) + state["input_image_shape"] = tuple([int(i) for i in state["input_image_shape"]]) return ImageClassifier(state) def __str__(self): @@ -373,8 +403,7 @@ def __repr__(self): width = 40 sections, section_titles = self._get_summary_struct() - out = _tkutl._toolkit_repr_print(self, sections, section_titles, - width=width) + out = _tkutl._toolkit_repr_print(self, sections, section_titles, width=width) return out def _get_summary_struct(self): @@ -395,18 +424,18 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ model_fields = [ - ('Number of classes', 'num_classes'), - ('Number of feature columns', 'num_features'), - ('Input image shape', 'input_image_shape'), + ("Number of classes", "num_classes"), + ("Number of feature columns", "num_features"), + ("Input image shape", "input_image_shape"), ] training_fields = [ - ('Number of examples', 'num_examples'), - ("Training loss", 'training_loss'), - ("Training time (sec)", 'training_time'), + ("Number of examples", "num_examples"), + ("Training loss", "training_loss"), + ("Training time (sec)", "training_time"), ] - section_titles = ['Schema', 'Training summary'] - return([model_fields, training_fields], section_titles) + section_titles = ["Schema", "Training summary"] + return ([model_fields, training_fields], section_titles) def _canonize_input(self, dataset): """ @@ -422,7 +451,7 @@ def _canonize_input(self, dataset): unpack = lambda x: x[0] return dataset, unpack - def predict(self, dataset, output_type='class', batch_size=64): + def predict(self, dataset, output_type="class", batch_size=64): """ Return predictions for ``dataset``, using the trained logistic regression model. Predictions can be generated as class labels, @@ -481,14 +510,18 @@ class as a vector. The probability of the first class (sorted """ if not isinstance(dataset, (_tc.SFrame, _tc.SArray, _tc.Image)): - raise TypeError('dataset must be either an SFrame, SArray or turicreate.Image') - if(batch_size < 1): + raise TypeError( + "dataset must be either an SFrame, SArray or turicreate.Image" + ) + if batch_size < 1: raise ValueError("'batch_size' must be greater than or equal to 1") dataset, unpack = self._canonize_input(dataset) extracted_features = self._extract_features(dataset, batch_size=batch_size) - return unpack(self.classifier.predict(extracted_features, output_type=output_type)) + return unpack( + self.classifier.predict(extracted_features, output_type=output_type) + ) def classify(self, dataset, batch_size=64): """ @@ -526,8 +559,10 @@ def classify(self, dataset, batch_size=64): """ if not isinstance(dataset, (_tc.SFrame, _tc.SArray, _tc.Image)): - raise TypeError('dataset must be either an SFrame, SArray or turicreate.Image') - if(batch_size < 1): + raise TypeError( + "dataset must be either an SFrame, SArray or turicreate.Image" + ) + if batch_size < 1: raise ValueError("'batch_size' must be greater than or equal to 1") dataset, unpack = self._canonize_input(dataset) @@ -591,16 +626,20 @@ def predict_topk(self, dataset, output_type="probability", k=3, batch_size=64): [35688 rows x 3 columns] """ if not isinstance(dataset, (_tc.SFrame, _tc.SArray, _tc.Image)): - raise TypeError('dataset must be either an SFrame, SArray or turicreate.Image') - if(batch_size < 1): + raise TypeError( + "dataset must be either an SFrame, SArray or turicreate.Image" + ) + if batch_size < 1: raise ValueError("'batch_size' must be greater than or equal to 1") dataset, _ = self._canonize_input(dataset) extracted_features = self._extract_features(dataset) - return self.classifier.predict_topk(extracted_features, output_type = output_type, k = k) + return self.classifier.predict_topk( + extracted_features, output_type=output_type, k=k + ) - def evaluate(self, dataset, metric='auto', verbose=True, batch_size=64): + def evaluate(self, dataset, metric="auto", verbose=True, batch_size=64): """ Evaluate the model by making predictions of target values and comparing these to actual values. @@ -653,82 +692,156 @@ def evaluate(self, dataset, metric='auto', verbose=True, batch_size=64): >>> results = model.evaluate(data) >>> print results['accuracy'] """ - if(batch_size < 1): + if batch_size < 1: raise ValueError("'batch_size' must be greater than or equal to 1") if self.target not in dataset.column_names(): raise _ToolkitError("Target column '%s' does not exist" % self.target) - extracted_features = self._extract_features(dataset, verbose=verbose, batch_size=batch_size) + extracted_features = self._extract_features( + dataset, verbose=verbose, batch_size=batch_size + ) extracted_features[self.target] = dataset[self.target] - metrics = self.classifier.evaluate(extracted_features, metric=metric, with_predictions=True) + metrics = self.classifier.evaluate( + extracted_features, metric=metric, with_predictions=True + ) predictions = metrics["predictions"]["probs"] state = self.__proxy__.get_state() labels = state["classes"] - from .._evaluate_utils import ( + from .._evaluate_utils import ( entropy, confidence, relative_confidence, get_confusion_matrix, hclusterSort, - l2Dist + l2Dist, ) - evaluation_result = {k: metrics[k] for k in ['accuracy', 'f1_score', 'log_loss', 'precision', - 'recall', 'auc', 'roc_curve', 'confusion_matrix']} - evaluation_result['num_test_examples'] = len(dataset) - for k in ['num_classes', 'num_features', 'input_image_shape', 'num_examples', 'training_loss', 'training_time', 'model', 'max_iterations']: + evaluation_result = { + k: metrics[k] + for k in [ + "accuracy", + "f1_score", + "log_loss", + "precision", + "recall", + "auc", + "roc_curve", + "confusion_matrix", + ] + } + evaluation_result["num_test_examples"] = len(dataset) + for k in [ + "num_classes", + "num_features", + "input_image_shape", + "num_examples", + "training_loss", + "training_time", + "model", + "max_iterations", + ]: evaluation_result[k] = getattr(self, k) # Extend the given test data - extended_test = dataset.add_column(predictions, 'probs') - extended_test['label'] = dataset[self.target] - extended_test = extended_test.add_columns( [extended_test.apply(lambda d: labels[d['probs'].index(confidence(d['probs']))]), - extended_test.apply(lambda d: entropy(d['probs'])), - extended_test.apply(lambda d: confidence(d['probs'])), - extended_test.apply(lambda d: relative_confidence(d['probs']))], - ['predicted_label', 'entropy', 'confidence', 'relative_confidence']) - extended_test = extended_test.add_column(extended_test.apply(lambda d: d['label'] == d['predicted_label']), 'correct') - - evaluation_result['model_name'] = state['model'] + extended_test = dataset.add_column(predictions, "probs") + extended_test["label"] = dataset[self.target] + extended_test = extended_test.add_columns( + [ + extended_test.apply( + lambda d: labels[d["probs"].index(confidence(d["probs"]))] + ), + extended_test.apply(lambda d: entropy(d["probs"])), + extended_test.apply(lambda d: confidence(d["probs"])), + extended_test.apply(lambda d: relative_confidence(d["probs"])), + ], + ["predicted_label", "entropy", "confidence", "relative_confidence"], + ) + extended_test = extended_test.add_column( + extended_test.apply(lambda d: d["label"] == d["predicted_label"]), "correct" + ) + + evaluation_result["model_name"] = state["model"] # Calculate the confusion matrix sf_conf_mat = get_confusion_matrix(extended_test, labels) confidence_threshold = 0.5 hesitant_threshold = 0.2 - evaluation_result['confidence_threshold'] = confidence_threshold - evaluation_result['hesitant_threshold'] = hesitant_threshold - evaluation_result['confidence_metric_for_threshold'] = 'relative_confidence' + evaluation_result["confidence_threshold"] = confidence_threshold + evaluation_result["hesitant_threshold"] = hesitant_threshold + evaluation_result["confidence_metric_for_threshold"] = "relative_confidence" - evaluation_result['conf_mat'] = list(sf_conf_mat) + evaluation_result["conf_mat"] = list(sf_conf_mat) # Get sorted labels (sorted by hCluster) - vectors = map(lambda l: {'name': l, 'pos':list(sf_conf_mat[sf_conf_mat['target_label']==l].sort('predicted_label')['norm_prob'])}, - labels) - evaluation_result['sorted_labels'] = hclusterSort(vectors, l2Dist)[0]['name'].split("|") + vectors = map( + lambda l: { + "name": l, + "pos": list( + sf_conf_mat[sf_conf_mat["target_label"] == l].sort( + "predicted_label" + )["norm_prob"] + ), + }, + labels, + ) + evaluation_result["sorted_labels"] = hclusterSort(vectors, l2Dist)[0][ + "name" + ].split("|") # Get recall and precision per label - per_l = extended_test.groupby(['label'], {'count': _tc.aggregate.COUNT, 'correct_count': _tc.aggregate.SUM('correct') }) - per_l['recall'] = per_l.apply(lambda l: l['correct_count']*1.0 / l['count']) - - per_pl = extended_test.groupby(['predicted_label'], {'predicted_count': _tc.aggregate.COUNT, 'correct_count': _tc.aggregate.SUM('correct') }) - per_pl['precision'] = per_pl.apply(lambda l: l['correct_count']*1.0 / l['predicted_count']) - per_pl = per_pl.rename({'predicted_label': 'label'}) - evaluation_result['label_metrics'] = list(per_l.join(per_pl, on='label', how='outer').select_columns(['label', 'count', 'correct_count', 'predicted_count', 'recall', 'precision'])) - evaluation_result['labels'] = labels + per_l = extended_test.groupby( + ["label"], + { + "count": _tc.aggregate.COUNT, + "correct_count": _tc.aggregate.SUM("correct"), + }, + ) + per_l["recall"] = per_l.apply(lambda l: l["correct_count"] * 1.0 / l["count"]) + + per_pl = extended_test.groupby( + ["predicted_label"], + { + "predicted_count": _tc.aggregate.COUNT, + "correct_count": _tc.aggregate.SUM("correct"), + }, + ) + per_pl["precision"] = per_pl.apply( + lambda l: l["correct_count"] * 1.0 / l["predicted_count"] + ) + per_pl = per_pl.rename({"predicted_label": "label"}) + evaluation_result["label_metrics"] = list( + per_l.join(per_pl, on="label", how="outer").select_columns( + [ + "label", + "count", + "correct_count", + "predicted_count", + "recall", + "precision", + ] + ) + ) + evaluation_result["labels"] = labels - extended_test = extended_test.add_row_number('__idx').rename({'label': 'target_label'}) + extended_test = extended_test.add_row_number("__idx").rename( + {"label": "target_label"} + ) - evaluation_result['test_data'] = extended_test - evaluation_result['feature'] = self.feature + evaluation_result["test_data"] = extended_test + evaluation_result["feature"] = self.feature return _Evaluation(evaluation_result) def _extract_features(self, dataset, verbose=False, batch_size=64): - return _tc.SFrame({ - '__image_features__': self.feature_extractor.extract_features(dataset, self.feature, verbose=verbose, batch_size=batch_size) - }) + return _tc.SFrame( + { + "__image_features__": self.feature_extractor.extract_features( + dataset, self.feature, verbose=verbose, batch_size=batch_size + ) + } + ) def export_coreml(self, filename): """ @@ -743,12 +856,12 @@ def export_coreml(self, filename): >>> model.export_coreml('myModel.mlmodel') """ import coremltools - # First define three internal helper functions + # First define three internal helper functions # Internal helper function def _create_vision_feature_print_scene(): - prob_name = self.target + 'Probability' + prob_name = self.target + "Probability" # # Setup the top level (pipeline classifier) spec @@ -767,7 +880,9 @@ def _create_vision_feature_print_scene(): input.name = self.feature input.type.imageType.width = 299 input.type.imageType.height = 299 - BGR_VALUE = coremltools.proto.FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value('BGR') + BGR_VALUE = coremltools.proto.FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value( + "BGR" + ) input.type.imageType.colorSpace = BGR_VALUE # @@ -786,7 +901,9 @@ def _create_vision_feature_print_scene(): output = scene_print.description.output.add() output.name = "output_name" - DOUBLE_ARRAY_VALUE = coremltools.proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value('DOUBLE') + DOUBLE_ARRAY_VALUE = coremltools.proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value( + "DOUBLE" + ) output.type.multiArrayType.dataType = DOUBLE_ARRAY_VALUE output.type.multiArrayType.shape.append(2048) @@ -808,7 +925,7 @@ def _create_vision_feature_print_scene(): # Softmax layer softmax = nn_spec.layers.add() softmax.name = "softmax" - softmax.softmax.MergeFromString(b'') + softmax.softmax.MergeFromString(b"") softmax.input.append("softmax_input") softmax.output.append(prob_name) @@ -825,18 +942,17 @@ def _create_vision_feature_print_scene(): label_output.name = self.target if type(self.classifier.classes[0]) == int: - prob_output.type.dictionaryType.int64KeyType.MergeFromString(b'') - label_output.type.int64Type.MergeFromString(b'') + prob_output.type.dictionaryType.int64KeyType.MergeFromString(b"") + label_output.type.int64Type.MergeFromString(b"") else: - prob_output.type.dictionaryType.stringKeyType.MergeFromString(b'') - label_output.type.stringType.MergeFromString(b'') + prob_output.type.dictionaryType.stringKeyType.MergeFromString(b"") + label_output.type.stringType.MergeFromString(b"") temp.description.predictedFeatureName = self.target temp.description.predictedProbabilitiesName = prob_name return top_spec - # Internal helper function def _update_last_two_layers(nn_spec): # Replace the softmax layer with new coeffients @@ -854,11 +970,13 @@ def _update_last_two_layers(nn_spec): del bias.floatValue[:] import numpy as np - W = np.array(coefs[coefs['index'] != None]['value'], ndmin = 2).reshape( - inputChannels, num_classes - 1, order = 'F') - b = coefs[coefs['index'] == None]['value'] + + W = np.array(coefs[coefs["index"] != None]["value"], ndmin=2).reshape( + inputChannels, num_classes - 1, order="F" + ) + b = coefs[coefs["index"] == None]["value"] Wa = np.hstack((np.zeros((inputChannels, 1)), W)) - weights.floatValue.extend(Wa.flatten(order = 'F')) + weights.floatValue.extend(Wa.flatten(order="F")) bias.floatValue.extend([0.0] + list(b)) # Internal helper function @@ -868,65 +986,72 @@ def _set_inputs_outputs_and_metadata(spec, nn_spec): probOutput = spec.description.output[0] classLabel = spec.description.output[1] - probOutput.type.dictionaryType.MergeFromString(b'') + probOutput.type.dictionaryType.MergeFromString(b"") if type(class_labels[0]) == int: - nn_spec.ClearField('int64ClassLabels') - probOutput.type.dictionaryType.int64KeyType.MergeFromString(b'') - classLabel.type.int64Type.MergeFromString(b'') + nn_spec.ClearField("int64ClassLabels") + probOutput.type.dictionaryType.int64KeyType.MergeFromString(b"") + classLabel.type.int64Type.MergeFromString(b"") del nn_spec.int64ClassLabels.vector[:] for c in class_labels: nn_spec.int64ClassLabels.vector.append(c) else: - nn_spec.ClearField('stringClassLabels') - probOutput.type.dictionaryType.stringKeyType.MergeFromString(b'') - classLabel.type.stringType.MergeFromString(b'') + nn_spec.ClearField("stringClassLabels") + probOutput.type.dictionaryType.stringKeyType.MergeFromString(b"") + classLabel.type.stringType.MergeFromString(b"") del nn_spec.stringClassLabels.vector[:] for c in class_labels: nn_spec.stringClassLabels.vector.append(c) - prob_name = self.target + 'Probability' + prob_name = self.target + "Probability" label_name = self.target old_output_name = nn_spec.layers[-1].name - coremltools.models.utils.rename_feature(spec, 'classLabel', label_name) + coremltools.models.utils.rename_feature(spec, "classLabel", label_name) coremltools.models.utils.rename_feature(spec, old_output_name, prob_name) if nn_spec.layers[-1].name == old_output_name: nn_spec.layers[-1].name = prob_name if nn_spec.labelProbabilityLayerName == old_output_name: nn_spec.labelProbabilityLayerName = prob_name - coremltools.models.utils.rename_feature(spec, 'data', self.feature) + coremltools.models.utils.rename_feature(spec, "data", self.feature) if len(nn_spec.preprocessing) > 0: nn_spec.preprocessing[0].featureName = self.feature mlmodel = coremltools.models.MLModel(spec) - model_type = 'image classifier (%s)' % self.model - mlmodel.short_description = _coreml_utils._mlmodel_short_description(model_type) - mlmodel.input_description[self.feature] = u'Input image' - mlmodel.output_description[prob_name] = 'Prediction probabilities' - mlmodel.output_description[label_name] = 'Class label of top prediction' + model_type = "image classifier (%s)" % self.model + mlmodel.short_description = _coreml_utils._mlmodel_short_description( + model_type + ) + mlmodel.input_description[self.feature] = u"Input image" + mlmodel.output_description[prob_name] = "Prediction probabilities" + mlmodel.output_description[label_name] = "Class label of top prediction" model_metadata = { - 'model': self.model, - 'target': self.target, - 'features': self.feature, - 'max_iterations': str(self.max_iterations), + "model": self.model, + "target": self.target, + "features": self.feature, + "max_iterations": str(self.max_iterations), } user_defined_metadata = model_metadata.update( - _coreml_utils._get_tc_version_info()) - _coreml_utils._set_model_metadata(mlmodel, - self.__class__.__name__, - user_defined_metadata, - version=ImageClassifier._PYTHON_IMAGE_CLASSIFIER_VERSION) + _coreml_utils._get_tc_version_info() + ) + _coreml_utils._set_model_metadata( + mlmodel, + self.__class__.__name__, + user_defined_metadata, + version=ImageClassifier._PYTHON_IMAGE_CLASSIFIER_VERSION, + ) return mlmodel # main part of the export_coreml function if self.model in _pre_trained_models.IMAGE_MODELS: ptModel = _pre_trained_models.IMAGE_MODELS[self.model]() - feature_extractor = _image_feature_extractor.TensorFlowFeatureExtractor(ptModel) + feature_extractor = _image_feature_extractor.TensorFlowFeatureExtractor( + ptModel + ) coreml_model = feature_extractor.get_coreml_model() spec = coreml_model.get_spec() nn_spec = spec.neuralNetworkClassifier - else: # model == VisionFeaturePrint_Scene + else: # model == VisionFeaturePrint_Scene spec = _create_vision_feature_print_scene() nn_spec = spec.pipelineClassifier.pipeline.models[1].neuralNetworkClassifier diff --git a/src/python/turicreate/toolkits/image_similarity/image_similarity.py b/src/python/turicreate/toolkits/image_similarity/image_similarity.py index 3f6c2c241c..7fc26563f2 100644 --- a/src/python/turicreate/toolkits/image_similarity/image_similarity.py +++ b/src/python/turicreate/toolkits/image_similarity/image_similarity.py @@ -21,8 +21,9 @@ from .. import _image_feature_extractor -def create(dataset, label = None, feature = None, model = 'resnet-50', verbose = True, - batch_size = 64): +def create( + dataset, label=None, feature=None, model="resnet-50", verbose=True, batch_size=64 +): """ Create a :class:`ImageSimilarityModel` model. @@ -103,23 +104,25 @@ def create(dataset, label = None, feature = None, model = 'resnet-50', verbose = # Check parameters allowed_models = list(_pre_trained_models.IMAGE_MODELS.keys()) - if _mac_ver() >= (10,14): - allowed_models.append('VisionFeaturePrint_Scene') + if _mac_ver() >= (10, 14): + allowed_models.append("VisionFeaturePrint_Scene") # Also, to make sure existing code doesn't break, replace incorrect name # with the correct name version if model == "VisionFeaturePrint_Screen": - print("WARNING: Correct spelling of model name is VisionFeaturePrint_Scene. VisionFeaturePrint_Screen will be removed in future releases.") + print( + "WARNING: Correct spelling of model name is VisionFeaturePrint_Scene. VisionFeaturePrint_Screen will be removed in future releases." + ) model = "VisionFeaturePrint_Scene" - _tkutl._check_categorical_option_type('model', model, allowed_models) + _tkutl._check_categorical_option_type("model", model, allowed_models) if len(dataset) == 0: - raise _ToolkitError('Unable to train on empty dataset') + raise _ToolkitError("Unable to train on empty dataset") if (label is not None) and (label not in dataset.column_names()): raise _ToolkitError("Row label column '%s' does not exist" % label) if (feature is not None) and (feature not in dataset.column_names()): raise _ToolkitError("Image feature column '%s' does not exist" % feature) - if(batch_size < 1): + if batch_size < 1: raise ValueError("'batch_size' must be greater than or equal to 1") # Set defaults @@ -129,34 +132,41 @@ def create(dataset, label = None, feature = None, model = 'resnet-50', verbose = feature_extractor = _image_feature_extractor._create_feature_extractor(model) # Extract features - extracted_features = _tc.SFrame({ - '__image_features__': feature_extractor.extract_features(dataset, feature, verbose=verbose, - batch_size=batch_size), - }) + extracted_features = _tc.SFrame( + { + "__image_features__": feature_extractor.extract_features( + dataset, feature, verbose=verbose, batch_size=batch_size + ), + } + ) # Train a similarity model using the extracted features if label is not None: extracted_features[label] = dataset[label] - nn_model = _tc.nearest_neighbors.create(extracted_features, label = label, - features = ['__image_features__'], verbose = verbose) + nn_model = _tc.nearest_neighbors.create( + extracted_features, + label=label, + features=["__image_features__"], + verbose=verbose, + ) # set input image shape if model in _pre_trained_models.IMAGE_MODELS: input_image_shape = _pre_trained_models.IMAGE_MODELS[model].input_image_shape - else: # model == VisionFeaturePrint_Scene + else: # model == VisionFeaturePrint_Scene input_image_shape = (3, 299, 299) # Save the model state = { - 'similarity_model': nn_model, - 'model': model, - 'feature_extractor': feature_extractor, - 'input_image_shape': input_image_shape, - 'label': label, - 'feature': feature, - 'num_features': 1, - 'num_examples': nn_model.num_examples, - 'training_time': _time.time() - start_time, + "similarity_model": nn_model, + "model": model, + "feature_extractor": feature_extractor, + "input_image_shape": input_image_shape, + "label": label, + "feature": feature, + "num_features": 1, + "num_examples": nn_model.num_examples, + "training_time": _time.time() - start_time, } return ImageSimilarityModel(state) @@ -199,8 +209,8 @@ def _get_native_state(self): >>> loaded_model = turicreate.load_model('my_model_file') """ state = self.__proxy__.get_state() - state['similarity_model'] = state['similarity_model'].__proxy__ - del state['feature_extractor'] + state["similarity_model"] = state["similarity_model"].__proxy__ + del state["feature_extractor"] return state @classmethod @@ -219,17 +229,22 @@ def _load_version(cls, state, version): """ _tkutl._model_version_check(version, cls._PYTHON_IMAGE_SIMILARITY_VERSION) from turicreate.toolkits.nearest_neighbors import NearestNeighborsModel - state['similarity_model'] = NearestNeighborsModel(state['similarity_model']) + + state["similarity_model"] = NearestNeighborsModel(state["similarity_model"]) # Correct models saved with a previous typo - if state['model'] == "VisionFeaturePrint_Screen": - state['model'] = "VisionFeaturePrint_Scene" - - if state['model'] == "VisionFeaturePrint_Scene" and _mac_ver() < (10,14): - raise ToolkitError("Can not load model on this operating system. This model uses VisionFeaturePrint_Scene, " - "which is only supported on macOS 10.14 and higher.") - state['feature_extractor'] = _image_feature_extractor._create_feature_extractor(state['model']) - state['input_image_shape'] = tuple([int(i) for i in state['input_image_shape']]) + if state["model"] == "VisionFeaturePrint_Screen": + state["model"] = "VisionFeaturePrint_Scene" + + if state["model"] == "VisionFeaturePrint_Scene" and _mac_ver() < (10, 14): + raise ToolkitError( + "Can not load model on this operating system. This model uses VisionFeaturePrint_Scene, " + "which is only supported on macOS 10.14 and higher." + ) + state["feature_extractor"] = _image_feature_extractor._create_feature_extractor( + state["model"] + ) + state["input_image_shape"] = tuple([int(i) for i in state["input_image_shape"]]) return ImageSimilarityModel(state) def __str__(self): @@ -252,8 +267,7 @@ def __repr__(self): width = 40 sections, section_titles = self._get_summary_struct() - out = _tkutl._toolkit_repr_print(self, sections, section_titles, - width=width) + out = _tkutl._toolkit_repr_print(self, sections, section_titles, width=width) return out def _get_summary_struct(self): @@ -274,22 +288,25 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ model_fields = [ - ('Number of examples', 'num_examples'), - ('Number of feature columns', 'num_features'), - ('Input image shape', 'input_image_shape'), + ("Number of examples", "num_examples"), + ("Number of feature columns", "num_features"), + ("Input image shape", "input_image_shape"), ] training_fields = [ - ("Training time (sec)", 'training_time'), + ("Training time (sec)", "training_time"), ] - section_titles = ['Schema', 'Training summary'] - return([model_fields, training_fields], section_titles) + section_titles = ["Schema", "Training summary"] + return ([model_fields, training_fields], section_titles) - def _extract_features(self, dataset, verbose, batch_size = 64): - return _tc.SFrame({ - '__image_features__': self.feature_extractor.extract_features(dataset, self.feature, verbose=verbose, - batch_size=batch_size) - }) + def _extract_features(self, dataset, verbose, batch_size=64): + return _tc.SFrame( + { + "__image_features__": self.feature_extractor.extract_features( + dataset, self.feature, verbose=verbose, batch_size=batch_size + ) + } + ) def query(self, dataset, label=None, k=5, radius=None, verbose=True, batch_size=64): """ @@ -365,8 +382,10 @@ def query(self, dataset, label=None, k=5, radius=None, verbose=True, batch_size= +-------------+-----------------+----------------+------+ """ if not isinstance(dataset, (_tc.SFrame, _tc.SArray, _tc.Image)): - raise TypeError('dataset must be either an SFrame, SArray or turicreate.Image') - if(batch_size < 1): + raise TypeError( + "dataset must be either an SFrame, SArray or turicreate.Image" + ) + if batch_size < 1: raise ValueError("'batch_size' must be greater than or equal to 1") if isinstance(dataset, _tc.SArray): @@ -374,13 +393,23 @@ def query(self, dataset, label=None, k=5, radius=None, verbose=True, batch_size= elif isinstance(dataset, _tc.Image): dataset = _tc.SFrame({self.feature: [dataset]}) - extracted_features = self._extract_features(dataset, verbose=verbose, batch_size=batch_size) + extracted_features = self._extract_features( + dataset, verbose=verbose, batch_size=batch_size + ) if label is not None: extracted_features[label] = dataset[label] - return self.similarity_model.query(extracted_features, label, k, radius, verbose) - - def similarity_graph(self, k=5, radius=None, include_self_edges=False, - output_type='SGraph', verbose=True): + return self.similarity_model.query( + extracted_features, label, k, radius, verbose + ) + + def similarity_graph( + self, + k=5, + radius=None, + include_self_edges=False, + output_type="SGraph", + verbose=True, + ): """ Construct the similarity graph on the reference dataset, which is already stored in the model to find the top `k` similar images for each @@ -452,7 +481,9 @@ def similarity_graph(self, k=5, radius=None, include_self_edges=False, | 1 | 0 | 0.376430604494 | 1 | +----------+----------+----------------+------+ """ - return self.similarity_model.similarity_graph(k, radius, include_self_edges, output_type, verbose) + return self.similarity_model.similarity_graph( + k, radius, include_self_edges, output_type, verbose + ) def export_coreml(self, filename): """ @@ -507,21 +538,28 @@ def export_coreml(self, filename): import numpy as _np from copy import deepcopy import coremltools as _cmt - from coremltools.models import datatypes as _datatypes, neural_network as _neural_network + from coremltools.models import ( + datatypes as _datatypes, + neural_network as _neural_network, + ) from turicreate.toolkits import _coreml_utils # Get the reference data from the model proxy = self.similarity_model.__proxy__ - reference_data = _np.array(_tc.extensions._nearest_neighbors._nn_get_reference_data(proxy)) + reference_data = _np.array( + _tc.extensions._nearest_neighbors._nn_get_reference_data(proxy) + ) num_examples, embedding_size = reference_data.shape - output_name = 'distance' + output_name = "distance" output_features = [(output_name, _datatypes.Array(num_examples))] - if self.model != 'VisionFeaturePrint_Scene': + if self.model != "VisionFeaturePrint_Scene": # Get the Core ML spec for the feature extractor ptModel = _pre_trained_models.IMAGE_MODELS[self.model]() - feature_extractor = _image_feature_extractor.TensorFlowFeatureExtractor(ptModel) + feature_extractor = _image_feature_extractor.TensorFlowFeatureExtractor( + ptModel + ) feature_extractor_spec = feature_extractor.get_coreml_model().get_spec() input_name = feature_extractor.coreml_data_layer @@ -532,15 +570,20 @@ def export_coreml(self, filename): for l in layers: feature_extractor_spec.neuralNetwork.layers.append(l) - builder = _neural_network.NeuralNetworkBuilder(input_features, output_features, - spec=feature_extractor_spec) + builder = _neural_network.NeuralNetworkBuilder( + input_features, output_features, spec=feature_extractor_spec + ) feature_layer = feature_extractor.coreml_feature_layer - else: # self.model == VisionFeaturePrint_Scene + else: # self.model == VisionFeaturePrint_Scene # Create a pipleline that contains a VisionFeaturePrint followed by a # neural network. - BGR_VALUE = _cmt.proto.FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value('BGR') - DOUBLE_ARRAY_VALUE = _cmt.proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value('DOUBLE') + BGR_VALUE = _cmt.proto.FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value( + "BGR" + ) + DOUBLE_ARRAY_VALUE = _cmt.proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.Value( + "DOUBLE" + ) INPUT_IMAGE_SHAPE = 299 top_spec = _cmt.proto.Model_pb2.Model() @@ -570,7 +613,7 @@ def export_coreml(self, filename): input.type.imageType.height = 299 input.type.imageType.colorSpace = BGR_VALUE - feature_layer = 'VisionFeaturePrint_Scene_output' + feature_layer = "VisionFeaturePrint_Scene_output" output = scene_print.description.output.add() output.name = feature_layer output.type.multiArrayType.dataType = DOUBLE_ARRAY_VALUE @@ -578,7 +621,9 @@ def export_coreml(self, filename): # Neural network builder input_features = [(feature_layer, _datatypes.Array(2048))] - builder = _neural_network.NeuralNetworkBuilder(input_features, output_features) + builder = _neural_network.NeuralNetworkBuilder( + input_features, output_features + ) # To add the nearest neighbors model we add calculation of the euclidean # distance between the newly extracted query features (denoted by the vector u) @@ -586,31 +631,55 @@ def export_coreml(self, filename): # Calculation of sqrt((v_i-u)^2) = sqrt(v_i^2 - 2v_i*u + u^2) ensues. V = reference_data v_squared = (V * V).sum(axis=1) - builder.add_inner_product('v^2-2vu', W=-2 * V, b=v_squared, has_bias=True, - input_channels=embedding_size, output_channels=num_examples, - input_name=feature_layer, output_name='v^2-2vu') - - builder.add_unary('element_wise-u^2', mode='power', alpha=2, - input_name=feature_layer, output_name='element_wise-u^2') + builder.add_inner_product( + "v^2-2vu", + W=-2 * V, + b=v_squared, + has_bias=True, + input_channels=embedding_size, + output_channels=num_examples, + input_name=feature_layer, + output_name="v^2-2vu", + ) + + builder.add_unary( + "element_wise-u^2", + mode="power", + alpha=2, + input_name=feature_layer, + output_name="element_wise-u^2", + ) # Produce a vector of length num_examples with all values equal to u^2 - builder.add_inner_product('u^2', W=_np.ones((embedding_size, num_examples)), - b=None, has_bias=False, - input_channels=embedding_size, output_channels=num_examples, - input_name='element_wise-u^2', output_name='u^2') - - builder.add_elementwise('v^2-2vu+u^2', mode='ADD', - input_names=['v^2-2vu', 'u^2'], - output_name='v^2-2vu+u^2') + builder.add_inner_product( + "u^2", + W=_np.ones((embedding_size, num_examples)), + b=None, + has_bias=False, + input_channels=embedding_size, + output_channels=num_examples, + input_name="element_wise-u^2", + output_name="u^2", + ) + + builder.add_elementwise( + "v^2-2vu+u^2", + mode="ADD", + input_names=["v^2-2vu", "u^2"], + output_name="v^2-2vu+u^2", + ) # v^2-2vu+u^2=(v-u)^2 is non-negative but some computations on GPU may result in # small negative values. Apply RELU so we don't take the square root of negative values. - builder.add_activation('relu', non_linearity='RELU', - input_name='v^2-2vu+u^2', output_name='relu') - builder.add_unary('sqrt', mode='sqrt', input_name='relu', output_name=output_name) + builder.add_activation( + "relu", non_linearity="RELU", input_name="v^2-2vu+u^2", output_name="relu" + ) + builder.add_unary( + "sqrt", mode="sqrt", input_name="relu", output_name=output_name + ) # Finalize model - if self.model != 'VisionFeaturePrint_Scene': + if self.model != "VisionFeaturePrint_Scene": builder.set_input([input_name], [self.input_image_shape]) builder.set_output([output_name], [(num_examples,)]) _cmt.models.utils.rename_feature(builder.spec, input_name, self.feature) @@ -621,20 +690,25 @@ def export_coreml(self, filename): mlmodel = _cmt.models.MLModel(top_spec) # Add metadata - model_type = 'image similarity' + model_type = "image similarity" mlmodel.short_description = _coreml_utils._mlmodel_short_description(model_type) - mlmodel.input_description[self.feature] = u'Input image' - mlmodel.output_description[output_name] = u'Distances between the input and reference images' + mlmodel.input_description[self.feature] = u"Input image" + mlmodel.output_description[ + output_name + ] = u"Distances between the input and reference images" model_metadata = { - 'model': self.model, - 'num_examples': str(self.num_examples), + "model": self.model, + "num_examples": str(self.num_examples), } user_defined_metadata = model_metadata.update( - _coreml_utils._get_tc_version_info()) - _coreml_utils._set_model_metadata(mlmodel, - self.__class__.__name__, - user_defined_metadata, - version=ImageSimilarityModel._PYTHON_IMAGE_SIMILARITY_VERSION) + _coreml_utils._get_tc_version_info() + ) + _coreml_utils._set_model_metadata( + mlmodel, + self.__class__.__name__, + user_defined_metadata, + version=ImageSimilarityModel._PYTHON_IMAGE_SIMILARITY_VERSION, + ) mlmodel.save(filename) diff --git a/src/python/turicreate/toolkits/nearest_neighbors/_nearest_neighbors.py b/src/python/turicreate/toolkits/nearest_neighbors/_nearest_neighbors.py index 317e1b3964..7ccaeca116 100644 --- a/src/python/turicreate/toolkits/nearest_neighbors/_nearest_neighbors.py +++ b/src/python/turicreate/toolkits/nearest_neighbors/_nearest_neighbors.py @@ -29,6 +29,7 @@ import copy as _copy import six as _six + def _construct_auto_distance(feature_names, column_names, column_types, sample): """ Construct composite distance parameters based on selected features and their @@ -47,7 +48,9 @@ def _construct_auto_distance(feature_names, column_names, column_types, sample): for c in feature_names: if col_type_dict[c] == str: - composite_distance_params.append([[c], _turicreate.distances.levenshtein, 1]) + composite_distance_params.append( + [[c], _turicreate.distances.levenshtein, 1] + ) elif col_type_dict[c] == dict: composite_distance_params.append([[c], _turicreate.distances.jaccard, 1]) elif col_type_dict[c] == array.array: @@ -60,18 +63,29 @@ def _construct_auto_distance(feature_names, column_names, column_types, sample): elif col_type_dict[c] in [int, float, array.array, list]: numeric_cols.append(c) else: - raise TypeError("Unable to automatically determine a distance "+\ - "for column {}".format(c)) + raise TypeError( + "Unable to automatically determine a distance " + + "for column {}".format(c) + ) # Make the standalone numeric column distance component if len(numeric_cols) > 0: - composite_distance_params.append([numeric_cols, _turicreate.distances.euclidean, 1]) + composite_distance_params.append( + [numeric_cols, _turicreate.distances.euclidean, 1] + ) return composite_distance_params -def create(dataset, label=None, features=None, distance=None, method='auto', - verbose=True, **kwargs): +def create( + dataset, + label=None, + features=None, + distance=None, + method="auto", + verbose=True, + **kwargs +): """ Create a nearest neighbor model, which can be searched efficiently and quickly for the nearest neighbors of a query observation. If the `method` @@ -353,54 +367,58 @@ def create(dataset, label=None, features=None, distance=None, method='auto', ## Basic validation of the features input if features is not None and not isinstance(features, list): - raise TypeError("If specified, input 'features' must be a list of " + - "strings.") + raise TypeError( + "If specified, input 'features' must be a list of " + "strings." + ) ## Clean the method options and create the options dictionary - allowed_kwargs = ['leaf_size', 'num_tables', 'num_projections_per_table'] + allowed_kwargs = ["leaf_size", "num_tables", "num_projections_per_table"] _method_options = {} for k, v in kwargs.items(): if k in allowed_kwargs: _method_options[k] = v else: - raise _ToolkitError("'{}' is not a valid keyword argument".format(k) + - " for the nearest neighbors model. Please " + - "check for capitalization and other typos.") - + raise _ToolkitError( + "'{}' is not a valid keyword argument".format(k) + + " for the nearest neighbors model. Please " + + "check for capitalization and other typos." + ) ## Exclude inappropriate combinations of method an distance - if method == 'ball_tree' and (distance == 'cosine' - or distance == _turicreate.distances.cosine - or distance == _turicreate.distances.dot_product - or distance == 'transformed_dot_product' - or distance == _turicreate.distances.transformed_dot_product): - raise TypeError("The ball tree method does not work with 'cosine' " + - "or 'transformed_dot_product' distance." + - "Please use the 'brute_force' method for these distances.") - - - if method == 'lsh' and ('num_projections_per_table' not in _method_options): - if distance == 'jaccard' or distance == _turicreate.distances.jaccard: - _method_options['num_projections_per_table'] = 4 - elif distance == 'cosine' or distance == _turicreate.distances.cosine: - _method_options['num_projections_per_table'] = 16 + if method == "ball_tree" and ( + distance == "cosine" + or distance == _turicreate.distances.cosine + or distance == _turicreate.distances.dot_product + or distance == "transformed_dot_product" + or distance == _turicreate.distances.transformed_dot_product + ): + raise TypeError( + "The ball tree method does not work with 'cosine' " + + "or 'transformed_dot_product' distance." + + "Please use the 'brute_force' method for these distances." + ) + + if method == "lsh" and ("num_projections_per_table" not in _method_options): + if distance == "jaccard" or distance == _turicreate.distances.jaccard: + _method_options["num_projections_per_table"] = 4 + elif distance == "cosine" or distance == _turicreate.distances.cosine: + _method_options["num_projections_per_table"] = 16 else: - _method_options['num_projections_per_table'] = 8 + _method_options["num_projections_per_table"] = 8 ## Initial validation and processing of the label if label is None: - _label = _robust_column_name('__id', dataset.column_names()) + _label = _robust_column_name("__id", dataset.column_names()) _dataset = dataset.add_row_number(_label) else: _label = label _dataset = _copy.copy(dataset) - col_type_map = {c:_dataset[c].dtype for c in _dataset.column_names()} + col_type_map = {c: _dataset[c].dtype for c in _dataset.column_names()} _validate_row_label(_label, col_type_map) ref_labels = _dataset[_label] - ## Determine the internal list of available feature names (may still include # the row label name). if features is None: @@ -408,16 +426,16 @@ def create(dataset, label=None, features=None, distance=None, method='auto', else: _features = _copy.deepcopy(features) - ## Check if there's only one feature and it's the same as the row label. # This would also be trapped by the composite distance validation, but the # error message is not very informative for the user. free_features = set(_features).difference([_label]) if len(free_features) < 1: - raise _ToolkitError("The only available feature is the same as the " + - "row label column. Please specify features " + - "that are not also row labels.") - + raise _ToolkitError( + "The only available feature is the same as the " + + "row label column. Please specify features " + + "that are not also row labels." + ) ### Validate and preprocess the distance function ### --------------------------------------------- @@ -432,22 +450,23 @@ def create(dataset, label=None, features=None, distance=None, method='auto', distance = _copy.deepcopy(distance) # distance is a single name (except 'auto') or function handle. - elif (hasattr(distance, '__call__') or - (isinstance(distance, str) and not distance == 'auto')): + elif hasattr(distance, "__call__") or ( + isinstance(distance, str) and not distance == "auto" + ): distance = [[_features, distance, 1]] # distance is unspecified and needs to be constructed. - elif distance is None or distance == 'auto': + elif distance is None or distance == "auto": sample = _dataset.head() - distance = _construct_auto_distance(_features, - _dataset.column_names(), - _dataset.column_types(), - sample) + distance = _construct_auto_distance( + _features, _dataset.column_names(), _dataset.column_types(), sample + ) else: - raise TypeError("Input 'distance' not understood. The 'distance' " - " argument must be a string, function handle, or " + - "composite distance.") + raise TypeError( + "Input 'distance' not understood. The 'distance' " + " argument must be a string, function handle, or " + "composite distance." + ) ## Basic composite distance validation, remove the row label from all # feature lists, and convert string distance names into distance functions. @@ -457,7 +476,12 @@ def create(dataset, label=None, features=None, distance=None, method='auto', ## Raise an error if any distances are used with non-lists list_features_to_check = [] - sparse_distances = ['jaccard', 'weighted_jaccard', 'cosine', 'transformed_dot_product'] + sparse_distances = [ + "jaccard", + "weighted_jaccard", + "cosine", + "transformed_dot_product", + ] sparse_distances = [_turicreate.distances.__dict__[k] for k in sparse_distances] for d in distance: feature_names, dist, _ = d @@ -466,102 +490,123 @@ def create(dataset, label=None, features=None, distance=None, method='auto', if dist in sparse_distances: list_features_to_check.append(f) else: - raise TypeError("The chosen distance cannot currently be used " + - "on list-typed columns.") + raise TypeError( + "The chosen distance cannot currently be used " + + "on list-typed columns." + ) for f in list_features_to_check: only_str_lists = _validate_lists(_dataset[f], [str]) if not only_str_lists: - raise TypeError("Distances for sparse data, such as jaccard " + - "and weighted_jaccard, can only be used on " + - "lists containing only strings. Please modify " + - "any list features accordingly before creating " + - "the nearest neighbors model.") + raise TypeError( + "Distances for sparse data, such as jaccard " + + "and weighted_jaccard, can only be used on " + + "lists containing only strings. Please modify " + + "any list features accordingly before creating " + + "the nearest neighbors model." + ) ## Raise an error if any component has string features are in single columns for d in distance: feature_names, dist, _ = d if (len(feature_names) > 1) and (dist == _turicreate.distances.levenshtein): - raise ValueError("Levenshtein distance cannot be used with multiple " + - "columns. Please concatenate strings into a single " + - "column before creating the nearest neighbors model.") + raise ValueError( + "Levenshtein distance cannot be used with multiple " + + "columns. Please concatenate strings into a single " + + "column before creating the nearest neighbors model." + ) ## Get the union of feature names and make a clean dataset. clean_features = _get_composite_distance_features(distance) sf_clean = _tkutl._toolkits_select_columns(_dataset, clean_features) - ## Decide which method to use ## - If more than one distance component (specified either directly or # generated automatically because distance set to 'auto'), then do brute # force. if len(distance) > 1: - _method = 'brute_force' + _method = "brute_force" - if method != 'brute_force' and verbose is True: - print("Defaulting to brute force instead of ball tree because " +\ - "there are multiple distance components.") + if method != "brute_force" and verbose is True: + print( + "Defaulting to brute force instead of ball tree because " + + "there are multiple distance components." + ) else: - if method == 'auto': + if method == "auto": # get the total number of variables. Assume the number of elements in # array type columns does not change - num_variables = sum([len(x) if hasattr(x, '__iter__') else 1 - for x in _six.itervalues(sf_clean[0])]) + num_variables = sum( + [ + len(x) if hasattr(x, "__iter__") else 1 + for x in _six.itervalues(sf_clean[0]) + ] + ) # flag if all the features in the single composite are of numeric # type. - numeric_type_flag = all([x in [int, float, list, array.array] - for x in sf_clean.column_types()]) + numeric_type_flag = all( + [x in [int, float, list, array.array] for x in sf_clean.column_types()] + ) ## Conditions necessary for ball tree to work and be worth it - if ((distance[0][1] in ['euclidean', - 'manhattan', - _turicreate.distances.euclidean, - _turicreate.distances.manhattan]) - and numeric_type_flag is True - and num_variables <= 200): - - _method = 'ball_tree' + if ( + ( + distance[0][1] + in [ + "euclidean", + "manhattan", + _turicreate.distances.euclidean, + _turicreate.distances.manhattan, + ] + ) + and numeric_type_flag is True + and num_variables <= 200 + ): + + _method = "ball_tree" else: - _method = 'brute_force' + _method = "brute_force" else: _method = method - ## Pick the right model name for the method - if _method == 'ball_tree': - model_name = 'nearest_neighbors_ball_tree' + if _method == "ball_tree": + model_name = "nearest_neighbors_ball_tree" - elif _method == 'brute_force': - model_name = 'nearest_neighbors_brute_force' + elif _method == "brute_force": + model_name = "nearest_neighbors_brute_force" - elif _method == 'lsh': - model_name = 'nearest_neighbors_lsh' + elif _method == "lsh": + model_name = "nearest_neighbors_lsh" else: - raise ValueError("Method must be 'auto', 'ball_tree', 'brute_force', " + - "or 'lsh'.") - + raise ValueError( + "Method must be 'auto', 'ball_tree', 'brute_force', " + "or 'lsh'." + ) ## Package the model options opts = {} opts.update(_method_options) opts.update( - {'model_name': model_name, - 'ref_labels': ref_labels, - 'label': label, - 'sf_features': sf_clean, - 'composite_params': distance}) + { + "model_name": model_name, + "ref_labels": ref_labels, + "label": label, + "sf_features": sf_clean, + "composite_params": distance, + } + ) ## Construct the nearest neighbors model with QuietProgress(verbose): result = _turicreate.extensions._nearest_neighbors.train(opts) - model_proxy = result['model'] + model_proxy = result["model"] model = NearestNeighborsModel(model_proxy) return model @@ -582,11 +627,15 @@ class NearestNeighborsModel(_Model): def __init__(self, model_proxy): """___init__(self)""" self.__proxy__ = model_proxy - self.__name__ = 'nearest_neighbors' + self.__name__ = "nearest_neighbors" @classmethod def _native_name(cls): - return ["nearest_neighbors_ball_tree", "nearest_neighbors_brute_force", "nearest_neighbors_lsh"] + return [ + "nearest_neighbors_ball_tree", + "nearest_neighbors_brute_force", + "nearest_neighbors_lsh", + ] def __str__(self): """ @@ -618,32 +667,32 @@ def _get_summary_struct(self): """ model_fields = [ - ("Method", 'method'), - ("Number of distance components", 'num_distance_components'), - ("Number of examples", 'num_examples'), - ("Number of feature columns", 'num_features'), - ("Number of unpacked features", 'num_unpacked_features'), - ("Distance", 'distance_for_summary_struct'), - ("Total training time (seconds)", 'training_time')] - - ball_tree_fields = [ - ("Tree depth", 'tree_depth'), - ("Leaf size", 'leaf_size')] + ("Method", "method"), + ("Number of distance components", "num_distance_components"), + ("Number of examples", "num_examples"), + ("Number of feature columns", "num_features"), + ("Number of unpacked features", "num_unpacked_features"), + ("Distance", "distance_for_summary_struct"), + ("Total training time (seconds)", "training_time"), + ] + + ball_tree_fields = [("Tree depth", "tree_depth"), ("Leaf size", "leaf_size")] lsh_fields = [ - ("Number of hash tables", 'num_tables'), - ("Number of projections per table", 'num_projections_per_table')] + ("Number of hash tables", "num_tables"), + ("Number of projections per table", "num_projections_per_table"), + ] sections = [model_fields] - section_titles = ['Attributes'] + section_titles = ["Attributes"] - if (self.method == 'ball_tree'): + if self.method == "ball_tree": sections.append(ball_tree_fields) - section_titles.append('Ball Tree Attributes') + section_titles.append("Ball Tree Attributes") - if (self.method == 'lsh'): + if self.method == "lsh": sections.append(lsh_fields) - section_titles.append('LSH Attributes') + section_titles.append("LSH Attributes") return (sections, section_titles) @@ -666,7 +715,7 @@ def _list_fields(self): out : list List of fields queryable with the ``get`` method. """ - opts = {'model': self.__proxy__, 'model_name': self.__name__} + opts = {"model": self.__proxy__, "model_name": self.__name__} response = _turicreate.extensions._nearest_neighbors.list_fields(opts) return sorted(response.keys()) @@ -716,11 +765,9 @@ def _get(self, field): out Value of the requested field. """ - opts = {'model': self.__proxy__, - 'model_name': self.__name__, - 'field': field} + opts = {"model": self.__proxy__, "model_name": self.__name__, "field": field} response = _turicreate.extensions._nearest_neighbors.get_value(opts) - return response['value'] + return response["value"] def _training_stats(self): """ @@ -755,7 +802,7 @@ def _training_stats(self): 'tree_depth': 1} """ - opts = {'model': self.__proxy__, 'model_name': self.__name__} + opts = {"model": self.__proxy__, "model_name": self.__name__} return _turicreate.extensions._nearest_neighbors.training_stats(opts) def query(self, dataset, label=None, k=5, radius=None, verbose=True): @@ -871,8 +918,9 @@ def query(self, dataset, label=None, k=5, radius=None, verbose=True): else: if not label in dataset.column_names(): raise ValueError( - "Input 'label' must be a string matching the name of a " +\ - "column in the reference SFrame 'dataset'.") + "Input 'label' must be a string matching the name of a " + + "column in the reference SFrame 'dataset'." + ) if not dataset[label].dtype == str and not dataset[label].dtype == int: raise TypeError("The label column must contain integers or strings.") @@ -882,7 +930,6 @@ def query(self, dataset, label=None, k=5, radius=None, verbose=True): query_labels = dataset[label] - ## Validate neighborhood parameters 'k' and 'radius' if k is not None: if not isinstance(k, int): @@ -898,7 +945,6 @@ def query(self, dataset, label=None, k=5, radius=None, verbose=True): if radius < 0: raise ValueError("Input 'radius' must be non-negative.") - ## Set k and radius to special values to indicate 'None' if k is None: k = -1 @@ -906,20 +952,28 @@ def query(self, dataset, label=None, k=5, radius=None, verbose=True): if radius is None: radius = -1.0 - opts = {'model': self.__proxy__, - 'model_name': self.__name__, - 'features': sf_features, - 'query_labels': query_labels, - 'k': k, - 'radius': radius} + opts = { + "model": self.__proxy__, + "model_name": self.__name__, + "features": sf_features, + "query_labels": query_labels, + "k": k, + "radius": radius, + } with QuietProgress(verbose): result = _turicreate.extensions._nearest_neighbors.query(opts) - return result['neighbors'] + return result["neighbors"] - def similarity_graph(self, k=5, radius=None, include_self_edges=False, - output_type='SGraph', verbose=True): + def similarity_graph( + self, + k=5, + radius=None, + include_self_edges=False, + output_type="SGraph", + verbose=True, + ): """ Construct the similarity graph on the reference dataset, which is already stored in the model. This is conceptually very similar to @@ -1016,7 +1070,6 @@ def similarity_graph(self, k=5, radius=None, include_self_edges=False, if radius < 0: raise ValueError("Input 'radius' must be non-negative.") - ## Set k and radius to special values to indicate 'None' if k is None: k = -1 @@ -1024,22 +1077,24 @@ def similarity_graph(self, k=5, radius=None, include_self_edges=False, if radius is None: radius = -1.0 - opts = {'model': self.__proxy__, - 'model_name': self.__name__, - 'k': k, - 'radius': radius, - 'include_self_edges': include_self_edges} + opts = { + "model": self.__proxy__, + "model_name": self.__name__, + "k": k, + "radius": radius, + "include_self_edges": include_self_edges, + } with QuietProgress(verbose): result = _turicreate.extensions._nearest_neighbors.similarity_graph(opts) - knn = result['neighbors'] + knn = result["neighbors"] if output_type == "SFrame": return knn else: - sg = _SGraph(edges=knn, src_field='query_label', - dst_field='reference_label') + sg = _SGraph( + edges=knn, src_field="query_label", dst_field="reference_label" + ) return sg - diff --git a/src/python/turicreate/toolkits/object_detector/__init__.py b/src/python/turicreate/toolkits/object_detector/__init__.py index 34b0f47e3a..5fde4f0967 100644 --- a/src/python/turicreate/toolkits/object_detector/__init__.py +++ b/src/python/turicreate/toolkits/object_detector/__init__.py @@ -9,4 +9,4 @@ from .object_detector import create, ObjectDetector from . import util -__all__ = ['create', 'ObjectDetector', 'util'] +__all__ = ["create", "ObjectDetector", "util"] diff --git a/src/python/turicreate/toolkits/object_detector/_detection.py b/src/python/turicreate/toolkits/object_detector/_detection.py index 93d2993d8d..c53900cd90 100644 --- a/src/python/turicreate/toolkits/object_detector/_detection.py +++ b/src/python/turicreate/toolkits/object_detector/_detection.py @@ -11,16 +11,17 @@ def multi_range(*args): import itertools + return itertools.product(*[range(a) for a in args]) def bbox_to_ybox(bbox): """Convert from corner bounding box to center/shape""" return [ - (bbox[1] + bbox[3]) / 2, - (bbox[0] + bbox[2]) / 2, - (bbox[3] - bbox[1]), - (bbox[2] - bbox[0]), + (bbox[1] + bbox[3]) / 2, + (bbox[0] + bbox[2]) / 2, + (bbox[3] - bbox[1]), + (bbox[2] - bbox[0]), ] @@ -38,8 +39,11 @@ def intersection_over_union(bbs1, bbs2): bbs1_bc = bbs1[:, _np.newaxis] bbs2_bc = bbs2[_np.newaxis] - inter = (_np.maximum(_np.minimum(bbs1_bc[..., 2:], bbs2_bc[..., 2:]) - - _np.maximum(bbs1_bc[..., :2], bbs2_bc[..., :2]), 0)) + inter = _np.maximum( + _np.minimum(bbs1_bc[..., 2:], bbs2_bc[..., 2:]) + - _np.maximum(bbs1_bc[..., :2], bbs2_bc[..., :2]), + 0, + ) inter_area = inter[..., 0] * inter[..., 1] area1 = (bbs1_bc[..., 2] - bbs1_bc[..., 0]) * (bbs1_bc[..., 3] - bbs1_bc[..., 1]) area2 = (bbs2_bc[..., 2] - bbs2_bc[..., 0]) * (bbs2_bc[..., 3] - bbs2_bc[..., 1]) @@ -48,8 +52,9 @@ def intersection_over_union(bbs1, bbs2): # Class-independent NMS -def non_maximum_suppression(boxes, classes, scores, num_classes, threshold=0.5, - limit=None): +def non_maximum_suppression( + boxes, classes, scores, num_classes, threshold=0.5, limit=None +): np_scores = _np.array(scores) np_boxes = _np.array(boxes) np_classes = _np.array(classes) @@ -66,8 +71,8 @@ def non_maximum_suppression(boxes, classes, scores, num_classes, threshold=0.5, keep = _np.ones(len(c_scores)).astype(_np.bool) for i in range(len(c_scores)): if keep[i]: - ious = intersection_over_union(c_boxes[[i]], c_boxes[i+1:])[0] - keep[i + 1:] &= ious <= threshold + ious = intersection_over_union(c_boxes[[i]], c_boxes[i + 1 :])[0] + keep[i + 1 :] &= ious <= threshold c_scores = c_scores[keep] c_boxes = c_boxes[keep] @@ -89,8 +94,9 @@ def non_maximum_suppression(boxes, classes, scores, num_classes, threshold=0.5, return new_boxes, new_classes, new_scores -def yolo_map_to_bounding_boxes(output, anchors, confidence_threshold=0.3, - block_size=32, nms_thresh=0.5, limit=None): +def yolo_map_to_bounding_boxes( + output, anchors, confidence_threshold=0.3, block_size=32, nms_thresh=0.5, limit=None +): assert output.shape[0] == 1, "For now works on single images" num_anchors = output.shape[-2] num_classes = output.shape[-1] - 5 @@ -116,21 +122,24 @@ def yolo_map_to_bounding_boxes(output, anchors, confidence_threshold=0.3, confidence_in_class = class_score * confidence if confidence_in_class > confidence_threshold: - boxes.append([y - h/2, x - w/2, y + h/2, x + w/2]) + boxes.append([y - h / 2, x - w / 2, y + h / 2, x + w / 2]) classes.append(int(i)) scores.append(confidence_in_class) if nms_thresh is not None: boxes, classes, scores = non_maximum_suppression( - boxes, classes, scores, - num_classes=num_classes, threshold=nms_thresh, - limit=None) + boxes, + classes, + scores, + num_classes=num_classes, + threshold=nms_thresh, + limit=None, + ) return boxes, classes, scores -def yolo_boxes_to_yolo_map(gt_mxboxes, input_shape, output_shape, - num_classes, anchors): +def yolo_boxes_to_yolo_map(gt_mxboxes, input_shape, output_shape, num_classes, anchors): num_anchors = len(anchors) @@ -166,6 +175,6 @@ def yolo_boxes_to_yolo_map(gt_mxboxes, input_shape, output_shape, ymap[iy, ix, :, 3] = h ymap[iy, ix, :, 4] = 1 ymap[iy, ix, :, 5:] = 0 - ymap[iy, ix, :, 5+gt_cls] = 1 + ymap[iy, ix, :, 5 + gt_cls] = 1 return ymap diff --git a/src/python/turicreate/toolkits/object_detector/_evaluation.py b/src/python/turicreate/toolkits/object_detector/_evaluation.py index f67ae5786a..f404061871 100644 --- a/src/python/turicreate/toolkits/object_detector/_evaluation.py +++ b/src/python/turicreate/toolkits/object_detector/_evaluation.py @@ -9,42 +9,49 @@ import numpy as _np -def average_precision(predictions, - groundtruth, - class_to_index, - iou_thresholds): +def average_precision(predictions, groundtruth, class_to_index, iou_thresholds): aps = _np.zeros((len(iou_thresholds), len(class_to_index))) for classname, c in class_to_index.items(): - c_predictions = predictions[predictions['label'] == classname] - c_groundtruth = groundtruth[groundtruth['label'] == classname] + c_predictions = predictions[predictions["label"] == classname] + c_groundtruth = groundtruth[groundtruth["label"] == classname] - pred_sorted = c_predictions.sort_values('confidence', ascending=False) + pred_sorted = c_predictions.sort_values("confidence", ascending=False) num_pred = len(pred_sorted) tp = _np.zeros((len(iou_thresholds), num_pred)) fp = _np.zeros((len(iou_thresholds), num_pred)) gts_dict = {} for index, (_, row) in enumerate(pred_sorted.iterrows()): - if row['row_id'] in gts_dict: - gts = gts_dict[row['row_id']] + if row["row_id"] in gts_dict: + gts = gts_dict[row["row_id"]] else: - gts = c_groundtruth[c_groundtruth['row_id'] == row['row_id']].reset_index(drop=True) + gts = c_groundtruth[ + c_groundtruth["row_id"] == row["row_id"] + ].reset_index(drop=True) for i in range(len(iou_thresholds)): - gts['correct_%d' % i] = False - gts_dict[row['row_id']] = gts + gts["correct_%d" % i] = False + gts_dict[row["row_id"]] = gts if gts.size > 0: - x_lo = _np.maximum(gts['x'] - gts['width'] / 2, row['x'] - row['width'] / 2) - x_hi = _np.minimum(gts['x'] + gts['width'] / 2, row['x'] + row['width'] / 2) - y_lo = _np.maximum(gts['y'] - gts['height'] / 2, row['y'] - row['height'] / 2) - y_hi = _np.minimum(gts['y'] + gts['height'] / 2, row['y'] + row['height'] / 2) + x_lo = _np.maximum( + gts["x"] - gts["width"] / 2, row["x"] - row["width"] / 2 + ) + x_hi = _np.minimum( + gts["x"] + gts["width"] / 2, row["x"] + row["width"] / 2 + ) + y_lo = _np.maximum( + gts["y"] - gts["height"] / 2, row["y"] - row["height"] / 2 + ) + y_hi = _np.minimum( + gts["y"] + gts["height"] / 2, row["y"] + row["height"] / 2 + ) width = _np.maximum(x_hi - x_lo, 0) height = _np.maximum(y_hi - y_lo, 0) inter_area = width * height - pred_area = row['width'] * row['height'] - gt_area = gts['width'] * gts['height'] + pred_area = row["width"] * row["height"] + gt_area = gts["width"] * gts["height"] iou = inter_area / (pred_area + gt_area - inter_area) best_gt_index = iou.idxmax() @@ -53,7 +60,7 @@ def average_precision(predictions, best_iou = 0.0 for th_index, iou_threshold in enumerate(iou_thresholds): - col_index = gts.columns.get_loc('correct_%d' % th_index) + col_index = gts.columns.get_loc("correct_%d" % th_index) if best_iou > iou_threshold and not gts.iloc[best_gt_index, col_index]: gts.iloc[best_gt_index, col_index] = True tp[th_index, index] = 1 @@ -64,18 +71,27 @@ def average_precision(predictions, cum_tp = _np.cumsum(tp, axis=1) def pad1(x, v0, v1): - return _np.concatenate([_np.full((x.shape[0], 1), v0, dtype=_np.float64), - x, - _np.full((x.shape[0], 1), v1, dtype=_np.float64)], axis=1) + return _np.concatenate( + [ + _np.full((x.shape[0], 1), v0, dtype=_np.float64), + x, + _np.full((x.shape[0], 1), v1, dtype=_np.float64), + ], + axis=1, + ) recall = pad1(cum_tp / len(c_groundtruth), 0, 1) precision_non_monotonic = pad1(cum_tp / _np.maximum(cum_tp + cum_fp, 1), 0, 0) - precision = _np.maximum.accumulate(precision_non_monotonic[:, ::-1], axis=1)[:, ::-1] + precision = _np.maximum.accumulate(precision_non_monotonic[:, ::-1], axis=1)[ + :, ::-1 + ] rec_diff = _np.diff(recall, axis=1) - for th_index, (rec_diff_th, precision_th) in enumerate(zip(rec_diff, precision)): + for th_index, (rec_diff_th, precision_th) in enumerate( + zip(rec_diff, precision) + ): ii = _np.where(rec_diff_th > 0)[0] ap = (rec_diff_th[ii] * precision_th[ii + 1]).sum() aps[th_index, c] = ap diff --git a/src/python/turicreate/toolkits/object_detector/_sframe_loader.py b/src/python/turicreate/toolkits/object_detector/_sframe_loader.py index 2291f1863a..825761b021 100644 --- a/src/python/turicreate/toolkits/object_detector/_sframe_loader.py +++ b/src/python/turicreate/toolkits/object_detector/_sframe_loader.py @@ -13,15 +13,16 @@ from turicreate.toolkits._main import ToolkitError as _ToolkitError from ._detection import yolo_boxes_to_yolo_map as _yolo_boxes_to_yolo_map -_TMP_COL_RANDOM_ORDER = '_random_order' +_TMP_COL_RANDOM_ORDER = "_random_order" def _convert_image_to_raw(image): # Decode image and make sure it has 3 channels return _tc.image_analysis.resize(image, image.width, image.height, 3, decode=True) + def _is_rectangle_annotation(ann): - return 'type' not in ann or ann['type'] == 'rectangle' + return "type" not in ann or ann["type"] == "rectangle" def _is_valid_annotation(ann): @@ -31,12 +32,14 @@ def _is_valid_annotation(ann): # Not necessarily valid, but we bypass stricter checks (we simply do # not care about non-rectangle types) return True - return ('coordinates' in ann and - isinstance(ann['coordinates'], dict) and - set(ann['coordinates'].keys()) == {'x', 'y', 'width', 'height'} and - ann['coordinates']['width'] > 0 and - ann['coordinates']['height'] > 0 and - 'label' in ann) + return ( + "coordinates" in ann + and isinstance(ann["coordinates"], dict) + and set(ann["coordinates"].keys()) == {"x", "y", "width", "height"} + and ann["coordinates"]["width"] > 0 + and ann["coordinates"]["height"] > 0 + and "label" in ann + ) def _is_valid_annotations_list(anns): diff --git a/src/python/turicreate/toolkits/object_detector/_tf_image_augmenter.py b/src/python/turicreate/toolkits/object_detector/_tf_image_augmenter.py index c70841d6c1..4596fd0c82 100644 --- a/src/python/turicreate/toolkits/object_detector/_tf_image_augmenter.py +++ b/src/python/turicreate/toolkits/object_detector/_tf_image_augmenter.py @@ -16,27 +16,29 @@ tf.disable_v2_behavior() _DEFAULT_AUG_PARAMS = { - 'max_hue_adjust' : 0.05, - 'max_brightness' : 0.05, - 'max_contrast' : 1.25, - 'max_saturation' : 1.25, - 'skip_probability_flip' : 0.5, - 'min_aspect_ratio' : 0.8, - 'max_aspect_ratio' : 1.25, - 'min_area_fraction_crop' : 0.15, - 'max_area_fraction_crop' : 1.0, - 'min_area_fraction_pad' : 1.0, - 'max_area_fraction_pad' : 2.0, - 'max_attempts' : 50, - 'skip_probability_pad' : 0.1, - 'skip_probability_crop' : 0.1, - 'min_object_covered': 0.0, - 'min_eject_coverage': 0.5, - 'resize_method': 'turicreate', + "max_hue_adjust": 0.05, + "max_brightness": 0.05, + "max_contrast": 1.25, + "max_saturation": 1.25, + "skip_probability_flip": 0.5, + "min_aspect_ratio": 0.8, + "max_aspect_ratio": 1.25, + "min_area_fraction_crop": 0.15, + "max_area_fraction_crop": 1.0, + "min_area_fraction_pad": 1.0, + "max_area_fraction_pad": 2.0, + "max_attempts": 50, + "skip_probability_pad": 0.1, + "skip_probability_crop": 0.1, + "min_object_covered": 0.0, + "min_eject_coverage": 0.5, + "resize_method": "turicreate", } -def hue_augmenter(image, annotation, - max_hue_adjust=_DEFAULT_AUG_PARAMS["max_hue_adjust"]): + +def hue_augmenter( + image, annotation, max_hue_adjust=_DEFAULT_AUG_PARAMS["max_hue_adjust"] +): # Sample a random rotation around the color wheel. hue_adjust = 0.0 @@ -50,10 +52,14 @@ def hue_augmenter(image, annotation, # No geometry changes, so just copy the annotations. return image, annotation -def color_augmenter(image, annotation, - max_brightness=_DEFAULT_AUG_PARAMS["max_brightness"], - max_contrast=_DEFAULT_AUG_PARAMS["max_contrast"], - max_saturation=_DEFAULT_AUG_PARAMS["max_saturation"]): + +def color_augmenter( + image, + annotation, + max_brightness=_DEFAULT_AUG_PARAMS["max_brightness"], + max_contrast=_DEFAULT_AUG_PARAMS["max_contrast"], + max_saturation=_DEFAULT_AUG_PARAMS["max_saturation"], +): # Sample a random adjustment to brightness. if max_brightness is not None and max_brightness > 0: @@ -62,65 +68,81 @@ def color_augmenter(image, annotation, # Sample a random adjustment to contrast. if max_saturation is not None and max_saturation > 1.0: log_sat = np.log(max_saturation) - image = tf.image.random_saturation(image, lower=np.exp(-log_sat), upper=np.exp(log_sat)) + image = tf.image.random_saturation( + image, lower=np.exp(-log_sat), upper=np.exp(log_sat) + ) # Sample a random adjustment to saturation. if max_contrast is not None and max_contrast > 1.0: log_con = np.log(max_contrast) - image = tf.image.random_contrast(image, lower=np.exp(-log_con), upper=np.exp(log_con)) + image = tf.image.random_contrast( + image, lower=np.exp(-log_con), upper=np.exp(log_con) + ) image = tf.clip_by_value(image, 0, 1) # No geometry changes, so just copy the annotations. return image, annotation -def resize_augmenter(image, annotation, - output_shape): - resize_method = _DEFAULT_AUG_PARAMS['resize_method'] +def resize_augmenter(image, annotation, output_shape): + + resize_method = _DEFAULT_AUG_PARAMS["resize_method"] def resize_PIL_image(image, output_shape): - image *= 255. - image = image.astype('uint8') + image *= 255.0 + image = image.astype("uint8") pil_img = Image.fromarray(image) - resize_img = pil_img.resize((output_shape[1], output_shape[0]), resample=Image.BILINEAR) + resize_img = pil_img.resize( + (output_shape[1], output_shape[0]), resample=Image.BILINEAR + ) np_img = np.array(resize_img) np_img = np_img.astype(np.float32) - np_img /= 255. + np_img /= 255.0 return np_img def resize_turicreate_image(image, output_shape): - image *= 255. - image = image.astype('uint8') + image *= 255.0 + image = image.astype("uint8") FORMAT_RAW = 2 - tc_image = tc.Image(_image_data=image.tobytes(), - _width=image.shape[1], - _height=image.shape[0], - _channels=image.shape[2], - _format_enum=FORMAT_RAW, - _image_data_size=image.size) - tc_image = tc.image_analysis.resize(tc_image, output_shape[1], output_shape[0], resample='bilinear') + tc_image = tc.Image( + _image_data=image.tobytes(), + _width=image.shape[1], + _height=image.shape[0], + _channels=image.shape[2], + _format_enum=FORMAT_RAW, + _image_data_size=image.size, + ) + tc_image = tc.image_analysis.resize( + tc_image, output_shape[1], output_shape[0], resample="bilinear" + ) image = tc_image.pixel_data image = image.astype(np.float32) - image /= 255. + image /= 255.0 return image - if resize_method == 'tensorflow': + if resize_method == "tensorflow": new_height = tf.cast(output_shape[0], dtype=tf.int32) new_width = tf.cast(output_shape[1], dtype=tf.int32) # Determine the affine transform to apply and apply to the image itself. - image_scaled = tf.squeeze(tf.image.resize_bilinear( - tf.expand_dims(image, 0), [new_height, new_width]), [0]) + image_scaled = tf.squeeze( + tf.image.resize_bilinear(tf.expand_dims(image, 0), [new_height, new_width]), + [0], + ) - elif resize_method == 'PIL': - image_scaled = tf.numpy_function(func=resize_PIL_image, inp=[image, output_shape], Tout=[tf.float32]) + elif resize_method == "PIL": + image_scaled = tf.numpy_function( + func=resize_PIL_image, inp=[image, output_shape], Tout=[tf.float32] + ) - elif resize_method == 'turicreate': - image_scaled = tf.numpy_function(func=resize_turicreate_image, inp=[image, output_shape], Tout=[tf.float32]) + elif resize_method == "turicreate": + image_scaled = tf.numpy_function( + func=resize_turicreate_image, inp=[image, output_shape], Tout=[tf.float32] + ) else: - raise Exception('Non-supported resize method.') + raise Exception("Non-supported resize method.") image_clipped = tf.clip_by_value(image_scaled, 0.0, 1.0) annotation = tf.clip_by_value(annotation, 0.0, 1.0) @@ -129,7 +151,9 @@ def resize_turicreate_image(image, output_shape): return image_clipped, annotation -def horizontal_flip_augmenter(image, annotation, skip_probability=_DEFAULT_AUG_PARAMS["skip_probability_flip"]): +def horizontal_flip_augmenter( + image, annotation, skip_probability=_DEFAULT_AUG_PARAMS["skip_probability_flip"] +): if np.random.uniform(0.0, 1.0) < skip_probability: return image, annotation @@ -145,14 +169,17 @@ def horizontal_flip_augmenter(image, annotation, skip_probability=_DEFAULT_AUG_P return flipped_image, annotation -def padding_augmenter(image, - annotation, - skip_probability=_DEFAULT_AUG_PARAMS["skip_probability_pad"], - min_aspect_ratio=_DEFAULT_AUG_PARAMS["min_aspect_ratio"], - max_aspect_ratio=_DEFAULT_AUG_PARAMS["max_aspect_ratio"], - min_area_fraction=_DEFAULT_AUG_PARAMS["min_area_fraction_pad"], - max_area_fraction=_DEFAULT_AUG_PARAMS["max_area_fraction_pad"], - max_attempts=_DEFAULT_AUG_PARAMS["max_attempts"]): + +def padding_augmenter( + image, + annotation, + skip_probability=_DEFAULT_AUG_PARAMS["skip_probability_pad"], + min_aspect_ratio=_DEFAULT_AUG_PARAMS["min_aspect_ratio"], + max_aspect_ratio=_DEFAULT_AUG_PARAMS["max_aspect_ratio"], + min_area_fraction=_DEFAULT_AUG_PARAMS["min_area_fraction_pad"], + max_area_fraction=_DEFAULT_AUG_PARAMS["max_area_fraction_pad"], + max_attempts=_DEFAULT_AUG_PARAMS["max_attempts"], +): if np.random.uniform(0.0, 1.0) < skip_probability: return np.array(image), annotation @@ -176,23 +203,27 @@ def padding_augmenter(image, # The padded area must attain the minimum area fraction. # w'h' >= fhw IMPLIES ah'h' >= fhw IMPLIES h' >= sqrt(fhw/a) - min_height_from_area = np.sqrt(image_height * image_width * min_area_fraction / aspect_ratio) + min_height_from_area = np.sqrt( + image_height * image_width * min_area_fraction / aspect_ratio + ) if min_height < min_height_from_area: min_height = min_height_from_area # The padded area must not exceed the maximum area fraction. - max_height = np.sqrt(image_height * image_width * max_area_fraction / aspect_ratio) + max_height = np.sqrt( + image_height * image_width * max_area_fraction / aspect_ratio + ) if min_height >= max_height: break # We did not find a compatible aspect ratio. Just return the original data. - if (min_height > max_height): + if min_height > max_height: return np.array(image), annotation # Sample a final size, given the sampled aspect ratio and range of heights. padded_height = np.random.uniform(min_height, max_height) - padded_width = padded_height * aspect_ratio; + padded_width = padded_height * aspect_ratio # Sample the offset of the source image inside the padded image. x_offset = np.random.uniform(0.0, (padded_width - image_width)) @@ -203,18 +234,18 @@ def padding_augmenter(image, after_padding_height = padded_height - image_height - y_offset # Pad the image - npad = ((int(y_offset), int(after_padding_height)), (int(x_offset), int(after_padding_width)), (0, 0)) - padded_image = np.pad(image, pad_width=npad, mode='constant', constant_values=0.5) + npad = ( + (int(y_offset), int(after_padding_height)), + (int(x_offset), int(after_padding_width)), + (0, 0), + ) + padded_image = np.pad(image, pad_width=npad, mode="constant", constant_values=0.5) ty = float(y_offset) tx = float(x_offset) # Transformation matrix for the annotations - transformation_matrix = np.array([ - [1.0, 0.0, ty], - [0.0, 1.0, tx], - [0.0, 0.0, 1.0] - ]) + transformation_matrix = np.array([[1.0, 0.0, ty], [0.0, 1.0, tx], [0.0, 0.0, 1.0]]) # Use transformation matrix to augment annotations formatted_annotation = [] @@ -224,7 +255,9 @@ def padding_augmenter(image, confidence = aug[5:6] if not np.any(bounds): - formatted_annotation.append(np.concatenate([identifier, np.array([0, 0, 0, 0]), confidence])) + formatted_annotation.append( + np.concatenate([identifier, np.array([0, 0, 0, 0]), confidence]) + ) continue width = bounds[2] @@ -237,7 +270,13 @@ def padding_augmenter(image, augmentation_coordinates = np.array([y1, x1, y2, x2], dtype=np.float32) - v = np.concatenate([augmentation_coordinates.reshape((2, 2)), np.ones((2, 1), dtype=np.float32)], axis=1) + v = np.concatenate( + [ + augmentation_coordinates.reshape((2, 2)), + np.ones((2, 1), dtype=np.float32), + ], + axis=1, + ) transposed_v = np.dot(v, np.transpose(transformation_matrix)) t_intersection = np.squeeze(transposed_v[:, :2].reshape(-1, 4)) @@ -250,23 +289,30 @@ def padding_augmenter(image, # Normalize the elements to the cropped width and height ele_1 = t_intersection[1] / padded_width ele_2 = t_intersection[0] / padded_height - ele_3 = (t_intersection[3] - t_intersection[1]) /padded_width + ele_3 = (t_intersection[3] - t_intersection[1]) / padded_width ele_4 = (t_intersection[2] - t_intersection[0]) / padded_height - formatted_annotation.append(np.concatenate([identifier, np.array([ele_1, ele_2, ele_3, ele_4]), confidence])) + formatted_annotation.append( + np.concatenate( + [identifier, np.array([ele_1, ele_2, ele_3, ele_4]), confidence] + ) + ) return np.array(padded_image), np.array(formatted_annotation, dtype=np.float32) -def crop_augmenter(image, - annotation, - skip_probability=_DEFAULT_AUG_PARAMS["skip_probability_crop"], - min_aspect_ratio=_DEFAULT_AUG_PARAMS["min_aspect_ratio"], - max_aspect_ratio=_DEFAULT_AUG_PARAMS["max_aspect_ratio"], - min_area_fraction=_DEFAULT_AUG_PARAMS["min_area_fraction_crop"], - max_area_fraction=_DEFAULT_AUG_PARAMS["max_area_fraction_crop"], - min_object_covered=_DEFAULT_AUG_PARAMS["min_object_covered"], - max_attempts=_DEFAULT_AUG_PARAMS["max_attempts"], - min_eject_coverage=_DEFAULT_AUG_PARAMS["min_eject_coverage"]): + +def crop_augmenter( + image, + annotation, + skip_probability=_DEFAULT_AUG_PARAMS["skip_probability_crop"], + min_aspect_ratio=_DEFAULT_AUG_PARAMS["min_aspect_ratio"], + max_aspect_ratio=_DEFAULT_AUG_PARAMS["max_aspect_ratio"], + min_area_fraction=_DEFAULT_AUG_PARAMS["min_area_fraction_crop"], + max_area_fraction=_DEFAULT_AUG_PARAMS["max_area_fraction_crop"], + min_object_covered=_DEFAULT_AUG_PARAMS["min_object_covered"], + max_attempts=_DEFAULT_AUG_PARAMS["max_attempts"], + min_eject_coverage=_DEFAULT_AUG_PARAMS["min_eject_coverage"], +): if np.random.uniform(0.0, 1.0) < skip_probability: return np.array(image), annotation @@ -296,22 +342,25 @@ def crop_augmenter(image, max_height = max_height_from_width # The cropped area must not exceed the maximum area fraction. - max_height_from_area = np.sqrt(image_height * image_width * max_area_fraction / aspect_ratio) + max_height_from_area = np.sqrt( + image_height * image_width * max_area_fraction / aspect_ratio + ) if max_height > max_height_from_area: max_height = max_height_from_area # The padded area must attain the minimum area fraction. - min_height = np.sqrt(image_height * image_width * min_area_fraction / aspect_ratio) + min_height = np.sqrt( + image_height * image_width * min_area_fraction / aspect_ratio + ) # If the range is empty, then crops with the sampled aspect ratio cannot # satisfy the area constraint. if min_height > max_height: continue - # Sample a position for the crop, constrained to lie within the image. cropped_height = np.random.uniform(min_height, max_height) - cropped_width = cropped_height * aspect_ratio; + cropped_width = cropped_height * aspect_ratio x_offset = np.random.uniform(0.0, (image_width - cropped_width)) y_offset = np.random.uniform(0.0, (image_height - cropped_height)) @@ -338,7 +387,12 @@ def crop_augmenter(image, y2 = (bounds[1] + height) * image_height # This tests whether the crop bounds are out of the annotated bounds, if not it returns an empty annotation - if crop_bounds_x1 < x2 and crop_bounds_y1 < y2 and x1 < crop_bounds_x2 and y1 < crop_bounds_y2: + if ( + crop_bounds_x1 < x2 + and crop_bounds_y1 < y2 + and x1 < crop_bounds_x2 + and y1 < crop_bounds_y2 + ): x_bounds = [x1, x2, x_offset, x_offset + cropped_width] y_bounds = [y1, y2, y_offset, y_offset + cropped_height] @@ -348,9 +402,13 @@ def crop_augmenter(image, x_pairs = x_bounds[1:3] y_pairs = y_bounds[1:3] - intersection = np.array([y_pairs[0], x_pairs[0], y_pairs[1], x_pairs[1]]) + intersection = np.array( + [y_pairs[0], x_pairs[0], y_pairs[1], x_pairs[1]] + ) - intersection_area = (intersection[3] - intersection[1]) * (intersection[2] - intersection[0]) + intersection_area = (intersection[3] - intersection[1]) * ( + intersection[2] - intersection[0] + ) annotation_area = (y2 - y1) * (x2 - x1) area_coverage = intersection_area / annotation_area @@ -360,38 +418,62 @@ def crop_augmenter(image, is_min_object_covered = False break - # If the area coverage is greater the min_eject_coverage, then actually keep the annotation if area_coverage >= min_eject_coverage: # Transformation matrix for the annotations - transformation_matrix = np.array([ - [1.0, 0.0, -y_offset], - [0.0, 1.0, -x_offset], - [0.0, 0.0, 1.0] - ]) - - v = np.concatenate([intersection.reshape((2, 2)), np.ones((2, 1), dtype=np.float32)], axis=1) + transformation_matrix = np.array( + [[1.0, 0.0, -y_offset], [0.0, 1.0, -x_offset], [0.0, 0.0, 1.0]] + ) + + v = np.concatenate( + [ + intersection.reshape((2, 2)), + np.ones((2, 1), dtype=np.float32), + ], + axis=1, + ) transposed_v = np.dot(v, np.transpose(transformation_matrix)) t_intersection = np.squeeze(transposed_v[:, :2].reshape(-1, 4)) # Sort the points top, left, bottom, right if t_intersection[0] > t_intersection[2]: - t_intersection[0], t_intersection[2] = t_intersection[2], t_intersection[0] + t_intersection[0], t_intersection[2] = ( + t_intersection[2], + t_intersection[0], + ) if t_intersection[1] > t_intersection[3]: - t_intersection[1], t_intersection[3] = t_intersection[3], t_intersection[1] + t_intersection[1], t_intersection[3] = ( + t_intersection[3], + t_intersection[1], + ) # Normalize the elements to the cropped width and height ele_1 = t_intersection[1] / cropped_width ele_2 = t_intersection[0] / cropped_height - ele_3 = (t_intersection[3] - t_intersection[1]) /cropped_width + ele_3 = (t_intersection[3] - t_intersection[1]) / cropped_width ele_4 = (t_intersection[2] - t_intersection[0]) / cropped_height - formatted_annotation.append(np.concatenate([identifier, np.array([ele_1, ele_2, ele_3, ele_4]), confidence])) + formatted_annotation.append( + np.concatenate( + [ + identifier, + np.array([ele_1, ele_2, ele_3, ele_4]), + confidence, + ] + ) + ) else: - formatted_annotation.append(np.concatenate([identifier, np.array([0.0, 0.0, 0.0, 0.0]), confidence])) + formatted_annotation.append( + np.concatenate( + [identifier, np.array([0.0, 0.0, 0.0, 0.0]), confidence] + ) + ) else: - formatted_annotation.append(np.concatenate([identifier, np.array([0.0, 0.0, 0.0, 0.0]), confidence])) - + formatted_annotation.append( + np.concatenate( + [identifier, np.array([0.0, 0.0, 0.0, 0.0]), confidence] + ) + ) if not is_min_object_covered: continue @@ -407,52 +489,73 @@ def crop_augmenter(image, return np.array(image), annotation + def complete_augmenter(img_tf, ann_tf, output_height, output_width): - img_tf, ann_tf = tf.numpy_function(func=crop_augmenter, inp=[img_tf, ann_tf], Tout=[tf.float32, tf.float32]) - img_tf, ann_tf = tf.numpy_function(func=padding_augmenter, inp=[img_tf, ann_tf], Tout=[tf.float32, tf.float32]) - img_tf, ann_tf = tf.numpy_function(func=horizontal_flip_augmenter, inp=[img_tf, ann_tf], Tout=[tf.float32, tf.float32]) + img_tf, ann_tf = tf.numpy_function( + func=crop_augmenter, inp=[img_tf, ann_tf], Tout=[tf.float32, tf.float32] + ) + img_tf, ann_tf = tf.numpy_function( + func=padding_augmenter, inp=[img_tf, ann_tf], Tout=[tf.float32, tf.float32] + ) + img_tf, ann_tf = tf.numpy_function( + func=horizontal_flip_augmenter, + inp=[img_tf, ann_tf], + Tout=[tf.float32, tf.float32], + ) img_tf, ann_tf = color_augmenter(img_tf, ann_tf) img_tf, ann_tf = hue_augmenter(img_tf, ann_tf) img_tf, ann_tf = resize_augmenter(img_tf, ann_tf, (output_height, output_width)) return img_tf, ann_tf - class DataAugmenter(object): def __init__(self, output_height, output_width, batch_size, resize_only): self.batch_size = batch_size self.graph = tf.Graph() self.resize_only = resize_only with self.graph.as_default(): - self.img_tf = [tf.placeholder(tf.float32, [None, None, 3]) for x in range(0, self.batch_size )] - self.ann_tf = [tf.placeholder(tf.float32, [None, 6]) for x in range(0, self.batch_size )] + self.img_tf = [ + tf.placeholder(tf.float32, [None, None, 3]) + for x in range(0, self.batch_size) + ] + self.ann_tf = [ + tf.placeholder(tf.float32, [None, 6]) for x in range(0, self.batch_size) + ] self.resize_op_batch = [] for i in range(0, self.batch_size): if resize_only: - aug_img_tf, aug_ann_tf = resize_augmenter(self.img_tf[i], self.ann_tf[i], (output_height, output_width)) + aug_img_tf, aug_ann_tf = resize_augmenter( + self.img_tf[i], self.ann_tf[i], (output_height, output_width) + ) self.resize_op_batch.append([aug_img_tf, aug_ann_tf]) else: - aug_img_tf, aug_ann_tf = complete_augmenter(self.img_tf[i], self.ann_tf[i], output_height, output_width) + aug_img_tf, aug_ann_tf = complete_augmenter( + self.img_tf[i], self.ann_tf[i], output_height, output_width + ) self.resize_op_batch.append([aug_img_tf, aug_ann_tf]) def get_augmented_data(self, images, annotations): with tf.Session(graph=self.graph) as session: feed_dict = dict() - graph_op = self.resize_op_batch[0:len(images)] + graph_op = self.resize_op_batch[0 : len(images)] for i in range(0, len(images)): - feed_dict[self.img_tf[i]] = _utils.convert_shared_float_array_to_numpy(images[i]) + feed_dict[self.img_tf[i]] = _utils.convert_shared_float_array_to_numpy( + images[i] + ) if self.resize_only: feed_dict[self.ann_tf[i]] = self.batch_size * [np.zeros(6)] else: - feed_dict[self.ann_tf[i]] = _utils.convert_shared_float_array_to_numpy(annotations[i]) + feed_dict[ + self.ann_tf[i] + ] = _utils.convert_shared_float_array_to_numpy(annotations[i]) aug_output = session.run(graph_op, feed_dict=feed_dict) processed_images = [] processed_annotations = [] for o in aug_output: processed_images.append(o[0]) - processed_annotations.append(np.ascontiguousarray(o[1], dtype=np.float32)) + processed_annotations.append( + np.ascontiguousarray(o[1], dtype=np.float32) + ) processed_images = np.array(processed_images, dtype=np.float32) processed_images = np.ascontiguousarray(processed_images, dtype=np.float32) return (processed_images, processed_annotations) - - diff --git a/src/python/turicreate/toolkits/object_detector/_tf_model_architecture.py b/src/python/turicreate/toolkits/object_detector/_tf_model_architecture.py index 4a6bb21c14..b8162872ca 100644 --- a/src/python/turicreate/toolkits/object_detector/_tf_model_architecture.py +++ b/src/python/turicreate/toolkits/object_detector/_tf_model_architecture.py @@ -12,31 +12,55 @@ from .._tf_model import TensorFlowModel import turicreate.toolkits._tf_utils as _utils import tensorflow.compat.v1 as _tf + _tf.disable_v2_behavior() -class ODTensorFlowModel(TensorFlowModel): - def __init__(self, input_h, input_w, batch_size, output_size, out_h, out_w, init_weights, config): +class ODTensorFlowModel(TensorFlowModel): + def __init__( + self, + input_h, + input_w, + batch_size, + output_size, + out_h, + out_w, + init_weights, + config, + ): self.gpu_policy = _utils.TensorFlowGPUPolicy() self.gpu_policy.start() - # Converting incoming weights from shared_float_array to numpy for key in init_weights.keys(): - init_weights[key] = _utils.convert_shared_float_array_to_numpy(init_weights[key]) + init_weights[key] = _utils.convert_shared_float_array_to_numpy( + init_weights[key] + ) self.od_graph = _tf.Graph() self.config = config self.batch_size = batch_size self.grid_shape = [out_h, out_w] - self.num_classes = int(_utils.convert_shared_float_array_to_numpy(config['num_classes'])) + self.num_classes = int( + _utils.convert_shared_float_array_to_numpy(config["num_classes"]) + ) self.anchors = [ - (1.0, 2.0), (1.0, 1.0), (2.0, 1.0), - (2.0, 4.0), (2.0, 2.0), (4.0, 2.0), - (4.0, 8.0), (4.0, 4.0), (8.0, 4.0), - (8.0, 16.0), (8.0, 8.0), (16.0, 8.0), - (16.0, 32.0), (16.0, 16.0), (32.0, 16.0), + (1.0, 2.0), + (1.0, 1.0), + (2.0, 1.0), + (2.0, 4.0), + (2.0, 2.0), + (4.0, 2.0), + (4.0, 8.0), + (4.0, 4.0), + (8.0, 4.0), + (8.0, 16.0), + (8.0, 8.0), + (16.0, 8.0), + (16.0, 32.0), + (16.0, 16.0), + (32.0, 16.0), ] self.num_anchors = len(self.anchors) self.output_size = output_size @@ -49,34 +73,59 @@ def init_object_detector_graph(self, input_h, input_w, init_weights): self.is_train = _tf.placeholder(_tf.bool) # Set flag for training or val # Create placeholders for image and labels - self.images = _tf.placeholder(_tf.float32, [self.batch_size, input_h, - input_w, 3], name='images') - self.labels = _tf.placeholder(_tf.float32, - [self.batch_size, self.grid_shape[0], self.grid_shape[1], - self.num_anchors, self.num_classes + 5], - name='labels') + self.images = _tf.placeholder( + _tf.float32, [self.batch_size, input_h, input_w, 3], name="images" + ) + self.labels = _tf.placeholder( + _tf.float32, + [ + self.batch_size, + self.grid_shape[0], + self.grid_shape[1], + self.num_anchors, + self.num_classes + 5, + ], + name="labels", + ) self.tf_model = self.tiny_yolo(inputs=self.images, output_size=self.output_size) - self.global_step = _tf.Variable(0, trainable=False, - name="global_step") + self.global_step = _tf.Variable(0, trainable=False, name="global_step") self.loss = self.loss_layer(self.tf_model, self.labels) - self.base_lr = _utils.convert_shared_float_array_to_numpy(self.config['learning_rate']) - self.num_iterations = int(_utils.convert_shared_float_array_to_numpy(self.config['num_iterations'])) - self.init_steps = [self.num_iterations // 2, 3 * self.num_iterations // 4, self.num_iterations] - self.lrs = [_np.float32(self.base_lr * 10 ** (-i)) for i, step in enumerate(self.init_steps)] + self.base_lr = _utils.convert_shared_float_array_to_numpy( + self.config["learning_rate"] + ) + self.num_iterations = int( + _utils.convert_shared_float_array_to_numpy(self.config["num_iterations"]) + ) + self.init_steps = [ + self.num_iterations // 2, + 3 * self.num_iterations // 4, + self.num_iterations, + ] + self.lrs = [ + _np.float32(self.base_lr * 10 ** (-i)) + for i, step in enumerate(self.init_steps) + ] self.steps_tf = self.init_steps[:-1] - self.lr = _tf.train.piecewise_constant(self.global_step, self.steps_tf, self.lrs) + self.lr = _tf.train.piecewise_constant( + self.global_step, self.steps_tf, self.lrs + ) # TODO: Evaluate method to update lr in set_learning_rate() self.opt = _tf.train.MomentumOptimizer(self.lr, momentum=0.9) - self.clip_value = _utils.convert_shared_float_array_to_numpy(self.config.get('gradient_clipping')) + self.clip_value = _utils.convert_shared_float_array_to_numpy( + self.config.get("gradient_clipping") + ) grads_and_vars = self.opt.compute_gradients(self.loss) - clipped_gradients = [(self.ClipIfNotNone(g, self.clip_value), v) for g, v in grads_and_vars] - self.train_op = self.opt.apply_gradients(clipped_gradients, global_step=self.global_step) - + clipped_gradients = [ + (self.ClipIfNotNone(g, self.clip_value), v) for g, v in grads_and_vars + ] + self.train_op = self.opt.apply_gradients( + clipped_gradients, global_step=self.global_step + ) self.sess.run(_tf.global_variables_initializer()) self.sess.run(_tf.local_variables_initializer()) @@ -99,13 +148,21 @@ def load_weights(self, tf_net_params): """ for keys in tf_net_params: if tf_net_params[keys].ndim == 1: - self.sess.run(_tf.assign(_tf.get_default_graph().get_tensor_by_name(keys+":0"), - tf_net_params[keys])) + self.sess.run( + _tf.assign( + _tf.get_default_graph().get_tensor_by_name(keys + ":0"), + tf_net_params[keys], + ) + ) elif tf_net_params[keys].ndim == 4: # Converting from [output_channels, input_channels, filter_height, filter_width] to # [filter_height, filter_width, input_channels, output_channels] - self.sess.run(_tf.assign(_tf.get_default_graph().get_tensor_by_name(keys+":0"), - tf_net_params[keys].transpose(2, 3, 1, 0))) + self.sess.run( + _tf.assign( + _tf.get_default_graph().get_tensor_by_name(keys + ":0"), + tf_net_params[keys].transpose(2, 3, 1, 0), + ) + ) else: continue @@ -117,12 +174,9 @@ def ClipIfNotNone(self, grad, clip_value): return grad return _tf.clip_by_value(grad, -clip_value, clip_value) - def batch_norm_wrapper(self, - inputs, - batch_name, - is_training=True, - epsilon=1e-05, - decay=0.9): + def batch_norm_wrapper( + self, inputs, batch_name, is_training=True, epsilon=1e-05, decay=0.9 + ): """ Layer to handle batch norm training and inference @@ -147,29 +201,54 @@ def batch_norm_wrapper(self, """ dim_of_x = inputs.get_shape()[-1] - shadow_mean = _tf.Variable(_tf.zeros(shape=[dim_of_x], dtype='float32'), - name=batch_name + 'running_mean', trainable=False) - - shadow_var = _tf.Variable(_tf.ones(shape=[dim_of_x], dtype='float32'), - name=batch_name + 'running_var', trainable=False) + shadow_mean = _tf.Variable( + _tf.zeros(shape=[dim_of_x], dtype="float32"), + name=batch_name + "running_mean", + trainable=False, + ) + + shadow_var = _tf.Variable( + _tf.ones(shape=[dim_of_x], dtype="float32"), + name=batch_name + "running_var", + trainable=False, + ) axes = list(range(len(inputs.get_shape()) - 1)) # Calculate mean and variance for a batch - batch_mean, batch_var = _tf.nn.moments(inputs, axes, name='moments') + batch_mean, batch_var = _tf.nn.moments(inputs, axes, name="moments") def mean_var_update(): - with _tf.control_dependencies([_tf.assign(shadow_mean, _tf.multiply(shadow_mean, decay) - + _tf.multiply(batch_mean, 1. - decay)), - _tf.assign(shadow_var, _tf.multiply(shadow_var, decay) - + _tf.multiply(batch_var, 1. - decay))]): + with _tf.control_dependencies( + [ + _tf.assign( + shadow_mean, + _tf.multiply(shadow_mean, decay) + + _tf.multiply(batch_mean, 1.0 - decay), + ), + _tf.assign( + shadow_var, + _tf.multiply(shadow_var, decay) + + _tf.multiply(batch_var, 1.0 - decay), + ), + ] + ): return _tf.identity(batch_mean), _tf.identity(batch_var) - mean, variance = _tf.cond(_tf.cast(is_training, _tf.bool), mean_var_update, - lambda:(_tf.identity(shadow_mean), _tf.identity(shadow_var))) - beta = _tf.Variable(_tf.zeros(shape=dim_of_x, dtype='float32'), - name=batch_name + 'beta', trainable=True) #Offset/Shift - gamma = _tf.Variable(_tf.ones(shape=dim_of_x, dtype='float32'), - name=batch_name + 'gamma', trainable=True) #Scale + mean, variance = _tf.cond( + _tf.cast(is_training, _tf.bool), + mean_var_update, + lambda: (_tf.identity(shadow_mean), _tf.identity(shadow_var)), + ) + beta = _tf.Variable( + _tf.zeros(shape=dim_of_x, dtype="float32"), + name=batch_name + "beta", + trainable=True, + ) # Offset/Shift + gamma = _tf.Variable( + _tf.ones(shape=dim_of_x, dtype="float32"), + name=batch_name + "gamma", + trainable=True, + ) # Scale return _tf.nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon) @@ -195,24 +274,28 @@ def conv_layer(self, inputs, shape, name, batch_name, batch_norm=True): conv: TensorFlow Tensor Return result from combining conv, batch norm and leaky ReLU or conv and bias as needed """ - weight = _tf.Variable(_tf.random.truncated_normal(shape, stddev=0.1), trainable=True, name=name + 'weight') + weight = _tf.Variable( + _tf.random.truncated_normal(shape, stddev=0.1), + trainable=True, + name=name + "weight", + ) - conv = _tf.nn.conv2d(inputs, weight, strides=[1, 1, 1, 1], padding='SAME', name=name) + conv = _tf.nn.conv2d( + inputs, weight, strides=[1, 1, 1, 1], padding="SAME", name=name + ) if batch_norm: - conv = self.batch_norm_wrapper(conv, - batch_name, - is_training = self.is_train) + conv = self.batch_norm_wrapper(conv, batch_name, is_training=self.is_train) alpha = 0.1 conv = _tf.maximum(alpha * conv, conv) else: - bias = _tf.Variable(_tf.constant(0.1, shape=[shape[3]]), name=name + 'bias') + bias = _tf.Variable(_tf.constant(0.1, shape=[shape[3]]), name=name + "bias") conv = _tf.add(conv, bias) return conv - def pooling_layer(self, inputs, pool_size, strides, padding, name='1_pool'): + def pooling_layer(self, inputs, pool_size, strides, padding, name="1_pool"): """ Define pooling layer @@ -233,7 +316,9 @@ def pooling_layer(self, inputs, pool_size, strides, padding, name='1_pool'): Return pooling layer """ - pool = _tf.nn.max_pool2d(inputs, ksize=pool_size, strides=strides, padding=padding, name=name) + pool = _tf.nn.max_pool2d( + inputs, ksize=pool_size, strides=strides, padding=padding, name=name + ) return pool def tiny_yolo(self, inputs, output_size=125): @@ -257,25 +342,52 @@ def tiny_yolo(self, inputs, output_size=125): filter_sizes = [16, 32, 64, 128, 256, 512, 1024, 1024] for idx, f in enumerate(filter_sizes, 1): - batch_name = 'batchnorm%d_' % (idx - 1) + batch_name = "batchnorm%d_" % (idx - 1) if idx == 1: - net = self.conv_layer(inputs, [3, 3, 3, f], name='conv%d_' % (idx - 1), - batch_name=batch_name, batch_norm=True) + net = self.conv_layer( + inputs, + [3, 3, 3, f], + name="conv%d_" % (idx - 1), + batch_name=batch_name, + batch_norm=True, + ) else: - net = self.conv_layer(net, [3, 3, filter_sizes[idx - 2], filter_sizes[idx - 1]], - name='conv%d_' % (idx - 1), batch_name=batch_name, batch_norm=True) + net = self.conv_layer( + net, + [3, 3, filter_sizes[idx - 2], filter_sizes[idx - 1]], + name="conv%d_" % (idx - 1), + batch_name=batch_name, + batch_norm=True, + ) if idx < 7: if idx < 6: strides = [1, 2, 2, 1] - net = self.pooling_layer(net, pool_size=[1, 2, 2, 1], strides=strides, padding='VALID', name='pool%d_' % idx) + net = self.pooling_layer( + net, + pool_size=[1, 2, 2, 1], + strides=strides, + padding="VALID", + name="pool%d_" % idx, + ) else: strides = [1, 1, 1, 1] - net = self.pooling_layer(net, pool_size=[1, 2, 2, 1], strides=strides, padding='SAME', name='pool%d_' % idx) + net = self.pooling_layer( + net, + pool_size=[1, 2, 2, 1], + strides=strides, + padding="SAME", + name="pool%d_" % idx, + ) if output_size is not None: - net = self.conv_layer(net, [1, 1, filter_sizes[idx - 1], output_size], - name='conv8_', batch_name=None, batch_norm=False) + net = self.conv_layer( + net, + [1, 1, filter_sizes[idx - 1], output_size], + name="conv8_", + batch_name=None, + batch_norm=False, + ) return net @@ -298,15 +410,28 @@ def loss_layer(self, predict, labels): POS_IOU = 0.7 NEG_IOU = 0.3 - rescore = int(_utils.convert_shared_float_array_to_numpy(self.config.get('od_rescore'))) - lmb_coord_xy = _utils.convert_shared_float_array_to_numpy(self.config.get('lmb_coord_xy')) - lmb_coord_wh = _utils.convert_shared_float_array_to_numpy(self.config.get('lmb_coord_wh')) - lmb_obj = _utils.convert_shared_float_array_to_numpy(self.config.get('lmb_obj')) - lmb_noobj = _utils.convert_shared_float_array_to_numpy(self.config.get('lmb_noobj')) - lmb_class = _utils.convert_shared_float_array_to_numpy(self.config.get('lmb_class')) + rescore = int( + _utils.convert_shared_float_array_to_numpy(self.config.get("od_rescore")) + ) + lmb_coord_xy = _utils.convert_shared_float_array_to_numpy( + self.config.get("lmb_coord_xy") + ) + lmb_coord_wh = _utils.convert_shared_float_array_to_numpy( + self.config.get("lmb_coord_wh") + ) + lmb_obj = _utils.convert_shared_float_array_to_numpy(self.config.get("lmb_obj")) + lmb_noobj = _utils.convert_shared_float_array_to_numpy( + self.config.get("lmb_noobj") + ) + lmb_class = _utils.convert_shared_float_array_to_numpy( + self.config.get("lmb_class") + ) # Prediction values from model on the images - ypred = _tf.reshape(predict, [-1] + list(self.grid_shape) + [self.num_anchors, 5 + self.num_classes]) + ypred = _tf.reshape( + predict, + [-1] + list(self.grid_shape) + [self.num_anchors, 5 + self.num_classes], + ) raw_xy = ypred[..., 0:2] raw_wh = ypred[..., 2:4] raw_conf = ypred[..., 4] @@ -343,22 +468,40 @@ def loss_layer(self, predict, labels): iou = inter_area / (area + gt_area - inter_area) active_iou = c_iou - cond_gt = _tf.cast(_tf.equal(gt_conf, _tf.constant(1.0)), dtype=_tf.float32) max_iou = _tf.reduce_max(active_iou, 3, keepdims=True) - cond_max = _tf.cast(_tf.equal(active_iou ,max_iou), dtype=_tf.float32) + cond_max = _tf.cast(_tf.equal(active_iou, max_iou), dtype=_tf.float32) cond_above = c_iou > POS_IOU - cond_logical_or = _tf.cast(_tf.math.logical_or(_tf.cast(cond_max, dtype=_tf.bool), _tf.cast(cond_above, dtype=_tf.bool)), dtype=_tf.float32) - cond_obj = _tf.cast(_tf.math.logical_and(_tf.cast(cond_gt, dtype=_tf.bool), _tf.cast(cond_logical_or, dtype=_tf.bool)), dtype=_tf.float32) + cond_logical_or = _tf.cast( + _tf.math.logical_or( + _tf.cast(cond_max, dtype=_tf.bool), _tf.cast(cond_above, dtype=_tf.bool) + ), + dtype=_tf.float32, + ) + cond_obj = _tf.cast( + _tf.math.logical_and( + _tf.cast(cond_gt, dtype=_tf.bool), + _tf.cast(cond_logical_or, dtype=_tf.bool), + ), + dtype=_tf.float32, + ) kr_obj_ij = _tf.stop_gradient(cond_obj) cond_below = c_iou < NEG_IOU - cond_logical_not = _tf.cast(_tf.math.logical_not(_tf.cast(cond_obj, dtype=_tf.bool)), dtype=_tf.float32) - cond_noobj = _tf.cast(_tf.math.logical_and(_tf.cast(cond_below, dtype=_tf.bool), _tf.cast(cond_logical_not, dtype=_tf.bool)), dtype=_tf.float32) + cond_logical_not = _tf.cast( + _tf.math.logical_not(_tf.cast(cond_obj, dtype=_tf.bool)), dtype=_tf.float32 + ) + cond_noobj = _tf.cast( + _tf.math.logical_and( + _tf.cast(cond_below, dtype=_tf.bool), + _tf.cast(cond_logical_not, dtype=_tf.bool), + ), + dtype=_tf.float32, + ) kr_noobj_ij = _tf.stop_gradient(cond_noobj) @@ -378,10 +521,33 @@ def loss_layer(self, predict, labels): obj_w = _tf.math.add(obj_w_obj, obj_w_noobj) - loss_xy = lmb_coord_xy * _tf.reduce_sum(kr_obj_ij_plus1 * _tf.square(gt_xy - xy)) / eps_count - loss_wh = _tf.losses.huber_loss(labels=gt_raw_wh, predictions=raw_wh, weights=lmb_coord_wh * kr_obj_ij_plus1, delta= 1.0) - loss_conf = scale_conf * _tf.reduce_sum(obj_w * _tf.nn.sigmoid_cross_entropy_with_logits(labels=obj_gt_conf, logits=raw_conf)) - loss_cls = lmb_class * _tf.reduce_sum(kr_obj_ij * _tf.nn.softmax_cross_entropy_with_logits_v2(labels=gt_class, logits=class_scores)) / eps_count + loss_xy = ( + lmb_coord_xy + * _tf.reduce_sum(kr_obj_ij_plus1 * _tf.square(gt_xy - xy)) + / eps_count + ) + loss_wh = _tf.losses.huber_loss( + labels=gt_raw_wh, + predictions=raw_wh, + weights=lmb_coord_wh * kr_obj_ij_plus1, + delta=1.0, + ) + loss_conf = scale_conf * _tf.reduce_sum( + obj_w + * _tf.nn.sigmoid_cross_entropy_with_logits( + labels=obj_gt_conf, logits=raw_conf + ) + ) + loss_cls = ( + lmb_class + * _tf.reduce_sum( + kr_obj_ij + * _tf.nn.softmax_cross_entropy_with_logits_v2( + labels=gt_class, logits=class_scores + ) + ) + / eps_count + ) losses = [loss_xy, loss_wh, loss_conf, loss_cls] loss = _tf.add_n(losses) @@ -405,13 +571,24 @@ def train(self, feed_dict): """ for key in feed_dict.keys(): feed_dict[key] = _utils.convert_shared_float_array_to_numpy(feed_dict[key]) - feed_dict['labels'] = feed_dict['labels'].reshape(self.batch_size, self.grid_shape[0], self.grid_shape[1],self.num_anchors, self.num_classes + 5) - - _, loss_batch = self.sess.run([self.train_op, self.loss], feed_dict={self.images: feed_dict['input'], - self.labels: feed_dict['labels'], - self.is_train: True}) + feed_dict["labels"] = feed_dict["labels"].reshape( + self.batch_size, + self.grid_shape[0], + self.grid_shape[1], + self.num_anchors, + self.num_classes + 5, + ) + + _, loss_batch = self.sess.run( + [self.train_op, self.loss], + feed_dict={ + self.images: feed_dict["input"], + self.labels: feed_dict["labels"], + self.is_train: True, + }, + ) result = {} - result['loss'] = _np.array([loss_batch]) + result["loss"] = _np.array([loss_batch]) return result def predict(self, feed_dict): @@ -431,11 +608,14 @@ def predict(self, feed_dict): for key in feed_dict.keys(): feed_dict[key] = _utils.convert_shared_float_array_to_numpy(feed_dict[key]) - output = self.sess.run([self.tf_model], feed_dict={self.images: feed_dict['input'], self.is_train: False}) + output = self.sess.run( + [self.tf_model], + feed_dict={self.images: feed_dict["input"], self.is_train: False}, + ) # TODO: Include self.labels: feed_dict['label'] to handle labels from validation set result = {} - result['output'] = _np.array(output[0]) + result["output"] = _np.array(output[0]) return result def export_weights(self): @@ -455,13 +635,19 @@ def export_weights(self): tvars_vals = self.sess.run(tvars) for var, val in zip(tvars, tvars_vals): if val.ndim == 1: - tf_export_params.update( - {var.name.replace(":0", ""): val}) + tf_export_params.update({var.name.replace(":0", ""): val}) elif val.ndim == 4: tf_export_params.update( - {var.name.replace(":0", ""): _utils.convert_conv2d_tf_to_coreml(val)}) + { + var.name.replace(":0", ""): _utils.convert_conv2d_tf_to_coreml( + val + ) + } + ) for layer_name in tf_export_params.keys(): - tf_export_params[layer_name] = _np.ascontiguousarray(tf_export_params[layer_name]) + tf_export_params[layer_name] = _np.ascontiguousarray( + tf_export_params[layer_name] + ) return tf_export_params diff --git a/src/python/turicreate/toolkits/object_detector/object_detector.py b/src/python/turicreate/toolkits/object_detector/object_detector.py index e0267087e5..a59868206f 100644 --- a/src/python/turicreate/toolkits/object_detector/object_detector.py +++ b/src/python/turicreate/toolkits/object_detector/object_detector.py @@ -24,28 +24,33 @@ import turicreate.toolkits._internal_utils as _tkutl from turicreate.toolkits import _coreml_utils from turicreate.toolkits._model import PythonProxy as _PythonProxy -from turicreate.toolkits._internal_utils import (_raise_error_if_not_sframe, - _numeric_param_check_range, - _raise_error_if_not_iterable) +from turicreate.toolkits._internal_utils import ( + _raise_error_if_not_sframe, + _numeric_param_check_range, + _raise_error_if_not_iterable, +) from turicreate import config as _tc_config from turicreate.toolkits._main import ToolkitError as _ToolkitError from .. import _pre_trained_models from ._evaluation import average_precision as _average_precision -from .._mps_utils import (use_mps as _use_mps, - mps_device_memory_limit as _mps_device_memory_limit, - MpsGraphAPI as _MpsGraphAPI, - MpsGraphNetworkType as _MpsGraphNetworkType, - MpsGraphMode as _MpsGraphMode) - - -def _get_mps_od_net(input_image_shape, batch_size, output_size, anchors, - config, weights={}): +from .._mps_utils import ( + use_mps as _use_mps, + mps_device_memory_limit as _mps_device_memory_limit, + MpsGraphAPI as _MpsGraphAPI, + MpsGraphNetworkType as _MpsGraphNetworkType, + MpsGraphMode as _MpsGraphMode, +) + + +def _get_mps_od_net( + input_image_shape, batch_size, output_size, anchors, config, weights={} +): """ Initializes an MpsGraphAPI for object detection. """ network = _MpsGraphAPI(network_id=_MpsGraphNetworkType.kODGraphNet) - c_in, h_in, w_in = input_image_shape + c_in, h_in, w_in = input_image_shape c_out = output_size h_out = h_in // 32 w_out = w_in // 32 @@ -54,8 +59,17 @@ def _get_mps_od_net(input_image_shape, batch_size, output_size, anchors, h_view = h_in w_view = w_in - network.init(batch_size, c_in, h_in, w_in, c_out, h_out, w_out, - weights=weights, config=config) + network.init( + batch_size, + c_in, + h_in, + w_in, + c_out, + h_out, + w_out, + weights=weights, + config=config, + ) return network @@ -65,7 +79,7 @@ def _seconds_as_string(seconds): """ Returns seconds as a human-friendly string, e.g. '1d 4h 47m 41s' """ - TIME_UNITS = [('s', 60), ('m', 60), ('h', 24), ('d', None)] + TIME_UNITS = [("s", 60), ("m", 60), ("h", 24), ("d", None)] unit_strings = [] cur = max(int(seconds), 1) for suffix, size in TIME_UNITS: @@ -74,12 +88,14 @@ def _seconds_as_string(seconds): else: rest = cur if rest > 0: - unit_strings.insert(0, '%d%s' % (rest, suffix)) - return ' '.join(unit_strings) + unit_strings.insert(0, "%d%s" % (rest, suffix)) + return " ".join(unit_strings) -def _raise_error_if_not_detection_sframe(dataset, feature, annotations, require_annotations): - _raise_error_if_not_sframe(dataset, 'datset') +def _raise_error_if_not_detection_sframe( + dataset, feature, annotations, require_annotations +): + _raise_error_if_not_sframe(dataset, "datset") if feature not in dataset.column_names(): raise _ToolkitError("Feature column '%s' does not exist" % feature) if dataset[feature].dtype != _tc.Image: @@ -92,9 +108,18 @@ def _raise_error_if_not_detection_sframe(dataset, feature, annotations, require_ raise _ToolkitError("Annotations column must be of type dict or list") -def create(dataset, annotations=None, feature=None, model='darknet-yolo', - classes=None, batch_size=0, max_iterations=0, - verbose=True, grid_shape=[13, 13], **kwargs): +def create( + dataset, + annotations=None, + feature=None, + model="darknet-yolo", + classes=None, + batch_size=0, + max_iterations=0, + verbose=True, + grid_shape=[13, 13], + **kwargs +): """ Create a :class:`ObjectDetector` model. @@ -180,85 +205,97 @@ def create(dataset, annotations=None, feature=None, model='darknet-yolo', _raise_error_if_not_sframe(dataset, "dataset") if len(dataset) == 0: - raise _ToolkitError('Unable to train on empty dataset') + raise _ToolkitError("Unable to train on empty dataset") - _numeric_param_check_range('max_iterations', max_iterations, 0, _six.MAXSIZE) + _numeric_param_check_range("max_iterations", max_iterations, 0, _six.MAXSIZE) start_time = _time.time() - supported_detectors = ['darknet-yolo'] + supported_detectors = ["darknet-yolo"] if feature is None: feature = _tkutl._find_only_image_column(dataset) if verbose: print("Using '%s' as feature column" % feature) if annotations is None: - annotations = _tkutl._find_only_column_of_type(dataset, - target_type=[list, dict], - type_name='list', - col_name='annotations') + annotations = _tkutl._find_only_column_of_type( + dataset, target_type=[list, dict], type_name="list", col_name="annotations" + ) if verbose: print("Using '%s' as annotations column" % annotations) - _raise_error_if_not_detection_sframe(dataset, feature, annotations, - require_annotations=True) - _tkutl._handle_missing_values(dataset, feature, 'dataset') + _raise_error_if_not_detection_sframe( + dataset, feature, annotations, require_annotations=True + ) + _tkutl._handle_missing_values(dataset, feature, "dataset") is_annotations_list = dataset[annotations].dtype == list - _tkutl._check_categorical_option_type('model', model, - supported_detectors) + _tkutl._check_categorical_option_type("model", model, supported_detectors) - base_model = model.split('-', 1)[0] + base_model = model.split("-", 1)[0] ref_model = _pre_trained_models.OBJECT_DETECTION_BASE_MODELS[base_model]() - pretrained_model = _pre_trained_models.OBJECT_DETECTION_BASE_MODELS['darknet_mlmodel']() + pretrained_model = _pre_trained_models.OBJECT_DETECTION_BASE_MODELS[ + "darknet_mlmodel" + ]() pretrained_model_path = pretrained_model.get_model_path() params = { - 'anchors': [ - (1.0, 2.0), (1.0, 1.0), (2.0, 1.0), - (2.0, 4.0), (2.0, 2.0), (4.0, 2.0), - (4.0, 8.0), (4.0, 4.0), (8.0, 4.0), - (8.0, 16.0), (8.0, 8.0), (16.0, 8.0), - (16.0, 32.0), (16.0, 16.0), (32.0, 16.0), + "anchors": [ + (1.0, 2.0), + (1.0, 1.0), + (2.0, 1.0), + (2.0, 4.0), + (2.0, 2.0), + (4.0, 2.0), + (4.0, 8.0), + (4.0, 4.0), + (8.0, 4.0), + (8.0, 16.0), + (8.0, 8.0), + (16.0, 8.0), + (16.0, 32.0), + (16.0, 16.0), + (32.0, 16.0), ], - 'grid_shape': grid_shape, - 'aug_resize': 0, - 'aug_rand_crop': 0.9, - 'aug_rand_pad': 0.9, - 'aug_rand_gray': 0.0, - 'aug_aspect_ratio': 1.25, - 'aug_hue': 0.05, - 'aug_brightness': 0.05, - 'aug_saturation': 0.05, - 'aug_contrast': 0.05, - 'aug_horizontal_flip': True, - 'aug_min_object_covered': 0, - 'aug_min_eject_coverage': 0.5, - 'aug_area_range': (.15, 2), - 'aug_pca_noise': 0.0, - 'aug_max_attempts': 20, - 'aug_inter_method': 2, - 'lmb_coord_xy': 10.0, - 'lmb_coord_wh': 10.0, - 'lmb_obj': 100.0, - 'lmb_noobj': 5.0, - 'lmb_class': 2.0, - 'non_maximum_suppression_threshold': 0.45, - 'rescore': True, - 'clip_gradients': 0.025, - 'weight_decay': 0.0005, - 'sgd_momentum': 0.9, - 'learning_rate': 1.0e-3, - 'shuffle': True, - 'mps_loss_mult': 8, + "grid_shape": grid_shape, + "aug_resize": 0, + "aug_rand_crop": 0.9, + "aug_rand_pad": 0.9, + "aug_rand_gray": 0.0, + "aug_aspect_ratio": 1.25, + "aug_hue": 0.05, + "aug_brightness": 0.05, + "aug_saturation": 0.05, + "aug_contrast": 0.05, + "aug_horizontal_flip": True, + "aug_min_object_covered": 0, + "aug_min_eject_coverage": 0.5, + "aug_area_range": (0.15, 2), + "aug_pca_noise": 0.0, + "aug_max_attempts": 20, + "aug_inter_method": 2, + "lmb_coord_xy": 10.0, + "lmb_coord_wh": 10.0, + "lmb_obj": 100.0, + "lmb_noobj": 5.0, + "lmb_class": 2.0, + "non_maximum_suppression_threshold": 0.45, + "rescore": True, + "clip_gradients": 0.025, + "weight_decay": 0.0005, + "sgd_momentum": 0.9, + "learning_rate": 1.0e-3, + "shuffle": True, + "mps_loss_mult": 8, # This large buffer size (8 batches) is an attempt to mitigate against # the SFrame shuffle operation that can occur after each epoch. - 'io_thread_buffer_size': 8, - 'mlmodel_path': pretrained_model_path, + "io_thread_buffer_size": 8, + "mlmodel_path": pretrained_model_path, } - #create tensorflow model here + # create tensorflow model here import turicreate.toolkits.libtctensorflow + if classes == None: classes = [] @@ -266,29 +303,35 @@ def create(dataset, annotations=None, feature=None, model='darknet-yolo', _raise_error_if_not_iterable(grid_shape) grid_shape = [int(x) for x in grid_shape] - assert(len(grid_shape) == 2) + assert len(grid_shape) == 2 tf_config = { - 'grid_height': params['grid_shape'][0], - 'grid_width': params['grid_shape'][1], - 'mlmodel_path' : params['mlmodel_path'], - 'classes' : classes, - 'compute_final_metrics' : False, - 'verbose' : verbose + "grid_height": params["grid_shape"][0], + "grid_width": params["grid_shape"][1], + "mlmodel_path": params["mlmodel_path"], + "classes": classes, + "compute_final_metrics": False, + "verbose": verbose, } # If batch_size or max_iterations = 0, they will be automatically # generated in C++. if batch_size > 0: - tf_config['batch_size'] = batch_size + tf_config["batch_size"] = batch_size if max_iterations > 0: - tf_config['max_iterations'] = max_iterations + tf_config["max_iterations"] = max_iterations model = _tc.extensions.object_detector() - model.train(data=dataset, annotations_column_name=annotations, image_column_name=feature, options=tf_config) + model.train( + data=dataset, + annotations_column_name=annotations, + image_column_name=feature, + options=tf_config, + ) return ObjectDetector(model_proxy=model, name="object_detector") + class ObjectDetector(_Model): """ A trained model using C++ implementation that is ready to use for classification @@ -296,6 +339,7 @@ class ObjectDetector(_Model): This model should not be constructed directly. """ + _CPP_OBJECT_DETECTOR_VERSION = 1 def __init__(self, model_proxy=None, name=None): @@ -325,17 +369,19 @@ def __repr__(self): width = 40 sections, section_titles = self._get_summary_struct() - out = _tkutl._toolkit_repr_print(self, sections, section_titles, - width=width) + out = _tkutl._toolkit_repr_print(self, sections, section_titles, width=width) return out def _get_version(self): return self._CPP_OBJECT_DETECTOR_VERSION - def export_coreml(self, filename, - include_non_maximum_suppression = True, - iou_threshold = None, - confidence_threshold = None): + def export_coreml( + self, + filename, + include_non_maximum_suppression=True, + iou_threshold=None, + confidence_threshold=None, + ): """ Save the model in Core ML format. The Core ML model takes an image of fixed size as input and produces two output arrays: `confidence` and @@ -395,20 +441,22 @@ def export_coreml(self, filename, >>> model.export_coreml('detector.mlmodel') """ options = {} - options['include_non_maximum_suppression'] = include_non_maximum_suppression - options['version'] = self._get_version() + options["include_non_maximum_suppression"] = include_non_maximum_suppression + options["version"] = self._get_version() if confidence_threshold is not None: - options['confidence_threshold'] = confidence_threshold + options["confidence_threshold"] = confidence_threshold if iou_threshold is not None: - options['iou_threshold'] = iou_threshold + options["iou_threshold"] = iou_threshold additional_user_defined_metadata = _coreml_utils._get_tc_version_info() - short_description = _coreml_utils._mlmodel_short_description('Object Detector') - - self.__proxy__.export_to_coreml(filename, short_description, - additional_user_defined_metadata, options) + short_description = _coreml_utils._mlmodel_short_description("Object Detector") + self.__proxy__.export_to_coreml( + filename, short_description, additional_user_defined_metadata, options + ) - def predict(self, dataset, confidence_threshold=0.25, iou_threshold=0.45, verbose=True): + def predict( + self, dataset, confidence_threshold=0.25, iou_threshold=0.45, verbose=True + ): """ Predict object instances in an SFrame of images. @@ -455,15 +503,24 @@ def predict(self, dataset, confidence_threshold=0.25, iou_threshold=0.45, verbos # Visualize predictions by generating a new column of marked up images >>> data['image_pred'] = turicreate.object_detector.util.draw_bounding_boxes(data['image'], data['predictions']) """ - _numeric_param_check_range('confidence_threshold', confidence_threshold, 0.0, 1.0) - _numeric_param_check_range('iou_threshold', iou_threshold, 0.0, 1.0) + _numeric_param_check_range( + "confidence_threshold", confidence_threshold, 0.0, 1.0 + ) + _numeric_param_check_range("iou_threshold", iou_threshold, 0.0, 1.0) options = {} options["confidence_threshold"] = confidence_threshold options["iou_threshold"] = iou_threshold options["verbose"] = verbose return self.__proxy__.predict(dataset, options) - def evaluate(self, dataset, metric='auto', output_type='dict', confidence_threshold = 0.001, iou_threshold = 0.45): + def evaluate( + self, + dataset, + metric="auto", + output_type="dict", + confidence_threshold=0.001, + iou_threshold=0.45, + ): """ Evaluate the model by making predictions and comparing these to ground truth bounding box annotations. @@ -508,8 +565,10 @@ def evaluate(self, dataset, metric='auto', output_type='dict', confidence_thresh >>> print('mAP: {:.1%}'.format(results['mean_average_precision'])) mAP: 43.2% """ - _numeric_param_check_range('confidence_threshold', confidence_threshold, 0.0, 1.0) - _numeric_param_check_range('iou_threshold', iou_threshold, 0.0, 1.0) + _numeric_param_check_range( + "confidence_threshold", confidence_threshold, 0.0, 1.0 + ) + _numeric_param_check_range("iou_threshold", iou_threshold, 0.0, 1.0) options = {} options["confidence_threshold"] = confidence_threshold options["iou_threshold"] = iou_threshold @@ -533,18 +592,18 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ model_fields = [ - ('Model', 'model'), - ('Number of classes', 'num_classes'), - ('Input image shape', 'input_image_shape') + ("Model", "model"), + ("Number of classes", "num_classes"), + ("Input image shape", "input_image_shape"), ] training_fields = [ - ('Training time', '_training_time_as_string'), - ('Training epochs', 'training_epochs'), - ('Training iterations', 'training_iterations'), - ('Number of examples (images)', 'num_examples'), - ('Number of bounding boxes (instances)', 'num_bounding_boxes'), - ('Final loss (specific to model)', 'training_loss'), + ("Training time", "_training_time_as_string"), + ("Training epochs", "training_epochs"), + ("Training iterations", "training_iterations"), + ("Number of examples (images)", "num_examples"), + ("Number of bounding boxes (instances)", "num_bounding_boxes"), + ("Final loss (specific to model)", "training_loss"), ] - section_titles = ['Schema', 'Training summary'] - return([model_fields, training_fields], section_titles) + section_titles = ["Schema", "Training summary"] + return ([model_fields, training_fields], section_titles) diff --git a/src/python/turicreate/toolkits/object_detector/util/__init__.py b/src/python/turicreate/toolkits/object_detector/util/__init__.py index 2e180644ec..1c1c5e1792 100644 --- a/src/python/turicreate/toolkits/object_detector/util/__init__.py +++ b/src/python/turicreate/toolkits/object_detector/util/__init__.py @@ -9,4 +9,4 @@ from ._output_formats import stack_annotations, unstack_annotations from ._visualization import draw_bounding_boxes -__all__ = ['stack_annotations', 'unstack_annotations', 'draw_bounding_boxes'] +__all__ = ["stack_annotations", "unstack_annotations", "draw_bounding_boxes"] diff --git a/src/python/turicreate/toolkits/object_detector/util/_output_formats.py b/src/python/turicreate/toolkits/object_detector/util/_output_formats.py index 26d57b141d..201874047e 100644 --- a/src/python/turicreate/toolkits/object_detector/util/_output_formats.py +++ b/src/python/turicreate/toolkits/object_detector/util/_output_formats.py @@ -7,8 +7,10 @@ from __future__ import division as _ from __future__ import absolute_import as _ import turicreate as _tc -from turicreate.toolkits._internal_utils import (_raise_error_if_not_sframe, - _raise_error_if_not_sarray) +from turicreate.toolkits._internal_utils import ( + _raise_error_if_not_sframe, + _raise_error_if_not_sarray, +) def stack_annotations(annotations_sarray): @@ -51,15 +53,15 @@ def stack_annotations(annotations_sarray): +--------+------------+-------+-------+-------+-------+--------+ [3 rows x 7 columns] """ - _raise_error_if_not_sarray(annotations_sarray, variable_name='annotations_sarray') - sf = _tc.SFrame({'annotations': annotations_sarray}).add_row_number('row_id') - sf = sf.stack('annotations', new_column_name='annotations', drop_na=True) + _raise_error_if_not_sarray(annotations_sarray, variable_name="annotations_sarray") + sf = _tc.SFrame({"annotations": annotations_sarray}).add_row_number("row_id") + sf = sf.stack("annotations", new_column_name="annotations", drop_na=True) if len(sf) == 0: - cols = ['row_id', 'confidence', 'label', 'height', 'width', 'x', 'y'] + cols = ["row_id", "confidence", "label", "height", "width", "x", "y"] return _tc.SFrame({k: [] for k in cols}) - sf = sf.unpack('annotations', column_name_prefix='') - sf = sf.unpack('coordinates', column_name_prefix='') - del sf['type'] + sf = sf.unpack("annotations", column_name_prefix="") + sf = sf.unpack("coordinates", column_name_prefix="") + del sf["type"] return sf @@ -117,32 +119,34 @@ def unstack_annotations(annotations_sframe, num_rows=None): """ _raise_error_if_not_sframe(annotations_sframe, variable_name="annotations_sframe") - cols = ['label', 'type', 'coordinates'] - has_confidence = 'confidence' in annotations_sframe.column_names() + cols = ["label", "type", "coordinates"] + has_confidence = "confidence" in annotations_sframe.column_names() if has_confidence: - cols.append('confidence') + cols.append("confidence") if num_rows is None: if len(annotations_sframe) == 0: num_rows = 0 else: - num_rows = annotations_sframe['row_id'].max() + 1 + num_rows = annotations_sframe["row_id"].max() + 1 sf = annotations_sframe - sf['type'] = 'rectangle' - sf = sf.pack_columns(['x', 'y', 'width', 'height'], dtype=dict, - new_column_name='coordinates') - sf = sf.pack_columns(cols, dtype=dict, new_column_name='ann') - sf = sf.unstack('ann', new_column_name='annotations') - sf_all_ids = _tc.SFrame({'row_id': range(num_rows)}) - sf = sf.join(sf_all_ids, on='row_id', how='right') - sf = sf.fillna('annotations', []) - sf = sf.sort('row_id') - - annotations_sarray = sf['annotations'] + sf["type"] = "rectangle" + sf = sf.pack_columns( + ["x", "y", "width", "height"], dtype=dict, new_column_name="coordinates" + ) + sf = sf.pack_columns(cols, dtype=dict, new_column_name="ann") + sf = sf.unstack("ann", new_column_name="annotations") + sf_all_ids = _tc.SFrame({"row_id": range(num_rows)}) + sf = sf.join(sf_all_ids, on="row_id", how="right") + sf = sf.fillna("annotations", []) + sf = sf.sort("row_id") + + annotations_sarray = sf["annotations"] # Sort the confidences again, since the unstack does not preserve the order if has_confidence: annotations_sarray = annotations_sarray.apply( - lambda x: sorted(x, key=lambda ann: ann['confidence'], reverse=True), - dtype=list) + lambda x: sorted(x, key=lambda ann: ann["confidence"], reverse=True), + dtype=list, + ) return annotations_sarray diff --git a/src/python/turicreate/toolkits/object_detector/util/_visualization.py b/src/python/turicreate/toolkits/object_detector/util/_visualization.py index 69decb0187..2c9022a5c2 100644 --- a/src/python/turicreate/toolkits/object_detector/util/_visualization.py +++ b/src/python/turicreate/toolkits/object_detector/util/_visualization.py @@ -21,61 +21,166 @@ def _string_hash(s): COLOR_NAMES = [ - 'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', - 'Bisque', 'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', - 'AntiqueWhite', 'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', - 'Crimson', 'Cyan', 'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', - 'DarkOrange', 'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', - 'DarkViolet', 'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', - 'FloralWhite', 'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', - 'GoldenRod', 'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', - 'Khaki', 'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', - 'LightBlue', 'LightCoral', 'LightCyan', 'LightGoldenRodYellow', - 'LightGray', 'LightGrey', 'LightGreen', 'LightPink', 'LightSalmon', - 'LightSeaGreen', 'LightSkyBlue', 'LightSlateGray', 'LightSlateGrey', - 'LightSteelBlue', 'LightYellow', 'Lime', 'LimeGreen', 'Linen', 'Magenta', - 'MediumAquaMarine', 'MediumOrchid', 'MediumPurple', 'MediumSeaGreen', - 'MediumSlateBlue', 'MediumSpringGreen', 'MediumTurquoise', - 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin', 'NavajoWhite', - 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed', 'Orchid', - 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed', - 'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple', - 'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown', - 'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue', - 'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', - 'GreenYellow', 'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', - 'White', 'WhiteSmoke', 'Yellow', 'YellowGreen' + "AliceBlue", + "Chartreuse", + "Aqua", + "Aquamarine", + "Azure", + "Beige", + "Bisque", + "BlanchedAlmond", + "BlueViolet", + "BurlyWood", + "CadetBlue", + "AntiqueWhite", + "Chocolate", + "Coral", + "CornflowerBlue", + "Cornsilk", + "Crimson", + "Cyan", + "DarkCyan", + "DarkGoldenRod", + "DarkGrey", + "DarkKhaki", + "DarkOrange", + "DarkOrchid", + "DarkSalmon", + "DarkSeaGreen", + "DarkTurquoise", + "DarkViolet", + "DeepPink", + "DeepSkyBlue", + "DodgerBlue", + "FireBrick", + "FloralWhite", + "ForestGreen", + "Fuchsia", + "Gainsboro", + "GhostWhite", + "Gold", + "GoldenRod", + "Salmon", + "Tan", + "HoneyDew", + "HotPink", + "IndianRed", + "Ivory", + "Khaki", + "Lavender", + "LavenderBlush", + "LawnGreen", + "LemonChiffon", + "LightBlue", + "LightCoral", + "LightCyan", + "LightGoldenRodYellow", + "LightGray", + "LightGrey", + "LightGreen", + "LightPink", + "LightSalmon", + "LightSeaGreen", + "LightSkyBlue", + "LightSlateGray", + "LightSlateGrey", + "LightSteelBlue", + "LightYellow", + "Lime", + "LimeGreen", + "Linen", + "Magenta", + "MediumAquaMarine", + "MediumOrchid", + "MediumPurple", + "MediumSeaGreen", + "MediumSlateBlue", + "MediumSpringGreen", + "MediumTurquoise", + "MediumVioletRed", + "MintCream", + "MistyRose", + "Moccasin", + "NavajoWhite", + "OldLace", + "Olive", + "OliveDrab", + "Orange", + "OrangeRed", + "Orchid", + "PaleGoldenRod", + "PaleGreen", + "PaleTurquoise", + "PaleVioletRed", + "PapayaWhip", + "PeachPuff", + "Peru", + "Pink", + "Plum", + "PowderBlue", + "Purple", + "Red", + "RosyBrown", + "RoyalBlue", + "SaddleBrown", + "Green", + "SandyBrown", + "SeaGreen", + "SeaShell", + "Sienna", + "Silver", + "SkyBlue", + "SlateBlue", + "SlateGray", + "SlateGrey", + "Snow", + "SpringGreen", + "SteelBlue", + "GreenYellow", + "Teal", + "Thistle", + "Tomato", + "Turquoise", + "Violet", + "Wheat", + "White", + "WhiteSmoke", + "Yellow", + "YellowGreen", ] def _annotate_image(pil_image, anns, confidence_threshold): from PIL import ImageDraw, ImageFont + draw = ImageDraw.Draw(pil_image) font = ImageFont.load_default() BUF = 2 # Reverse, to print the highest confidence on top for ann in reversed(anns): - if 'confidence' in ann and ann['confidence'] < confidence_threshold: + if "confidence" in ann and ann["confidence"] < confidence_threshold: continue - if 'label' in ann: - color = COLOR_NAMES[_string_hash(ann['label']) % len(COLOR_NAMES)] + if "label" in ann: + color = COLOR_NAMES[_string_hash(ann["label"]) % len(COLOR_NAMES)] else: - color = 'White' - - left = ann['coordinates']['x'] - ann['coordinates']['width'] / 2 - top = ann['coordinates']['y'] - ann['coordinates']['height'] / 2 - right = ann['coordinates']['x'] + ann['coordinates']['width'] / 2 - bottom = ann['coordinates']['y'] + ann['coordinates']['height'] / 2 + color = "White" - draw.line([(left, top), (left, bottom), (right, bottom), - (right, top), (left, top)], width=4, fill=color) + left = ann["coordinates"]["x"] - ann["coordinates"]["width"] / 2 + top = ann["coordinates"]["y"] - ann["coordinates"]["height"] / 2 + right = ann["coordinates"]["x"] + ann["coordinates"]["width"] / 2 + bottom = ann["coordinates"]["y"] + ann["coordinates"]["height"] / 2 + draw.line( + [(left, top), (left, bottom), (right, bottom), (right, top), (left, top)], + width=4, + fill=color, + ) - if 'confidence' in ann: - text = '{} {:.0%}'.format(ann['label'], ann['confidence']) + if "confidence" in ann: + text = "{} {:.0%}".format(ann["label"], ann["confidence"]) else: - text = ann['label'] + text = ann["label"] width, height = font.getsize(text) @@ -83,13 +188,15 @@ def _annotate_image(pil_image, anns, confidence_threshold): label_top = bottom + height + 2 * BUF else: label_top = top - draw.rectangle([(left - 1, label_top - height - 2 * BUF), - (left + width + 2 * BUF, label_top)], fill=color) + draw.rectangle( + [ + (left - 1, label_top - height - 2 * BUF), + (left + width + 2 * BUF, label_top), + ], + fill=color, + ) - draw.text((left + BUF, label_top - height - BUF), - text, - fill='black', - font=font) + draw.text((left + BUF, label_top - height - BUF), text, fill="black", font=font) def draw_bounding_boxes(images, annotations, confidence_threshold=0): @@ -124,12 +231,13 @@ def draw_bounding_boxes(images, annotations, confidence_threshold=0): -------- unstack_annotations """ - _numeric_param_check_range('confidence_threshold', confidence_threshold, 0.0, 1.0) + _numeric_param_check_range("confidence_threshold", confidence_threshold, 0.0, 1.0) from PIL import Image + def draw_single_image(row): - image = row['image'] - anns = row['annotations'] - row_number = row['id'] + image = row["image"] + anns = row["annotations"] + row_number = row["id"] if anns == None: anns = [] elif type(anns) == dict: @@ -142,23 +250,29 @@ def draw_single_image(row): # Grayscale image, reshape image shape image = image.reshape(image.shape[0], image.shape[1], 1) FORMAT_RAW = 2 - annotated_image = _tc.Image(_image_data=image.tobytes(), - _width=image.shape[1], - _height=image.shape[0], - _channels=image.shape[2], - _format_enum=FORMAT_RAW, - _image_data_size=image.size) + annotated_image = _tc.Image( + _image_data=image.tobytes(), + _width=image.shape[1], + _height=image.shape[0], + _channels=image.shape[2], + _format_enum=FORMAT_RAW, + _image_data_size=image.size, + ) except Exception as e: if row_number == -1: # indication that it was a single image and not an SFrame raise _ToolkitError(e) - raise _ToolkitError("Received exception at row " + str(row_number) + ": " + str(e)) + raise _ToolkitError( + "Received exception at row " + str(row_number) + ": " + str(e) + ) return annotated_image if isinstance(images, _tc.Image) and isinstance(annotations, list): - return draw_single_image({'image': images, 'annotations': annotations, 'id': -1}) + return draw_single_image( + {"image": images, "annotations": annotations, "id": -1} + ) else: - sf = _tc.SFrame({'image': images, 'annotations': annotations}) + sf = _tc.SFrame({"image": images, "annotations": annotations}) sf = sf.add_row_number() annotated_images = sf.apply(draw_single_image) return annotated_images diff --git a/src/python/turicreate/toolkits/one_shot_object_detector/__init__.py b/src/python/turicreate/toolkits/one_shot_object_detector/__init__.py index 6650ab4c11..cfa3d3c8af 100644 --- a/src/python/turicreate/toolkits/one_shot_object_detector/__init__.py +++ b/src/python/turicreate/toolkits/one_shot_object_detector/__init__.py @@ -8,4 +8,4 @@ from __future__ import absolute_import as _ from .one_shot_object_detector import create, OneShotObjectDetector -__all__ = ['create', 'OneShotObjectDetector', 'util'] +__all__ = ["create", "OneShotObjectDetector", "util"] diff --git a/src/python/turicreate/toolkits/one_shot_object_detector/one_shot_object_detector.py b/src/python/turicreate/toolkits/one_shot_object_detector/one_shot_object_detector.py index e780e9da53..cfac14a995 100644 --- a/src/python/turicreate/toolkits/one_shot_object_detector/one_shot_object_detector.py +++ b/src/python/turicreate/toolkits/one_shot_object_detector/one_shot_object_detector.py @@ -9,11 +9,18 @@ from turicreate import extensions as _extensions from turicreate.toolkits._model import CustomModel as _CustomModel from turicreate.toolkits._model import PythonProxy as _PythonProxy -from turicreate.toolkits.object_detector.object_detector import ObjectDetector as _ObjectDetector -from turicreate.toolkits.one_shot_object_detector.util._augmentation import preview_synthetic_training_data as _preview_synthetic_training_data +from turicreate.toolkits.object_detector.object_detector import ( + ObjectDetector as _ObjectDetector, +) +from turicreate.toolkits.one_shot_object_detector.util._augmentation import ( + preview_synthetic_training_data as _preview_synthetic_training_data, +) import turicreate.toolkits._internal_utils as _tkutl -def create(data, target, backgrounds=None, batch_size=0, max_iterations=0, verbose=True): + +def create( + data, target, backgrounds=None, batch_size=0, max_iterations=0, verbose=True +): """ Create a :class:`OneShotObjectDetector` model. Note: The One Shot Object Detector is currently in beta. @@ -56,12 +63,13 @@ def create(data, target, backgrounds=None, batch_size=0, max_iterations=0, verbo """ if not isinstance(data, _tc.SFrame) and not isinstance(data, _tc.Image): raise TypeError("'data' must be of type SFrame or Image.") - augmented_data = _preview_synthetic_training_data( - data, target, backgrounds) - model = _tc.object_detector.create(augmented_data, - batch_size=batch_size, - max_iterations=max_iterations, - verbose=verbose) + augmented_data = _preview_synthetic_training_data(data, target, backgrounds) + model = _tc.object_detector.create( + augmented_data, + batch_size=batch_size, + max_iterations=max_iterations, + verbose=verbose, + ) if isinstance(data, _tc.SFrame): num_starter_images = len(data) else: @@ -83,13 +91,16 @@ class OneShotObjectDetector(_CustomModel): This model should not be constructed directly. """ + _PYTHON_ONE_SHOT_OBJECT_DETECTOR_VERSION = 1 def __init__(self, state): # We use PythonProxy here so that we get tab completion self.__proxy__ = _PythonProxy(state) - def predict(self, dataset, confidence_threshold=0.25, iou_threshold=0.45, verbose=True): + def predict( + self, dataset, confidence_threshold=0.25, iou_threshold=0.45, verbose=True + ): """ Predict object instances in an SFrame of images. @@ -139,13 +150,20 @@ def predict(self, dataset, confidence_threshold=0.25, iou_threshold=0.45, verbos >>> predictions_with_bounding_boxes.explore() """ - return self.__proxy__['detector'].predict( + return self.__proxy__["detector"].predict( dataset=dataset, confidence_threshold=confidence_threshold, iou_threshold=iou_threshold, - verbose=verbose) - - def export_coreml(self, filename, include_non_maximum_suppression=True, iou_threshold=None, confidence_threshold=None): + verbose=verbose, + ) + + def export_coreml( + self, + filename, + include_non_maximum_suppression=True, + iou_threshold=None, + confidence_threshold=None, + ): """ Save the model in Core ML format. The Core ML model takes an image of fixed size as input and produces two output arrays: `confidence` and @@ -207,23 +225,26 @@ def export_coreml(self, filename, include_non_maximum_suppression=True, iou_thre from turicreate.toolkits import _coreml_utils additional_user_defined_metadata = _coreml_utils._get_tc_version_info() - short_description = _coreml_utils._mlmodel_short_description('Object Detector') + short_description = _coreml_utils._mlmodel_short_description("Object Detector") options = { - 'include_non_maximum_suppression': include_non_maximum_suppression, + "include_non_maximum_suppression": include_non_maximum_suppression, } - options['version'] = self._PYTHON_ONE_SHOT_OBJECT_DETECTOR_VERSION + options["version"] = self._PYTHON_ONE_SHOT_OBJECT_DETECTOR_VERSION if confidence_threshold is not None: - options['confidence_threshold'] = confidence_threshold + options["confidence_threshold"] = confidence_threshold if iou_threshold is not None: - options['iou_threshold'] = iou_threshold + options["iou_threshold"] = iou_threshold additional_user_defined_metadata = _coreml_utils._get_tc_version_info() - short_description = _coreml_utils._mlmodel_short_description('One Shot Object Detector') - self.__proxy__['detector'].__proxy__.export_to_coreml(filename, - short_description, additional_user_defined_metadata, options) + short_description = _coreml_utils._mlmodel_short_description( + "One Shot Object Detector" + ) + self.__proxy__["detector"].__proxy__.export_to_coreml( + filename, short_description, additional_user_defined_metadata, options + ) def _get_version(self): return self._PYTHON_ONE_SHOT_OBJECT_DETECTOR_VERSION @@ -239,16 +260,17 @@ def _get_native_state(self): # We don't know how to serialize a Python class, hence we need to # reduce the detector to the proxy object before saving it. - state['detector'] = {'detector_model':state['detector'].__proxy__} + state["detector"] = {"detector_model": state["detector"].__proxy__} return state @classmethod def _load_version(cls, state, version): - assert(version == cls._PYTHON_ONE_SHOT_OBJECT_DETECTOR_VERSION) + assert version == cls._PYTHON_ONE_SHOT_OBJECT_DETECTOR_VERSION # we need to undo what we did at save and turn the proxy object # back into a Python class - state['detector'] = _ObjectDetector._load_version( - state['detector'], state["_detector_version"]) + state["detector"] = _ObjectDetector._load_version( + state["detector"], state["_detector_version"] + ) return OneShotObjectDetector(state) def __str__(self): @@ -270,13 +292,14 @@ def __repr__(self): width = 40 sections, section_titles = self._get_summary_struct() - detector = self.__proxy__['detector'] + detector = self.__proxy__["detector"] out = _tkutl._toolkit_repr_print( detector, sections, section_titles, width=width, - class_name='OneShotObjectDetector') + class_name="OneShotObjectDetector", + ) return out def summary(self, output=None): @@ -302,15 +325,18 @@ def summary(self, output=None): -------- >>> m.summary() """ - from turicreate.toolkits._internal_utils import _toolkit_serialize_summary_struct + from turicreate.toolkits._internal_utils import ( + _toolkit_serialize_summary_struct, + ) - if output is None or output == 'stdout': + if output is None or output == "stdout": pass - elif (output == 'str'): + elif output == "str": return self.__repr__() - elif output == 'dict': - return _toolkit_serialize_summary_struct( self.__proxy__['detector'], \ - *self._get_summary_struct() ) + elif output == "dict": + return _toolkit_serialize_summary_struct( + self.__proxy__["detector"], *self._get_summary_struct() + ) try: print(self.__repr__()) except: @@ -334,22 +360,19 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ model_fields = [ - ('Number of classes', 'num_classes'), - ('Input image shape', 'input_image_shape'), + ("Number of classes", "num_classes"), + ("Input image shape", "input_image_shape"), ] data_fields = [ - ('Number of synthetically generated examples', 'num_examples'), - ('Number of synthetically generated bounding boxes', 'num_bounding_boxes'), + ("Number of synthetically generated examples", "num_examples"), + ("Number of synthetically generated bounding boxes", "num_bounding_boxes"), ] training_fields = [ - ('Training time', '_training_time_as_string'), - ('Training iterations', 'training_iterations'), - ('Training epochs', 'training_epochs'), - ('Final loss (specific to model)', 'training_loss'), + ("Training time", "_training_time_as_string"), + ("Training iterations", "training_iterations"), + ("Training epochs", "training_epochs"), + ("Final loss (specific to model)", "training_loss"), ] - section_titles = [ - 'Model summary', - 'Synthetic data summary', - 'Training summary'] - return([model_fields, data_fields, training_fields], section_titles) + section_titles = ["Model summary", "Synthetic data summary", "Training summary"] + return ([model_fields, data_fields, training_fields], section_titles) diff --git a/src/python/turicreate/toolkits/one_shot_object_detector/util/__init__.py b/src/python/turicreate/toolkits/one_shot_object_detector/util/__init__.py index b0aaee9fcd..e4fb315dcf 100644 --- a/src/python/turicreate/toolkits/one_shot_object_detector/util/__init__.py +++ b/src/python/turicreate/toolkits/one_shot_object_detector/util/__init__.py @@ -9,4 +9,4 @@ from ._augmentation import preview_synthetic_training_data from ._visualization import draw_bounding_boxes -__all__ = ['preview_synthetic_training_data', 'draw_bounding_boxes'] +__all__ = ["preview_synthetic_training_data", "draw_bounding_boxes"] diff --git a/src/python/turicreate/toolkits/one_shot_object_detector/util/_augmentation.py b/src/python/turicreate/toolkits/one_shot_object_detector/util/_augmentation.py index 2f7ca08c2f..770fe88044 100644 --- a/src/python/turicreate/toolkits/one_shot_object_detector/util/_augmentation.py +++ b/src/python/turicreate/toolkits/one_shot_object_detector/util/_augmentation.py @@ -8,15 +8,16 @@ import turicreate as _tc from turicreate import extensions as _extensions import turicreate.toolkits._internal_utils as _tkutl -from turicreate.toolkits.one_shot_object_detector.util._error_handling import check_one_shot_input +from turicreate.toolkits.one_shot_object_detector.util._error_handling import ( + check_one_shot_input, +) from turicreate.toolkits import _data_zoo -def preview_synthetic_training_data(data, - target, - backgrounds=None, - verbose=True, - **kwargs): + +def preview_synthetic_training_data( + data, target, backgrounds=None, verbose=True, **kwargs +): """ A utility function to visualize the synthetically generated data. @@ -40,31 +41,31 @@ def preview_synthetic_training_data(data, out : SFrame An SFrame of sythetically generated annotated training data. """ - dataset_to_augment, image_column_name, target_column_name = check_one_shot_input(data, target, backgrounds) - _tkutl._handle_missing_values(dataset_to_augment, image_column_name, 'dataset') + dataset_to_augment, image_column_name, target_column_name = check_one_shot_input( + data, target, backgrounds + ) + _tkutl._handle_missing_values(dataset_to_augment, image_column_name, "dataset") one_shot_model = _extensions.one_shot_object_detector() - seed = kwargs["seed"] if "seed" in kwargs else _random.randint(0, 2**32 - 1) + seed = kwargs["seed"] if "seed" in kwargs else _random.randint(0, 2 ** 32 - 1) if backgrounds is None: backgrounds_downloader = _data_zoo.OneShotObjectDetectorBackgroundData() backgrounds = backgrounds_downloader.get_backgrounds() # We resize the background dimensions by half along each axis to reduce # the disk footprint during augmentation, and also reduce the time - # taken to synthesize data. - backgrounds = backgrounds.apply(lambda im: _tc.image_analysis.resize( - im, - int(im.width/2), - int(im.height/2), - im.channels - )) + # taken to synthesize data. + backgrounds = backgrounds.apply( + lambda im: _tc.image_analysis.resize( + im, int(im.width / 2), int(im.height / 2), im.channels + ) + ) # Option arguments to pass in to C++ Object Detector, if we use it: # {'mlmodel_path':'darknet.mlmodel', 'max_iterations' : 25} - options_for_augmentation = { - "seed": seed, - "verbose": verbose - } - augmented_data = one_shot_model.augment(dataset_to_augment, - image_column_name, - target_column_name, - backgrounds, - options_for_augmentation) + options_for_augmentation = {"seed": seed, "verbose": verbose} + augmented_data = one_shot_model.augment( + dataset_to_augment, + image_column_name, + target_column_name, + backgrounds, + options_for_augmentation, + ) return augmented_data diff --git a/src/python/turicreate/toolkits/one_shot_object_detector/util/_error_handling.py b/src/python/turicreate/toolkits/one_shot_object_detector/util/_error_handling.py index 7cab9002cb..e715fa6b17 100644 --- a/src/python/turicreate/toolkits/one_shot_object_detector/util/_error_handling.py +++ b/src/python/turicreate/toolkits/one_shot_object_detector/util/_error_handling.py @@ -8,11 +8,12 @@ import turicreate.toolkits._internal_utils as _tkutl from turicreate.toolkits._main import ToolkitError as _ToolkitError + def check_one_shot_input(data, target, backgrounds): - if backgrounds is not None and not(isinstance(backgrounds, _tc.SArray)): + if backgrounds is not None and not (isinstance(backgrounds, _tc.SArray)): raise TypeError("'backgrounds' must be None or an SArray.") - if (isinstance(backgrounds, _tc.SArray) and len(backgrounds) == 0): - raise _ToolkitError('Unable to train with no background images') + if isinstance(backgrounds, _tc.SArray) and len(backgrounds) == 0: + raise _ToolkitError("Unable to train with no background images") if not isinstance(target, str): raise TypeError("'target' must be of type string.") if isinstance(data, _tc.SFrame): @@ -23,8 +24,9 @@ def check_one_shot_input(data, target, backgrounds): elif isinstance(data, _tc.Image): image_column_name = "image" target_column_name = "target" - dataset_to_augment = _tc.SFrame({image_column_name: [data], - target_column_name: [target]}) + dataset_to_augment = _tc.SFrame( + {image_column_name: [data], target_column_name: [target]} + ) else: raise TypeError("'data' must be of type SFrame or Image.") return dataset_to_augment, image_column_name, target_column_name diff --git a/src/python/turicreate/toolkits/one_shot_object_detector/util/_visualization.py b/src/python/turicreate/toolkits/one_shot_object_detector/util/_visualization.py index faec5989cf..6d5e648efd 100644 --- a/src/python/turicreate/toolkits/one_shot_object_detector/util/_visualization.py +++ b/src/python/turicreate/toolkits/one_shot_object_detector/util/_visualization.py @@ -1,4 +1,7 @@ -from turicreate.toolkits.object_detector.util._visualization import draw_bounding_boxes as _draw_bounding_boxes +from turicreate.toolkits.object_detector.util._visualization import ( + draw_bounding_boxes as _draw_bounding_boxes, +) + def draw_bounding_boxes(images, annotations, confidence_threshold=0): """ diff --git a/src/python/turicreate/toolkits/recommender/__init__.py b/src/python/turicreate/toolkits/recommender/__init__.py index 775fc4a160..751e968759 100644 --- a/src/python/turicreate/toolkits/recommender/__init__.py +++ b/src/python/turicreate/toolkits/recommender/__init__.py @@ -84,12 +84,14 @@ from __future__ import division as _ from __future__ import absolute_import as _ -__all__ = ['popularity_recommender', - 'factorization_recommender', - 'ranking_factorization_recommender', - 'item_similarity_recommender', - 'create', - 'util'] +__all__ = [ + "popularity_recommender", + "factorization_recommender", + "ranking_factorization_recommender", + "item_similarity_recommender", + "create", + "util", +] from . import popularity_recommender from . import factorization_recommender diff --git a/src/python/turicreate/toolkits/recommender/factorization_recommender.py b/src/python/turicreate/toolkits/recommender/factorization_recommender.py index 3de7cbdd25..3f52b8e570 100644 --- a/src/python/turicreate/toolkits/recommender/factorization_recommender.py +++ b/src/python/turicreate/toolkits/recommender/factorization_recommender.py @@ -13,23 +13,29 @@ from turicreate.toolkits._model import _get_default_options_wrapper import turicreate as _turicreate from turicreate.toolkits.recommender.util import _Recommender -from turicreate.data_structures.sframe import SFrame as _SFrame - -def create(observation_data, - user_id='user_id', item_id='item_id', target=None, - user_data=None, item_data=None, - num_factors=8, - regularization=1e-8, - linear_regularization=1e-10, - side_data_factorization=True, - nmf=False, - binary_target=False, - max_iterations=50, - sgd_step_size=0, - random_seed=0, - solver = 'auto', - verbose=True, - **kwargs): +from turicreate.data_structures.sframe import SFrame as _SFrame + + +def create( + observation_data, + user_id="user_id", + item_id="item_id", + target=None, + user_data=None, + item_data=None, + num_factors=8, + regularization=1e-8, + linear_regularization=1e-10, + side_data_factorization=True, + nmf=False, + binary_target=False, + max_iterations=50, + sgd_step_size=0, + random_seed=0, + solver="auto", + verbose=True, + **kwargs +): """Create a FactorizationRecommender that learns latent factors for each user and item and uses them to make rating predictions. This includes both standard matrix factorization as well as factorization machines models @@ -186,8 +192,9 @@ def create(observation_data, """ from turicreate._cython.cy_server import QuietProgress + if not (isinstance(observation_data, _SFrame)): - raise TypeError('observation_data input must be a SFrame') + raise TypeError("observation_data input must be a SFrame") opts = {} model_proxy = _turicreate.extensions.factorization_recommender() model_proxy.init_options(opts) @@ -197,22 +204,22 @@ def create(observation_data, if item_data is None: item_data = _turicreate.SFrame() - opts = {'user_id' : user_id, - 'item_id' : item_id, - 'target' : target, - 'random_seed' : random_seed, - 'num_factors' : num_factors, - 'regularization' : regularization, - 'linear_regularization' : linear_regularization, - 'binary_target' : binary_target, - 'max_iterations' : max_iterations, - 'sgd_step_size' : sgd_step_size, - 'solver' : solver, - 'side_data_factorization' : side_data_factorization, - + opts = { + "user_id": user_id, + "item_id": item_id, + "target": target, + "random_seed": random_seed, + "num_factors": num_factors, + "regularization": regularization, + "linear_regularization": linear_regularization, + "binary_target": binary_target, + "max_iterations": max_iterations, + "sgd_step_size": sgd_step_size, + "solver": solver, + "side_data_factorization": side_data_factorization, # has no effect in the c++ end; ignore. - - 'nmf' : nmf} + "nmf": nmf, + } if kwargs: try: @@ -222,21 +229,24 @@ def create(observation_data, bad_arguments = set(kwargs.keys()).difference(possible_args) if bad_arguments: - raise TypeError("Bad Keyword Arguments: " + ', '.join(bad_arguments)) + raise TypeError("Bad Keyword Arguments: " + ", ".join(bad_arguments)) opts.update(kwargs) - extra_data = {"nearest_items" : _turicreate.SFrame()} + extra_data = {"nearest_items": _turicreate.SFrame()} with QuietProgress(verbose): model_proxy.train(observation_data, user_data, item_data, opts, extra_data) return FactorizationRecommender(model_proxy) + _get_default_options = _get_default_options_wrapper( - 'factorization_recommender', - 'recommender.factorization_recommender', - 'FactorizationRecommender') + "factorization_recommender", + "recommender.factorization_recommender", + "FactorizationRecommender", +) + class FactorizationRecommender(_Recommender): r""" @@ -399,10 +409,9 @@ class FactorizationRecommender(_Recommender): """ def __init__(self, model_proxy): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model_proxy - @classmethod def _native_name(cls): return "factorization_recommender" diff --git a/src/python/turicreate/toolkits/recommender/item_content_recommender.py b/src/python/turicreate/toolkits/recommender/item_content_recommender.py index 25d9423d09..ba9a3c263b 100644 --- a/src/python/turicreate/toolkits/recommender/item_content_recommender.py +++ b/src/python/turicreate/toolkits/recommender/item_content_recommender.py @@ -16,13 +16,18 @@ from turicreate.toolkits.recommender.util import _Recommender -def create(item_data, item_id, - observation_data = None, - user_id = None, target = None, - weights = 'auto', - similarity_metrics = 'auto', - item_data_transform = 'auto', - max_item_neighborhood_size = 64, verbose=True): +def create( + item_data, + item_id, + observation_data=None, + user_id=None, + target=None, + weights="auto", + similarity_metrics="auto", + item_data_transform="auto", + max_item_neighborhood_size=64, + verbose=True, +): """Create a content-based recommender model in which the similarity between the items recommended is determined by the content of @@ -129,27 +134,37 @@ def create(item_data, item_id, # item_data is correct type if not isinstance(item_data, _SFrame) or item_data.num_rows() == 0: - raise TypeError("`item_data` argument must be a non-empty SFrame giving item data to use for similarities.") + raise TypeError( + "`item_data` argument must be a non-empty SFrame giving item data to use for similarities." + ) # Error checking on column names item_columns = set(item_data.column_names()) if item_id not in item_columns: - raise ValueError("Item column given as 'item_id = %s', but this is not found in `item_data` SFrame." - % item_id) + raise ValueError( + "Item column given as 'item_id = %s', but this is not found in `item_data` SFrame." + % item_id + ) # Now, get the set ready to test for other argument issues. item_columns.remove(item_id) - if weights != 'auto': + if weights != "auto": if type(weights) is not dict: - raise TypeError("`weights` parameter must be 'auto' or a dictionary of column " - "names in `item_data` to weight values.") - - bad_columns = [col_name for col_name in item_columns if col_name not in item_columns] + raise TypeError( + "`weights` parameter must be 'auto' or a dictionary of column " + "names in `item_data` to weight values." + ) + + bad_columns = [ + col_name for col_name in item_columns if col_name not in item_columns + ] if bad_columns: - raise ValueError("Columns %s given in weights, but these are not found in item_data." - % ', '.join(bad_columns)) + raise ValueError( + "Columns %s given in weights, but these are not found in item_data." + % ", ".join(bad_columns) + ) # Now, set any columns not given in the weights column to be # weight 0. @@ -160,11 +175,17 @@ def create(item_data, item_id, # Now, check the feature transformer stuff. # Pass it through a feature transformer. - if item_data_transform == 'auto': - item_data_transform = _turicreate.toolkits._feature_engineering.AutoVectorizer(excluded_features = [item_id]) - - if not isinstance(item_data_transform, _turicreate.toolkits._feature_engineering.TransformerBase): - raise TypeError("item_data_transform must be 'auto' or a valid feature_engineering transformer instance.") + if item_data_transform == "auto": + item_data_transform = _turicreate.toolkits._feature_engineering.AutoVectorizer( + excluded_features=[item_id] + ) + + if not isinstance( + item_data_transform, _turicreate.toolkits._feature_engineering.TransformerBase + ): + raise TypeError( + "item_data_transform must be 'auto' or a valid feature_engineering transformer instance." + ) # Transform the input data. item_data = item_data_transform.fit_transform(item_data) @@ -176,9 +197,11 @@ def create(item_data, item_id, for c in item_columns: if item_data[c].dtype is str: - item_data[c] = item_data[c].apply(lambda s: {s : 1}) + item_data[c] = item_data[c].apply(lambda s: {s: 1}) elif item_data[c].dtype in [float, int]: - item_data[c] = (item_data[c] - item_data[c].mean()) / max(item_data[c].std(), 1e-8) + item_data[c] = (item_data[c] - item_data[c].mean()) / max( + item_data[c].std(), 1e-8 + ) gaussian_kernel_metrics.add(c) if verbose: @@ -204,7 +227,9 @@ def create(item_data, item_id, # back if necessary. empty_user = _turicreate.SArray([], dtype=str) empty_item = _turicreate.SArray([], dtype=item_data[item_id].dtype) - observation_data = _turicreate.SFrame( {user_id : empty_user, item_id : empty_item} ) + observation_data = _turicreate.SFrame( + {user_id: empty_user, item_id: empty_item} + ) # Now, work out stuff for the observation_data component normalization_factor = 1 @@ -215,44 +240,66 @@ def create(item_data, item_id, if weights == "auto": # TODO: automatically tune this. - weights = {col_name : 1 for col_name in item_data.column_names() if col_name != item_id} + weights = { + col_name: 1 + for col_name in item_data.column_names() + if col_name != item_id + } # Use the abs value here in case users pass in weights with negative values. normalization_factor = sum(abs(v) for v in weights.values()) if normalization_factor == 0: raise ValueError("Weights cannot all be set to 0.") - distance = [([col_name], ("gaussian_kernel" if col_name in gaussian_kernel_metrics else "cosine"), weight) - for col_name, weight in weights.items()] + distance = [ + ( + [col_name], + ( + "gaussian_kernel" + if col_name in gaussian_kernel_metrics + else "cosine" + ), + weight, + ) + for col_name, weight in weights.items() + ] else: distance = "cosine" # Now, build the nearest neighbors model: - nn = _turicreate.nearest_neighbors.create(item_data, label=item_id, distance = distance, verbose = verbose) - graph = nn.query(item_data, label = item_id, k=max_item_neighborhood_size, verbose = verbose) - graph = graph.rename({"query_label" : item_id, - "reference_label" : "similar", - "distance" : "score"}, inplace=True) + nn = _turicreate.nearest_neighbors.create( + item_data, label=item_id, distance=distance, verbose=verbose + ) + graph = nn.query( + item_data, label=item_id, k=max_item_neighborhood_size, verbose=verbose + ) + graph = graph.rename( + {"query_label": item_id, "reference_label": "similar", "distance": "score"}, + inplace=True, + ) def process_weights(x): return max(-1, min(1, 1 - x / normalization_factor)) graph["score"] = graph["score"].apply(process_weights) - opts = {'user_id': user_id, - 'item_id': item_id, - 'target': target, - 'similarity_type' : "cosine", - 'max_item_neighborhood_size' : max_item_neighborhood_size} + opts = { + "user_id": user_id, + "item_id": item_id, + "target": target, + "similarity_type": "cosine", + "max_item_neighborhood_size": max_item_neighborhood_size, + } user_data = _turicreate.SFrame() - extra_data = {"nearest_items" : graph} + extra_data = {"nearest_items": graph} with QuietProgress(verbose): model_proxy.train(observation_data, user_data, item_data, opts, extra_data) return ItemContentRecommender(model_proxy) + class ItemContentRecommender(_Recommender): """A recommender based on the similarity between item content rather using user interaction patterns to compute similarity. @@ -281,7 +328,7 @@ class ItemContentRecommender(_Recommender): """ def __init__(self, model_proxy): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model_proxy @classmethod diff --git a/src/python/turicreate/toolkits/recommender/item_similarity_recommender.py b/src/python/turicreate/toolkits/recommender/item_similarity_recommender.py index cfd9930727..75b3bee7b2 100644 --- a/src/python/turicreate/toolkits/recommender/item_similarity_recommender.py +++ b/src/python/turicreate/toolkits/recommender/item_similarity_recommender.py @@ -15,16 +15,22 @@ from turicreate.toolkits._model import _get_default_options_wrapper from turicreate.data_structures.sframe import SFrame as _SFrame -def create(observation_data, - user_id='user_id', item_id='item_id', target=None, - user_data=None, item_data=None, - nearest_items=None, - similarity_type='jaccard', - threshold=0.001, - only_top_k=64, - verbose=True, - target_memory_usage = 8*1024*1024*1024, - **kwargs): + +def create( + observation_data, + user_id="user_id", + item_id="item_id", + target=None, + user_data=None, + item_data=None, + nearest_items=None, + similarity_type="jaccard", + threshold=0.001, + only_top_k=64, + verbose=True, + target_memory_usage=8 * 1024 * 1024 * 1024, + **kwargs +): """ Create a recommender that uses item-item similarities based on users in common. @@ -191,8 +197,9 @@ def create(observation_data, """ from turicreate._cython.cy_server import QuietProgress + if not (isinstance(observation_data, _SFrame)): - raise TypeError('observation_data input must be a SFrame') + raise TypeError("observation_data input must be a SFrame") opts = {} model_proxy = _turicreate.extensions.item_similarity() model_proxy.init_options(opts) @@ -204,16 +211,17 @@ def create(observation_data, if nearest_items is None: nearest_items = _turicreate.SFrame() - opts = {'user_id': user_id, - 'item_id': item_id, - 'target': target, - 'similarity_type': similarity_type, - 'threshold': threshold, - 'target_memory_usage' : float(target_memory_usage), - 'max_item_neighborhood_size': only_top_k} + opts = { + "user_id": user_id, + "item_id": item_id, + "target": target, + "similarity_type": similarity_type, + "threshold": threshold, + "target_memory_usage": float(target_memory_usage), + "max_item_neighborhood_size": only_top_k, + } - - extra_data = {"nearest_items" : nearest_items} + extra_data = {"nearest_items": nearest_items} if kwargs: try: @@ -223,11 +231,11 @@ def create(observation_data, bad_arguments = set(kwargs.keys()).difference(possible_args) if bad_arguments: - raise TypeError("Bad Keyword Arguments: " + ', '.join(bad_arguments)) + raise TypeError("Bad Keyword Arguments: " + ", ".join(bad_arguments)) opts.update(kwargs) - extra_data = {"nearest_items" : nearest_items} + extra_data = {"nearest_items": nearest_items} opts.update(kwargs) with QuietProgress(verbose): @@ -237,9 +245,9 @@ def create(observation_data, _get_default_options = _get_default_options_wrapper( - 'item_similarity', - 'recommender.item_similarity', - 'ItemSimilarityRecommender') + "item_similarity", "recommender.item_similarity", "ItemSimilarityRecommender" +) + class ItemSimilarityRecommender(_Recommender): """ @@ -340,7 +348,7 @@ class ItemSimilarityRecommender(_Recommender): """ def __init__(self, model_proxy): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model_proxy @classmethod diff --git a/src/python/turicreate/toolkits/recommender/popularity_recommender.py b/src/python/turicreate/toolkits/recommender/popularity_recommender.py index 6378d5c408..55293550af 100644 --- a/src/python/turicreate/toolkits/recommender/popularity_recommender.py +++ b/src/python/turicreate/toolkits/recommender/popularity_recommender.py @@ -13,11 +13,17 @@ from turicreate.toolkits.recommender.util import _Recommender from turicreate.data_structures.sframe import SFrame as _SFrame -def create(observation_data, - user_id='user_id', item_id='item_id', target=None, - user_data=None, item_data=None, - random_seed=0, - verbose=True): + +def create( + observation_data, + user_id="user_id", + item_id="item_id", + target=None, + user_data=None, + item_data=None, + random_seed=0, + verbose=True, +): """ Create a model that makes recommendations using item popularity. When no target column is provided, the popularity is determined by the number of @@ -80,8 +86,9 @@ def create(observation_data, PopularityRecommender """ from turicreate._cython.cy_server import QuietProgress + if not (isinstance(observation_data, _SFrame)): - raise TypeError('observation_data input must be a SFrame') + raise TypeError("observation_data input must be a SFrame") opts = {} model_proxy = _turicreate.extensions.popularity() model_proxy.init_options(opts) @@ -91,17 +98,15 @@ def create(observation_data, if item_data is None: item_data = _turicreate.SFrame() - opts = {'user_id': user_id, - 'item_id': item_id, - 'target': target, - 'random_seed': 1} + opts = {"user_id": user_id, "item_id": item_id, "target": target, "random_seed": 1} - extra_data = {"nearest_items" : _turicreate.SFrame()} + extra_data = {"nearest_items": _turicreate.SFrame()} with QuietProgress(verbose): model_proxy.train(observation_data, user_data, item_data, opts, extra_data) return PopularityRecommender(model_proxy) + class PopularityRecommender(_Recommender): """ The Popularity Model ranks an item according to its overall popularity. @@ -128,7 +133,7 @@ class PopularityRecommender(_Recommender): """ def __init__(self, model_proxy): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model_proxy @classmethod diff --git a/src/python/turicreate/toolkits/recommender/ranking_factorization_recommender.py b/src/python/turicreate/toolkits/recommender/ranking_factorization_recommender.py index ca98e51b2b..399921554a 100644 --- a/src/python/turicreate/toolkits/recommender/ranking_factorization_recommender.py +++ b/src/python/turicreate/toolkits/recommender/ranking_factorization_recommender.py @@ -17,23 +17,29 @@ from turicreate.toolkits._model import _get_default_options_wrapper from turicreate.data_structures.sframe import SFrame as _SFrame -def create(observation_data, - user_id='user_id', item_id='item_id', target=None, - user_data=None, item_data=None, - num_factors=32, - regularization=1e-9, - linear_regularization=1e-9, - side_data_factorization=True, - ranking_regularization=0.25, - unobserved_rating_value=None, - num_sampled_negative_examples=4, - max_iterations=25, - sgd_step_size=0, - random_seed=0, - binary_target = False, - solver = 'auto', - verbose=True, - **kwargs): + +def create( + observation_data, + user_id="user_id", + item_id="item_id", + target=None, + user_data=None, + item_data=None, + num_factors=32, + regularization=1e-9, + linear_regularization=1e-9, + side_data_factorization=True, + ranking_regularization=0.25, + unobserved_rating_value=None, + num_sampled_negative_examples=4, + max_iterations=25, + sgd_step_size=0, + random_seed=0, + binary_target=False, + solver="auto", + verbose=True, + **kwargs +): """Create a RankingFactorizationRecommender that learns latent factors for each user and item and uses them to make rating predictions. @@ -218,8 +224,9 @@ def create(observation_data, """ from turicreate._cython.cy_server import QuietProgress + if not (isinstance(observation_data, _SFrame)): - raise TypeError('observation_data input must be a SFrame') + raise TypeError("observation_data input must be a SFrame") opts = {} model_proxy = _turicreate.extensions.ranking_factorization_recommender() model_proxy.init_options(opts) @@ -232,22 +239,23 @@ def create(observation_data, if target is None: binary_target = True - opts = {'user_id' : user_id, - 'item_id' : item_id, - 'target' : target, - 'random_seed' : random_seed, - 'num_factors' : num_factors, - 'regularization' : regularization, - 'linear_regularization' : linear_regularization, - 'ranking_regularization' : ranking_regularization, - 'binary_target' : binary_target, - 'max_iterations' : max_iterations, - 'side_data_factorization' : side_data_factorization, - 'num_sampled_negative_examples' : num_sampled_negative_examples, - 'solver' : solver, - - # Has no effect here. - 'sgd_step_size' : sgd_step_size} + opts = { + "user_id": user_id, + "item_id": item_id, + "target": target, + "random_seed": random_seed, + "num_factors": num_factors, + "regularization": regularization, + "linear_regularization": linear_regularization, + "ranking_regularization": ranking_regularization, + "binary_target": binary_target, + "max_iterations": max_iterations, + "side_data_factorization": side_data_factorization, + "num_sampled_negative_examples": num_sampled_negative_examples, + "solver": solver, + # Has no effect here. + "sgd_step_size": sgd_step_size, + } if unobserved_rating_value is not None: opts["unobserved_rating_value"] = unobserved_rating_value @@ -260,20 +268,23 @@ def create(observation_data, bad_arguments = set(kwargs.keys()).difference(possible_args) if bad_arguments: - raise TypeError("Bad Keyword Arguments: " + ', '.join(bad_arguments)) + raise TypeError("Bad Keyword Arguments: " + ", ".join(bad_arguments)) opts.update(kwargs) - extra_data = {"nearest_items" : _turicreate.SFrame()} + extra_data = {"nearest_items": _turicreate.SFrame()} with QuietProgress(verbose): model_proxy.train(observation_data, user_data, item_data, opts, extra_data) return RankingFactorizationRecommender(model_proxy) + _get_default_options = _get_default_options_wrapper( - 'ranking_factorization_recommender', - 'recommender.RankingFactorizationRecommender', - 'RankingFactorizationRecommender') + "ranking_factorization_recommender", + "recommender.RankingFactorizationRecommender", + "RankingFactorizationRecommender", +) + class RankingFactorizationRecommender(_Recommender): r""" @@ -545,7 +556,7 @@ class RankingFactorizationRecommender(_Recommender): """ def __init__(self, model_proxy): - '''__init__(self)''' + """__init__(self)""" self.__proxy__ = model_proxy @classmethod diff --git a/src/python/turicreate/toolkits/recommender/util.py b/src/python/turicreate/toolkits/recommender/util.py index 5294196d7e..5d1ca2d5b6 100644 --- a/src/python/turicreate/toolkits/recommender/util.py +++ b/src/python/turicreate/toolkits/recommender/util.py @@ -15,18 +15,23 @@ from turicreate.toolkits import _coreml_utils from turicreate.toolkits._main import ToolkitError as _ToolkitError from turicreate.toolkits._model import Model as _Model -from turicreate.toolkits._internal_utils import _toolkit_repr_print, \ - _precomputed_field +from turicreate.toolkits._internal_utils import _toolkit_repr_print, _precomputed_field import turicreate.aggregate as _Aggregate from turicreate.data_structures.sarray import SArray as _SArray from turicreate.data_structures.sframe import SFrame as _SFrame from turicreate._deps import numpy as _numpy, HAS_NUMPY as _HAS_NUMPY -def _create(observation_data, - user_id='user_id', item_id='item_id', target=None, - user_data=None, item_data=None, - ranking=True, - verbose=True): + +def _create( + observation_data, + user_id="user_id", + item_id="item_id", + target=None, + user_data=None, + item_data=None, + ranking=True, + verbose=True, +): """ A unified interface for training recommender models. Based on simple characteristics of the data, a type of model is selected and trained. The @@ -125,46 +130,47 @@ def _create(observation_data, """ if not (isinstance(observation_data, _SFrame)): - raise TypeError('observation_data input must be a SFrame') + raise TypeError("observation_data input must be a SFrame") side_data = (user_data is not None) or (item_data is not None) if user_data is not None: if not isinstance(user_data, _SFrame): - raise TypeError('Provided user_data must be an SFrame.') + raise TypeError("Provided user_data must be an SFrame.") if item_data is not None: if not isinstance(item_data, _SFrame): - raise TypeError('Provided item_data must be an SFrame.') + raise TypeError("Provided item_data must be an SFrame.") if target is None: if ranking: if side_data: - method = 'ranking_factorization_recommender' + method = "ranking_factorization_recommender" else: - method = 'item_similarity' + method = "item_similarity" else: if side_data: - method = 'ranking_factorization_recommender' + method = "ranking_factorization_recommender" else: - method = 'item_similarity' + method = "item_similarity" else: if ranking: if side_data: - method = 'ranking_factorization_recommender' + method = "ranking_factorization_recommender" else: - method = 'ranking_factorization_recommender' + method = "ranking_factorization_recommender" else: if side_data: - method = 'factorization_recommender' + method = "factorization_recommender" else: - method = 'factorization_recommender' - - opts = {'observation_data': observation_data, - 'user_id': user_id, - 'item_id': item_id, - 'target': target, - 'user_data': user_data, - 'item_data': item_data} + method = "factorization_recommender" + opts = { + "observation_data": observation_data, + "user_id": user_id, + "item_id": item_id, + "target": target, + "user_data": user_data, + "item_data": item_data, + } if method == "item_similarity": return _turicreate.recommender.item_similarity_recommender.create(**opts) @@ -175,13 +181,19 @@ def _create(observation_data, else: raise RuntimeError("Provided method not recognized.") -def compare_models(dataset, models, model_names=None, user_sample=1.0, - metric='auto', - target=None, - exclude_known_for_precision_recall=True, - make_plot=False, - verbose=True, - **kwargs): + +def compare_models( + dataset, + models, + model_names=None, + user_sample=1.0, + metric="auto", + target=None, + exclude_known_for_precision_recall=True, + make_plot=False, + verbose=True, + **kwargs +): """ Compare the prediction or recommendation performance of recommender models on a common test dataset. @@ -283,15 +295,19 @@ def compare_models(dataset, models, model_names=None, user_sample=1.0, num_models = len(models) if model_names is None: - model_names = ['M' + str(i) for i in range(len(models))] + model_names = ["M" + str(i) for i in range(len(models))] if num_models < 1: - raise ValueError("Must pass in at least one recommender model to \ - evaluate") + raise ValueError( + "Must pass in at least one recommender model to \ + evaluate" + ) if model_names is not None and len(model_names) != num_models: - raise ValueError("Must pass in the same number of model names as \ - models") + raise ValueError( + "Must pass in the same number of model names as \ + models" + ) # if we are asked to sample the users, come up with a list of unique users if user_sample < 1.0: @@ -316,22 +332,22 @@ def compare_models(dataset, models, model_names=None, user_sample=1.0, results = [] for (m, mname) in zip(models, model_names): if verbose: - print('PROGRESS: Evaluate model %s' % mname) - r = m.evaluate(dataset_subset, - metric, - exclude_known_for_precision_recall, - target, - verbose=verbose, - cutoffs=list(range(1,11,1))+list(range(11,50,5)), - **kwargs) + print("PROGRESS: Evaluate model %s" % mname) + r = m.evaluate( + dataset_subset, + metric, + exclude_known_for_precision_recall, + target, + verbose=verbose, + cutoffs=list(range(1, 11, 1)) + list(range(11, 50, 5)), + **kwargs + ) results.append(r) return results -def precision_recall_by_user(observed_user_items, - recommendations, - cutoffs=[10]): +def precision_recall_by_user(observed_user_items, recommendations, cutoffs=[10]): """ Compute precision and recall at a given cutoff for each user. In information retrieval terms, precision represents the ratio of relevant, retrieved items @@ -403,37 +419,45 @@ def precision_recall_by_user(observed_user_items, user_id = recommendations.column_names()[0] item_id = recommendations.column_names()[1] - assert observed_user_items.num_rows() > 0, \ - "Evaluating precision and recall requires a non-empty " + \ - "observed_user_items." - assert user_id in observed_user_items.column_names(), \ - "User column required in observed_user_items." - assert item_id in observed_user_items.column_names(), \ - "Item column required in observed_user_items." - assert observed_user_items[user_id].dtype == \ - recommendations[user_id].dtype, \ - "The user column in the two provided SFrames must have the same type." - assert observed_user_items[item_id].dtype == \ - recommendations[item_id].dtype, \ - "The user column in the two provided SFrames must have the same type." - - cutoffs = _array.array('f', cutoffs) - - opts = {'data': observed_user_items, - 'recommendations': recommendations, - 'cutoffs': cutoffs} - - response = _turicreate.toolkits._main.run('evaluation_precision_recall_by_user', opts) - sf = _SFrame(None, _proxy=response['pr']) - return sf.sort([user_id, 'cutoff']) - - -def random_split_by_user(dataset, - user_id='user_id', - item_id='item_id', - max_num_users=1000, - item_test_proportion=.2, - random_seed=0): + assert observed_user_items.num_rows() > 0, ( + "Evaluating precision and recall requires a non-empty " + "observed_user_items." + ) + assert ( + user_id in observed_user_items.column_names() + ), "User column required in observed_user_items." + assert ( + item_id in observed_user_items.column_names() + ), "Item column required in observed_user_items." + assert ( + observed_user_items[user_id].dtype == recommendations[user_id].dtype + ), "The user column in the two provided SFrames must have the same type." + assert ( + observed_user_items[item_id].dtype == recommendations[item_id].dtype + ), "The user column in the two provided SFrames must have the same type." + + cutoffs = _array.array("f", cutoffs) + + opts = { + "data": observed_user_items, + "recommendations": recommendations, + "cutoffs": cutoffs, + } + + response = _turicreate.toolkits._main.run( + "evaluation_precision_recall_by_user", opts + ) + sf = _SFrame(None, _proxy=response["pr"]) + return sf.sort([user_id, "cutoff"]) + + +def random_split_by_user( + dataset, + user_id="user_id", + item_id="item_id", + max_num_users=1000, + item_test_proportion=0.2, + random_seed=0, +): """Create a recommender-friendly train-test split of the provided data set. The test dataset is generated by first choosing `max_num_users` out of the @@ -482,23 +506,27 @@ def random_split_by_user(dataset, """ - assert user_id in dataset.column_names(), \ - 'Provided user column "{0}" not found in data set.'.format(user_id) - assert item_id in dataset.column_names(), \ - 'Provided item column "{0}" not found in data set.'.format(item_id) + assert ( + user_id in dataset.column_names() + ), 'Provided user column "{0}" not found in data set.'.format(user_id) + assert ( + item_id in dataset.column_names() + ), 'Provided item column "{0}" not found in data set.'.format(item_id) - if max_num_users == 'all': + if max_num_users == "all": max_num_users = None if random_seed is None: import time - random_seed = int(hash("%20f" % time.time()) % 2**63) - response = _turicreate.extensions._recsys.train_test_split(dataset, user_id, item_id, - max_num_users, item_test_proportion, random_seed) + random_seed = int(hash("%20f" % time.time()) % 2 ** 63) + + response = _turicreate.extensions._recsys.train_test_split( + dataset, user_id, item_id, max_num_users, item_test_proportion, random_seed + ) - train = response['train'] - test = response['test'] + train = response["train"] + test = response["test"] return train, test @@ -513,8 +541,6 @@ class _Recommender(_Model): def _native_name(cls): return None - - def _list_fields(self): """ Get the current settings of the model. The keys depend on the type of @@ -527,7 +553,7 @@ def _list_fields(self): """ response = self.__proxy__.list_fields() - return [s for s in response['value'] if not s.startswith("_")] + return [s for s in response["value"] if not s.startswith("_")] def _get(self, field): """ @@ -593,7 +619,6 @@ def get_num_users_per_item(self): response = self.__proxy__.get_num_users_per_item() return response - def __str__(self): """ Returns the type of model. @@ -631,9 +656,7 @@ def _get_summary_struct(self): sections = [] observation_columns = set(self.observation_data_column_names) - not_needed = set([self.user_id, - self.item_id, - self.target]) + not_needed = set([self.user_id, self.item_id, self.target]) num_obs_fields = len(observation_columns.difference(not_needed)) user_features = self.user_side_data_column_names @@ -641,44 +664,42 @@ def _get_summary_struct(self): section_titles.append("Schema") schema_fields = [ - ('User ID', 'user_id'), - ('Item ID', 'item_id'), - ('Target', 'target'), - ('Additional observation features', _precomputed_field(num_obs_fields)), - ('User side features', _precomputed_field(user_features)), - ('Item side features', _precomputed_field(item_features))] + ("User ID", "user_id"), + ("Item ID", "item_id"), + ("Target", "target"), + ("Additional observation features", _precomputed_field(num_obs_fields)), + ("User side features", _precomputed_field(user_features)), + ("Item side features", _precomputed_field(item_features)), + ] sections.append(schema_fields) data_fields = [ - ('Number of observations', 'num_observations'), - ('Number of users', 'num_users'), - ('Number of items', 'num_items')] + ("Number of observations", "num_observations"), + ("Number of users", "num_users"), + ("Number of items", "num_items"), + ] section_titles.append("Statistics") sections.append(data_fields) - training_fields = [ - ('Training time', 'training_time')] + training_fields = [("Training time", "training_time")] - if 'data_load_elapsed_time' in stats: - training_fields.append(('Data load time', - 'data_load_elapsed_time')) - if 'validation_metrics_elapsed_time' in stats: - training_fields.append(('Validation metrics time', - 'validation_metrics_elapsed_time')) + if "data_load_elapsed_time" in stats: + training_fields.append(("Data load time", "data_load_elapsed_time")) + if "validation_metrics_elapsed_time" in stats: + training_fields.append( + ("Validation metrics time", "validation_metrics_elapsed_time") + ) section_titles.append("Training summary") sections.append(training_fields) # Remove any options that should not be shown under "Settings" - to_ignore = ['random_seed', - 'user_id', - 'item_id', - 'target'] + to_ignore = ["random_seed", "user_id", "item_id", "target"] for k in to_ignore: if k in options: del options[k] - def add_ordered_options(name, ordered_options, additional = []): + def add_ordered_options(name, ordered_options, additional=[]): option_fields = [] for k, v in additional: @@ -706,10 +727,14 @@ def add_ordered_options(name, ordered_options, additional = []): "nmf", "max_iterations", "similarity_type", - "training_method"] + "training_method", + ] - add_ordered_options("Model Parameters", model_parameter_options, - [("Model class", self.__class__.__name__)]) + add_ordered_options( + "Model Parameters", + model_parameter_options, + [("Model class", self.__class__.__name__)], + ) # Regularization type options regularization_options = [ @@ -720,7 +745,8 @@ def add_ordered_options(name, ordered_options, additional = []): "unobserved_rating_value", "num_sampled_negative_examples", "ials_confidence_scaling_type", - "ials_confidence_scaling_factor"] + "ials_confidence_scaling_factor", + ] add_ordered_options("Regularization Settings", regularization_options) @@ -740,7 +766,8 @@ def add_ordered_options(name, ordered_options, additional = []): "adagrad_momentum_weighting", "num_tempering_iterations", "tempering_regularization_start_value", - "track_exact_loss"] + "track_exact_loss", + ] add_ordered_options("Optimization Settings", optimization_settings) @@ -800,18 +827,24 @@ def __prepare_dataset_parameter(self, dataset): # Translate the dataset argument into the proper type if not isinstance(dataset, _SFrame): + def raise_dataset_type_exception(): - raise TypeError("The dataset parameter must be either an SFrame, " - "or a dictionary of (str : list) or (str : value).") + raise TypeError( + "The dataset parameter must be either an SFrame, " + "or a dictionary of (str : list) or (str : value)." + ) if type(dataset) is dict: if not all(type(k) is str for k in _six.iterkeys(dataset)): raise_dataset_type_exception() - if all(type(v) in (list, tuple, _array.array) for v in _six.itervalues(dataset)): + if all( + type(v) in (list, tuple, _array.array) + for v in _six.itervalues(dataset) + ): dataset = _SFrame(dataset) else: - dataset = _SFrame({k : [v] for k, v in _six.iteritems(dataset)}) + dataset = _SFrame({k: [v] for k, v in _six.iteritems(dataset)}) else: raise_dataset_type_exception() @@ -826,13 +859,16 @@ def _get_data_schema(self): if not hasattr(self, "_data_schema"): response = self.__proxy__.get_data_schema() - self._data_schema = {k : _turicreate._cython.cy_flexible_type.pytype_from_type_name(v) - for k, v in response["schema"].items()} + self._data_schema = { + k: _turicreate._cython.cy_flexible_type.pytype_from_type_name(v) + for k, v in response["schema"].items() + } return self._data_schema - def predict(self, dataset, - new_observation_data=None, new_user_data=None, new_item_data=None): + def predict( + self, dataset, new_observation_data=None, new_user_data=None, new_item_data=None + ): """ Return a score prediction for the user ids and item ids in the provided data set. @@ -883,21 +919,26 @@ def predict(self, dataset, if new_item_data is None: new_item_data = _SFrame() - dataset = self.__prepare_dataset_parameter(dataset) def check_type(arg, arg_name, required_type, allowed_types): if not isinstance(arg, required_type): - raise TypeError("Parameter " + arg_name + " must be of type(s) " - + (", ".join(allowed_types)) - + "; Type '" + str(type(arg)) + "' not recognized.") + raise TypeError( + "Parameter " + + arg_name + + " must be of type(s) " + + (", ".join(allowed_types)) + + "; Type '" + + str(type(arg)) + + "' not recognized." + ) check_type(new_observation_data, "new_observation_data", _SFrame, ["SFrame"]) check_type(new_user_data, "new_user_data", _SFrame, ["SFrame"]) check_type(new_item_data, "new_item_data", _SFrame, ["SFrame"]) response = self.__proxy__.predict(dataset, new_user_data, new_item_data) - return response['prediction'] + return response["prediction"] def get_similar_items(self, items=None, k=10, verbose=False): """ @@ -953,9 +994,15 @@ def get_similar_items(self, items=None, k=10, verbose=False): def check_type(arg, arg_name, required_type, allowed_types): if not isinstance(arg, required_type): - raise TypeError("Parameter " + arg_name + " must be of type(s) " - + (", ".join(allowed_types) ) - + "; Type '" + str(type(arg)) + "' not recognized.") + raise TypeError( + "Parameter " + + arg_name + + " must be of type(s) " + + (", ".join(allowed_types)) + + "; Type '" + + str(type(arg)) + + "' not recognized." + ) check_type(items, "items", _SArray, ["SArray", "list"]) check_type(k, "k", int, ["int"]) @@ -1012,9 +1059,15 @@ def get_similar_users(self, users=None, k=10): def check_type(arg, arg_name, required_type, allowed_types): if not isinstance(arg, required_type): - raise TypeError("Parameter " + arg_name + " must be of type(s) " - + (", ".join(allowed_types) ) - + "; Type '" + str(type(arg)) + "' not recognized.") + raise TypeError( + "Parameter " + + arg_name + + " must be of type(s) " + + (", ".join(allowed_types)) + + "; Type '" + + str(type(arg)) + + "' not recognized." + ) check_type(users, "users", _SArray, ["SArray", "list"]) check_type(k, "k", int, ["int"]) @@ -1022,11 +1075,20 @@ def check_type(arg, arg_name, required_type, allowed_types): response = self.__proxy__.get_similar_users(users, k, get_all_users) return response - - def recommend(self, users=None, k=10, exclude=None, items=None, - new_observation_data=None, new_user_data=None, new_item_data=None, - exclude_known=True, diversity=0, random_seed=None, - verbose=True): + def recommend( + self, + users=None, + k=10, + exclude=None, + items=None, + new_observation_data=None, + new_user_data=None, + new_item_data=None, + exclude_known=True, + diversity=0, + random_seed=None, + verbose=True, + ): """ Recommend the ``k`` highest scored items for each user. @@ -1159,18 +1221,22 @@ def recommend(self, users=None, k=10, exclude=None, items=None, if new_item_data is None: new_item_data = __null_sframe - if isinstance(users, list) or (_HAS_NUMPY and isinstance(users, _numpy.ndarray)): + if isinstance(users, list) or ( + _HAS_NUMPY and isinstance(users, _numpy.ndarray) + ): users = _SArray(users) # allow to take a list of dictionaries of the form [{'user_id':1,'time':10}] etc. if users.dtype == dict: - users = users.unpack(column_name_prefix='') + users = users.unpack(column_name_prefix="") if isinstance(users, _SArray): users = _SFrame({user_id: users}) - if isinstance(items, list) or (_HAS_NUMPY and isinstance(items, _numpy.ndarray)): - items = _SArray(items, dtype = item_type) + if isinstance(items, list) or ( + _HAS_NUMPY and isinstance(items, _numpy.ndarray) + ): + items = _SArray(items, dtype=item_type) if isinstance(items, _SArray): items = _SFrame({item_id: items}) @@ -1178,13 +1244,23 @@ def recommend(self, users=None, k=10, exclude=None, items=None, # Check type of incoming data. def check_type(arg, arg_name, required_type, allowed_types): if not isinstance(arg, required_type): - raise TypeError("Parameter " + arg_name + " must be of type(s) " - + (", ".join(allowed_types)) - + "; Type '" + str(type(arg)) + "' not recognized.") - - check_type(users, "users", _SFrame, ["SArray", "list", "SFrame", "numpy.ndarray"]) + raise TypeError( + "Parameter " + + arg_name + + " must be of type(s) " + + (", ".join(allowed_types)) + + "; Type '" + + str(type(arg)) + + "' not recognized." + ) + + check_type( + users, "users", _SFrame, ["SArray", "list", "SFrame", "numpy.ndarray"] + ) check_type(exclude, "exclude", _SFrame, ["SFrame"]) - check_type(items, "items", _SFrame, ["SFrame", "SArray", "list", "numpy.ndarray"]) + check_type( + items, "items", _SFrame, ["SFrame", "SArray", "list", "numpy.ndarray"] + ) check_type(new_observation_data, "new_observation_data", _SFrame, ["SFrame"]) check_type(new_user_data, "new_user_data", _SFrame, ["SFrame"]) check_type(new_item_data, "new_item_data", _SFrame, ["SFrame"]) @@ -1210,42 +1286,54 @@ def check_type(arg, arg_name, required_type, allowed_types): # everything back and forth to that to preserve type. if new_observation_data.num_rows() == 0: - raise ValueError("When users are not specified with the model, " - "new_observation_data must be set in order to make recommendations.") + raise ValueError( + "When users are not specified with the model, " + "new_observation_data must be set in order to make recommendations." + ) - new_observation_data[user_id] = new_observation_data[user_id].astype(user_type) + new_observation_data[user_id] = new_observation_data[user_id].astype( + user_type + ) else: - print("WARNING: No users specified to model at creation time, so " - "calling recommend() for all users returns empty SFrame.") + print( + "WARNING: No users specified to model at creation time, so " + "calling recommend() for all users returns empty SFrame." + ) # Cast to the appropriate type if necessary. if users.num_rows() != 0: try: user_column = users[user_id] except RuntimeError: - raise _ToolkitError("User column '%s' not present in input user data." % user_id) + raise _ToolkitError( + "User column '%s' not present in input user data." % user_id + ) if cast_user_to_string_type: assert new_observation_data.num_rows() != 0 original_user_type = user_column.dtype users[user_id] = user_column.astype(str) - user_type=str + user_type = str elif user_column.dtype != user_type: users[user_id] = user_column.astype(user_type) # Cast user specified in exclude to the appropriate type if necessary. - if user_id in exclude.column_names() and exclude[user_id].dtype!=user_type: - exclude[user_id] = exclude[user_id].astype(user_type) + if user_id in exclude.column_names() and exclude[user_id].dtype != user_type: + exclude[user_id] = exclude[user_id].astype(user_type) try: diversity = float(diversity) except Exception: - raise TypeError("Parameter diversity must be a floating point value equal to or larger than 0.") + raise TypeError( + "Parameter diversity must be a floating point value equal to or larger than 0." + ) if diversity < 0: - raise TypeError("Parameter diversity must be a floating point value equal to or larger than 0.") + raise TypeError( + "Parameter diversity must be a floating point value equal to or larger than 0." + ) if random_seed is None: random_seed = hash("%.20f" % _time.time()) @@ -1256,8 +1344,18 @@ def check_type(arg, arg_name, required_type, allowed_types): raise TypeError("random_seed must be integer.") with QuietProgress(verbose): - recs = self.__proxy__.recommend(users, exclude, items, new_observation_data, new_user_data, - new_item_data, exclude_known, k, diversity, random_seed) + recs = self.__proxy__.recommend( + users, + exclude, + items, + new_observation_data, + new_user_data, + new_item_data, + exclude_known, + k, + diversity, + random_seed, + ) if cast_user_to_string_type: recs[user_id] = recs[user_id].astype(original_user_type) @@ -1265,10 +1363,18 @@ def check_type(arg, arg_name, required_type, allowed_types): return recs def recommend_from_interactions( - self, observed_items, k=10, exclude=None, items=None, - new_user_data=None, new_item_data=None, - exclude_known=True, diversity=0, random_seed=None, - verbose=True): + self, + observed_items, + k=10, + exclude=None, + items=None, + new_user_data=None, + new_item_data=None, + exclude_known=True, + diversity=0, + random_seed=None, + verbose=True, + ): """ Recommend the ``k`` highest scored items based on the interactions given in `observed_items.` @@ -1366,17 +1472,19 @@ def recommend_from_interactions( item_type = column_types[item_id] if not hasattr(self, "_implicit_user_name"): - self._implicit_user_name = None #("implicit-user-%s" -# % hashlib.md5("%0.20f" % time.time()).hexdigest()[:12]) + self._implicit_user_name = None # ("implicit-user-%s" + # % hashlib.md5("%0.20f" % time.time()).hexdigest()[:12]) if isinstance(observed_items, list): - observed_items = _SArray(observed_items, dtype = item_type) + observed_items = _SArray(observed_items, dtype=item_type) if isinstance(observed_items, _SArray): - observed_items = _SFrame({self.item_id : observed_items}) + observed_items = _SFrame({self.item_id: observed_items}) if not isinstance(observed_items, _SFrame): - raise TypeError("observed_items must be a list or SArray of items, or an SFrame of items " - "and optionally ratings or other interaction information.") + raise TypeError( + "observed_items must be a list or SArray of items, or an SFrame of items " + "and optionally ratings or other interaction information." + ) # Don't modify the user's argument (if it's an SFrame). observed_items = observed_items.copy() @@ -1387,12 +1495,14 @@ def recommend_from_interactions( if user_id in observed_items.column_names(): main_user_value = observed_items[user_id][0] if (observed_items[user_id] != main_user_value).any(): - raise ValueError("To recommend items for more than one user, use `recommend()` and " - "supply new interactions using new_observation_data.") - users = _SArray([main_user_value], dtype = user_type) + raise ValueError( + "To recommend items for more than one user, use `recommend()` and " + "supply new interactions using new_observation_data." + ) + users = _SArray([main_user_value], dtype=user_type) else: - users = _SArray([self._implicit_user_name], dtype = user_type) + users = _SArray([self._implicit_user_name], dtype=user_type) observed_items[user_id] = self._implicit_user_name if observed_items[user_id].dtype != user_type: @@ -1401,33 +1511,40 @@ def recommend_from_interactions( # Check the rest of the arguments. if exclude is not None: if isinstance(exclude, list): - exclude = _SArray(exclude, dtype = item_type) + exclude = _SArray(exclude, dtype=item_type) if isinstance(exclude, _SArray): - exclude = _SFrame({item_id : exclude}) + exclude = _SFrame({item_id: exclude}) if user_id not in exclude.column_names(): exclude[user_id] = self._implicit_user_name exclude[user_id] = exclude[user_id].astype(user_type) recommendations = self.recommend( - users = users, - new_observation_data = observed_items, - exclude = exclude, - k = k, - items = items, - new_user_data = new_user_data, - new_item_data = new_item_data, - exclude_known = exclude_known, - diversity = diversity, - random_seed = random_seed, - verbose = verbose) + users=users, + new_observation_data=observed_items, + exclude=exclude, + k=k, + items=items, + new_user_data=new_user_data, + new_item_data=new_item_data, + exclude_known=exclude_known, + diversity=diversity, + random_seed=random_seed, + verbose=verbose, + ) del recommendations[user_id] return recommendations - def evaluate_precision_recall(self, dataset, cutoffs=list(range(1,11,1))+list(range(11,50,5)), - skip_set=None, exclude_known=True, - verbose=True, **kwargs): + def evaluate_precision_recall( + self, + dataset, + cutoffs=list(range(1, 11, 1)) + list(range(11, 50, 5)), + skip_set=None, + exclude_known=True, + verbose=True, + **kwargs + ): """ Compute a model's precision and recall scores for a particular dataset. @@ -1479,10 +1596,11 @@ def evaluate_precision_recall(self, dataset, cutoffs=list(range(1,11,1))+list(ra user_column = self.user_id item_column = self.item_id - assert user_column in dataset.column_names() and \ - item_column in dataset.column_names(), \ - 'Provided data set must have a column pertaining to user ids and \ - item ids, similar to what we had during training.' + assert ( + user_column in dataset.column_names() + and item_column in dataset.column_names() + ), "Provided data set must have a column pertaining to user ids and \ + item ids, similar to what we had during training." dataset = self.__prepare_dataset_parameter(dataset) @@ -1490,21 +1608,30 @@ def evaluate_precision_recall(self, dataset, cutoffs=list(range(1,11,1))+list(ra dataset = dataset[[self.user_id, self.item_id]] - recs = self.recommend(users=users, k=max(cutoffs), exclude=skip_set, - exclude_known=exclude_known, - verbose=verbose, - **kwargs) + recs = self.recommend( + users=users, + k=max(cutoffs), + exclude=skip_set, + exclude_known=exclude_known, + verbose=verbose, + **kwargs + ) - precision_recall_by_user = self.__proxy__.precision_recall_by_user(dataset, recs, cutoffs) + precision_recall_by_user = self.__proxy__.precision_recall_by_user( + dataset, recs, cutoffs + ) - ret = {'precision_recall_by_user': precision_recall_by_user} + ret = {"precision_recall_by_user": precision_recall_by_user} pr_agg = precision_recall_by_user.groupby( - 'cutoff', - operations={'precision' : _Aggregate.MEAN('precision'), - 'recall' : _Aggregate.MEAN('recall')}) - - pr_agg = pr_agg[['cutoff', 'precision', 'recall']] + "cutoff", + operations={ + "precision": _Aggregate.MEAN("precision"), + "recall": _Aggregate.MEAN("recall"), + }, + ) + + pr_agg = pr_agg[["cutoff", "precision", "recall"]] ret["precision_recall_overall"] = pr_agg.sort("cutoff") return ret @@ -1542,38 +1669,55 @@ def evaluate_rmse(self, dataset, target): turicreate.evaluation.rmse """ - assert target in dataset.column_names(), \ - 'Provided dataset must contain a target column with the same \ - name as the target used during training.' + assert ( + target in dataset.column_names() + ), "Provided dataset must contain a target column with the same \ + name as the target used during training." y = dataset[target] yhat = self.predict(dataset) user_column = self.user_id item_column = self.item_id - assert user_column in dataset.column_names() and \ - item_column in dataset.column_names(), \ - 'Provided data set must have a column pertaining to user ids and \ - item ids, similar to what we had during training.' + assert ( + user_column in dataset.column_names() + and item_column in dataset.column_names() + ), "Provided data set must have a column pertaining to user ids and \ + item ids, similar to what we had during training." result = dataset[[user_column, item_column]] - result['sq_error'] = (y - yhat) * (y - yhat) - rmse_by_user = result.groupby(user_column, - {'rmse':_turicreate.aggregate.AVG('sq_error'), - 'count':_turicreate.aggregate.COUNT}) - rmse_by_user['rmse'] = rmse_by_user['rmse'].apply(lambda x: x**.5) - rmse_by_item = result.groupby(item_column, - {'rmse':_turicreate.aggregate.AVG('sq_error'), - 'count':_turicreate.aggregate.COUNT}) - rmse_by_item['rmse'] = rmse_by_item['rmse'].apply(lambda x: x**.5) - overall_rmse = result['sq_error'].mean() ** .5 - - return {'rmse_by_user': rmse_by_user, - 'rmse_by_item': rmse_by_item, - 'rmse_overall': overall_rmse} - - def evaluate(self, dataset, metric='auto', - exclude_known_for_precision_recall=True, - target=None, - verbose=True, **kwargs): + result["sq_error"] = (y - yhat) * (y - yhat) + rmse_by_user = result.groupby( + user_column, + { + "rmse": _turicreate.aggregate.AVG("sq_error"), + "count": _turicreate.aggregate.COUNT, + }, + ) + rmse_by_user["rmse"] = rmse_by_user["rmse"].apply(lambda x: x ** 0.5) + rmse_by_item = result.groupby( + item_column, + { + "rmse": _turicreate.aggregate.AVG("sq_error"), + "count": _turicreate.aggregate.COUNT, + }, + ) + rmse_by_item["rmse"] = rmse_by_item["rmse"].apply(lambda x: x ** 0.5) + overall_rmse = result["sq_error"].mean() ** 0.5 + + return { + "rmse_by_user": rmse_by_user, + "rmse_by_item": rmse_by_item, + "rmse_overall": overall_rmse, + } + + def evaluate( + self, + dataset, + metric="auto", + exclude_known_for_precision_recall=True, + target=None, + verbose=True, + **kwargs + ): r""" Evaluate the model's ability to make rating predictions or recommendations. @@ -1661,38 +1805,53 @@ def evaluate(self, dataset, metric='auto', dataset = self.__prepare_dataset_parameter(dataset) # If the model does not have a target column, compute prec-recall. - if metric in ['precision_recall', 'auto']: - results = self.evaluate_precision_recall(dataset, - exclude_known=exclude_known_for_precision_recall, - verbose=verbose, - **kwargs) + if metric in ["precision_recall", "auto"]: + results = self.evaluate_precision_recall( + dataset, + exclude_known=exclude_known_for_precision_recall, + verbose=verbose, + **kwargs + ) ret.update(results) if verbose: print("\nPrecision and recall summary statistics by cutoff") - print(results['precision_recall_by_user'].groupby('cutoff', \ - {'mean_precision': _turicreate.aggregate.AVG('precision'), - 'mean_recall': _turicreate.aggregate.AVG('recall')}).topk('cutoff', reverse=True)) - if metric in ['rmse', 'auto']: + print( + results["precision_recall_by_user"] + .groupby( + "cutoff", + { + "mean_precision": _turicreate.aggregate.AVG("precision"), + "mean_recall": _turicreate.aggregate.AVG("recall"), + }, + ) + .topk("cutoff", reverse=True) + ) + if metric in ["rmse", "auto"]: if target is None: target = self.target if target is None or target == "": - _logging.warning("Model trained without a target. Skipping RMSE computation.") + _logging.warning( + "Model trained without a target. Skipping RMSE computation." + ) else: results = self.evaluate_rmse(dataset, target) ret.update(results) if verbose: - print("\nOverall RMSE:", results['rmse_overall']) + print("\nOverall RMSE:", results["rmse_overall"]) print("\nPer User RMSE (best)") - print(results['rmse_by_user'].topk('rmse', 1, reverse=True)) + print(results["rmse_by_user"].topk("rmse", 1, reverse=True)) print("\nPer User RMSE (worst)") - print(results['rmse_by_user'].topk('rmse', 1)) + print(results["rmse_by_user"].topk("rmse", 1)) print("\nPer Item RMSE (best)") - print(results['rmse_by_item'].topk('rmse', 1, reverse=True)) + print(results["rmse_by_item"].topk("rmse", 1, reverse=True)) print("\nPer Item RMSE (worst)") - print(results['rmse_by_item'].topk('rmse', 1)) - if metric not in ['rmse', 'precision_recall', 'auto']: - raise ValueError('Unknown evaluation metric %s, supported metrics are [\"rmse\", \"precision_recall\"]' % metric) + print(results["rmse_by_item"].topk("rmse", 1)) + if metric not in ["rmse", "precision_recall", "auto"]: + raise ValueError( + 'Unknown evaluation metric %s, supported metrics are ["rmse", "precision_recall"]' + % metric + ) return ret @@ -1705,6 +1864,7 @@ def _get_popularity_baseline(self): response = self.__proxy__.get_popularity_baseline() from .popularity_recommender import PopularityRecommender + return PopularityRecommender(response) def _get_item_intersection_info(self, item_pairs): @@ -1730,16 +1890,24 @@ def _get_item_intersection_info(self, item_pairs): if type(item_pairs) is list: if not all(type(t) in [list, tuple] and len(t) == 2 for t in item_pairs): - raise TypeError("item_pairs must be 2-column SFrame of two item " - "columns, or a list of (item_1, item_2) tuples. ") + raise TypeError( + "item_pairs must be 2-column SFrame of two item " + "columns, or a list of (item_1, item_2) tuples. " + ) item_name = self.item_id - item_pairs = _turicreate.SFrame({item_name + "_1" : [v1 for v1, v2 in item_pairs], - item_name + "_2" : [v2 for v1, v2 in item_pairs]}) + item_pairs = _turicreate.SFrame( + { + item_name + "_1": [v1 for v1, v2 in item_pairs], + item_name + "_2": [v2 for v1, v2 in item_pairs], + } + ) if not isinstance(item_pairs, _turicreate.SFrame): - raise TypeError("item_pairs must be 2-column SFrame of two item " - "columns, or a list of (item_1, item_2) tuples. ") + raise TypeError( + "item_pairs must be 2-column SFrame of two item " + "columns, or a list of (item_1, item_2) tuples. " + ) response = self.__proxy__.get_item_intersection_info(item_pairs) return response @@ -1757,10 +1925,12 @@ def export_coreml(self, filename): -------- >>> model.export_coreml('myModel.mlmodel') """ - print('This model is exported as a custom Core ML model. In order to use it in your\n' - 'application, you must also include "libRecommender.dylib". For additional\n' - 'details see:\n' - 'https://apple.github.io/turicreate/docs/userguide/recommender/coreml-deployment.html') + print( + "This model is exported as a custom Core ML model. In order to use it in your\n" + 'application, you must also include "libRecommender.dylib". For additional\n' + "details see:\n" + "https://apple.github.io/turicreate/docs/userguide/recommender/coreml-deployment.html" + ) additional_user_defined_metadata = _coreml_utils._get_tc_version_info() self.__proxy__.export_to_coreml(filename, additional_user_defined_metadata) diff --git a/src/python/turicreate/toolkits/regression/_regression.py b/src/python/turicreate/toolkits/regression/_regression.py index 50dd63aea5..cec0921d66 100644 --- a/src/python/turicreate/toolkits/regression/_regression.py +++ b/src/python/turicreate/toolkits/regression/_regression.py @@ -11,8 +11,8 @@ from turicreate.toolkits._internal_utils import _validate_data from turicreate._cython.cy_server import QuietProgress -def create(dataset, target, features=None, validation_set = 'auto', - verbose=True): + +def create(dataset, target, features=None, validation_set="auto", verbose=True): """ Automatically create a suitable regression model based on the provided training data. @@ -105,12 +105,12 @@ def create(dataset, target, features=None, validation_set = 'auto', """ - dataset, validation_set = _validate_data(dataset, target, features, - validation_set) + dataset, validation_set = _validate_data(dataset, target, features, validation_set) if validation_set is None: validation_set = _turicreate.SFrame() model_proxy = _turicreate.extensions.create_automatic_regression_model( - dataset, target, validation_set, {}) + dataset, target, validation_set, {} + ) return _sl.wrap_model_proxy(model_proxy) diff --git a/src/python/turicreate/toolkits/regression/boosted_trees_regression.py b/src/python/turicreate/toolkits/regression/boosted_trees_regression.py index cec2f311ee..693c6ff281 100644 --- a/src/python/turicreate/toolkits/regression/boosted_trees_regression.py +++ b/src/python/turicreate/toolkits/regression/boosted_trees_regression.py @@ -10,7 +10,9 @@ from __future__ import division as _ from __future__ import absolute_import as _ import turicreate as _turicreate -from turicreate.toolkits._supervised_learning import SupervisedLearningModel as _SupervisedLearningModel +from turicreate.toolkits._supervised_learning import ( + SupervisedLearningModel as _SupervisedLearningModel, +) import turicreate.toolkits._supervised_learning as _sl import turicreate.toolkits._main as _toolkits_main from turicreate.toolkits._internal_utils import _toolkit_repr_print @@ -22,12 +24,28 @@ from turicreate.util import _make_internal_url -_BOOSTED_TREES_MODEL_PARAMS_KEYS = ['step_size', 'max_depth', -'max_iterations', 'min_child_weight', 'min_loss_reduction', 'row_subsample'] -_BOOSTED_TREE_TRAINING_PARAMS_KEYS = ['objective', 'training_time', -'training_error', 'validation_error', 'evaluation_metric'] -_BOOSTED_TREE_TRAINING_DATA_PARAMS_KEYS = ['target', 'features', -'num_features', 'num_examples', 'num_validation_examples'] +_BOOSTED_TREES_MODEL_PARAMS_KEYS = [ + "step_size", + "max_depth", + "max_iterations", + "min_child_weight", + "min_loss_reduction", + "row_subsample", +] +_BOOSTED_TREE_TRAINING_PARAMS_KEYS = [ + "objective", + "training_time", + "training_error", + "validation_error", + "evaluation_metric", +] +_BOOSTED_TREE_TRAINING_DATA_PARAMS_KEYS = [ + "target", + "features", + "num_features", + "num_examples", + "num_validation_examples", +] class BoostedTreesRegression(_SupervisedLearningModel, _TreeModelMixin): @@ -54,6 +72,7 @@ class BoostedTreesRegression(_SupervisedLearningModel, _TreeModelMixin): -------- create """ + def __init__(self, proxy): """__init__(self)""" self.__proxy__ = proxy @@ -149,7 +168,7 @@ def _get(self, field): """ return super(BoostedTreesRegression, self)._get(field) - def evaluate(self, dataset, metric='auto', missing_value_action='auto'): + def evaluate(self, dataset, metric="auto", missing_value_action="auto"): """ Evaluate the model on the given dataset. @@ -194,11 +213,10 @@ def evaluate(self, dataset, metric='auto', missing_value_action='auto'): >>> results = model.evaluate(test_data, 'rmse') """ - _raise_error_evaluation_metric_is_valid( - metric, ['auto', 'rmse', 'max_error']) - return super(BoostedTreesRegression, self).evaluate(dataset, - missing_value_action=missing_value_action, - metric=metric) + _raise_error_evaluation_metric_is_valid(metric, ["auto", "rmse", "max_error"]) + return super(BoostedTreesRegression, self).evaluate( + dataset, missing_value_action=missing_value_action, metric=metric + ) def export_coreml(self, filename): """ @@ -214,16 +232,18 @@ def export_coreml(self, filename): >>> model.export_coreml("MyModel.mlmodel") """ from turicreate.toolkits import _coreml_utils + display_name = "boosted trees regression" short_description = _coreml_utils._mlmodel_short_description(display_name) - context = {"mode" : "regression", - "model_type" : "boosted_trees", - "class": self.__class__.__name__, - "short_description": short_description, - } + context = { + "mode": "regression", + "model_type": "boosted_trees", + "class": self.__class__.__name__, + "short_description": short_description, + } self._export_coreml_impl(filename, context) - def predict(self, dataset, missing_value_action='auto'): + def predict(self, dataset, missing_value_action="auto"): """ Predict the target column of the given dataset. @@ -263,21 +283,28 @@ def predict(self, dataset, missing_value_action='auto'): -------- >>> m.predict(testdata) """ - return super(BoostedTreesRegression, self).predict(dataset, output_type='margin', - missing_value_action=missing_value_action) - - - -def create(dataset, target, - features=None, max_iterations=10, - validation_set='auto', - max_depth=6, step_size=0.3, - min_loss_reduction=0.0, min_child_weight=0.1, - row_subsample=1.0, column_subsample=1.0, - verbose=True, - random_seed = None, - metric = 'auto', - **kwargs): + return super(BoostedTreesRegression, self).predict( + dataset, output_type="margin", missing_value_action=missing_value_action + ) + + +def create( + dataset, + target, + features=None, + max_iterations=10, + validation_set="auto", + max_depth=6, + step_size=0.3, + min_loss_reduction=0.0, + min_child_weight=0.1, + row_subsample=1.0, + column_subsample=1.0, + verbose=True, + random_seed=None, + metric="auto", + **kwargs +): """ Create a :class:`~turicreate.boosted_trees_regression.BoostedTreesRegression` to predict a scalar target variable using one or more features. In addition to standard @@ -420,25 +447,31 @@ def create(dataset, target, """ if random_seed is not None: - kwargs['random_seed'] = random_seed - if 'model_checkpoint_path' in kwargs: - kwargs['model_checkpoint_path'] = _make_internal_url(kwargs['model_checkpoint_path']) - if 'resume_from_checkpoint' in kwargs: - kwargs['resume_from_checkpoint'] = _make_internal_url(kwargs['resume_from_checkpoint']) - - model = _sl.create(dataset = dataset, - target = target, - features = features, - model_name = 'boosted_trees_regression', - max_iterations = max_iterations, - validation_set = validation_set, - max_depth = max_depth, - step_size = step_size, - min_loss_reduction = min_loss_reduction, - min_child_weight = min_child_weight, - row_subsample = row_subsample, - column_subsample = column_subsample, - verbose = verbose, - metric = metric, - **kwargs) + kwargs["random_seed"] = random_seed + if "model_checkpoint_path" in kwargs: + kwargs["model_checkpoint_path"] = _make_internal_url( + kwargs["model_checkpoint_path"] + ) + if "resume_from_checkpoint" in kwargs: + kwargs["resume_from_checkpoint"] = _make_internal_url( + kwargs["resume_from_checkpoint"] + ) + + model = _sl.create( + dataset=dataset, + target=target, + features=features, + model_name="boosted_trees_regression", + max_iterations=max_iterations, + validation_set=validation_set, + max_depth=max_depth, + step_size=step_size, + min_loss_reduction=min_loss_reduction, + min_child_weight=min_child_weight, + row_subsample=row_subsample, + column_subsample=column_subsample, + verbose=verbose, + metric=metric, + **kwargs + ) return BoostedTreesRegression(model.__proxy__) diff --git a/src/python/turicreate/toolkits/regression/decision_tree_regression.py b/src/python/turicreate/toolkits/regression/decision_tree_regression.py index 99b79a5698..58aa972f87 100644 --- a/src/python/turicreate/toolkits/regression/decision_tree_regression.py +++ b/src/python/turicreate/toolkits/regression/decision_tree_regression.py @@ -10,7 +10,9 @@ from __future__ import division as _ from __future__ import absolute_import as _ -from turicreate.toolkits._supervised_learning import SupervisedLearningModel as _SupervisedLearningModel +from turicreate.toolkits._supervised_learning import ( + SupervisedLearningModel as _SupervisedLearningModel, +) import turicreate.toolkits._supervised_learning as _sl from turicreate.toolkits._internal_utils import _toolkit_repr_print from turicreate.toolkits._internal_utils import _raise_error_evaluation_metric_is_valid @@ -20,13 +22,25 @@ from turicreate.toolkits._internal_utils import _map_unity_proxy_to_object - -_DECISION_TREE_MODEL_PARAMS_KEYS = ['max_depth', 'min_child_weight', -'min_loss_reduction'] -_DECISION_TREE_TRAINING_PARAMS_KEYS = ['objective', 'training_time', -'training_error', 'validation_error', 'evaluation_metric'] -_DECISION_TREE_TRAINING_DATA_PARAMS_KEYS = ['target', 'features', -'num_features', 'num_examples', 'num_validation_examples'] +_DECISION_TREE_MODEL_PARAMS_KEYS = [ + "max_depth", + "min_child_weight", + "min_loss_reduction", +] +_DECISION_TREE_TRAINING_PARAMS_KEYS = [ + "objective", + "training_time", + "training_error", + "validation_error", + "evaluation_metric", +] +_DECISION_TREE_TRAINING_DATA_PARAMS_KEYS = [ + "target", + "features", + "num_features", + "num_examples", + "num_validation_examples", +] class DecisionTreeRegression(_SupervisedLearningModel, _TreeModelMixin): @@ -53,6 +67,7 @@ class DecisionTreeRegression(_SupervisedLearningModel, _TreeModelMixin): -------- create """ + def __init__(self, proxy): """__init__(self)""" self.__proxy__ = proxy @@ -91,17 +106,19 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ data_fields = [ - ('Number of examples', 'num_examples'), - ('Number of feature columns', 'num_features'), - ('Number of unpacked features', 'num_unpacked_features')] + ("Number of examples", "num_examples"), + ("Number of feature columns", "num_features"), + ("Number of unpacked features", "num_unpacked_features"), + ] training_fields = [ - ("Max tree depth", 'max_depth'), - ("Train RMSE", 'training_rmse'), - ("Validation RMSE", 'validation_rmse'), - ("Training time (sec)", 'training_time')] + ("Max tree depth", "max_depth"), + ("Train RMSE", "training_rmse"), + ("Validation RMSE", "validation_rmse"), + ("Training time (sec)", "training_time"), + ] - return ( [data_fields, training_fields], ['Schema', 'Settings']) + return ([data_fields, training_fields], ["Schema", "Settings"]) def __repr__(self): """ @@ -173,7 +190,7 @@ def _get(self, field): """ return super(DecisionTreeRegression, self)._get(field) - def evaluate(self, dataset, metric='auto', missing_value_action='auto'): + def evaluate(self, dataset, metric="auto", missing_value_action="auto"): """ Evaluate the model on the given dataset. @@ -218,11 +235,10 @@ def evaluate(self, dataset, metric='auto', missing_value_action='auto'): >>> results = model.evaluate(test_data, 'rmse') """ - _raise_error_evaluation_metric_is_valid( - metric, ['auto', 'rmse', 'max_error']) - return super(DecisionTreeRegression, self).evaluate(dataset, - missing_value_action=missing_value_action, - metric=metric) + _raise_error_evaluation_metric_is_valid(metric, ["auto", "rmse", "max_error"]) + return super(DecisionTreeRegression, self).evaluate( + dataset, missing_value_action=missing_value_action, metric=metric + ) def export_coreml(self, filename): """ @@ -238,16 +254,18 @@ def export_coreml(self, filename): >>> model.export_coreml("MyModel.mlmodel") """ from turicreate.toolkits import _coreml_utils + display_name = "decision tree regression" short_description = _coreml_utils._mlmodel_short_description(display_name) - context = {"mode" : "regression", - "model_type" : "decision_tree", - "class": self.__class__.__name__, - "short_description": short_description, - } + context = { + "mode": "regression", + "model_type": "decision_tree", + "class": self.__class__.__name__, + "short_description": short_description, + } self._export_coreml_impl(filename, context) - def predict(self, dataset, missing_value_action='auto'): + def predict(self, dataset, missing_value_action="auto"): """ Predict the target column of the given dataset. @@ -287,19 +305,24 @@ def predict(self, dataset, missing_value_action='auto'): -------- >>> m.predict(testdata) """ - return super(DecisionTreeRegression, self).predict(dataset, output_type='margin', - missing_value_action=missing_value_action) - - -def create(dataset, target, - features=None, - validation_set='auto', - max_depth=6, - min_loss_reduction=0.0, min_child_weight=0.1, - verbose=True, - random_seed = None, - metric = 'auto', - **kwargs): + return super(DecisionTreeRegression, self).predict( + dataset, output_type="margin", missing_value_action=missing_value_action + ) + + +def create( + dataset, + target, + features=None, + validation_set="auto", + max_depth=6, + min_loss_reduction=0.0, + min_child_weight=0.1, + verbose=True, + random_seed=None, + metric="auto", + **kwargs +): """ Create a :class:`~turicreate.decision_tree_regression.DecisionTreeRegression` to predict a scalar target variable using one or more features. In addition to standard @@ -396,15 +419,18 @@ def create(dataset, target, """ if random_seed is not None: - kwargs['random_seed'] = random_seed - - model = _sl.create(dataset = dataset, - target = target, - features = features, - model_name = 'decision_tree_regression', - validation_set = validation_set, - max_depth = max_depth, - min_loss_reduction = min_loss_reduction, - min_child_weight = min_child_weight, - verbose = verbose, **kwargs) + kwargs["random_seed"] = random_seed + + model = _sl.create( + dataset=dataset, + target=target, + features=features, + model_name="decision_tree_regression", + validation_set=validation_set, + max_depth=max_depth, + min_loss_reduction=min_loss_reduction, + min_child_weight=min_child_weight, + verbose=verbose, + **kwargs + ) return DecisionTreeRegression(model.__proxy__) diff --git a/src/python/turicreate/toolkits/regression/linear_regression.py b/src/python/turicreate/toolkits/regression/linear_regression.py index b99579bbe4..d8a06bd0eb 100644 --- a/src/python/turicreate/toolkits/regression/linear_regression.py +++ b/src/python/turicreate/toolkits/regression/linear_regression.py @@ -11,27 +11,39 @@ from __future__ import absolute_import as _ import turicreate.toolkits._supervised_learning as _sl -from turicreate.toolkits._supervised_learning import SupervisedLearningModel as \ - _SupervisedLearningModel -from turicreate.toolkits._internal_utils import _toolkit_repr_print, \ - _toolkit_get_topk_bottomk, \ - _summarize_coefficients, \ - _raise_error_evaluation_metric_is_valid +from turicreate.toolkits._supervised_learning import ( + SupervisedLearningModel as _SupervisedLearningModel, +) +from turicreate.toolkits._internal_utils import ( + _toolkit_repr_print, + _toolkit_get_topk_bottomk, + _summarize_coefficients, + _raise_error_evaluation_metric_is_valid, +) _DEFAULT_SOLVER_OPTIONS = { -'convergence_threshold': 1e-2, -'step_size': 1.0, -'lbfgs_memory_level': 11, -'max_iterations': 10} - -def create(dataset, target, features=None, l2_penalty=1e-2, l1_penalty=0.0, - solver='auto', feature_rescaling=True, - convergence_threshold = _DEFAULT_SOLVER_OPTIONS['convergence_threshold'], - step_size = _DEFAULT_SOLVER_OPTIONS['step_size'], - lbfgs_memory_level = _DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'], - max_iterations = _DEFAULT_SOLVER_OPTIONS['max_iterations'], - validation_set = "auto", - verbose=True): + "convergence_threshold": 1e-2, + "step_size": 1.0, + "lbfgs_memory_level": 11, + "max_iterations": 10, +} + + +def create( + dataset, + target, + features=None, + l2_penalty=1e-2, + l1_penalty=0.0, + solver="auto", + feature_rescaling=True, + convergence_threshold=_DEFAULT_SOLVER_OPTIONS["convergence_threshold"], + step_size=_DEFAULT_SOLVER_OPTIONS["step_size"], + lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS["lbfgs_memory_level"], + max_iterations=_DEFAULT_SOLVER_OPTIONS["max_iterations"], + validation_set="auto", + verbose=True, +): """ Create a :class:`~turicreate.linear_regression.LinearRegression` to @@ -271,15 +283,22 @@ def create(dataset, target, features=None, l2_penalty=1e-2, l1_penalty=0.0, model_name = "regression_linear_regression" solver = solver.lower() - model = _sl.create(dataset, target, model_name, features=features, - validation_set = validation_set, - solver = solver, verbose = verbose, - l2_penalty=l2_penalty, l1_penalty = l1_penalty, - feature_rescaling = feature_rescaling, - convergence_threshold = convergence_threshold, - step_size = step_size, - lbfgs_memory_level = lbfgs_memory_level, - max_iterations = max_iterations) + model = _sl.create( + dataset, + target, + model_name, + features=features, + validation_set=validation_set, + solver=solver, + verbose=verbose, + l2_penalty=l2_penalty, + l1_penalty=l1_penalty, + feature_rescaling=feature_rescaling, + convergence_threshold=convergence_threshold, + step_size=step_size, + lbfgs_memory_level=lbfgs_memory_level, + max_iterations=max_iterations, + ) return LinearRegression(model.__proxy__) @@ -339,6 +358,7 @@ class LinearRegression(_SupervisedLearningModel): create """ + def __init__(self, model_proxy): self.__proxy__ = model_proxy self.__name__ = self.__class__._native_name() @@ -378,35 +398,36 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ model_fields = [ - ('Number of coefficients', 'num_coefficients'), - ('Number of examples', 'num_examples'), - ('Number of feature columns', 'num_features'), - ('Number of unpacked features', 'num_unpacked_features')] + ("Number of coefficients", "num_coefficients"), + ("Number of examples", "num_examples"), + ("Number of feature columns", "num_features"), + ("Number of unpacked features", "num_unpacked_features"), + ] - hyperparam_fields = [ - ("L1 penalty", 'l1_penalty'), - ("L2 penalty", 'l2_penalty')] + hyperparam_fields = [("L1 penalty", "l1_penalty"), ("L2 penalty", "l2_penalty")] solver_fields = [ - ("Solver", 'solver'), - ("Solver iterations", 'training_iterations'), - ("Solver status", 'training_solver_status'), - ("Training time (sec)", 'training_time')] + ("Solver", "solver"), + ("Solver iterations", "training_iterations"), + ("Solver status", "training_solver_status"), + ("Training time (sec)", "training_time"), + ] training_fields = [ - ("Residual sum of squares", 'training_loss'), - ("Training RMSE", 'training_rmse')] + ("Residual sum of squares", "training_loss"), + ("Training RMSE", "training_rmse"), + ] coefs = self.coefficients - top_coefs, bottom_coefs = _toolkit_get_topk_bottomk(coefs,k=5) + top_coefs, bottom_coefs = _toolkit_get_topk_bottomk(coefs, k=5) - (coefs_list, titles_list) = _summarize_coefficients(top_coefs, \ - bottom_coefs) + (coefs_list, titles_list) = _summarize_coefficients(top_coefs, bottom_coefs) - return ([model_fields, hyperparam_fields, - solver_fields, training_fields] + coefs_list, \ - [ 'Schema', 'Hyperparameters', \ - 'Training Summary', 'Settings' ] + titles_list ) + return ( + [model_fields, hyperparam_fields, solver_fields, training_fields] + + coefs_list, + ["Schema", "Hyperparameters", "Training Summary", "Settings"] + titles_list, + ) def __repr__(self): """ @@ -438,12 +459,14 @@ def export_coreml(self, filename): """ from turicreate.extensions import _linear_regression_export_as_model_asset from turicreate.toolkits import _coreml_utils + display_name = "linear regression" short_description = _coreml_utils._mlmodel_short_description(display_name) - context = {"class": self.__class__.__name__, - "short_description": short_description, - } - context['user_defined'] = _coreml_utils._get_tc_version_info() + context = { + "class": self.__class__.__name__, + "short_description": short_description, + } + context["user_defined"] = _coreml_utils._get_tc_version_info() _linear_regression_export_as_model_asset(self.__proxy__, filename, context) def _get(self, field): @@ -512,7 +535,7 @@ def _get(self, field): """ return super(LinearRegression, self)._get(field) - def predict(self, dataset, missing_value_action='auto'): + def predict(self, dataset, missing_value_action="auto"): """ Return target value predictions for ``dataset``, using the trained linear regression model. This method can be used to get fitted values @@ -557,10 +580,11 @@ def predict(self, dataset, missing_value_action='auto'): >>> results = model.predict(data) """ - return super(LinearRegression, self).predict(dataset, missing_value_action=missing_value_action) - + return super(LinearRegression, self).predict( + dataset, missing_value_action=missing_value_action + ) - def evaluate(self, dataset, metric='auto', missing_value_action='auto'): + def evaluate(self, dataset, metric="auto", missing_value_action="auto"): r"""Evaluate the model by making target value predictions and comparing to actual values. @@ -625,7 +649,7 @@ def evaluate(self, dataset, metric='auto', missing_value_action='auto'): >>> results = model.evaluate(data) """ - _raise_error_evaluation_metric_is_valid(metric, - ['auto', 'rmse', 'max_error']) - return super(LinearRegression, self).evaluate(dataset, missing_value_action=missing_value_action, - metric=metric) + _raise_error_evaluation_metric_is_valid(metric, ["auto", "rmse", "max_error"]) + return super(LinearRegression, self).evaluate( + dataset, missing_value_action=missing_value_action, metric=metric + ) diff --git a/src/python/turicreate/toolkits/regression/random_forest_regression.py b/src/python/turicreate/toolkits/regression/random_forest_regression.py index 7dd64bf3f6..d62e5b0801 100644 --- a/src/python/turicreate/toolkits/regression/random_forest_regression.py +++ b/src/python/turicreate/toolkits/regression/random_forest_regression.py @@ -10,7 +10,9 @@ from __future__ import division as _ from __future__ import absolute_import as _ import turicreate as _turicreate -from turicreate.toolkits._supervised_learning import SupervisedLearningModel as _SupervisedLearningModel +from turicreate.toolkits._supervised_learning import ( + SupervisedLearningModel as _SupervisedLearningModel, +) import turicreate.toolkits._supervised_learning as _sl from turicreate.toolkits._internal_utils import _toolkit_repr_print from turicreate.toolkits._internal_utils import _raise_error_evaluation_metric_is_valid @@ -20,12 +22,27 @@ import logging as _logging from turicreate.util import _make_internal_url -_RANDOM_FOREST_MODEL_PARAMS_KEYS = ['max_depth', - 'min_child_weight', 'min_loss_reduction', 'row_subsample'] -_RANDOM_FOREST_TRAINING_PARAMS_KEYS = ['objective', 'training_time', -'training_error', 'validation_error', 'evaluation_metric'] -_RANDOM_FOREST_TRAINING_DATA_PARAMS_KEYS = ['target', 'features', -'num_features', 'num_examples', 'num_validation_examples'] +_RANDOM_FOREST_MODEL_PARAMS_KEYS = [ + "max_depth", + "min_child_weight", + "min_loss_reduction", + "row_subsample", +] +_RANDOM_FOREST_TRAINING_PARAMS_KEYS = [ + "objective", + "training_time", + "training_error", + "validation_error", + "evaluation_metric", +] +_RANDOM_FOREST_TRAINING_DATA_PARAMS_KEYS = [ + "target", + "features", + "num_features", + "num_examples", + "num_validation_examples", +] + class RandomForestRegression(_SupervisedLearningModel, _TreeModelMixin): """ @@ -51,6 +68,7 @@ class RandomForestRegression(_SupervisedLearningModel, _TreeModelMixin): -------- create """ + def __init__(self, proxy): """__init__(self)""" self.__proxy__ = proxy @@ -60,7 +78,6 @@ def __init__(self, proxy): def _native_name(cls): return "random_forest_regression" - def __str__(self): """ Return a string description of the model to the ``print`` method. @@ -90,19 +107,21 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ data_fields = [ - ('Number of examples', 'num_examples'), - ('Number of feature columns', 'num_features'), - ('Number of unpacked features', 'num_unpacked_features')] + ("Number of examples", "num_examples"), + ("Number of feature columns", "num_features"), + ("Number of unpacked features", "num_unpacked_features"), + ] training_fields = [ - ("Maximum number of iterations", 'max_iterations'), - ("Number of trees", 'num_trees'), - ("Max tree depth", 'max_depth'), - ("Train RMSE", 'training_rmse'), - ("Validation RMSE", 'validation_rmse'), - ("Training time (sec)", 'training_time')] + ("Maximum number of iterations", "max_iterations"), + ("Number of trees", "num_trees"), + ("Max tree depth", "max_depth"), + ("Train RMSE", "training_rmse"), + ("Validation RMSE", "validation_rmse"), + ("Training time (sec)", "training_time"), + ] - return ( [data_fields, training_fields], ['Schema', 'Settings']) + return ([data_fields, training_fields], ["Schema", "Settings"]) def __repr__(self): """ @@ -176,7 +195,7 @@ def _get(self, field): """ return super(RandomForestRegression, self)._get(field) - def evaluate(self, dataset, metric='auto', missing_value_action='auto'): + def evaluate(self, dataset, metric="auto", missing_value_action="auto"): """ Evaluate the model on the given dataset. @@ -221,13 +240,12 @@ def evaluate(self, dataset, metric='auto', missing_value_action='auto'): >>> results = model.evaluate(test_data, 'rmse') """ - _raise_error_evaluation_metric_is_valid( - metric, ['auto', 'rmse', 'max_error']) - return super(RandomForestRegression, self).evaluate(dataset, - missing_value_action=missing_value_action, - metric=metric) + _raise_error_evaluation_metric_is_valid(metric, ["auto", "rmse", "max_error"]) + return super(RandomForestRegression, self).evaluate( + dataset, missing_value_action=missing_value_action, metric=metric + ) - def predict(self, dataset, missing_value_action='auto'): + def predict(self, dataset, missing_value_action="auto"): """ Predict the target column of the given dataset. @@ -267,10 +285,9 @@ def predict(self, dataset, missing_value_action='auto'): -------- >>> m.predict(testdata) """ - return super(RandomForestRegression, self).predict(dataset, - output_type='margin', - missing_value_action=missing_value_action) - + return super(RandomForestRegression, self).predict( + dataset, output_type="margin", missing_value_action=missing_value_action + ) def export_coreml(self, filename): """ @@ -286,22 +303,29 @@ def export_coreml(self, filename): >>> model.export_coreml("MyModel.mlmodel") """ from turicreate.toolkits import _coreml_utils + display_name = "random forest regression" short_description = _coreml_utils._mlmodel_short_description(display_name) - context = {"mode" : "regression", - "model_type" : "random_forest", - "class": self.__class__.__name__, - "short_description": short_description} + context = { + "mode": "regression", + "model_type": "random_forest", + "class": self.__class__.__name__, + "short_description": short_description, + } self._export_coreml_impl(filename, context) -def create(dataset, target, - features=None, - max_iterations=10, - validation_set='auto', - verbose=True, - random_seed = None, - metric = 'auto', - **kwargs): + +def create( + dataset, + target, + features=None, + max_iterations=10, + validation_set="auto", + verbose=True, + random_seed=None, + metric="auto", + **kwargs +): """ Create a :class:`~turicreate.random_forest_regression.RandomForestRegression` to predict a scalar target variable using one or more features. In addition to standard @@ -438,26 +462,34 @@ def create(dataset, target, """ if random_seed is not None: - kwargs['random_seed'] = random_seed - if 'model_checkpoint_path' in kwargs: - kwargs['model_checkpoint_path'] = _make_internal_url(kwargs['model_checkpoint_path']) - if 'resume_from_checkpoint' in kwargs: - kwargs['resume_from_checkpoint'] = _make_internal_url(kwargs['resume_from_checkpoint']) - if 'num_trees' in kwargs: + kwargs["random_seed"] = random_seed + if "model_checkpoint_path" in kwargs: + kwargs["model_checkpoint_path"] = _make_internal_url( + kwargs["model_checkpoint_path"] + ) + if "resume_from_checkpoint" in kwargs: + kwargs["resume_from_checkpoint"] = _make_internal_url( + kwargs["resume_from_checkpoint"] + ) + if "num_trees" in kwargs: logger = _logging.getLogger(__name__) - logger.warning("The `num_trees` keyword argument is deprecated. Please " - "use the `max_iterations` argument instead. Any value provided " - "for `num_trees` will be used in place of `max_iterations`.") - max_iterations = kwargs['num_trees'] - del kwargs['num_trees'] - - model = _sl.create(dataset = dataset, - target = target, - features = features, - model_name = 'random_forest_regression', - max_iterations = max_iterations, - validation_set = validation_set, - verbose = verbose, - metric = metric, - **kwargs) + logger.warning( + "The `num_trees` keyword argument is deprecated. Please " + "use the `max_iterations` argument instead. Any value provided " + "for `num_trees` will be used in place of `max_iterations`." + ) + max_iterations = kwargs["num_trees"] + del kwargs["num_trees"] + + model = _sl.create( + dataset=dataset, + target=target, + features=features, + model_name="random_forest_regression", + max_iterations=max_iterations, + validation_set=validation_set, + verbose=verbose, + metric=metric, + **kwargs + ) return RandomForestRegression(model.__proxy__) diff --git a/src/python/turicreate/toolkits/sound_classifier/_audio_feature_extractor.py b/src/python/turicreate/toolkits/sound_classifier/_audio_feature_extractor.py index 8e48b4d976..262a0ed541 100644 --- a/src/python/turicreate/toolkits/sound_classifier/_audio_feature_extractor.py +++ b/src/python/turicreate/toolkits/sound_classifier/_audio_feature_extractor.py @@ -3,8 +3,10 @@ from coremltools.models import MLModel import numpy as _np from tensorflow import keras as _keras + # Suppresses verbosity to only errors import turicreate.toolkits._tf_utils as _utils + _utils.suppress_tensorflow_warnings() import turicreate as _tc @@ -15,30 +17,33 @@ # We need to disable this here to match behavior in the rest of TuriCreate from tensorflow.compat.v1 import disable_v2_behavior + disable_v2_behavior() VGGish_instance = None + + def _get_feature_extractor(model_name): global VGGish_instance - assert model_name == 'VGGish' + assert model_name == "VGGish" if VGGish_instance is None: VGGish_instance = VGGishFeatureExtractor() return VGGish_instance class VGGishFeatureExtractor(object): - name = 'VGGish' + name = "VGGish" output_length = 12288 input_sample_rate = SAMPLE_RATE @staticmethod def _preprocess_data(audio_data, verbose=True): - ''' + """ Preprocess each example, breaking it up into frames. Returns two numpy arrays: preprocessed frame and their indexes - ''' + """ from .vggish_input import waveform_to_examples last_progress_update = _time.time() @@ -48,8 +53,8 @@ def _preprocess_data(audio_data, verbose=True): # https://github.com/apple/turicreate/issues/1216 preprocessed_data, audio_data_index = [], [] for i, audio_dict in enumerate(audio_data): - scaled_data = audio_dict['data'] / 32768.0 - data = waveform_to_examples(scaled_data, audio_dict['sample_rate']) + scaled_data = audio_dict["data"] / 32768.0 + data = waveform_to_examples(scaled_data, audio_dict["sample_rate"]) for j in data: preprocessed_data.append([j]) @@ -64,7 +69,11 @@ def _preprocess_data(audio_data, verbose=True): last_progress_update = _time.time() if progress_header_printed: - print("Preprocessed {} of {} examples\n".format(len(audio_data), len(audio_data))) + print( + "Preprocessed {} of {} examples\n".format( + len(audio_data), len(audio_data) + ) + ) return _np.asarray(preprocessed_data), audio_data_index def __init__(self): @@ -74,14 +83,15 @@ def __init__(self): if self.mac_ver < (10, 14): # Use TensorFlow/Keras import turicreate.toolkits._tf_utils as _utils + self.gpu_policy = _utils.TensorFlowGPUPolicy() self.gpu_policy.start() - model_path = vggish_model_file.get_model_path(format='tensorflow') + model_path = vggish_model_file.get_model_path(format="tensorflow") self.vggish_model = _keras.models.load_model(model_path) else: # Use Core ML - model_path = vggish_model_file.get_model_path(format='coreml') + model_path = vggish_model_file.get_model_path(format="coreml") self.vggish_model = MLModel(model_path) def __del__(self): @@ -121,16 +131,20 @@ def _extract_features(self, preprocessed_data, verbose=True): print("Extracted {} of {}".format(i, len(preprocessed_data))) last_progress_update = _time.time() if progress_header_printed: - print("Extracted {} of {}\n".format(len(preprocessed_data), len(preprocessed_data))) + print( + "Extracted {} of {}\n".format( + len(preprocessed_data), len(preprocessed_data) + ) + ) else: # Use Core ML for i, cur_example in enumerate(preprocessed_data): for cur_frame in cur_example: - x = {'input1': _np.asarray([cur_frame])} + x = {"input1": _np.asarray([cur_frame])} y = self.vggish_model.predict(x) - deep_features.append(y['output1']) + deep_features.append(y["output1"]) # If `verbose` is set, print an progress update about every 20s if verbose and _time.time() - last_progress_update >= 20: @@ -140,29 +154,37 @@ def _extract_features(self, preprocessed_data, verbose=True): print("Extracted {} of {}".format(i, len(preprocessed_data))) last_progress_update = _time.time() if progress_header_printed: - print("Extracted {} of {}\n".format(len(preprocessed_data), len(preprocessed_data))) + print( + "Extracted {} of {}\n".format( + len(preprocessed_data), len(preprocessed_data) + ) + ) return deep_features.close() def get_deep_features(self, audio_data, verbose): - ''' + """ Performs both audio preprocessing and VGGish deep feature extraction. - ''' + """ preprocessed_data, row_ids = self._preprocess_data(audio_data, verbose) deep_features = self._extract_features(preprocessed_data, verbose) - output = _tc.SFrame({'deep features': deep_features, 'row id': row_ids}) - output = output.unstack('deep features') + output = _tc.SFrame({"deep features": deep_features, "row id": row_ids}) + output = output.unstack("deep features") max_row_id = len(audio_data) - missing_ids = set(range(max_row_id)) - set(output['row id'].unique()) + missing_ids = set(range(max_row_id)) - set(output["row id"].unique()) if len(missing_ids) != 0: - empty_rows = _tc.SFrame({'List of deep features': [ [] for _ in range(len(missing_ids)) ], - 'row id': missing_ids}) + empty_rows = _tc.SFrame( + { + "List of deep features": [[] for _ in range(len(missing_ids))], + "row id": missing_ids, + } + ) output = output.append(empty_rows) - output = output.sort('row id') - return output['List of deep features'] + output = output.sort("row id") + return output["List of deep features"] def get_spec(self): """ @@ -172,5 +194,5 @@ def get_spec(self): return self.vggish_model.get_spec() else: vggish_model_file = VGGish() - coreml_model_path = vggish_model_file.get_model_path(format='coreml') + coreml_model_path = vggish_model_file.get_model_path(format="coreml") return MLModel(coreml_model_path).get_spec() diff --git a/src/python/turicreate/toolkits/sound_classifier/_tf_sound_classifier.py b/src/python/turicreate/toolkits/sound_classifier/_tf_sound_classifier.py index df2d0cb20c..28d8f8dca5 100644 --- a/src/python/turicreate/toolkits/sound_classifier/_tf_sound_classifier.py +++ b/src/python/turicreate/toolkits/sound_classifier/_tf_sound_classifier.py @@ -8,13 +8,15 @@ from __future__ import absolute_import as _ from .._tf_model import TensorFlowModel import turicreate.toolkits._tf_utils as _utils + # Suppresses verbosity to only errors _utils.suppress_tensorflow_warnings() import tensorflow.compat.v1 as _tf + _tf.disable_v2_behavior() -class SoundClassifierTensorFlowModel(TensorFlowModel): +class SoundClassifierTensorFlowModel(TensorFlowModel): def __init__(self, num_inputs, num_classes, custom_layer_sizes): """ Defines the TensorFlow model, loss, optimisation and accuracy. @@ -47,32 +49,42 @@ def init_sound_classifier_graph(self, num_inputs, custom_layer_sizes): # Create variables for customized layers for i, cur_layer_size in enumerate(custom_layer_sizes): - weight_name = 'sound_dense{}_weight'.format(i) - bias_name = 'sound_dense{}_bias'.format(i) - self.names_of_layers.append('dense{}'.format(i)) + weight_name = "sound_dense{}_weight".format(i) + bias_name = "sound_dense{}_bias".format(i) + self.names_of_layers.append("dense{}".format(i)) out_units = cur_layer_size - if i==0: + if i == 0: in_units = num_inputs - weights[weight_name] = _tf.Variable(initializer([in_units, out_units]), name=weight_name) + weights[weight_name] = _tf.Variable( + initializer([in_units, out_units]), name=weight_name + ) biases[bias_name] = _tf.Variable(initializer([out_units]), name=bias_name) in_units = out_units - i+=1 - weight_name = 'sound_dense{}_weight'.format(i) - bias_name = 'sound_dense{}_bias'.format(i) - self.names_of_layers.append('dense{}'.format(i)) - weights[weight_name] = _tf.Variable(initializer([in_units, self.num_classes]), name=weight_name) - biases[bias_name] = _tf.Variable(initializer([self.num_classes]), name=bias_name) + i += 1 + weight_name = "sound_dense{}_weight".format(i) + bias_name = "sound_dense{}_bias".format(i) + self.names_of_layers.append("dense{}".format(i)) + weights[weight_name] = _tf.Variable( + initializer([in_units, self.num_classes]), name=weight_name + ) + biases[bias_name] = _tf.Variable( + initializer([self.num_classes]), name=bias_name + ) # Add customized layers for i in range(len(weights.keys())): - weight_name = 'sound_dense{}_weight'.format(i) - bias_name = 'sound_dense{}_bias'.format(i) - if i==0: - curr_dense = _tf.nn.xw_plus_b(self.x, weights=weights[weight_name], biases=biases[bias_name]) + weight_name = "sound_dense{}_weight".format(i) + bias_name = "sound_dense{}_bias".format(i) + if i == 0: + curr_dense = _tf.nn.xw_plus_b( + self.x, weights=weights[weight_name], biases=biases[bias_name] + ) else: - curr_dense = _tf.nn.xw_plus_b(curr_dense, weights=weights[weight_name], biases=biases[bias_name]) - if i==(len(weights.keys())-1): + curr_dense = _tf.nn.xw_plus_b( + curr_dense, weights=weights[weight_name], biases=biases[bias_name] + ) + if i == (len(weights.keys()) - 1): out = _tf.nn.softmax(curr_dense) else: curr_dense = _tf.nn.relu(curr_dense) @@ -80,47 +92,59 @@ def init_sound_classifier_graph(self, num_inputs, custom_layer_sizes): self.predictions = out # Loss - self.cost = _tf.reduce_mean(_tf.nn.softmax_cross_entropy_with_logits_v2(logits=curr_dense, - labels=self.y)) + self.cost = _tf.reduce_mean( + _tf.nn.softmax_cross_entropy_with_logits_v2( + logits=curr_dense, labels=self.y + ) + ) # Optimizer - self.optimizer = _tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9, - use_nesterov=True).minimize(self.cost) + self.optimizer = _tf.train.MomentumOptimizer( + learning_rate=0.01, momentum=0.9, use_nesterov=True + ).minimize(self.cost) # Predictions - correct_prediction = _tf.equal(_tf.argmax(self.predictions, 1), _tf.argmax(self.y, 1)) + correct_prediction = _tf.equal( + _tf.argmax(self.predictions, 1), _tf.argmax(self.y, 1) + ) self.accuracy = _tf.reduce_mean(_tf.cast(correct_prediction, "float")) # Set variables to their initial values self.sess.run(_tf.global_variables_initializer()) - def train(self, data, label): data_shape = data.shape[0] - _, final_train_loss, final_train_accuracy = self.sess.run([self.optimizer, self.cost, self.accuracy], - feed_dict={ - self.x: data.reshape((data_shape, 12288)), - self.y: _tf.keras.utils.to_categorical(label, self.num_classes).reshape((data_shape, self.num_classes)) - }) - result = {'accuracy' : final_train_accuracy, 'loss' : final_train_loss} + _, final_train_loss, final_train_accuracy = self.sess.run( + [self.optimizer, self.cost, self.accuracy], + feed_dict={ + self.x: data.reshape((data_shape, 12288)), + self.y: _tf.keras.utils.to_categorical(label, self.num_classes).reshape( + (data_shape, self.num_classes) + ), + }, + ) + result = {"accuracy": final_train_accuracy, "loss": final_train_loss} return result def evaluate(self, data, label): data_shape = data.shape[0] - pred_probs, final_accuracy = self.sess.run([self.predictions, self.accuracy], - feed_dict={ - self.x: data.reshape((data_shape, 12288)), - self.y: _tf.keras.utils.to_categorical(label, self.num_classes).reshape((data_shape, self.num_classes)) - }) - result = {'accuracy' : final_accuracy, 'predictions' : pred_probs} + pred_probs, final_accuracy = self.sess.run( + [self.predictions, self.accuracy], + feed_dict={ + self.x: data.reshape((data_shape, 12288)), + self.y: _tf.keras.utils.to_categorical(label, self.num_classes).reshape( + (data_shape, self.num_classes) + ), + }, + ) + result = {"accuracy": final_accuracy, "predictions": pred_probs} return result def predict(self, data): data_shape = data.shape[0] - pred_probs = self.sess.run(self.predictions, - feed_dict={ - self.x: data.reshape((data_shape, 12288)) - }) + pred_probs = self.sess.run( + self.predictions, feed_dict={self.x: data.reshape((data_shape, 12288))} + ) return pred_probs def export_weights(self): @@ -148,15 +172,15 @@ def export_weights(self): layers = [] for i, name in enumerate(self.names_of_layers): - weight_name = 'sound_{}_weight:0'.format(name) - bias_name = 'sound_{}_bias:0'.format(name) - layer={} - layer['weight'] = layer_dict[weight_name].transpose(1, 0) - layer['bias'] = layer_dict[bias_name] - if i==(len(self.names_of_layers)-1): - layer['act'] = None + weight_name = "sound_{}_weight:0".format(name) + bias_name = "sound_{}_bias:0".format(name) + layer = {} + layer["weight"] = layer_dict[weight_name].transpose(1, 0) + layer["bias"] = layer_dict[bias_name] + if i == (len(self.names_of_layers) - 1): + layer["act"] = None else: - layer['act'] = 'relu' + layer["act"] = "relu" layers.append(layer) return layers @@ -180,13 +204,13 @@ def get_weights(self): shapes = {} for var, val in zip(layer_names, layer_weights): layer_name = var.name[:-2] - if 'bias' in layer_name: + if "bias" in layer_name: data[layer_name] = val else: data[layer_name] = val.transpose(1, 0) shapes[layer_name] = val.shape[::-1] - return {'data': data, 'shapes': shapes} + return {"data": data, "shapes": shapes} def load_weights(self, net_params): """ @@ -195,14 +219,24 @@ def load_weights(self, net_params): need to be transposed to match TF format. """ - layers = net_params['data'].keys() + layers = net_params["data"].keys() for layer_name in layers: new_layer_name = layer_name.replace("custom", "sound") - if 'bias' in layer_name: - self.sess.run(_tf.assign(self.sc_graph.get_tensor_by_name(new_layer_name+":0"), - net_params['data'][layer_name])) + if "bias" in layer_name: + self.sess.run( + _tf.assign( + self.sc_graph.get_tensor_by_name(new_layer_name + ":0"), + net_params["data"][layer_name], + ) + ) else: - curr_shape = [int(x) for x in net_params['shapes'][layer_name]] - self.sess.run(_tf.assign(self.sc_graph.get_tensor_by_name(new_layer_name+":0"), - net_params['data'][layer_name].reshape(curr_shape).transpose(1,0))) + curr_shape = [int(x) for x in net_params["shapes"][layer_name]] + self.sess.run( + _tf.assign( + self.sc_graph.get_tensor_by_name(new_layer_name + ":0"), + net_params["data"][layer_name] + .reshape(curr_shape) + .transpose(1, 0), + ) + ) diff --git a/src/python/turicreate/toolkits/sound_classifier/mel_features.py b/src/python/turicreate/toolkits/sound_classifier/mel_features.py index ac58fb5427..9ebef889e0 100644 --- a/src/python/turicreate/toolkits/sound_classifier/mel_features.py +++ b/src/python/turicreate/toolkits/sound_classifier/mel_features.py @@ -19,7 +19,7 @@ def frame(data, window_length, hop_length): - """Convert array into a sequence of successive possibly overlapping frames. + """Convert array into a sequence of successive possibly overlapping frames. An n-dimensional array of shape (num_samples, ...) is converted into an (n+1)-D array of shape (num_frames, window_length, ...), where each frame @@ -38,15 +38,15 @@ def frame(data, window_length, hop_length): (N+1)-D np.array with as many rows as there are complete frames that can be extracted. """ - num_samples = data.shape[0] - num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length)) - shape = (num_frames, window_length) + data.shape[1:] - strides = (data.strides[0] * hop_length,) + data.strides - return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides) + num_samples = data.shape[0] + num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length)) + shape = (num_frames, window_length) + data.shape[1:] + strides = (data.strides[0] * hop_length,) + data.strides + return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides) def periodic_hann(window_length): - """Calculate a "periodic" Hann window. + """Calculate a "periodic" Hann window. The classic Hann window is defined as a raised cosine that starts and ends on zero, and where every value appears twice, except the middle @@ -64,14 +64,11 @@ def periodic_hann(window_length): Returns: A 1D np.array containing the periodic hann window. """ - return 0.5 - (0.5 * np.cos(2 * np.pi / window_length * - np.arange(window_length))) + return 0.5 - (0.5 * np.cos(2 * np.pi / window_length * np.arange(window_length))) -def stft_magnitude(signal, fft_length, - hop_length=None, - window_length=None): - """Calculate the short-time Fourier transform magnitude. +def stft_magnitude(signal, fft_length, hop_length=None, window_length=None): + """Calculate the short-time Fourier transform magnitude. Args: signal: 1D np.array of the input time-domain signal. @@ -83,13 +80,13 @@ def stft_magnitude(signal, fft_length, 2D np.array where each row contains the magnitudes of the fft_length/2+1 unique values of the FFT for the corresponding frame of input samples. """ - frames = frame(signal, window_length, hop_length) - # Apply frame window to each frame. We use a periodic Hann (cosine of period - # window_length) instead of the symmetric Hann of np.hanning (period - # window_length-1). - window = periodic_hann(window_length) - windowed_frames = frames * window - return np.abs(np.fft.rfft(windowed_frames, int(fft_length))) + frames = frame(signal, window_length, hop_length) + # Apply frame window to each frame. We use a periodic Hann (cosine of period + # window_length) instead of the symmetric Hann of np.hanning (period + # window_length-1). + window = periodic_hann(window_length) + windowed_frames = frames * window + return np.abs(np.fft.rfft(windowed_frames, int(fft_length))) # Mel spectrum constants and functions. @@ -98,7 +95,7 @@ def stft_magnitude(signal, fft_length, def hertz_to_mel(frequencies_hertz): - """Convert frequencies to mel scale using HTK formula. + """Convert frequencies to mel scale using HTK formula. Args: frequencies_hertz: Scalar or np.array of frequencies in hertz. @@ -107,16 +104,19 @@ def hertz_to_mel(frequencies_hertz): Object of same size as frequencies_hertz containing corresponding values on the mel scale. """ - return _MEL_HIGH_FREQUENCY_Q * np.log( - 1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ)) + return _MEL_HIGH_FREQUENCY_Q * np.log( + 1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ) + ) -def spectrogram_to_mel_matrix(num_mel_bins=20, - num_spectrogram_bins=129, - audio_sample_rate=8000, - lower_edge_hertz=125.0, - upper_edge_hertz=3800.0): - """Return a matrix that can post-multiply spectrogram rows to make mel. +def spectrogram_to_mel_matrix( + num_mel_bins=20, + num_spectrogram_bins=129, + audio_sample_rate=8000, + lower_edge_hertz=125.0, + upper_edge_hertz=3800.0, +): + """Return a matrix that can post-multiply spectrogram rows to make mel. Returns a np.array matrix A that can be used to post-multiply a matrix S of spectrogram values (STFT magnitudes) arranged as frames x bins to generate a @@ -152,50 +152,58 @@ def spectrogram_to_mel_matrix(num_mel_bins=20, Raises: ValueError: if frequency edges are incorrectly ordered or out of range. """ - nyquist_hertz = audio_sample_rate / 2. - if lower_edge_hertz < 0.0: - raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz) - if lower_edge_hertz >= upper_edge_hertz: - raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" % - (lower_edge_hertz, upper_edge_hertz)) - if upper_edge_hertz > nyquist_hertz: - raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" % - (upper_edge_hertz, nyquist_hertz)) - spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins) - spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz) - # The i'th mel band (starting from i=1) has center frequency - # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge - # band_edges_mel[i+1]. Thus, we need num_mel_bins + 2 values in - # the band_edges_mel arrays. - band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz), - hertz_to_mel(upper_edge_hertz), num_mel_bins + 2) - # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins - # of spectrogram values. - mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins)) - for i in range(num_mel_bins): - lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3] - # Calculate lower and upper slopes for every spectrogram bin. - # Line segments are linear in the *mel* domain, not hertz. - lower_slope = ((spectrogram_bins_mel - lower_edge_mel) / - (center_mel - lower_edge_mel)) - upper_slope = ((upper_edge_mel - spectrogram_bins_mel) / - (upper_edge_mel - center_mel)) - # .. then intersect them with each other and zero. - mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope, - upper_slope)) - # HTK excludes the spectrogram DC bin; make sure it always gets a zero - # coefficient. - mel_weights_matrix[0, :] = 0.0 - return mel_weights_matrix - - -def log_mel_spectrogram(data, - audio_sample_rate=8000, - log_offset=0.0, - window_length_secs=0.025, - hop_length_secs=0.010, - **kwargs): - """Convert waveform to a log magnitude mel-frequency spectrogram. + nyquist_hertz = audio_sample_rate / 2.0 + if lower_edge_hertz < 0.0: + raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz) + if lower_edge_hertz >= upper_edge_hertz: + raise ValueError( + "lower_edge_hertz %.1f >= upper_edge_hertz %.1f" + % (lower_edge_hertz, upper_edge_hertz) + ) + if upper_edge_hertz > nyquist_hertz: + raise ValueError( + "upper_edge_hertz %.1f is greater than Nyquist %.1f" + % (upper_edge_hertz, nyquist_hertz) + ) + spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins) + spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz) + # The i'th mel band (starting from i=1) has center frequency + # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge + # band_edges_mel[i+1]. Thus, we need num_mel_bins + 2 values in + # the band_edges_mel arrays. + band_edges_mel = np.linspace( + hertz_to_mel(lower_edge_hertz), hertz_to_mel(upper_edge_hertz), num_mel_bins + 2 + ) + # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins + # of spectrogram values. + mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins)) + for i in range(num_mel_bins): + lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i : i + 3] + # Calculate lower and upper slopes for every spectrogram bin. + # Line segments are linear in the *mel* domain, not hertz. + lower_slope = (spectrogram_bins_mel - lower_edge_mel) / ( + center_mel - lower_edge_mel + ) + upper_slope = (upper_edge_mel - spectrogram_bins_mel) / ( + upper_edge_mel - center_mel + ) + # .. then intersect them with each other and zero. + mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope, upper_slope)) + # HTK excludes the spectrogram DC bin; make sure it always gets a zero + # coefficient. + mel_weights_matrix[0, :] = 0.0 + return mel_weights_matrix + + +def log_mel_spectrogram( + data, + audio_sample_rate=8000, + log_offset=0.0, + window_length_secs=0.025, + hop_length_secs=0.010, + **kwargs +): + """Convert waveform to a log magnitude mel-frequency spectrogram. Args: data: 1D np.array of waveform data. @@ -209,15 +217,21 @@ def log_mel_spectrogram(data, 2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank magnitudes for successive frames. """ - window_length_samples = int(round(audio_sample_rate * window_length_secs)) - hop_length_samples = int(round(audio_sample_rate * hop_length_secs)) - fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0))) - spectrogram = stft_magnitude( - data, - fft_length=fft_length, - hop_length=hop_length_samples, - window_length=window_length_samples) - mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix( - num_spectrogram_bins=spectrogram.shape[1], - audio_sample_rate=audio_sample_rate, **kwargs)) - return np.log(mel_spectrogram + log_offset) + window_length_samples = int(round(audio_sample_rate * window_length_secs)) + hop_length_samples = int(round(audio_sample_rate * hop_length_secs)) + fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0))) + spectrogram = stft_magnitude( + data, + fft_length=fft_length, + hop_length=hop_length_samples, + window_length=window_length_samples, + ) + mel_spectrogram = np.dot( + spectrogram, + spectrogram_to_mel_matrix( + num_spectrogram_bins=spectrogram.shape[1], + audio_sample_rate=audio_sample_rate, + **kwargs + ), + ) + return np.log(mel_spectrogram + log_offset) diff --git a/src/python/turicreate/toolkits/sound_classifier/sound_classifier.py b/src/python/turicreate/toolkits/sound_classifier/sound_classifier.py index 9afd0effcc..8f3dcb693f 100644 --- a/src/python/turicreate/toolkits/sound_classifier/sound_classifier.py +++ b/src/python/turicreate/toolkits/sound_classifier/sound_classifier.py @@ -22,33 +22,38 @@ from turicreate.toolkits._model import PythonProxy as _PythonProxy from turicreate.toolkits import _coreml_utils + class _DataIterator(object): - ''' + """ Defines a common interface around TensorFlow DataSet.from_tensor_slices - ''' + """ - ''' + """ Creates a new instance wrapping numpy data and labels. If `label` is provided, `label.shape[0]` must match `data.shape[0]`. - ''' + """ + def __init__(self, data, label=None, batch_size=1, shuffle=False): raise NotImplementedError - ''' + """ Returns an iterator that yields a sequence of tuples, comprising a batch of `data` values and a batch of `label` values (if provided). - ''' + """ + def __iter__(self): raise NotImplementedError - ''' + """ Ensures that the next iteration through the dataset starts from the beginning. - ''' + """ + def reset(self): raise NotImplementedError + class _TFDataIterator(_DataIterator): def __init__(self, data, label=None, batch_size=1, shuffle=False): import tensorflow as tf @@ -72,6 +77,7 @@ def reset(self): # pass through the data. pass + class _NumPyDataIterator(_DataIterator): def __init__(self, data, label=None, batch_size=1, shuffle=False): @@ -87,10 +93,18 @@ def __iter__(self): def __next__(self): if self.batch_idx < self.num_batches: - data = self.data_slices[0][self.batch_size*self.batch_idx:self.batch_size*(self.batch_idx+1)] + data = self.data_slices[0][ + self.batch_size + * self.batch_idx : self.batch_size + * (self.batch_idx + 1) + ] label = None - if len(self.data_slices)>1: - label = self.data_slices[1][self.batch_size*self.batch_idx:self.batch_size*(self.batch_idx+1)] + if len(self.data_slices) > 1: + label = self.data_slices[1][ + self.batch_size + * self.batch_idx : self.batch_size + * (self.batch_idx + 1) + ] self.batch_idx += 1 return (data, label) if label is not None else (data,) else: @@ -102,44 +116,50 @@ def next(self): def reset(self): self.batch_idx = 0 + def _create_data_iterator(data, label=None, batch_size=1, shuffle=False): - return _NumPyDataIterator(data, label=label, batch_size=batch_size, - shuffle=shuffle) + return _NumPyDataIterator(data, label=label, batch_size=batch_size, shuffle=shuffle) + class _Accuracy(object): - ''' + """ Defines a common interface around TensorFlow accuracy metrics. - ''' + """ def __init__(self): raise NotImplementedError - ''' + """ Tallies the results from a single batch of predictions. The predictions are expected to contain (possibly unnormalized) class probabilities. Replacing the last axis of the predictions with the argmax should yield a shape matching the ground truth. - ''' + """ + def update(self, ground_truth, predicted): raise NotImplementedError - ''' + """ Removes all tallied results so far. - ''' + """ + def reset(self): raise NotImplementedError - ''' + """ Computes the accuracy for all the results tallied so far. - ''' + """ + def get(self): raise NotImplementedError + class _TFAccuracy(_Accuracy): def __init__(self): import tensorflow as tf + self.impl = tf.keras.metrics.Accuracy() def update(self, ground_truth, predicted): @@ -152,8 +172,8 @@ def reset(self): def get(self): return self.impl.result() -class _NumPyAccuracy(_Accuracy): +class _NumPyAccuracy(_Accuracy): def __init__(self): self.cumulative_acc = 0.0 self.num_examples = 0.0 @@ -161,18 +181,22 @@ def __init__(self): def update(self, ground_truth, predicted): self.num_examples += len(predicted) predicted = _np.argmax(predicted, axis=-1) - self.cumulative_acc += sum([1 for x,y in zip(ground_truth, predicted) if x==y]) + self.cumulative_acc += sum( + [1 for x, y in zip(ground_truth, predicted) if x == y] + ) def reset(self): self.cumulative_acc = 0.0 self.num_examples = 0.0 def get(self): - return self.cumulative_acc/self.num_examples + return self.cumulative_acc / self.num_examples + def _get_accuracy_metric(): return _NumPyAccuracy() + def _is_deep_feature_sarray(sa): if not isinstance(sa, _tc.SArray): return False @@ -186,17 +210,19 @@ def _is_deep_feature_sarray(sa): return False return True + def _is_audio_data_sarray(sa): if not isinstance(sa, _tc.SArray): return False if sa.dtype != dict: return False - if set(sa[0].keys()) != {'sample_rate', 'data'}: + if set(sa[0].keys()) != {"sample_rate", "data"}: return False return True + def get_deep_features(audio_data, verbose=True): - ''' + """ Calculates the deep features used by the Sound Classifier. Internally the Sound Classifier calculates deep features for both model @@ -216,22 +242,29 @@ def get_deep_features(audio_data, verbose=True): >>> train, test = my_audio_data.random_split(.8) >>> model = tc.sound_classifier.create(train, 'label', 'deep_features') >>> predictions = model.predict(test) - ''' + """ from ._audio_feature_extractor import _get_feature_extractor if not _is_audio_data_sarray(audio_data): raise TypeError("Input must be audio data") - feature_extractor_name = 'VGGish' + feature_extractor_name = "VGGish" feature_extractor = _get_feature_extractor(feature_extractor_name) return feature_extractor.get_deep_features(audio_data, verbose=verbose) -def create(dataset, target, feature, max_iterations=10, - custom_layer_sizes=[100, 100], verbose=True, - validation_set='auto', batch_size=64): - ''' +def create( + dataset, + target, + feature, + max_iterations=10, + custom_layer_sizes=[100, 100], + verbose=True, + validation_set="auto", + batch_size=64, +): + """ Creates a :class:`SoundClassifier` model. Parameters @@ -279,7 +312,7 @@ def create(dataset, target, feature, max_iterations=10, batch_size : int, optional If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. - ''' + """ import time from ._audio_feature_extractor import _get_feature_extractor @@ -289,47 +322,64 @@ def create(dataset, target, feature, max_iterations=10, raise TypeError('"dataset" must be of type SFrame.') # check parameters if len(dataset) == 0: - raise _ToolkitError('Unable to train on empty dataset') + raise _ToolkitError("Unable to train on empty dataset") if feature not in dataset.column_names(): raise _ToolkitError("Audio feature column '%s' does not exist" % feature) - if not _is_deep_feature_sarray(dataset[feature]) and not _is_audio_data_sarray(dataset[feature]): + if not _is_deep_feature_sarray(dataset[feature]) and not _is_audio_data_sarray( + dataset[feature] + ): raise _ToolkitError("'%s' column is not audio data." % feature) if target not in dataset.column_names(): raise _ToolkitError("Target column '%s' does not exist" % target) - if not _tc.util._is_non_string_iterable(custom_layer_sizes) or len(custom_layer_sizes) == 0: + if ( + not _tc.util._is_non_string_iterable(custom_layer_sizes) + or len(custom_layer_sizes) == 0 + ): raise _ToolkitError("'custom_layer_sizes' must be a non-empty list.") for i in custom_layer_sizes: if not isinstance(i, int): raise _ToolkitError("'custom_layer_sizes' must contain only integers.") if not i >= 1: raise _ToolkitError("'custom_layer_sizes' must contain integers >= 1.") - if not (isinstance(validation_set, _tc.SFrame) or validation_set == 'auto' or validation_set is None): + if not ( + isinstance(validation_set, _tc.SFrame) + or validation_set == "auto" + or validation_set is None + ): raise TypeError("Unrecognized value for 'validation_set'") if isinstance(validation_set, _tc.SFrame): - if feature not in validation_set.column_names() or target not in validation_set.column_names(): - raise ValueError("The 'validation_set' SFrame must be in the same format as the 'dataset'") + if ( + feature not in validation_set.column_names() + or target not in validation_set.column_names() + ): + raise ValueError( + "The 'validation_set' SFrame must be in the same format as the 'dataset'" + ) if not isinstance(batch_size, int): raise TypeError("'batch_size' must be of type int.") if batch_size < 1: - raise ValueError('\'batch_size\' must be greater than or equal to 1') + raise ValueError("'batch_size' must be greater than or equal to 1") if not isinstance(max_iterations, int): raise TypeError("'max_iterations' must be type int.") - _tk_utils._numeric_param_check_range('max_iterations', max_iterations, 1, _six.MAXSIZE) - + _tk_utils._numeric_param_check_range( + "max_iterations", max_iterations, 1, _six.MAXSIZE + ) classes = list(dataset[target].unique().sort()) num_labels = len(classes) if num_labels <= 1: - raise ValueError('The number of classes must be greater than one.') - feature_extractor_name = 'VGGish' + raise ValueError("The number of classes must be greater than one.") + feature_extractor_name = "VGGish" feature_extractor = _get_feature_extractor(feature_extractor_name) class_label_to_id = {l: i for i, l in enumerate(classes)} # create the validation set - if not isinstance(validation_set, _tc.SFrame) and validation_set == 'auto': + if not isinstance(validation_set, _tc.SFrame) and validation_set == "auto": if len(dataset) >= 100: - print ( "Creating a validation set from 5 percent of training data. This may take a while.\n" - "\tYou can set ``validation_set=None`` to disable validation tracking.\n") + print( + "Creating a validation set from 5 percent of training data. This may take a while.\n" + "\tYou can set ``validation_set=None`` to disable validation tracking.\n" + ) dataset, validation_set = dataset.random_split(0.95, exact=True) else: validation_set = None @@ -342,38 +392,58 @@ def create(dataset, target, feature, max_iterations=10, # do the preprocess and VGGish feature extraction train_deep_features = get_deep_features(dataset[feature], verbose=verbose) - train_data = _tc.SFrame({'deep features': train_deep_features, 'labels': encoded_target}) - train_data = train_data.stack('deep features', new_column_name='deep features') - train_data, missing_ids = train_data.dropna_split(columns=['deep features']) + train_data = _tc.SFrame( + {"deep features": train_deep_features, "labels": encoded_target} + ) + train_data = train_data.stack("deep features", new_column_name="deep features") + train_data, missing_ids = train_data.dropna_split(columns=["deep features"]) training_batch_size = min(len(train_data), batch_size) - train_data = _create_data_iterator(train_data['deep features'].to_numpy(), - train_data['labels'].to_numpy(), - batch_size=training_batch_size, - shuffle=True) + train_data = _create_data_iterator( + train_data["deep features"].to_numpy(), + train_data["labels"].to_numpy(), + batch_size=training_batch_size, + shuffle=True, + ) if len(missing_ids) > 0: - _logging.warning("Dropping %d examples which are less than 975ms in length." % len(missing_ids)) + _logging.warning( + "Dropping %d examples which are less than 975ms in length." + % len(missing_ids) + ) if validation_set is not None: if verbose: print("Preparing validation set") - validation_encoded_target = validation_set[target].apply(lambda x: class_label_to_id[x]) + validation_encoded_target = validation_set[target].apply( + lambda x: class_label_to_id[x] + ) if _is_deep_feature_sarray(validation_set[feature]): validation_deep_features = validation_set[feature] else: - validation_deep_features = get_deep_features(validation_set[feature], verbose=verbose) + validation_deep_features = get_deep_features( + validation_set[feature], verbose=verbose + ) - validation_data = _tc.SFrame({'deep features': validation_deep_features, 'labels': validation_encoded_target}) - validation_data = validation_data.stack('deep features', new_column_name='deep features') - validation_data = validation_data.dropna(columns=['deep features']) + validation_data = _tc.SFrame( + { + "deep features": validation_deep_features, + "labels": validation_encoded_target, + } + ) + validation_data = validation_data.stack( + "deep features", new_column_name="deep features" + ) + validation_data = validation_data.dropna(columns=["deep features"]) validation_batch_size = min(len(validation_data), batch_size) - validation_data = _create_data_iterator(validation_data['deep features'].to_numpy(), - validation_data['labels'].to_numpy(), - batch_size=validation_batch_size) + validation_data = _create_data_iterator( + validation_data["deep features"].to_numpy(), + validation_data["labels"].to_numpy(), + batch_size=validation_batch_size, + ) else: validation_data = [] @@ -384,18 +454,19 @@ def create(dataset, target, feature, max_iterations=10, if verbose: print("\nTraining a custom neural network -") - from ._tf_sound_classifier import SoundClassifierTensorFlowModel - custom_NN = SoundClassifierTensorFlowModel(feature_extractor.output_length, num_labels, custom_layer_sizes) + custom_NN = SoundClassifierTensorFlowModel( + feature_extractor.output_length, num_labels, custom_layer_sizes + ) if verbose: # Setup progress table - row_ids = ['iteration', 'train_accuracy', 'time'] - row_display_names = ['Iteration', 'Training Accuracy', 'Elapsed Time'] + row_ids = ["iteration", "train_accuracy", "time"] + row_display_names = ["Iteration", "Training Accuracy", "Elapsed Time"] if validation_data: - row_ids.insert(2, 'validation_accuracy') - row_display_names.insert(2, 'Validation Accuracy (%)') + row_ids.insert(2, "validation_accuracy") + row_display_names.insert(2, "Validation Accuracy (%)") table_printer = _tc.util._ProgressTablePrinter(row_ids, row_display_names) for i in range(max_iterations): @@ -418,32 +489,31 @@ def create(dataset, target, feature, max_iterations=10, # Get metrics, print progress table train_accuracy = train_metric.get() train_metric.reset() - printed_row_values = {'iteration': i+1, 'train_accuracy': train_accuracy} + printed_row_values = {"iteration": i + 1, "train_accuracy": train_accuracy} if validation_data: validation_accuracy = validation_metric.get() - printed_row_values['validation_accuracy'] = validation_accuracy + printed_row_values["validation_accuracy"] = validation_accuracy validation_metric.reset() validation_data.reset() if verbose: - printed_row_values['time'] = time.time()-start_time + printed_row_values["time"] = time.time() - start_time table_printer.print_row(**printed_row_values) - state = { - '_class_label_to_id': class_label_to_id, - '_custom_classifier': custom_NN, - '_feature_extractor': feature_extractor, - '_id_to_class_label': {v: k for k, v in class_label_to_id.items()}, - 'classes': classes, - 'custom_layer_sizes': custom_layer_sizes, - 'feature': feature, - 'feature_extractor_name': feature_extractor.name, - 'num_classes': num_labels, - 'num_examples': len(dataset), - 'target': target, - 'training_accuracy': train_accuracy, - 'training_time': time.time() - start_time, - 'validation_accuracy': validation_accuracy if validation_data else None, + "_class_label_to_id": class_label_to_id, + "_custom_classifier": custom_NN, + "_feature_extractor": feature_extractor, + "_id_to_class_label": {v: k for k, v in class_label_to_id.items()}, + "classes": classes, + "custom_layer_sizes": custom_layer_sizes, + "feature": feature, + "feature_extractor_name": feature_extractor.name, + "num_classes": num_labels, + "num_examples": len(dataset), + "target": target, + "training_accuracy": train_accuracy, + "training_time": time.time() - start_time, + "validation_accuracy": validation_accuracy if validation_data else None, } return SoundClassifier(state) @@ -458,6 +528,7 @@ class SoundClassifier(_CustomModel): ---------- create """ + _PYTHON_SOUND_CLASSIFIER_VERSION = 1 def __init__(self, state): @@ -477,9 +548,9 @@ def _get_native_state(self): """ state = self.__proxy__.get_state() - del state['_feature_extractor'] + del state["_feature_extractor"] - state['_custom_classifier'] = state['_custom_classifier'].get_weights() + state["_custom_classifier"] = state["_custom_classifier"].get_weights() return state @@ -490,23 +561,28 @@ def _load_version(cls, state, version): """ from ._audio_feature_extractor import _get_feature_extractor - state['_feature_extractor'] = _get_feature_extractor(state['feature_extractor_name']) + state["_feature_extractor"] = _get_feature_extractor( + state["feature_extractor_name"] + ) # Load the custom nerual network - num_classes = state['num_classes'] - num_inputs = state['_feature_extractor'].output_length - if 'custom_layer_sizes' in state: + num_classes = state["num_classes"] + num_inputs = state["_feature_extractor"].output_length + if "custom_layer_sizes" in state: # These are deserialized as floats - custom_layer_sizes = list(map(int, state['custom_layer_sizes'])) + custom_layer_sizes = list(map(int, state["custom_layer_sizes"])) else: # Default value, was not part of state for only Turi Create 5.4 custom_layer_sizes = [100, 100] - state['custom_layer_sizes'] = custom_layer_sizes + state["custom_layer_sizes"] = custom_layer_sizes from ._tf_sound_classifier import SoundClassifierTensorFlowModel - custom_NN = SoundClassifierTensorFlowModel(num_inputs, num_classes, custom_layer_sizes) - custom_NN.load_weights(state['_custom_classifier']) - state['_custom_classifier'] = custom_NN + + custom_NN = SoundClassifierTensorFlowModel( + num_inputs, num_classes, custom_layer_sizes + ) + custom_NN.load_weights(state["_custom_classifier"]) + state["_custom_classifier"] = custom_NN return SoundClassifier(state) @@ -531,8 +607,7 @@ def __repr__(self): width = 40 sections, section_titles = self._get_summary_struct() - out = tkutl._toolkit_repr_print(self, sections, section_titles, - width=width) + out = tkutl._toolkit_repr_print(self, sections, section_titles, width=width) return out def _get_summary_struct(self): @@ -553,19 +628,19 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ model_fields = [ - ('Number of classes', 'num_classes'), - ('Number of training examples', 'num_examples'), - ('Custom layer sizes', 'custom_layer_sizes'), + ("Number of classes", "num_classes"), + ("Number of training examples", "num_examples"), + ("Custom layer sizes", "custom_layer_sizes"), ] training_fields = [ - ('Number of examples', 'num_examples'), - ("Training accuracy", 'training_accuracy'), - ("Validation accuracy", 'validation_accuracy'), - ("Training time (sec)", 'training_time'), + ("Number of examples", "num_examples"), + ("Training accuracy", "training_accuracy"), + ("Validation accuracy", "validation_accuracy"), + ("Training time (sec)", "training_time"), ] - section_titles = ['Schema', 'Training Summary'] - return([model_fields, training_fields], section_titles) + section_titles = ["Schema", "Training Summary"] + return ([model_fields, training_fields], section_titles) def classify(self, dataset, verbose=True, batch_size=64): """ @@ -600,16 +675,22 @@ def classify(self, dataset, verbose=True, batch_size=64): ---------- >>> classes = model.classify(data) """ - prob_vector = self.predict(dataset, output_type='probability_vector', - verbose=verbose, batch_size=batch_size) + prob_vector = self.predict( + dataset, + output_type="probability_vector", + verbose=verbose, + batch_size=batch_size, + ) id_to_label = self._id_to_class_label - return _tc.SFrame({ - 'class': prob_vector.apply(lambda v: id_to_label[_np.argmax(v)]), - 'probability': prob_vector.apply(_np.max) - }) + return _tc.SFrame( + { + "class": prob_vector.apply(lambda v: id_to_label[_np.argmax(v)]), + "probability": prob_vector.apply(_np.max), + } + ) - def evaluate(self, dataset, metric='auto', verbose=True, batch_size=64): + def evaluate(self, dataset, metric="auto", verbose=True, batch_size=64): """ Evaluate the model by making predictions of target values and comparing these to actual values. @@ -665,14 +746,23 @@ def evaluate(self, dataset, metric='auto', verbose=True, batch_size=64): # parameter checking if not isinstance(dataset, _tc.SFrame): - raise TypeError('\'dataset\' parameter must be an SFrame') - - avail_metrics = ['accuracy', 'auc', 'precision', 'recall', - 'f1_score', 'log_loss', 'confusion_matrix', 'roc_curve'] + raise TypeError("'dataset' parameter must be an SFrame") + + avail_metrics = [ + "accuracy", + "auc", + "precision", + "recall", + "f1_score", + "log_loss", + "confusion_matrix", + "roc_curve", + ] _tk_utils._check_categorical_option_type( - 'metric', metric, avail_metrics + ['auto']) + "metric", metric, avail_metrics + ["auto"] + ) - if metric == 'auto': + if metric == "auto": metrics = avail_metrics else: metrics = [metric] @@ -681,44 +771,72 @@ def evaluate(self, dataset, metric='auto', verbose=True, batch_size=64): deep_features = dataset[self.feature] else: deep_features = get_deep_features(dataset[self.feature], verbose=verbose) - data = _tc.SFrame({'deep features': deep_features}) + data = _tc.SFrame({"deep features": deep_features}) data = data.add_row_number() - missing_ids = data.filter_by([[]], 'deep features')['id'] + missing_ids = data.filter_by([[]], "deep features")["id"] if len(missing_ids) > 0: - data = data.filter_by([[]], 'deep features', exclude=True) + data = data.filter_by([[]], "deep features", exclude=True) # Remove the labels for entries without deep features - _logging.warning("Dropping %d examples which are less than 975ms in length." % len(missing_ids)) + _logging.warning( + "Dropping %d examples which are less than 975ms in length." + % len(missing_ids) + ) labels = dataset[[self.target]].add_row_number() - labels = data.join(labels, how='left')[self.target] + labels = data.join(labels, how="left")[self.target] else: labels = dataset[self.target] - assert(len(labels) == len(data)) - - if any([m in metrics for m in ('roc_curve', 'log_loss', 'auc')]): - probs = self.predict(data['deep features'], output_type='probability_vector', - verbose=verbose, batch_size=batch_size) - if any([m in metrics for m in ('accuracy', 'precision', 'recall', 'f1_score', 'confusion_matrix')]): - classes = self.predict(data['deep features'], output_type='class', - verbose=verbose, batch_size=batch_size) + assert len(labels) == len(data) + + if any([m in metrics for m in ("roc_curve", "log_loss", "auc")]): + probs = self.predict( + data["deep features"], + output_type="probability_vector", + verbose=verbose, + batch_size=batch_size, + ) + if any( + [ + m in metrics + for m in ( + "accuracy", + "precision", + "recall", + "f1_score", + "confusion_matrix", + ) + ] + ): + classes = self.predict( + data["deep features"], + output_type="class", + verbose=verbose, + batch_size=batch_size, + ) ret = {} - if 'accuracy' in metrics: - ret['accuracy'] = evaluation.accuracy(labels, classes) - if 'auc' in metrics: - ret['auc'] = evaluation.auc(labels, probs, index_map=self._class_label_to_id) - if 'precision' in metrics: - ret['precision'] = evaluation.precision(labels, classes) - if 'recall' in metrics: - ret['recall'] = evaluation.recall(labels, classes) - if 'f1_score' in metrics: - ret['f1_score'] = evaluation.f1_score(labels, classes) - if 'log_loss' in metrics: - ret['log_loss'] = evaluation.log_loss(labels, probs, index_map=self._class_label_to_id) - if 'confusion_matrix' in metrics: - ret['confusion_matrix'] = evaluation.confusion_matrix(labels, classes) - if 'roc_curve' in metrics: - ret['roc_curve'] = evaluation.roc_curve(labels, probs, index_map=self._class_label_to_id) + if "accuracy" in metrics: + ret["accuracy"] = evaluation.accuracy(labels, classes) + if "auc" in metrics: + ret["auc"] = evaluation.auc( + labels, probs, index_map=self._class_label_to_id + ) + if "precision" in metrics: + ret["precision"] = evaluation.precision(labels, classes) + if "recall" in metrics: + ret["recall"] = evaluation.recall(labels, classes) + if "f1_score" in metrics: + ret["f1_score"] = evaluation.f1_score(labels, classes) + if "log_loss" in metrics: + ret["log_loss"] = evaluation.log_loss( + labels, probs, index_map=self._class_label_to_id + ) + if "confusion_matrix" in metrics: + ret["confusion_matrix"] = evaluation.confusion_matrix(labels, classes) + if "roc_curve" in metrics: + ret["roc_curve"] = evaluation.roc_curve( + labels, probs, index_map=self._class_label_to_id + ) return ret @@ -736,17 +854,20 @@ def export_coreml(self, filename): """ import coremltools from coremltools.proto.FeatureTypes_pb2 import ArrayFeatureType - prob_name = self.target + 'Probability' + + prob_name = self.target + "Probability" def get_custom_model_spec(): from coremltools.models.neural_network import NeuralNetworkBuilder from coremltools.models.datatypes import Array - input_name = 'output1' + input_name = "output1" input_length = self._feature_extractor.output_length - builder = NeuralNetworkBuilder([(input_name, Array(input_length,))], - [(prob_name, Array(self.num_classes, ))], - 'classifier') + builder = NeuralNetworkBuilder( + [(input_name, Array(input_length,))], + [(prob_name, Array(self.num_classes,))], + "classifier", + ) layer_counter = [0] builder.set_input([input_name], [(input_length,)]) @@ -755,33 +876,39 @@ def next_layer_name(): return "layer_%d" % layer_counter[0] for i, cur_layer in enumerate(self._custom_classifier.export_weights()): - W = cur_layer['weight'] + W = cur_layer["weight"] nC, nB = W.shape - Wb = cur_layer['bias'] + Wb = cur_layer["bias"] output_name = next_layer_name() - builder.add_inner_product(name="inner_product_"+str(i), - W=W, - b=Wb, - input_channels=nB, - output_channels=nC, - has_bias=True, - input_name=input_name, - output_name=output_name) + builder.add_inner_product( + name="inner_product_" + str(i), + W=W, + b=Wb, + input_channels=nB, + output_channels=nC, + has_bias=True, + input_name=input_name, + output_name=output_name, + ) input_name = output_name - if cur_layer['act']: + if cur_layer["act"]: output_name = next_layer_name() - builder.add_activation("activation"+str(i), 'RELU', input_name, output_name) + builder.add_activation( + "activation" + str(i), "RELU", input_name, output_name + ) input_name = output_name - builder.add_softmax('softmax', input_name, prob_name) - builder.set_class_labels(self.classes, predicted_feature_name = self.target, - prediction_blob = prob_name) + builder.add_softmax("softmax", input_name, prob_name) + builder.set_class_labels( + self.classes, + predicted_feature_name=self.target, + prediction_blob=prob_name, + ) return builder.spec - top_level_spec = coremltools.proto.Model_pb2.Model() top_level_spec.specificationVersion = 3 @@ -790,7 +917,9 @@ def next_layer_name(): input = desc.input.add() input.name = self.feature assert type(self.feature) is str - input.type.multiArrayType.dataType = ArrayFeatureType.ArrayDataType.Value('FLOAT32') + input.type.multiArrayType.dataType = ArrayFeatureType.ArrayDataType.Value( + "FLOAT32" + ) input.type.multiArrayType.shape.append(15600) # Set outputs @@ -802,28 +931,30 @@ def next_layer_name(): desc.predictedProbabilitiesName = prob_name if type(self.classes[0]) == int: # Class labels are ints - prob_output.type.dictionaryType.int64KeyType.MergeFromString(b'') - label_output.type.int64Type.MergeFromString(b'') - else: # Class are strings - prob_output.type.dictionaryType.stringKeyType.MergeFromString(b'') - label_output.type.stringType.MergeFromString(b'') + prob_output.type.dictionaryType.int64KeyType.MergeFromString(b"") + label_output.type.int64Type.MergeFromString(b"") + else: # Class are strings + prob_output.type.dictionaryType.stringKeyType.MergeFromString(b"") + label_output.type.stringType.MergeFromString(b"") # Set metadata user_metadata = desc.metadata.userDefined - user_metadata['sampleRate'] = str(self._feature_extractor.input_sample_rate) + user_metadata["sampleRate"] = str(self._feature_extractor.input_sample_rate) pipeline = top_level_spec.pipelineClassifier.pipeline # Add the preprocessing model preprocessing_model = pipeline.models.add() - preprocessing_model.customModel.className = 'TCSoundClassifierPreprocessing' + preprocessing_model.customModel.className = "TCSoundClassifierPreprocessing" preprocessing_model.specificationVersion = 3 preprocessing_input = preprocessing_model.description.input.add() preprocessing_input.CopyFrom(input) preprocessed_output = preprocessing_model.description.output.add() - preprocessed_output.name = 'preprocessed_data' - preprocessed_output.type.multiArrayType.dataType = ArrayFeatureType.ArrayDataType.Value('DOUBLE') + preprocessed_output.name = "preprocessed_data" + preprocessed_output.type.multiArrayType.dataType = ArrayFeatureType.ArrayDataType.Value( + "DOUBLE" + ) preprocessed_output.type.multiArrayType.shape.append(1) preprocessed_output.type.multiArrayType.shape.append(96) preprocessed_output.type.multiArrayType.shape.append(64) @@ -840,29 +971,32 @@ def next_layer_name(): # Set key type for the probability dictionary prob_output_type = pipeline.models[-1].description.output[0].type.dictionaryType if type(self.classes[0]) == int: - prob_output_type.int64KeyType.MergeFromString(b'') - else: # String labels - prob_output_type.stringKeyType.MergeFromString(b'') + prob_output_type.int64KeyType.MergeFromString(b"") + else: # String labels + prob_output_type.stringKeyType.MergeFromString(b"") mlmodel = coremltools.models.MLModel(top_level_spec) - model_type = 'sound classifier' + model_type = "sound classifier" mlmodel.short_description = _coreml_utils._mlmodel_short_description(model_type) - mlmodel.input_description[self.feature] = u'Input audio features' - mlmodel.output_description[prob_name] = 'Prediction probabilities' - mlmodel.output_description[self.target] = 'Class label of top prediction' + mlmodel.input_description[self.feature] = u"Input audio features" + mlmodel.output_description[prob_name] = "Prediction probabilities" + mlmodel.output_description[self.target] = "Class label of top prediction" model_metadata = { - 'target': self.target, - 'feature': self.feature, + "target": self.target, + "feature": self.feature, } user_defined_metadata = model_metadata.update( - _coreml_utils._get_tc_version_info()) - _coreml_utils._set_model_metadata(mlmodel, - self.__class__.__name__, - user_defined_metadata, - version=SoundClassifier._PYTHON_SOUND_CLASSIFIER_VERSION) + _coreml_utils._get_tc_version_info() + ) + _coreml_utils._set_model_metadata( + mlmodel, + self.__class__.__name__, + user_defined_metadata, + version=SoundClassifier._PYTHON_SOUND_CLASSIFIER_VERSION, + ) mlmodel.save(filename) - def predict(self, dataset, output_type='class', verbose=True, batch_size=64): + def predict(self, dataset, output_type="class", verbose=True, batch_size=64): """ Return predictions for ``dataset``. Predictions can be generated as class labels or probabilities. @@ -911,27 +1045,37 @@ class as a vector. Label ordering is dictated by the ``classes`` """ if not isinstance(dataset, (_tc.SFrame, _tc.SArray, dict)): - raise TypeError('\'dataset\' parameter must be either an SFrame, SArray or dictionary') + raise TypeError( + "'dataset' parameter must be either an SFrame, SArray or dictionary" + ) if isinstance(dataset, dict): - if(set(dataset.keys()) != {'sample_rate', 'data'}): - raise ValueError('\'dataset\' parameter is a dictionary but does not appear to be audio data.') + if set(dataset.keys()) != {"sample_rate", "data"}: + raise ValueError( + "'dataset' parameter is a dictionary but does not appear to be audio data." + ) dataset = _tc.SArray([dataset]) elif isinstance(dataset, _tc.SFrame): dataset = dataset[self.feature] if not _is_deep_feature_sarray(dataset) and not _is_audio_data_sarray(dataset): - raise ValueError('\'dataset\' must be either audio data or audio deep features.') - - if output_type not in ('probability', 'probability_vector', 'class'): - raise ValueError('\'dataset\' parameter must be either an SFrame, SArray or dictionary') - if output_type == 'probability' and self.num_classes != 2: - raise _ToolkitError('Output type \'probability\' is only supported for binary' - ' classification. For multi-class classification, use' - ' predict_topk() instead.') + raise ValueError( + "'dataset' must be either audio data or audio deep features." + ) + + if output_type not in ("probability", "probability_vector", "class"): + raise ValueError( + "'dataset' parameter must be either an SFrame, SArray or dictionary" + ) + if output_type == "probability" and self.num_classes != 2: + raise _ToolkitError( + "Output type 'probability' is only supported for binary" + " classification. For multi-class classification, use" + " predict_topk() instead." + ) if not isinstance(batch_size, int): raise TypeError("'batch_size' must be of type int.") - if(batch_size < 1): + if batch_size < 1: raise ValueError("'batch_size' must be greater than or equal to 1") if _is_deep_feature_sarray(dataset): @@ -939,48 +1083,67 @@ class as a vector. Label ordering is dictated by the ``classes`` else: deep_features = get_deep_features(dataset, verbose=verbose) - deep_features = _tc.SFrame({'deep features': deep_features}) + deep_features = _tc.SFrame({"deep features": deep_features}) deep_features = deep_features.add_row_number() - deep_features = deep_features.stack('deep features', new_column_name='deep features') - deep_features, missing_ids = deep_features.dropna_split(columns=['deep features']) + deep_features = deep_features.stack( + "deep features", new_column_name="deep features" + ) + deep_features, missing_ids = deep_features.dropna_split( + columns=["deep features"] + ) if len(missing_ids) > 0: - _logging.warning("Unable to make predictions for %d examples because they are less than 975ms in length." - % len(missing_ids)) + _logging.warning( + "Unable to make predictions for %d examples because they are less than 975ms in length." + % len(missing_ids) + ) if batch_size > len(deep_features): batch_size = len(deep_features) y = [] - for data, in _create_data_iterator(deep_features['deep features'].to_numpy(), None, batch_size=batch_size): + for (data,) in _create_data_iterator( + deep_features["deep features"].to_numpy(), None, batch_size=batch_size + ): y += self._custom_classifier.predict(data).tolist() - assert(len(y) == len(deep_features)) + assert len(y) == len(deep_features) # Combine predictions from multiple frames - sf = _tc.SFrame({'predictions': y, 'id': deep_features['id']}) - probabilities_sum = sf.groupby('id', {'prob_sum': _tc.aggregate.SUM('predictions')}) + sf = _tc.SFrame({"predictions": y, "id": deep_features["id"]}) + probabilities_sum = sf.groupby( + "id", {"prob_sum": _tc.aggregate.SUM("predictions")} + ) - if output_type == 'class': - predicted_ids = probabilities_sum['prob_sum'].apply(lambda x: _np.argmax(x)) + if output_type == "class": + predicted_ids = probabilities_sum["prob_sum"].apply(lambda x: _np.argmax(x)) mappings = self._id_to_class_label - probabilities_sum['results'] = predicted_ids.apply(lambda x: mappings[x]) + probabilities_sum["results"] = predicted_ids.apply(lambda x: mappings[x]) else: - assert output_type in ('probability', 'probability_vector') - frame_per_example_count = sf.groupby('id', _tc.aggregate.COUNT()) + assert output_type in ("probability", "probability_vector") + frame_per_example_count = sf.groupby("id", _tc.aggregate.COUNT()) probabilities_sum = probabilities_sum.join(frame_per_example_count) - probabilities_sum['results'] = probabilities_sum.apply(lambda row: [i / row['Count'] for i in row['prob_sum']]) + probabilities_sum["results"] = probabilities_sum.apply( + lambda row: [i / row["Count"] for i in row["prob_sum"]] + ) if len(missing_ids) > 0: - output_type = probabilities_sum['results'].dtype - missing_predictions = _tc.SFrame({'id': missing_ids['id'], - 'results': _tc.SArray([ None ] * len(missing_ids), dtype=output_type) - }) - probabilities_sum = probabilities_sum[['id', 'results']].append(missing_predictions) + output_type = probabilities_sum["results"].dtype + missing_predictions = _tc.SFrame( + { + "id": missing_ids["id"], + "results": _tc.SArray([None] * len(missing_ids), dtype=output_type), + } + ) + probabilities_sum = probabilities_sum[["id", "results"]].append( + missing_predictions + ) - probabilities_sum = probabilities_sum.sort('id') - return probabilities_sum['results'] + probabilities_sum = probabilities_sum.sort("id") + return probabilities_sum["results"] - def predict_topk(self, dataset, output_type='probability', k=3, verbose=True, batch_size=64): + def predict_topk( + self, dataset, output_type="probability", k=3, verbose=True, batch_size=64 + ): """ Return top-k predictions for the ``dataset``. Predictions are returned as an SFrame with three columns: `id`, @@ -1041,25 +1204,33 @@ def predict_topk(self, dataset, output_type='probability', k=3, verbose=True, ba """ if not isinstance(k, int): raise TypeError("'k' must be of type int.") - _tk_utils._numeric_param_check_range('k', k, 1, _six.MAXSIZE) - prob_vector = self.predict(dataset, output_type='probability_vector', - verbose=verbose, batch_size=batch_size) + _tk_utils._numeric_param_check_range("k", k, 1, _six.MAXSIZE) + prob_vector = self.predict( + dataset, + output_type="probability_vector", + verbose=verbose, + batch_size=batch_size, + ) id_to_label = self._id_to_class_label - if output_type == 'probability': - results = prob_vector.apply(lambda p: [ - {'class': id_to_label[i], 'probability': p[i]} - for i in reversed(_np.argsort(p)[-k:])] + if output_type == "probability": + results = prob_vector.apply( + lambda p: [ + {"class": id_to_label[i], "probability": p[i]} + for i in reversed(_np.argsort(p)[-k:]) + ] ) else: - assert(output_type == 'rank') - results = prob_vector.apply(lambda p: [ - {'class': id_to_label[i], 'rank': rank} - for rank, i in enumerate(reversed(_np.argsort(p)[-k:]))] + assert output_type == "rank" + results = prob_vector.apply( + lambda p: [ + {"class": id_to_label[i], "rank": rank} + for rank, i in enumerate(reversed(_np.argsort(p)[-k:])) + ] ) - results = _tc.SFrame({'X': results}) + results = _tc.SFrame({"X": results}) results = results.add_row_number() - results = results.stack('X', new_column_name='X') - results = results.unpack('X', column_name_prefix='') + results = results.stack("X", new_column_name="X") + results = results.unpack("X", column_name_prefix="") return results diff --git a/src/python/turicreate/toolkits/sound_classifier/vggish_input.py b/src/python/turicreate/toolkits/sound_classifier/vggish_input.py index 5f406524a2..3a4c489b3e 100644 --- a/src/python/turicreate/toolkits/sound_classifier/vggish_input.py +++ b/src/python/turicreate/toolkits/sound_classifier/vggish_input.py @@ -22,7 +22,7 @@ def waveform_to_examples(data, sample_rate): - """Converts audio waveform into an array of examples for VGGish. + """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions @@ -38,41 +38,43 @@ def waveform_to_examples(data, sample_rate): bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ - import resampy - - # Convert to mono. - if len(data.shape) > 1: - data = np.mean(data, axis=1) - # Resample to the rate assumed by VGGish. - if sample_rate != vggish_params.SAMPLE_RATE: - data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) - - # Compute log mel spectrogram features. - log_mel = mel_features.log_mel_spectrogram( - data, - audio_sample_rate=vggish_params.SAMPLE_RATE, - log_offset=vggish_params.LOG_OFFSET, - window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, - hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, - num_mel_bins=vggish_params.NUM_MEL_BINS, - lower_edge_hertz=vggish_params.MEL_MIN_HZ, - upper_edge_hertz=vggish_params.MEL_MAX_HZ) - - # Frame features into examples. - features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS - example_window_length = int(round( - vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) - example_hop_length = int(round( - vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) - log_mel_examples = mel_features.frame( - log_mel, - window_length=example_window_length, - hop_length=example_hop_length) - return log_mel_examples + import resampy + + # Convert to mono. + if len(data.shape) > 1: + data = np.mean(data, axis=1) + # Resample to the rate assumed by VGGish. + if sample_rate != vggish_params.SAMPLE_RATE: + data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) + + # Compute log mel spectrogram features. + log_mel = mel_features.log_mel_spectrogram( + data, + audio_sample_rate=vggish_params.SAMPLE_RATE, + log_offset=vggish_params.LOG_OFFSET, + window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, + hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, + num_mel_bins=vggish_params.NUM_MEL_BINS, + lower_edge_hertz=vggish_params.MEL_MIN_HZ, + upper_edge_hertz=vggish_params.MEL_MAX_HZ, + ) + + # Frame features into examples. + features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS + example_window_length = int( + round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate) + ) + example_hop_length = int( + round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate) + ) + log_mel_examples = mel_features.frame( + log_mel, window_length=example_window_length, hop_length=example_hop_length + ) + return log_mel_examples def wavfile_to_examples(wav_file): - """Convenience wrapper around waveform_to_examples() for a common WAV format. + """Convenience wrapper around waveform_to_examples() for a common WAV format. Args: wav_file: String path to a file, or a file-like object. The file @@ -81,8 +83,9 @@ def wavfile_to_examples(wav_file): Returns: See waveform_to_examples. """ - from scipy.io import wavfile - sr, wav_data = wavfile.read(wav_file) - assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype - samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] - return waveform_to_examples(samples, sr) + from scipy.io import wavfile + + sr, wav_data = wavfile.read(wav_file) + assert wav_data.dtype == np.int16, "Bad sample type: %r" % wav_data.dtype + samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] + return waveform_to_examples(samples, sr) diff --git a/src/python/turicreate/toolkits/sound_classifier/vggish_params.py b/src/python/turicreate/toolkits/sound_classifier/vggish_params.py index a38ce26c9d..24551290cb 100644 --- a/src/python/turicreate/toolkits/sound_classifier/vggish_params.py +++ b/src/python/turicreate/toolkits/sound_classifier/vggish_params.py @@ -32,11 +32,11 @@ MEL_MAX_HZ = 7500 LOG_OFFSET = 0.01 # Offset used for stabilized log of input mel-spectrogram. EXAMPLE_WINDOW_SECONDS = 0.96 # Each example contains 96 10ms frames -EXAMPLE_HOP_SECONDS = 0.96 # with zero overlap. +EXAMPLE_HOP_SECONDS = 0.96 # with zero overlap. # Parameters used for embedding postprocessing. -PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors' -PCA_MEANS_NAME = 'pca_means' +PCA_EIGEN_VECTORS_NAME = "pca_eigen_vectors" +PCA_MEANS_NAME = "pca_means" QUANTIZE_MIN_VAL = -2.0 QUANTIZE_MAX_VAL = +2.0 @@ -46,8 +46,8 @@ ADAM_EPSILON = 1e-8 # Epsilon for the Adam optimizer. # Names of ops, tensors, and features. -INPUT_OP_NAME = 'vggish/input_features' -INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0' -OUTPUT_OP_NAME = 'vggish/embedding' -OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0' -AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding' +INPUT_OP_NAME = "vggish/input_features" +INPUT_TENSOR_NAME = INPUT_OP_NAME + ":0" +OUTPUT_OP_NAME = "vggish/embedding" +OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ":0" +AUDIO_EMBEDDING_FEATURE_NAME = "audio_embedding" diff --git a/src/python/turicreate/toolkits/style_transfer/__init__.py b/src/python/turicreate/toolkits/style_transfer/__init__.py index 7b94acb003..baccdc61f5 100644 --- a/src/python/turicreate/toolkits/style_transfer/__init__.py +++ b/src/python/turicreate/toolkits/style_transfer/__init__.py @@ -8,4 +8,4 @@ from __future__ import absolute_import as _ from .style_transfer import create, StyleTransfer -__all__ = ['create', 'StyleTransfer'] +__all__ = ["create", "StyleTransfer"] diff --git a/src/python/turicreate/toolkits/style_transfer/_sframe_loader.py b/src/python/turicreate/toolkits/style_transfer/_sframe_loader.py index 71c32cede5..e5c36cf001 100644 --- a/src/python/turicreate/toolkits/style_transfer/_sframe_loader.py +++ b/src/python/turicreate/toolkits/style_transfer/_sframe_loader.py @@ -10,8 +10,9 @@ import turicreate as _tc from turicreate.toolkits._main import ToolkitError as _ToolkitError -_TMP_COL_PREP_IMAGE = '_prepared_image' -_TMP_COL_RANDOM_ORDER = '_random_order' +_TMP_COL_PREP_IMAGE = "_prepared_image" +_TMP_COL_RANDOM_ORDER = "_random_order" + def _resize_if_too_large(image, max_shape): width_f = image.width / max_shape[1] @@ -28,12 +29,13 @@ def _resize_if_too_large(image, max_shape): height = min(height, max_shape[0]) # Decode image and make sure it has 3 channels - return _tc.image_analysis.resize(image, width, height, 3, decode=True, - resample='bilinear') + return _tc.image_analysis.resize( + image, width, height, 3, decode=True, resample="bilinear" + ) def _stretch_resize(image, shape): height, width = shape - return _tc.image_analysis.resize(image, width, height, 3, decode=True, - resample='bilinear') - + return _tc.image_analysis.resize( + image, width, height, 3, decode=True, resample="bilinear" + ) diff --git a/src/python/turicreate/toolkits/style_transfer/_tf_model_architecture.py b/src/python/turicreate/toolkits/style_transfer/_tf_model_architecture.py index 132ffddf4f..e5da3dda0d 100644 --- a/src/python/turicreate/toolkits/style_transfer/_tf_model_architecture.py +++ b/src/python/turicreate/toolkits/style_transfer/_tf_model_architecture.py @@ -575,7 +575,7 @@ def __init__(self, config, net_params): self.st_graph = _tf.Graph() self._batch_size = 1 self._finetune_all_params = True - self._define_training_graph = bool(config['st_training']) + self._define_training_graph = bool(config["st_training"]) self.sess = _tf.Session(graph=self.st_graph) with self.st_graph.as_default(): @@ -589,7 +589,7 @@ def init_style_transfer_graph(self, net_params): self.tf_style = _tf.placeholder(dtype=_tf.float32, shape=[None, 256, 256, 3]) self.tf_index = _tf.placeholder(dtype=_tf.int64, shape=[self.batch_size]) - self.__define_graph(); + self.__define_graph() init = _tf.global_variables_initializer() self.sess.run(init) @@ -619,7 +619,7 @@ def batch_size(self): def batch_size(self, batch_size): self._batch_size = batch_size with self.st_graph.as_default(): - self.tf_index = _tf.placeholder(dtype = _tf.int32, shape = [batch_size]) + self.tf_index = _tf.placeholder(dtype=_tf.int32, shape=[batch_size]) self.__define_graph() def train(self, feed_dict): diff --git a/src/python/turicreate/toolkits/style_transfer/_utils.py b/src/python/turicreate/toolkits/style_transfer/_utils.py index b853ec136b..fdc601f3cf 100644 --- a/src/python/turicreate/toolkits/style_transfer/_utils.py +++ b/src/python/turicreate/toolkits/style_transfer/_utils.py @@ -11,7 +11,7 @@ def _seconds_as_string(seconds): """ Returns seconds as a human-friendly string, e.g. '1d 4h 47m 41s' """ - TIME_UNITS = [('s', 60), ('m', 60), ('h', 24), ('d', None)] + TIME_UNITS = [("s", 60), ("m", 60), ("h", 24), ("d", None)] unit_strings = [] cur = max(int(seconds), 1) for suffix, size in TIME_UNITS: @@ -20,5 +20,5 @@ def _seconds_as_string(seconds): else: rest = cur if rest > 0: - unit_strings.insert(0, '%d%s' % (rest, suffix)) - return ' '.join(unit_strings) + unit_strings.insert(0, "%d%s" % (rest, suffix)) + return " ".join(unit_strings) diff --git a/src/python/turicreate/toolkits/style_transfer/style_transfer.py b/src/python/turicreate/toolkits/style_transfer/style_transfer.py index 07cdd209ac..3350beb392 100644 --- a/src/python/turicreate/toolkits/style_transfer/style_transfer.py +++ b/src/python/turicreate/toolkits/style_transfer/style_transfer.py @@ -22,19 +22,21 @@ import numpy as _np import math as _math import six as _six -from .._mps_utils import (use_mps as _use_mps, - mps_device_memory_limit as _mps_device_memory_limit, - MpsGraphAPI as _MpsGraphAPI, - MpsStyleGraphAPI as _MpsStyleGraphAPI, - MpsGraphNetworkType as _MpsGraphNetworkType, - MpsGraphMode as _MpsGraphMode) - -def _get_mps_st_net(input_image_shape, batch_size, output_size, - config, weights={}): +from .._mps_utils import ( + use_mps as _use_mps, + mps_device_memory_limit as _mps_device_memory_limit, + MpsGraphAPI as _MpsGraphAPI, + MpsStyleGraphAPI as _MpsStyleGraphAPI, + MpsGraphNetworkType as _MpsGraphNetworkType, + MpsGraphMode as _MpsGraphMode, +) + + +def _get_mps_st_net(input_image_shape, batch_size, output_size, config, weights={}): """ Initializes an MpsGraphAPI for style transfer. """ - c_in, h_in, w_in = input_image_shape + c_in, h_in, w_in = input_image_shape c_out = output_size[0] h_out = h_in @@ -44,13 +46,31 @@ def _get_mps_st_net(input_image_shape, batch_size, output_size, h_view = h_in w_view = w_in - network = _MpsStyleGraphAPI(batch_size, c_in, h_in, w_in, c_out, h_out, - w_out, weights=weights, config=config) + network = _MpsStyleGraphAPI( + batch_size, + c_in, + h_in, + w_in, + c_out, + h_out, + w_out, + weights=weights, + config=config, + ) return network -def create(style_dataset, content_dataset, style_feature=None, - content_feature=None, max_iterations=None, model='resnet-16', - verbose=True, batch_size = 1, **kwargs): + +def create( + style_dataset, + content_dataset, + style_feature=None, + content_feature=None, + max_iterations=None, + model="resnet-16", + verbose=True, + batch_size=1, + **kwargs +): """ Create a :class:`StyleTransfer` model. @@ -128,70 +148,78 @@ def create(style_dataset, content_dataset, style_feature=None, raise _ToolkitError("style_dataset SFrame cannot be empty") if len(content_dataset) == 0: raise _ToolkitError("content_dataset SFrame cannot be empty") - if(batch_size < 1): + if batch_size < 1: raise _ToolkitError("'batch_size' must be greater than or equal to 1") - if max_iterations is not None and (not isinstance(max_iterations, int) or max_iterations < 0): - raise _ToolkitError("'max_iterations' must be an integer greater than or equal to 0") + if max_iterations is not None and ( + not isinstance(max_iterations, int) or max_iterations < 0 + ): + raise _ToolkitError( + "'max_iterations' must be an integer greater than or equal to 0" + ) if style_feature is None: style_feature = _tkutl._find_only_image_column(style_dataset) - + if content_feature is None: content_feature = _tkutl._find_only_image_column(content_dataset) if verbose: - print("Using '{}' in style_dataset as feature column and using " - "'{}' in content_dataset as feature column".format(style_feature, content_feature)) + print( + "Using '{}' in style_dataset as feature column and using " + "'{}' in content_dataset as feature column".format( + style_feature, content_feature + ) + ) _raise_error_if_not_training_sframe(style_dataset, style_feature) _raise_error_if_not_training_sframe(content_dataset, content_feature) - _tkutl._handle_missing_values(style_dataset, style_feature, 'style_dataset') - _tkutl._handle_missing_values(content_dataset, content_feature, 'content_dataset') + _tkutl._handle_missing_values(style_dataset, style_feature, "style_dataset") + _tkutl._handle_missing_values(content_dataset, content_feature, "content_dataset") params = { - 'batch_size': batch_size, - 'vgg16_content_loss_layer': 2, # conv3_3 layer - 'lr': 0.001, - 'content_loss_mult': 1.0, - 'style_loss_mult': [1e-4, 1e-4, 1e-4, 1e-4], # conv 1-4 layers - 'finetune_all_params': True, - 'pretrained_weights': False, - 'print_loss_breakdown': False, - 'input_shape': (256, 256), - 'training_content_loader_type': 'stretch', - 'use_augmentation': False, - 'sequential_image_processing': False, + "batch_size": batch_size, + "vgg16_content_loss_layer": 2, # conv3_3 layer + "lr": 0.001, + "content_loss_mult": 1.0, + "style_loss_mult": [1e-4, 1e-4, 1e-4, 1e-4], # conv 1-4 layers + "finetune_all_params": True, + "pretrained_weights": False, + "print_loss_breakdown": False, + "input_shape": (256, 256), + "training_content_loader_type": "stretch", + "use_augmentation": False, + "sequential_image_processing": False, # Only used if use_augmentaion is True - 'aug_resize': 0, - 'aug_min_object_covered': 0, - 'aug_rand_crop': 0.9, - 'aug_rand_pad': 0.9, - 'aug_rand_gray': 0.0, - 'aug_aspect_ratio': 1.25, - 'aug_hue': 0.05, - 'aug_brightness': 0.05, - 'aug_saturation': 0.05, - 'aug_contrast': 0.05, - 'aug_horizontal_flip': True, - 'aug_area_range': (.05, 1.5), - 'aug_pca_noise': 0.0, - 'aug_max_attempts': 20, - 'aug_inter_method': 2, - 'checkpoint': False, - 'checkpoint_prefix': 'style_transfer', - 'checkpoint_increment': 1000 + "aug_resize": 0, + "aug_min_object_covered": 0, + "aug_rand_crop": 0.9, + "aug_rand_pad": 0.9, + "aug_rand_gray": 0.0, + "aug_aspect_ratio": 1.25, + "aug_hue": 0.05, + "aug_brightness": 0.05, + "aug_saturation": 0.05, + "aug_contrast": 0.05, + "aug_horizontal_flip": True, + "aug_area_range": (0.05, 1.5), + "aug_pca_noise": 0.0, + "aug_max_attempts": 20, + "aug_inter_method": 2, + "checkpoint": False, + "checkpoint_prefix": "style_transfer", + "checkpoint_increment": 1000, } - if '_advanced_parameters' in kwargs: + if "_advanced_parameters" in kwargs: # Make sure no additional parameters are provided - new_keys = set(kwargs['_advanced_parameters'].keys()) + new_keys = set(kwargs["_advanced_parameters"].keys()) set_keys = set(params.keys()) unsupported = new_keys - set_keys if unsupported: - raise _ToolkitError('Unknown advanced parameters: {}'.format(unsupported)) + raise _ToolkitError("Unknown advanced parameters: {}".format(unsupported)) - params.update(kwargs['_advanced_parameters']) + params.update(kwargs["_advanced_parameters"]) - name = 'style_transfer' + name = "style_transfer" import turicreate as _turicreate @@ -199,36 +227,39 @@ def create(style_dataset, content_dataset, style_feature=None, import turicreate.toolkits.libtctensorflow model = _turicreate.extensions.style_transfer() - pretrained_resnet_model = _pre_trained_models.STYLE_TRANSFER_BASE_MODELS['resnet-16']() - pretrained_vgg16_model = _pre_trained_models.STYLE_TRANSFER_BASE_MODELS['Vgg16']() + pretrained_resnet_model = _pre_trained_models.STYLE_TRANSFER_BASE_MODELS[ + "resnet-16" + ]() + pretrained_vgg16_model = _pre_trained_models.STYLE_TRANSFER_BASE_MODELS["Vgg16"]() options = {} - options['image_height'] = params['input_shape'][0] - options['image_width'] = params['input_shape'][1] - options['content_feature'] = content_feature - options['style_feature'] = style_feature + options["image_height"] = params["input_shape"][0] + options["image_width"] = params["input_shape"][1] + options["content_feature"] = content_feature + options["style_feature"] = style_feature if verbose is not None: - options['verbose'] = verbose + options["verbose"] = verbose else: - options['verbose'] = False + options["verbose"] = False if batch_size is not None: - options['batch_size'] = batch_size + options["batch_size"] = batch_size if max_iterations is not None: - options['max_iterations'] = max_iterations - options['num_styles'] = len(style_dataset) - options['resnet_mlmodel_path'] = pretrained_resnet_model.get_model_path('coreml') - options['vgg_mlmodel_path'] = pretrained_vgg16_model.get_model_path('coreml') + options["max_iterations"] = max_iterations + options["num_styles"] = len(style_dataset) + options["resnet_mlmodel_path"] = pretrained_resnet_model.get_model_path("coreml") + options["vgg_mlmodel_path"] = pretrained_vgg16_model.get_model_path("coreml") model.train(style_dataset[style_feature], content_dataset[content_feature], options) return StyleTransfer(model_proxy=model, name=name) + def _raise_error_if_not_training_sframe(dataset, context_column): - _raise_error_if_not_sframe(dataset, 'datset') + _raise_error_if_not_sframe(dataset, "datset") if context_column not in dataset.column_names(): - raise _ToolkitError("Context Image column '%s' does not exist" - % context_column) + raise _ToolkitError("Context Image column '%s' does not exist" % context_column) if dataset[context_column].dtype != _tc.Image: raise _ToolkitError("Context Image column must contain images") + class StyleTransfer(_Model): """ A trained model using C++ implementation that is ready to use for classification or export to @@ -236,6 +267,7 @@ class StyleTransfer(_Model): This model should not be constructed directly. """ + _CPP_STYLE_TRANSFER_VERSION = 1 def __init__(self, model_proxy=None, name=None): @@ -264,14 +296,15 @@ def __repr__(self): """ width = 40 sections, section_titles = self._get_summary_struct() - out = _tkutl._toolkit_repr_print(self, sections, section_titles, - width=width) + out = _tkutl._toolkit_repr_print(self, sections, section_titles, width=width) return out def _get_version(self): return self._CPP_STYLE_TRANSFER_VERSION - def export_coreml(self, filename, image_shape=(256, 256), include_flexible_shape=True): + def export_coreml( + self, filename, image_shape=(256, 256), include_flexible_shape=True + ): """ Save the model in Core ML format. The Core ML model takes an image of fixed size, and a style index inputs and produces an output @@ -298,17 +331,17 @@ def export_coreml(self, filename, image_shape=(256, 256), include_flexible_shape >>> model.export_coreml('StyleTransfer.mlmodel') """ options = {} - options['image_width'] = image_shape[1] - options['image_height'] = image_shape[0] - options['include_flexible_shape'] = include_flexible_shape + options["image_width"] = image_shape[1] + options["image_height"] = image_shape[0] + options["include_flexible_shape"] = include_flexible_shape additional_user_defined_metadata = _coreml_utils._get_tc_version_info() - short_description = _coreml_utils._mlmodel_short_description('Style Transfer') - - self.__proxy__.export_to_coreml(filename, short_description, - additional_user_defined_metadata, options) + short_description = _coreml_utils._mlmodel_short_description("Style Transfer") + self.__proxy__.export_to_coreml( + filename, short_description, additional_user_defined_metadata, options + ) - def stylize(self, images, style=None, verbose=True, max_size=800, batch_size = 4): + def stylize(self, images, style=None, verbose=True, max_size=800, batch_size=4): """ Stylize an SFrame of Images given a style index or a list of styles. @@ -382,31 +415,35 @@ def stylize(self, images, style=None, verbose=True, max_size=800, batch_size = 4 [8 rows x 3 columns] """ if not isinstance(images, (_tc.SFrame, _tc.SArray, _tc.Image)): - raise TypeError('"image" parameter must be of type SFrame, SArray or turicreate.Image.') + raise TypeError( + '"image" parameter must be of type SFrame, SArray or turicreate.Image.' + ) if isinstance(images, (_tc.SFrame, _tc.SArray)) and len(images) == 0: raise _ToolkitError('"image" parameter cannot be empty') if style is not None and not isinstance(style, (int, list)): raise TypeError('"style" must parameter must be a None, int or a list') if not isinstance(max_size, int): raise TypeError('"max_size" must parameter must be an int') - if (max_size < 1): + if max_size < 1: raise _ToolkitError("'max_size' must be greater than or equal to 1") if not isinstance(batch_size, int): raise TypeError('"batch_size" must parameter must be an int') - if (batch_size < 1): + if batch_size < 1: raise _ToolkitError("'batch_size' must be greater than or equal to 1") options = {} - options['style_idx'] = style - options['verbose'] = verbose - options['max_size'] = max_size - options['batch_size'] = batch_size + options["style_idx"] = style + options["verbose"] = verbose + options["max_size"] = max_size + options["batch_size"] = batch_size if isinstance(style, list) or style is None: if isinstance(images, _tc.SFrame): image_feature = _tkutl._find_only_image_column(images) stylized_images = self.__proxy__.predict(images[image_feature], options) - stylized_images = stylized_images.rename({'stylized_image' : 'stylized_' + str(image_feature)}) + stylized_images = stylized_images.rename( + {"stylized_image": "stylized_" + str(image_feature)} + ) return stylized_images return self.__proxy__.predict(images, options) else: @@ -415,7 +452,9 @@ def stylize(self, images, style=None, verbose=True, max_size=800, batch_size = 4 raise _ToolkitError("SFrame cannot be empty") image_feature = _tkutl._find_only_image_column(images) stylized_images = self.__proxy__.predict(images[image_feature], options) - stylized_images = stylized_images.rename({'stylized_image' : 'stylized_' + str(image_feature)}) + stylized_images = stylized_images.rename( + {"stylized_image": "stylized_" + str(image_feature)} + ) return stylized_images elif isinstance(images, (_tc.Image)): stylized_images = self.__proxy__.predict(images, options) @@ -477,18 +516,18 @@ def _get_summary_struct(self): The order matches that of the 'sections' object. """ model_fields = [ - ('Model', 'model'), - ('Number of unique styles', 'num_styles'), + ("Model", "model"), + ("Number of unique styles", "num_styles"), ] training_fields = [ - ('Training time', '_training_time_as_string'), - ('Training epochs', 'training_epochs'), - ('Training iterations', 'training_iterations'), - ('Number of style images', 'num_styles'), - ('Number of content images', 'num_content_images'), - ('Final loss', 'training_loss'), + ("Training time", "_training_time_as_string"), + ("Training epochs", "training_epochs"), + ("Training iterations", "training_iterations"), + ("Number of style images", "num_styles"), + ("Number of content images", "num_content_images"), + ("Final loss", "training_loss"), ] - section_titles = ['Schema', 'Training summary'] - return([model_fields, training_fields], section_titles) + section_titles = ["Schema", "Training summary"] + return ([model_fields, training_fields], section_titles) diff --git a/src/python/turicreate/toolkits/text_analytics/__init__.py b/src/python/turicreate/toolkits/text_analytics/__init__.py index e182e7c84f..8abb8657a8 100644 --- a/src/python/turicreate/toolkits/text_analytics/__init__.py +++ b/src/python/turicreate/toolkits/text_analytics/__init__.py @@ -22,15 +22,38 @@ from __future__ import division as _ from __future__ import absolute_import as _ -__all__ = ['tf_idf', 'bm25', 'stop_words', 'count_words', - 'count_ngrams', 'random_split', 'parse_sparse', - 'parse_docword', 'tokenize', 'drop_words','split_by_sentence', - 'extract_parts_of_speech'] +__all__ = [ + "tf_idf", + "bm25", + "stop_words", + "count_words", + "count_ngrams", + "random_split", + "parse_sparse", + "parse_docword", + "tokenize", + "drop_words", + "split_by_sentence", + "extract_parts_of_speech", +] + + def __dir__(): - return ['tf_idf', 'bm25', 'stop_words', 'count_words', - 'count_ngrams', 'random_split', 'parse_sparse', - 'parse_docword', 'tokenize', 'drop_words', 'split_by_sentence', - 'extract_parts_of_speech'] + return [ + "tf_idf", + "bm25", + "stop_words", + "count_words", + "count_ngrams", + "random_split", + "parse_sparse", + "parse_docword", + "tokenize", + "drop_words", + "split_by_sentence", + "extract_parts_of_speech", + ] + from ._util import tf_idf from ._util import bm25 diff --git a/src/python/turicreate/toolkits/text_analytics/_util.py b/src/python/turicreate/toolkits/text_analytics/_util.py index 07bbeacbdf..edac732927 100644 --- a/src/python/turicreate/toolkits/text_analytics/_util.py +++ b/src/python/turicreate/toolkits/text_analytics/_util.py @@ -15,10 +15,46 @@ import turicreate.toolkits._feature_engineering as _feature_engineering -DEFAULT_DELIMITERS = ["\r", "\v", "\n", "\f", "\t", " ", "!", "#", "$", "%", - "&", "'", "\"", "(", ")", "*", "+", ",", "-", ".", "/", - ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", - "_", "`", "{", "|", "}", "~"] +DEFAULT_DELIMITERS = [ + "\r", + "\v", + "\n", + "\f", + "\t", + " ", + "!", + "#", + "$", + "%", + "&", + "'", + '"', + "(", + ")", + "*", + "+", + ",", + "-", + ".", + "/", + ":", + ";", + "<", + "=", + ">", + "?", + "@", + "[", + "\\", + "]", + "^", + "_", + "`", + "{", + "|", + "}", + "~", +] def count_words(text, to_lower=True, delimiters=DEFAULT_DELIMITERS): @@ -107,18 +143,27 @@ def count_words(text, to_lower=True, delimiters=DEFAULT_DELIMITERS): _raise_error_if_not_sarray(text, "text") ## Compute word counts - sf = _turicreate.SFrame({'docs': text}) - fe = _feature_engineering.WordCounter(features='docs', - to_lower=to_lower, - delimiters=delimiters, - output_column_prefix=None) + sf = _turicreate.SFrame({"docs": text}) + fe = _feature_engineering.WordCounter( + features="docs", + to_lower=to_lower, + delimiters=delimiters, + output_column_prefix=None, + ) output_sf = fe.fit_transform(sf) - return output_sf['docs'] + return output_sf["docs"] -def count_ngrams(text, n=2, method="word", to_lower=True, - delimiters=DEFAULT_DELIMITERS, - ignore_punct=True, ignore_space=True): + +def count_ngrams( + text, + n=2, + method="word", + to_lower=True, + delimiters=DEFAULT_DELIMITERS, + ignore_punct=True, + ignore_space=True, +): """ Return an SArray of ``dict`` type where each element contains the count for each of the n-grams that appear in the corresponding input element. @@ -229,18 +274,20 @@ def count_ngrams(text, n=2, method="word", to_lower=True, _raise_error_if_not_sarray(text, "text") # Compute ngrams counts - sf = _turicreate.SFrame({'docs': text}) - fe = _feature_engineering.NGramCounter(features='docs', - n=n, - method=method, - to_lower=to_lower, - delimiters=delimiters, - ignore_punct=ignore_punct, - ignore_space=ignore_space, - output_column_prefix=None) + sf = _turicreate.SFrame({"docs": text}) + fe = _feature_engineering.NGramCounter( + features="docs", + n=n, + method=method, + to_lower=to_lower, + delimiters=delimiters, + ignore_punct=ignore_punct, + ignore_space=ignore_space, + output_column_prefix=None, + ) output_sf = fe.fit_transform(sf) - return output_sf['docs'] + return output_sf["docs"] def tf_idf(text): @@ -289,15 +336,16 @@ def tf_idf(text): if len(text) == 0: return _turicreate.SArray() - dataset = _turicreate.SFrame({'docs': text}) - scores = _feature_engineering.TFIDF('docs').fit_transform(dataset) + dataset = _turicreate.SFrame({"docs": text}) + scores = _feature_engineering.TFIDF("docs").fit_transform(dataset) - return scores['docs'] + return scores["docs"] -def drop_words(text, threshold=2, to_lower=True, delimiters=DEFAULT_DELIMITERS, - stop_words=None): - ''' +def drop_words( + text, threshold=2, to_lower=True, delimiters=DEFAULT_DELIMITERS, stop_words=None +): + """ Remove words that occur below a certain number of times in an SArray. This is a common method of cleaning text before it is used, and can increase the quality and explainability of the models learned on the transformed data. @@ -393,21 +441,24 @@ def drop_words(text, threshold=2, to_lower=True, delimiters=DEFAULT_DELIMITERS, dtype: list Rows: 2 [['one', 'one'], ['a dog', 'a dog']] - ''' + """ _raise_error_if_not_sarray(text, "text") ## Compute word counts - sf = _turicreate.SFrame({'docs': text}) - fe = _feature_engineering.RareWordTrimmer(features='docs', - threshold=threshold, - to_lower=to_lower, - delimiters=delimiters, - stopwords=stop_words, - output_column_prefix=None) + sf = _turicreate.SFrame({"docs": text}) + fe = _feature_engineering.RareWordTrimmer( + features="docs", + threshold=threshold, + to_lower=to_lower, + delimiters=delimiters, + stopwords=stop_words, + output_column_prefix=None, + ) tokens = fe.fit_transform(sf) - return tokens['docs'] + return tokens["docs"] + def tokenize(text, to_lower=False, delimiters=DEFAULT_DELIMITERS): """ @@ -468,16 +519,19 @@ def tokenize(text, to_lower=False, delimiters=DEFAULT_DELIMITERS): _raise_error_if_not_sarray(text, "text") ## Compute word counts - sf = _turicreate.SFrame({'docs': text}) - fe = _feature_engineering.Tokenizer(features='docs', - to_lower=to_lower, - delimiters=delimiters, - output_column_prefix=None) + sf = _turicreate.SFrame({"docs": text}) + fe = _feature_engineering.Tokenizer( + features="docs", + to_lower=to_lower, + delimiters=delimiters, + output_column_prefix=None, + ) tokens = fe.fit_transform(sf) - return tokens['docs'] + return tokens["docs"] + -def bm25(dataset, query, k1=1.5, b=.75): +def bm25(dataset, query, k1=1.5, b=0.75): """ For a given query and set of documents, compute the BM25 score for each document. If we have a query with words q_1, ..., q_n the BM25 score for @@ -552,10 +606,12 @@ def bm25(dataset, query, k1=1.5, b=.75): """ if type(dataset) != _turicreate.SArray: - raise TypeError('bm25 requires an SArray of dict, list, or str type'+\ - ', where each dictionary whose keys are words and whose values' + \ - ' are word frequency.') - sf = _SFrame({'docs' : dataset}) + raise TypeError( + "bm25 requires an SArray of dict, list, or str type" + + ", where each dictionary whose keys are words and whose values" + + " are word frequency." + ) + sf = _SFrame({"docs": dataset}) if type(query) is dict: # For backwards compatibility query = list(query.keys()) @@ -564,28 +620,34 @@ def bm25(dataset, query, k1=1.5, b=.75): if type(query) is set: query = list(query) if type(query) is not list: - raise TypeError('The query must either be an SArray of str type, '+\ - ' a list of strings, or a set of strings.') + raise TypeError( + "The query must either be an SArray of str type, " + + " a list of strings, or a set of strings." + ) # Calculate BM25 - sf = sf.add_row_number('doc_id') - sf = sf.dropna('docs') # Drop missing documents - scores = _feature_engineering.BM25('docs',query, k1, b, output_column_name = 'bm25').fit_transform(sf) + sf = sf.add_row_number("doc_id") + sf = sf.dropna("docs") # Drop missing documents + scores = _feature_engineering.BM25( + "docs", query, k1, b, output_column_name="bm25" + ).fit_transform(sf) # Find documents with query words - if scores['docs'].dtype is dict: - scores['doc_terms'] = scores['docs'].dict_keys() - elif scores['docs'].dtype is list: - scores['doc_terms'] = scores['docs'].apply(lambda x: list(set(x))) - elif scores['docs'].dtype is str: - scores['doc_terms'] = count_words(scores['docs']).dict_keys() + if scores["docs"].dtype is dict: + scores["doc_terms"] = scores["docs"].dict_keys() + elif scores["docs"].dtype is list: + scores["doc_terms"] = scores["docs"].apply(lambda x: list(set(x))) + elif scores["docs"].dtype is str: + scores["doc_terms"] = count_words(scores["docs"]).dict_keys() else: # This should never occur (handled by BM25) - raise TypeError('bm25 requires an SArray of dict, list, or str type') - scores['doc_counts'] = scores['doc_terms'].apply(lambda x: len([word for word in query if word in x])) - scores = scores[scores['doc_counts'] > 0] # Drop documents without query word - scores = scores.select_columns(['doc_id','bm25']) + raise TypeError("bm25 requires an SArray of dict, list, or str type") + scores["doc_counts"] = scores["doc_terms"].apply( + lambda x: len([word for word in query if word in x]) + ) + scores = scores[scores["doc_counts"] > 0] # Drop documents without query word + scores = scores.select_columns(["doc_id", "bm25"]) return scores @@ -634,23 +696,24 @@ def parse_sparse(filename, vocab_filename): >>> vocab = 'https://static.turi.com/datasets/text/ap.vocab.txt' >>> docs = turicreate.text_analytics.parse_sparse(file, vocab) """ - vocab = _turicreate.SFrame.read_csv(vocab_filename, header=None)['X1'] + vocab = _turicreate.SFrame.read_csv(vocab_filename, header=None)["X1"] vocab = list(vocab) docs = _turicreate.SFrame.read_csv(filename, header=None) # Remove first word - docs = docs['X1'].apply(lambda x: x.split(' ')[1:]) + docs = docs["X1"].apply(lambda x: x.split(" ")[1:]) # Helper function that checks whether we get too large a word id def get_word(word_id): - assert int(word_id) < len(vocab), \ - "Text data contains integers that are larger than the \ + assert int(word_id) < len( + vocab + ), "Text data contains integers that are larger than the \ size of the provided vocabulary." return vocab[word_id] def make_dict(pairs): - pairs = [z.split(':') for z in pairs] + pairs = [z.split(":") for z in pairs] ret = {} for k, v in pairs: ret[get_word(int(k))] = int(v) @@ -702,21 +765,21 @@ def parse_docword(filename, vocab_filename): >>> vocab = 'https://static.turi.com/datasets/text/vocab.nips.txt') >>> docs = turicreate.text_analytics.parse_docword(textfile, vocab) """ - vocab = _turicreate.SFrame.read_csv(vocab_filename, header=None)['X1'] + vocab = _turicreate.SFrame.read_csv(vocab_filename, header=None)["X1"] vocab = list(vocab) sf = _turicreate.SFrame.read_csv(filename, header=False) sf = sf[3:] - sf['X2'] = sf['X1'].apply(lambda x: [int(z) for z in x.split(' ')]) - del sf['X1'] - sf = sf.unpack('X2', column_name_prefix='', column_types=[int,int,int]) - docs = sf.unstack(['1', '2'], 'bow').sort('0')['bow'] - docs = docs.apply(lambda x: {vocab[k-1]:v for (k, v) in six.iteritems(x)}) + sf["X2"] = sf["X1"].apply(lambda x: [int(z) for z in x.split(" ")]) + del sf["X1"] + sf = sf.unpack("X2", column_name_prefix="", column_types=[int, int, int]) + docs = sf.unstack(["1", "2"], "bow").sort("0")["bow"] + docs = docs.apply(lambda x: {vocab[k - 1]: v for (k, v) in six.iteritems(x)}) return docs -def random_split(dataset, prob=.5): +def random_split(dataset, prob=0.5): """ Utility for performing a random split for text data that is already in bag-of-words format. For each (word, count) pair in a particular element, @@ -752,8 +815,9 @@ def grab_values(x, train=True): ix = 0 else: ix = 1 - return dict([(key, value[ix]) for key, value in six.iteritems(x) \ - if value[ix] != 0]) + return dict( + [(key, value[ix]) for key, value in six.iteritems(x) if value[ix] != 0] + ) def word_count_split(n, p): num_in_test = 0 @@ -765,20 +829,26 @@ def word_count_split(n, p): # Get an SArray where each word has a 2 element list containing # the count that will be for the training set and the count that will # be assigned to the test set. - data = dataset.apply(lambda x: dict([(key, word_count_split(int(value), prob)) \ - for key, value in six.iteritems(x)])) + data = dataset.apply( + lambda x: dict( + [ + (key, word_count_split(int(value), prob)) + for key, value in six.iteritems(x) + ] + ) + ) # Materialize the data set data.materialize() # Grab respective counts for each data set train = data.apply(lambda x: grab_values(x, train=True)) - test = data.apply(lambda x: grab_values(x, train=False)) + test = data.apply(lambda x: grab_values(x, train=False)) return train, test -def stop_words(lang='en'): +def stop_words(lang="en"): """ Get common words that are often removed during preprocessing of text data, i.e. "stop words". Currently only English stop words are provided. @@ -803,26 +873,553 @@ def stop_words(lang='en'): Rows: 1 [{'entertained': 1}] """ - if lang=='en' or lang=='english': - return set(['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'b', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'c', 'came', 'can', 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'course', 'currently', 'd', 'definitely', 'described', 'despite', 'did', 'different', 'do', 'does', 'doing', 'done', 'down', 'downwards', 'during', 'e', 'each', 'edu', 'eg', 'eight', 'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'f', 'far', 'few', 'fifth', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'four', 'from', 'further', 'furthermore', 'g', 'get', 'gets', 'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'h', 'had', 'happens', 'hardly', 'has', 'have', 'having', 'he', 'hello', 'help', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'i', 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'into', 'inward', 'is', 'it', 'its', 'itself', 'j', 'just', 'k', 'keep', 'keeps', 'kept', 'know', 'knows', 'known', 'l', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'm', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'more', 'moreover', 'most', 'mostly', 'much', 'must', 'my', 'myself', 'n', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'o', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'p', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'q', 'que', 'quite', 'qv', 'r', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 's', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 't', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twice', 'two', 'u', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'uucp', 'v', 'value', 'various', 'very', 'via', 'viz', 'vs', 'w', 'want', 'wants', 'was', 'way', 'we', 'welcome', 'well', 'went', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', 'would', 'would', 'x', 'y', 'yes', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', 'z', 'zero']) + if lang == "en" or lang == "english": + return set( + [ + "a", + "able", + "about", + "above", + "according", + "accordingly", + "across", + "actually", + "after", + "afterwards", + "again", + "against", + "all", + "allow", + "allows", + "almost", + "alone", + "along", + "already", + "also", + "although", + "always", + "am", + "among", + "amongst", + "an", + "and", + "another", + "any", + "anybody", + "anyhow", + "anyone", + "anything", + "anyway", + "anyways", + "anywhere", + "apart", + "appear", + "appreciate", + "appropriate", + "are", + "around", + "as", + "aside", + "ask", + "asking", + "associated", + "at", + "available", + "away", + "awfully", + "b", + "be", + "became", + "because", + "become", + "becomes", + "becoming", + "been", + "before", + "beforehand", + "behind", + "being", + "believe", + "below", + "beside", + "besides", + "best", + "better", + "between", + "beyond", + "both", + "brief", + "but", + "by", + "c", + "came", + "can", + "cannot", + "cant", + "cause", + "causes", + "certain", + "certainly", + "changes", + "clearly", + "co", + "com", + "come", + "comes", + "concerning", + "consequently", + "consider", + "considering", + "contain", + "containing", + "contains", + "corresponding", + "could", + "course", + "currently", + "d", + "definitely", + "described", + "despite", + "did", + "different", + "do", + "does", + "doing", + "done", + "down", + "downwards", + "during", + "e", + "each", + "edu", + "eg", + "eight", + "either", + "else", + "elsewhere", + "enough", + "entirely", + "especially", + "et", + "etc", + "even", + "ever", + "every", + "everybody", + "everyone", + "everything", + "everywhere", + "ex", + "exactly", + "example", + "except", + "f", + "far", + "few", + "fifth", + "first", + "five", + "followed", + "following", + "follows", + "for", + "former", + "formerly", + "forth", + "four", + "from", + "further", + "furthermore", + "g", + "get", + "gets", + "getting", + "given", + "gives", + "go", + "goes", + "going", + "gone", + "got", + "gotten", + "greetings", + "h", + "had", + "happens", + "hardly", + "has", + "have", + "having", + "he", + "hello", + "help", + "hence", + "her", + "here", + "hereafter", + "hereby", + "herein", + "hereupon", + "hers", + "herself", + "hi", + "him", + "himself", + "his", + "hither", + "hopefully", + "how", + "howbeit", + "however", + "i", + "ie", + "if", + "ignored", + "immediate", + "in", + "inasmuch", + "inc", + "indeed", + "indicate", + "indicated", + "indicates", + "inner", + "insofar", + "instead", + "into", + "inward", + "is", + "it", + "its", + "itself", + "j", + "just", + "k", + "keep", + "keeps", + "kept", + "know", + "knows", + "known", + "l", + "last", + "lately", + "later", + "latter", + "latterly", + "least", + "less", + "lest", + "let", + "like", + "liked", + "likely", + "little", + "look", + "looking", + "looks", + "ltd", + "m", + "mainly", + "many", + "may", + "maybe", + "me", + "mean", + "meanwhile", + "merely", + "might", + "more", + "moreover", + "most", + "mostly", + "much", + "must", + "my", + "myself", + "n", + "name", + "namely", + "nd", + "near", + "nearly", + "necessary", + "need", + "needs", + "neither", + "never", + "nevertheless", + "new", + "next", + "nine", + "no", + "nobody", + "non", + "none", + "noone", + "nor", + "normally", + "not", + "nothing", + "novel", + "now", + "nowhere", + "o", + "obviously", + "of", + "off", + "often", + "oh", + "ok", + "okay", + "old", + "on", + "once", + "one", + "ones", + "only", + "onto", + "or", + "other", + "others", + "otherwise", + "ought", + "our", + "ours", + "ourselves", + "out", + "outside", + "over", + "overall", + "own", + "p", + "particular", + "particularly", + "per", + "perhaps", + "placed", + "please", + "plus", + "possible", + "presumably", + "probably", + "provides", + "q", + "que", + "quite", + "qv", + "r", + "rather", + "rd", + "re", + "really", + "reasonably", + "regarding", + "regardless", + "regards", + "relatively", + "respectively", + "right", + "s", + "said", + "same", + "saw", + "say", + "saying", + "says", + "second", + "secondly", + "see", + "seeing", + "seem", + "seemed", + "seeming", + "seems", + "seen", + "self", + "selves", + "sensible", + "sent", + "serious", + "seriously", + "seven", + "several", + "shall", + "she", + "should", + "since", + "six", + "so", + "some", + "somebody", + "somehow", + "someone", + "something", + "sometime", + "sometimes", + "somewhat", + "somewhere", + "soon", + "sorry", + "specified", + "specify", + "specifying", + "still", + "sub", + "such", + "sup", + "sure", + "t", + "take", + "taken", + "tell", + "tends", + "th", + "than", + "thank", + "thanks", + "thanx", + "that", + "thats", + "the", + "their", + "theirs", + "them", + "themselves", + "then", + "thence", + "there", + "thereafter", + "thereby", + "therefore", + "therein", + "theres", + "thereupon", + "these", + "they", + "think", + "third", + "this", + "thorough", + "thoroughly", + "those", + "though", + "three", + "through", + "throughout", + "thru", + "thus", + "to", + "together", + "too", + "took", + "toward", + "towards", + "tried", + "tries", + "truly", + "try", + "trying", + "twice", + "two", + "u", + "un", + "under", + "unfortunately", + "unless", + "unlikely", + "until", + "unto", + "up", + "upon", + "us", + "use", + "used", + "useful", + "uses", + "using", + "usually", + "uucp", + "v", + "value", + "various", + "very", + "via", + "viz", + "vs", + "w", + "want", + "wants", + "was", + "way", + "we", + "welcome", + "well", + "went", + "were", + "what", + "whatever", + "when", + "whence", + "whenever", + "where", + "whereafter", + "whereas", + "whereby", + "wherein", + "whereupon", + "wherever", + "whether", + "which", + "while", + "whither", + "who", + "whoever", + "whole", + "whom", + "whose", + "why", + "will", + "willing", + "wish", + "with", + "within", + "without", + "wonder", + "would", + "would", + "x", + "y", + "yes", + "yet", + "you", + "your", + "yours", + "yourself", + "yourselves", + "z", + "zero", + ] + ) else: - raise NotImplementedError('Only English stop words are currently available.') - + raise NotImplementedError("Only English stop words are currently available.") def _check_input(dataset): if isinstance(dataset, _SFrame): - assert dataset.num_columns() == 1, \ - "The provided SFrame contains more than one column. It should have " +\ - "only one column of type dict." + assert dataset.num_columns() == 1, ( + "The provided SFrame contains more than one column. It should have " + + "only one column of type dict." + ) colname = dataset.column_names()[0] dataset = dataset[colname] - assert isinstance(dataset, _SArray), \ - "Provided data must be an SArray." + assert isinstance(dataset, _SArray), "Provided data must be an SArray." - assert dataset.dtype == dict, \ - "Provided data must be of type dict, representing the documents in " + \ - "bag-of-words format. Please consult the documentation." + assert dataset.dtype == dict, ( + "Provided data must be of type dict, representing the documents in " + + "bag-of-words format. Please consult the documentation." + ) return dataset diff --git a/src/python/turicreate/toolkits/text_classifier/__init__.py b/src/python/turicreate/toolkits/text_classifier/__init__.py index 82a536f012..bc2f4a0010 100644 --- a/src/python/turicreate/toolkits/text_classifier/__init__.py +++ b/src/python/turicreate/toolkits/text_classifier/__init__.py @@ -11,4 +11,4 @@ from __future__ import division as _ from __future__ import absolute_import as _ from ._text_classifier import create -from ._text_classifier import TextClassifier +from ._text_classifier import TextClassifier diff --git a/src/python/turicreate/toolkits/text_classifier/_text_classifier.py b/src/python/turicreate/toolkits/text_classifier/_text_classifier.py index 7bc3e58750..24a8b07439 100644 --- a/src/python/turicreate/toolkits/text_classifier/_text_classifier.py +++ b/src/python/turicreate/toolkits/text_classifier/_text_classifier.py @@ -20,7 +20,7 @@ def _BOW_FEATURE_EXTRACTOR(sf, target=None): Return an SFrame containing a bag of words representation of each column. """ if isinstance(sf, dict): - out = _tc.SArray([sf]).unpack('') + out = _tc.SArray([sf]).unpack("") elif isinstance(sf, _tc.SFrame): out = sf.__copy__() else: @@ -30,9 +30,17 @@ def _BOW_FEATURE_EXTRACTOR(sf, target=None): out[f] = _tc.text_analytics.count_words(out[f]) return out -def create(dataset, target, features = None, drop_stop_words = True, - word_count_threshold = 2, method = 'auto', validation_set = 'auto', - max_iterations = 10): + +def create( + dataset, + target, + features=None, + drop_stop_words=True, + word_count_threshold=2, + method="auto", + validation_set="auto", + max_iterations=10, +): """ Create a model that trains a classifier to classify text from a collection of documents. The model is a @@ -105,9 +113,9 @@ def create(dataset, target, features = None, drop_stop_words = True, _raise_error_if_not_sframe(dataset, "dataset") # Validate method. - if method == 'auto': - method = 'bow-logistic' - if method not in ['bow-logistic']: + if method == "auto": + method = "bow-logistic" + if method not in ["bow-logistic"]: raise ValueError("Unsupported method provided.") # Validate dataset @@ -125,31 +133,37 @@ def create(dataset, target, features = None, drop_stop_words = True, if drop_stop_words: stop_words = _text_analytics.stop_words() for cur_feature in features: - train[cur_feature] = _text_analytics.drop_words(train[cur_feature], - threshold = word_count_threshold, - stop_words = stop_words) + train[cur_feature] = _text_analytics.drop_words( + train[cur_feature], threshold=word_count_threshold, stop_words=stop_words + ) # Check for a validation set. if isinstance(validation_set, _tc.SFrame): validation_set = feature_extractor(validation_set, target) - m = _tc.logistic_classifier.create(train, - target=target, - features=features, - l2_penalty=.2, - max_iterations=max_iterations, - validation_set=validation_set) + m = _tc.logistic_classifier.create( + train, + target=target, + features=features, + l2_penalty=0.2, + max_iterations=max_iterations, + validation_set=validation_set, + ) num_examples = len(dataset) model = TextClassifier() model.__proxy__.update( - {'target': target, - 'features': features, - 'method': method, - 'num_examples': num_examples, - 'num_features': len(features), - 'classifier': m}) + { + "target": target, + "features": features, + "method": method, + "num_examples": num_examples, + "num_features": len(features), + "classifier": m, + } + ) return model + class TextClassifier(_CustomModel): _PYTHON_TEXT_CLASSIFIER_MODEL_VERSION = 1 @@ -168,18 +182,22 @@ def _get_version(self): def _get_native_state(self): import copy + retstate = copy.copy(self.__proxy__.state) - retstate['classifier'] = retstate['classifier'].__proxy__ + retstate["classifier"] = retstate["classifier"].__proxy__ return retstate @classmethod def _load_version(self, state, version): - from turicreate.toolkits.classifier.logistic_classifier import LogisticClassifier - state['classifier'] = LogisticClassifier(state['classifier']) + from turicreate.toolkits.classifier.logistic_classifier import ( + LogisticClassifier, + ) + + state["classifier"] = LogisticClassifier(state["classifier"]) state = _PythonProxy(state) return TextClassifier(state) - def predict(self, dataset, output_type='class'): + def predict(self, dataset, output_type="class"): """ Return predictions for ``dataset``, using the trained model. @@ -218,8 +236,8 @@ class as a vector. The probability of the first class (sorted >>> m.predict(dataset) """ - m = self.__proxy__['classifier'] - target = self.__proxy__['target'] + m = self.__proxy__["classifier"] + target = self.__proxy__["target"] f = _BOW_FEATURE_EXTRACTOR return m.predict(f(dataset, target), output_type=output_type) @@ -254,8 +272,8 @@ def classify(self, dataset): >>> output = m.classify(dataset) """ - m = self.__proxy__['classifier'] - target = self.__proxy__['target'] + m = self.__proxy__["classifier"] + target = self.__proxy__["target"] f = _BOW_FEATURE_EXTRACTOR return m.classify(f(dataset, target)) @@ -272,12 +290,14 @@ def __str__(self): def _get_summary_struct(self): - dataset_fields = [('Number of examples', 'num_examples')] - model_fields = [('Target column', 'target'), - ('Features', 'features'), - ('Method', 'method')] + dataset_fields = [("Number of examples", "num_examples")] + model_fields = [ + ("Target column", "target"), + ("Features", "features"), + ("Method", "method"), + ] sections = [dataset_fields, model_fields] - section_titles = ['dataset', 'Model'] + section_titles = ["dataset", "Model"] return sections, section_titles def __repr__(self): @@ -286,7 +306,7 @@ def __repr__(self): out = _toolkit_repr_print(self, sections, section_titles, width=width) return out - def evaluate(self, dataset, metric='auto', **kwargs): + def evaluate(self, dataset, metric="auto", **kwargs): """ Evaluate the model by making predictions of target values and comparing these to actual values. @@ -325,8 +345,8 @@ def evaluate(self, dataset, metric='auto', **kwargs): create, predict, classify """ - m = self.__proxy__['classifier'] - target = self.__proxy__['target'] + m = self.__proxy__["classifier"] + target = self.__proxy__["target"] f = _BOW_FEATURE_EXTRACTOR test = f(dataset, target) return m.evaluate(test, metric, **kwargs) @@ -335,7 +355,7 @@ def summary(self): """ Get a summary for the underlying classifier. """ - return self.__proxy__['classifier'].summary() + return self.__proxy__["classifier"].summary() def export_coreml(self, filename): """ @@ -353,16 +373,18 @@ def export_coreml(self, filename): from turicreate.extensions import _logistic_classifier_export_as_model_asset from turicreate.toolkits import _coreml_utils - display_name = 'text classifier' + display_name = "text classifier" short_description = _coreml_utils._mlmodel_short_description(display_name) - context = {'class': self.__class__.__name__, - 'short_description': short_description, - } - context['user_defined'] = _coreml_utils._get_tc_version_info() + context = { + "class": self.__class__.__name__, + "short_description": short_description, + } + context["user_defined"] = _coreml_utils._get_tc_version_info() - model = self.__proxy__['classifier'].__proxy__ + model = self.__proxy__["classifier"].__proxy__ _logistic_classifier_export_as_model_asset(model, filename, context) + def _get_str_columns(sf): """ Returns a list of names of columns that are string type. diff --git a/src/python/turicreate/toolkits/topic_model/topic_model.py b/src/python/turicreate/toolkits/topic_model/topic_model.py index bd4b0b55af..d058a14b8b 100644 --- a/src/python/turicreate/toolkits/topic_model/topic_model.py +++ b/src/python/turicreate/toolkits/topic_model/topic_model.py @@ -16,34 +16,41 @@ from turicreate.data_structures.sarray import SArray as _SArray from turicreate.toolkits.text_analytics._util import _check_input from turicreate.toolkits.text_analytics._util import random_split as _random_split -from turicreate.toolkits._internal_utils import _check_categorical_option_type, \ - _precomputed_field, \ - _toolkit_repr_print +from turicreate.toolkits._internal_utils import ( + _check_categorical_option_type, + _precomputed_field, + _toolkit_repr_print, +) import sys as _sys + if _sys.version_info.major == 3: _izip = zip _xrange = range else: from itertools import izip as _izip + _xrange = xrange import operator as _operator import array as _array -def create(dataset, - num_topics=10, - initial_topics=None, - alpha=None, - beta=.1, - num_iterations=10, - num_burnin=5, - associations=None, - verbose=False, - print_interval=10, - validation_set=None, - method='auto'): + +def create( + dataset, + num_topics=10, + initial_topics=None, + alpha=None, + beta=0.1, + num_iterations=10, + num_burnin=5, + associations=None, + verbose=False, + print_interval=10, + validation_set=None, + method="auto", +): """ Create a topic model from the given data set. A topic model assumes each document is a mixture of a set of topics, where for each topic some words @@ -193,24 +200,22 @@ def create(dataset, """ dataset = _check_input(dataset) - _check_categorical_option_type("method", method, ['auto', 'cgs', 'alias']) - if method == 'cgs' or method == 'auto': - model_name = 'cgs_topic_model' + _check_categorical_option_type("method", method, ["auto", "cgs", "alias"]) + if method == "cgs" or method == "auto": + model_name = "cgs_topic_model" else: - model_name = 'alias_topic_model' + model_name = "alias_topic_model" # If associations are provided, check they are in the proper format if associations is None: - associations = _turicreate.SFrame({'word': [], 'topic': []}) - if isinstance(associations, _turicreate.SFrame) and \ - associations.num_rows() > 0: - assert set(associations.column_names()) == set(['word', 'topic']), \ - "Provided associations must be an SFrame containing a word column\ + associations = _turicreate.SFrame({"word": [], "topic": []}) + if isinstance(associations, _turicreate.SFrame) and associations.num_rows() > 0: + assert set(associations.column_names()) == set( + ["word", "topic"] + ), "Provided associations must be an SFrame containing a word column\ and a topic column." - assert associations['word'].dtype == str, \ - "Words must be strings." - assert associations['topic'].dtype == int, \ - "Topic ids must be of int type." + assert associations["word"].dtype == str, "Words must be strings." + assert associations["topic"].dtype == int, "Topic ids must be of int type." if alpha is None: alpha = float(50) / num_topics @@ -224,49 +229,56 @@ def create(dataset, validation_train = _SArray() validation_test = _SArray() - opts = {'model_name': model_name, - 'data': dataset, - 'num_topics': num_topics, - 'num_iterations': num_iterations, - 'print_interval': print_interval, - 'alpha': alpha, - 'beta': beta, - 'num_burnin': num_burnin, - 'associations': associations} + opts = { + "model_name": model_name, + "data": dataset, + "num_topics": num_topics, + "num_iterations": num_iterations, + "print_interval": print_interval, + "alpha": alpha, + "beta": beta, + "num_burnin": num_burnin, + "associations": associations, + } # Initialize the model with basic parameters response = _turicreate.extensions._text.topicmodel_init(opts) - m = TopicModel(response['model']) + m = TopicModel(response["model"]) # If initial_topics provided, load it into the model if isinstance(initial_topics, _turicreate.SFrame): - assert set(['vocabulary', 'topic_probabilities']) == \ - set(initial_topics.column_names()), \ - "The provided initial_topics does not have the proper format, \ + assert set(["vocabulary", "topic_probabilities"]) == set( + initial_topics.column_names() + ), "The provided initial_topics does not have the proper format, \ e.g. wrong column names." - observed_topics = initial_topics['topic_probabilities'].apply(lambda x: len(x)) - assert all(observed_topics == num_topics), \ - "Provided num_topics value does not match the number of provided initial_topics." + observed_topics = initial_topics["topic_probabilities"].apply(lambda x: len(x)) + assert all( + observed_topics == num_topics + ), "Provided num_topics value does not match the number of provided initial_topics." # Rough estimate of total number of words weight = len(dataset) * 1000 - opts = {'model': m.__proxy__, - 'topics': initial_topics['topic_probabilities'], - 'vocabulary': initial_topics['vocabulary'], - 'weight': weight} + opts = { + "model": m.__proxy__, + "topics": initial_topics["topic_probabilities"], + "vocabulary": initial_topics["vocabulary"], + "weight": weight, + } response = _turicreate.extensions._text.topicmodel_set_topics(opts) - m = TopicModel(response['model']) + m = TopicModel(response["model"]) # Train the model on the given data set and retrieve predictions - opts = {'model': m.__proxy__, - 'data': dataset, - 'verbose': verbose, - 'validation_train': validation_train, - 'validation_test': validation_test} + opts = { + "model": m.__proxy__, + "data": dataset, + "verbose": verbose, + "validation_train": validation_train, + "validation_test": validation_test, + } response = _turicreate.extensions._text.topicmodel_train(opts) - m = TopicModel(response['model']) + m = TopicModel(response["model"]) return m @@ -317,24 +329,22 @@ def _get_summary_struct(self): A list of section titles. The order matches that of the 'sections' object. """ - section_titles=['Schema','Settings'] + section_titles = ["Schema", "Settings"] vocab_length = len(self.vocabulary) verbose = self.verbose == 1 - sections=[ - [ - ('Vocabulary Size',_precomputed_field(vocab_length)) - ], - [ - ('Number of Topics', 'num_topics'), - ('alpha','alpha'), - ('beta','beta'), - ('Iterations', 'num_iterations'), - ('Training time', 'training_time'), - ('Verbose', _precomputed_field(verbose)) - ] - ] + sections = [ + [("Vocabulary Size", _precomputed_field(vocab_length))], + [ + ("Number of Topics", "num_topics"), + ("alpha", "alpha"), + ("beta", "beta"), + ("Iterations", "num_iterations"), + ("Training time", "training_time"), + ("Verbose", _precomputed_field(verbose)), + ], + ] return (sections, section_titles) def __repr__(self): @@ -350,14 +360,29 @@ def __repr__(self): extra = [] extra.append(key_str.format("Accessible fields", width, "")) - extra.append(key_str.format("m.topics",width,"An SFrame containing the topics.")) - extra.append(key_str.format("m.vocabulary",width,"An SArray containing the words in the vocabulary.")) + extra.append( + key_str.format("m.topics", width, "An SFrame containing the topics.") + ) + extra.append( + key_str.format( + "m.vocabulary", + width, + "An SArray containing the words in the vocabulary.", + ) + ) extra.append(key_str.format("Useful methods", width, "")) - extra.append(key_str.format("m.get_topics()",width,"Get the most probable words per topic.")) - extra.append(key_str.format("m.predict(new_docs)",width,"Make predictions for new documents.")) - - return out + '\n' + '\n'.join(extra) - + extra.append( + key_str.format( + "m.get_topics()", width, "Get the most probable words per topic." + ) + ) + extra.append( + key_str.format( + "m.predict(new_docs)", width, "Make predictions for new documents." + ) + ) + + return out + "\n" + "\n".join(extra) def _get(self, field): """ @@ -389,9 +414,9 @@ def _get(self, field): Value of the requested field. """ - opts = {'model': self.__proxy__, 'field': field} + opts = {"model": self.__proxy__, "field": field} response = _turicreate.extensions._text.topicmodel_get_value(opts) - return response['value'] + return response["value"] def _training_stats(self): """ @@ -419,16 +444,20 @@ def _training_stats(self): """ fields = self._list_fields() - stat_fields = ['training_time', - 'training_iterations'] - if 'validation_perplexity' in fields: - stat_fields.append('validation_perplexity') + stat_fields = ["training_time", "training_iterations"] + if "validation_perplexity" in fields: + stat_fields.append("validation_perplexity") - ret = {k : self._get(k) for k in stat_fields} + ret = {k: self._get(k) for k in stat_fields} return ret - def get_topics(self, topic_ids=None, num_words=5, cdf_cutoff=1.0, - output_type='topic_probabilities'): + def get_topics( + self, + topic_ids=None, + num_words=5, + cdf_cutoff=1.0, + output_type="topic_probabilities", + ): """ Get the words associated with a given topic. The score column is the @@ -533,41 +562,48 @@ def get_topics(self, topic_ids=None, num_words=5, cdf_cutoff=1.0, ['hidden', 'layer', 'system', 'training', 'vector'], ['component', 'distribution', 'local', 'model', 'optimal']] """ - _check_categorical_option_type('output_type', output_type, - ['topic_probabilities', 'topic_words']) + _check_categorical_option_type( + "output_type", output_type, ["topic_probabilities", "topic_words"] + ) if topic_ids is None: - topic_ids = list(range(self._get('num_topics'))) + topic_ids = list(range(self._get("num_topics"))) - assert isinstance(topic_ids, list), \ - "The provided topic_ids is not a list." + assert isinstance(topic_ids, list), "The provided topic_ids is not a list." if any([type(x) == str for x in topic_ids]): - raise ValueError("Only integer topic_ids can be used at this point in time.") + raise ValueError( + "Only integer topic_ids can be used at this point in time." + ) if not all([x >= 0 and x < self.num_topics for x in topic_ids]): - raise ValueError("Topic id values must be non-negative and less than the " + \ - "number of topics used to fit the model.") - - opts = {'model': self.__proxy__, - 'topic_ids': topic_ids, - 'num_words': num_words, - 'cdf_cutoff': cdf_cutoff} + raise ValueError( + "Topic id values must be non-negative and less than the " + + "number of topics used to fit the model." + ) + + opts = { + "model": self.__proxy__, + "topic_ids": topic_ids, + "num_words": num_words, + "cdf_cutoff": cdf_cutoff, + } response = _turicreate.extensions._text.topicmodel_get_topic(opts) - ret = response['top_words'] + ret = response["top_words"] def sort_wordlist_by_prob(z): words = sorted(z.items(), key=_operator.itemgetter(1), reverse=True) return [word for (word, prob) in words] - if output_type != 'topic_probabilities': - ret = ret.groupby('topic', - {'word': _turicreate.aggregate.CONCAT('word', 'score')}) - words = ret.sort('topic')['word'].apply(sort_wordlist_by_prob) - ret = _SFrame({'words': words}) + if output_type != "topic_probabilities": + ret = ret.groupby( + "topic", {"word": _turicreate.aggregate.CONCAT("word", "score")} + ) + words = ret.sort("topic")["word"].apply(sort_wordlist_by_prob) + ret = _SFrame({"words": words}) return ret - def predict(self, dataset, output_type='assignment', num_burnin=None): + def predict(self, dataset, output_type="assignment", num_burnin=None): """ Use the model to predict topics for each document. The provided `dataset` should be an SArray object where each element is a dict @@ -646,21 +682,18 @@ def predict(self, dataset, output_type='assignment', num_burnin=None): if num_burnin is None: num_burnin = self.num_burnin - opts = {'model': self.__proxy__, - 'data': dataset, - 'num_burnin': num_burnin} + opts = {"model": self.__proxy__, "data": dataset, "num_burnin": num_burnin} response = _turicreate.extensions._text.topicmodel_predict(opts) - preds = response['predictions'] + preds = response["predictions"] # Get most likely topic if probabilities are not requested - if output_type not in ['probability', 'probabilities', 'prob']: + if output_type not in ["probability", "probabilities", "prob"]: # equivalent to numpy.argmax(x) preds = preds.apply(lambda x: max(_izip(x, _xrange(len(x))))[1]) return preds - - def evaluate(self, train_data, test_data=None, metric='perplexity'): + def evaluate(self, train_data, test_data=None, metric="perplexity"): """ Estimate the model's ability to predict new data. Imagine you have a corpus of books. One common approach to evaluating topic models is to @@ -720,14 +753,13 @@ def evaluate(self, train_data, test_data=None, metric='perplexity'): else: test_data = _check_input(test_data) - predictions = self.predict(train_data, output_type='probability') + predictions = self.predict(train_data, output_type="probability") topics = self.topics ret = {} - ret['perplexity'] = perplexity(test_data, - predictions, - topics['topic_probabilities'], - topics['vocabulary']) + ret["perplexity"] = perplexity( + test_data, predictions, topics["topic_probabilities"], topics["vocabulary"] + ) return ret @@ -806,15 +838,19 @@ def perplexity(test_data, predictions, topics, vocabulary): 1720.7 # lower values are better """ test_data = _check_input(test_data) - assert isinstance(predictions, _SArray), \ - "Predictions must be an SArray of vector type." - assert predictions.dtype == _array.array, \ - "Predictions must be probabilities. Try using m.predict() with " + \ - "output_type='probability'." - - opts = {'test_data': test_data, - 'predictions': predictions, - 'topics': topics, - 'vocabulary': vocabulary} + assert isinstance( + predictions, _SArray + ), "Predictions must be an SArray of vector type." + assert predictions.dtype == _array.array, ( + "Predictions must be probabilities. Try using m.predict() with " + + "output_type='probability'." + ) + + opts = { + "test_data": test_data, + "predictions": predictions, + "topics": topics, + "vocabulary": vocabulary, + } response = _turicreate.extensions._text.topicmodel_get_perplexity(opts) - return response['perplexity'] + return response["perplexity"] diff --git a/src/python/turicreate/util/__init__.py b/src/python/turicreate/util/__init__.py index c0a7b71a0a..1f1c9c155b 100644 --- a/src/python/turicreate/util/__init__.py +++ b/src/python/turicreate/util/__init__.py @@ -27,18 +27,22 @@ except ImportError: import ConfigParser as _ConfigParser + def _convert_slashes(path): """ Converts all windows-style slashes to unix-style slashes """ - return path.replace('\\', '/') + return path.replace("\\", "/") + def _get_s3_endpoint(): """ Returns the current S3 Endpoint" """ import turicreate - return turicreate.config.get_runtime_config()['TURI_S3_ENDPOINT'] + + return turicreate.config.get_runtime_config()["TURI_S3_ENDPOINT"] + def _get_aws_credentials(): """ @@ -59,11 +63,15 @@ def _get_aws_credentials(): ('RBZH792CTQPP7T435BGQ', '7x2hMqplWsLpU/qQCN6xAPKcmWo46TlPJXYTvKcv') """ - if (not 'AWS_ACCESS_KEY_ID' in _os.environ): - raise KeyError('No access key found. Please set the environment variable AWS_ACCESS_KEY_ID.') - if (not 'AWS_SECRET_ACCESS_KEY' in _os.environ): - raise KeyError('No secret key found. Please set the environment variable AWS_SECRET_ACCESS_KEY.') - return (_os.environ['AWS_ACCESS_KEY_ID'], _os.environ['AWS_SECRET_ACCESS_KEY']) + if not "AWS_ACCESS_KEY_ID" in _os.environ: + raise KeyError( + "No access key found. Please set the environment variable AWS_ACCESS_KEY_ID." + ) + if not "AWS_SECRET_ACCESS_KEY" in _os.environ: + raise KeyError( + "No secret key found. Please set the environment variable AWS_SECRET_ACCESS_KEY." + ) + return (_os.environ["AWS_ACCESS_KEY_ID"], _os.environ["AWS_SECRET_ACCESS_KEY"]) def _try_inject_s3_credentials(url): @@ -72,20 +80,20 @@ def _try_inject_s3_credentials(url): If s3 url already contains secret key/id pairs, just return as is. """ - assert url.startswith('s3://') + assert url.startswith("s3://") path = url[5:] # Check if the path already contains credentials - tokens = path.split(':') + tokens = path.split(":") # If there are two ':', its possible that we have already injected credentials if len(tokens) == 3: # Edge case: there are exactly two ':'s in the object key which is a false alarm. # We prevent this by checking that '/' is not in the assumed key and id. - if ('/' not in tokens[0]) and ('/' not in tokens[1]): + if ("/" not in tokens[0]) and ("/" not in tokens[1]): return url # S3 url does not contain secret key/id pair, query the environment variables (k, v) = _get_aws_credentials() - return 's3://' + k + ':' + v + ':' + path + return "s3://" + k + ":" + v + ":" + path def _make_internal_url(url): @@ -112,7 +120,7 @@ def _make_internal_url(url): If a bad url is provided. """ if not url: - raise ValueError('Invalid url: %s' % url) + raise ValueError("Invalid url: %s" % url) from .. import _sys_util from . import _file_util @@ -123,22 +131,27 @@ def _make_internal_url(url): # Try to split the url into (protocol, path). protocol = _file_util.get_protocol(url) is_local = False - if protocol in ['http', 'https']: + if protocol in ["http", "https"]: pass - elif protocol == 'hdfs': + elif protocol == "hdfs": if not _sys_util.get_hadoop_class_path(): - raise ValueError("HDFS URL is not supported because Hadoop not found. Please make hadoop available from PATH or set the environment variable HADOOP_HOME and try again.") - elif protocol == 's3': + raise ValueError( + "HDFS URL is not supported because Hadoop not found. Please make hadoop available from PATH or set the environment variable HADOOP_HOME and try again." + ) + elif protocol == "s3": return _try_inject_s3_credentials(url) - elif protocol == '': + elif protocol == "": is_local = True - elif (protocol == 'local' or protocol == 'remote'): + elif protocol == "local" or protocol == "remote": # local and remote are legacy protocol for separate server process is_local = True # This code assumes local and remote are same machine - url = _re.sub(protocol+'://','',url,count=1) + url = _re.sub(protocol + "://", "", url, count=1) else: - raise ValueError('Invalid url protocol %s. Supported url protocols are: local, s3://, https:// and hdfs://' % protocol) + raise ValueError( + "Invalid url protocol %s. Supported url protocols are: local, s3://, https:// and hdfs://" + % protocol + ) if is_local: url = _os.path.abspath(_os.path.expanduser(url)) @@ -167,7 +180,7 @@ def is_directory_archive(path): if not _os.path.isdir(path): return False - ini_path = '/'.join([_convert_slashes(path), 'dir_archive.ini']) + ini_path = "/".join([_convert_slashes(path), "dir_archive.ini"]) if not _os.path.exists(ini_path): return False @@ -176,6 +189,7 @@ def is_directory_archive(path): return True return False + def get_archive_type(path): """ Returns the contents type for the provided archive path. @@ -190,17 +204,18 @@ def get_archive_type(path): Returns a string of: sframe, sgraph, raises TypeError for anything else """ if not is_directory_archive(path): - raise TypeError('Unable to determine the type of archive at path: %s' % path) + raise TypeError("Unable to determine the type of archive at path: %s" % path) try: - ini_path = '/'.join([_convert_slashes(path), 'dir_archive.ini']) + ini_path = "/".join([_convert_slashes(path), "dir_archive.ini"]) parser = _ConfigParser.SafeConfigParser() parser.read(ini_path) - contents = parser.get('metadata', 'contents') + contents = parser.get("metadata", "contents") return contents except Exception as e: - raise TypeError('Unable to determine type of archive for path: %s' % path, e) + raise TypeError("Unable to determine type of archive for path: %s" % path, e) + def crossproduct(d): """ @@ -237,31 +252,35 @@ def crossproduct(d): """ from .. import SArray + d = [list(zip(list(d.keys()), x)) for x in _itertools.product(*list(d.values()))] - sa = [{k:v for (k,v) in x} for x in d] - return SArray(sa).unpack(column_name_prefix='') + sa = [{k: v for (k, v) in x} for x in d] + return SArray(sa).unpack(column_name_prefix="") def get_turicreate_object_type(url): - ''' + """ Given url where a Turi Create object is persisted, return the Turi Create object type: 'model', 'graph', 'sframe', or 'sarray' - ''' + """ from .._connect import main as _glconnect + ret = _glconnect.get_unity().get_turicreate_object_type(_make_internal_url(url)) # to be consistent, we use sgraph instead of graph here - if ret == 'graph': - ret = 'sgraph' + if ret == "graph": + ret = "sgraph" return ret -def _assert_sframe_equal(sf1, - sf2, - check_column_names=True, - check_column_order=True, - check_row_order=True, - float_column_delta=None): +def _assert_sframe_equal( + sf1, + sf2, + check_column_names=True, + check_column_order=True, + check_row_order=True, + float_column_delta=None, +): """ Assert the two SFrames are equal. @@ -300,6 +319,7 @@ def _assert_sframe_equal(sf1, float type. Applies to all float columns. """ from .. import SFrame as _SFrame + if (type(sf1) is not _SFrame) or (type(sf2) is not _SFrame): raise TypeError("Cannot function on types other than SFrames.") @@ -310,8 +330,12 @@ def _assert_sframe_equal(sf1, sf2.materialize() if sf1.num_columns() != sf2.num_columns(): - raise AssertionError("Number of columns mismatched: " + - str(sf1.num_columns()) + " != " + str(sf2.num_columns())) + raise AssertionError( + "Number of columns mismatched: " + + str(sf1.num_columns()) + + " != " + + str(sf2.num_columns()) + ) s1_names = sf1.column_names() s2_names = sf2.column_names() @@ -320,13 +344,23 @@ def _assert_sframe_equal(sf1, sorted_s2_names = sorted(s2_names) if check_column_names: - if (check_column_order and (s1_names != s2_names)) or (sorted_s1_names != sorted_s2_names): - raise AssertionError("SFrame does not have same column names: " + - str(sf1.column_names()) + " != " + str(sf2.column_names())) + if (check_column_order and (s1_names != s2_names)) or ( + sorted_s1_names != sorted_s2_names + ): + raise AssertionError( + "SFrame does not have same column names: " + + str(sf1.column_names()) + + " != " + + str(sf2.column_names()) + ) if sf1.num_rows() != sf2.num_rows(): - raise AssertionError("Number of rows mismatched: " + - str(sf1.num_rows()) + " != " + str(sf2.num_rows())) + raise AssertionError( + "Number of rows mismatched: " + + str(sf1.num_rows()) + + " != " + + str(sf2.num_rows()) + ) if not check_row_order and (sf1.num_rows() > 1): sf1 = sf1.sort(s1_names) @@ -334,9 +368,9 @@ def _assert_sframe_equal(sf1, names_to_check = None if check_column_names: - names_to_check = list(zip(sorted_s1_names, sorted_s2_names)) + names_to_check = list(zip(sorted_s1_names, sorted_s2_names)) else: - names_to_check = list(zip(s1_names, s2_names)) + names_to_check = list(zip(s1_names, s2_names)) for i in names_to_check: col1 = sf1[i[0]] col2 = sf2[i[1]] @@ -346,99 +380,112 @@ def _assert_sframe_equal(sf1, compare_ary = None if col1.dtype == float and float_column_delta is not None: dt = float_column_delta - compare_ary = ((col1 > col2-dt) & (col1 < col2+dt)) + compare_ary = (col1 > col2 - dt) & (col1 < col2 + dt) else: - compare_ary = (sf1[i[0]] == sf2[i[1]]) + compare_ary = sf1[i[0]] == sf2[i[1]] if not compare_ary.all(): count = 0 for j in compare_ary: if not j: - first_row = count - break + first_row = count + break count += 1 - raise AssertionError("Columns " + str(i) + - " are not equal! First differing element is at row " + - str(first_row) + ": " + str((col1[first_row],col2[first_row]))) + raise AssertionError( + "Columns " + + str(i) + + " are not equal! First differing element is at row " + + str(first_row) + + ": " + + str((col1[first_row], col2[first_row])) + ) + def _get_temp_file_location(): - ''' + """ Returns user specified temporary file location. The temporary location is specified through: >>> turicreate.config.set_runtime_config('TURI_CACHE_FILE_LOCATIONS', ...) - ''' + """ from .._connect import main as _glconnect + unity = _glconnect.get_unity() cache_dir = _convert_slashes(unity.get_current_cache_file_location()) if not _os.path.exists(cache_dir): _os.makedirs(cache_dir) return cache_dir + def _make_temp_directory(prefix): - ''' + """ Generate a temporary directory that would not live beyond the lifetime of unity_server. Caller is expected to clean up the temp file as soon as the directory is no longer needed. But the directory will be cleaned as unity_server restarts - ''' + """ temp_dir = _make_temp_filename(prefix=str(prefix)) _os.makedirs(temp_dir) return temp_dir + def _make_temp_filename(prefix): - ''' + """ Generate a temporary file that would not live beyond the lifetime of unity_server. Caller is expected to clean up the temp file as soon as the file is no longer needed. But temp files created using this method will be cleaned up when unity_server restarts - ''' + """ temp_location = _get_temp_file_location() - temp_file_name = '/'.join([temp_location, str(prefix)+str(_uuid.uuid4())]) + temp_file_name = "/".join([temp_location, str(prefix) + str(_uuid.uuid4())]) return temp_file_name + def _pickle_to_temp_location_or_memory(obj): - ''' + """ If obj can be serialized directly into memory (via cloudpickle) this will return the serialized bytes. Otherwise, gl_pickle is attempted and it will then generates a temporary directory serializes an object into it, returning the directory name. This directory will not have lifespan greater than that of unity_server. - ''' - from . import _cloudpickle as cloudpickle - try: - # try cloudpickle first and see if that works - lambda_str = cloudpickle.dumps(obj) - return lambda_str - except: - pass - - # nope. that does not work! lets try again with gl pickle - filename = _make_temp_filename('pickle') - from .. import _gl_pickle - pickler = _gl_pickle.GLPickler(filename) - pickler.dump(obj) - pickler.close() - return filename + """ + from . import _cloudpickle as cloudpickle + + try: + # try cloudpickle first and see if that works + lambda_str = cloudpickle.dumps(obj) + return lambda_str + except: + pass + + # nope. that does not work! lets try again with gl pickle + filename = _make_temp_filename("pickle") + from .. import _gl_pickle + + pickler = _gl_pickle.GLPickler(filename) + pickler.dump(obj) + pickler.close() + return filename def _get_module_from_object(obj): - mod_str = obj.__class__.__module__.split('.')[0] + mod_str = obj.__class__.__module__.split(".")[0] return _sys.modules[mod_str] + def _infer_dbapi2_types(cursor, mod_info): desc = cursor.description result_set_types = [i[1] for i in desc] - dbapi2_to_python = [ # a type code can match more than one, so ordered by - # preference (loop short-circuits when it finds a match - (mod_info['DATETIME'], _datetime.datetime), - (mod_info['ROWID'],int), - (mod_info['NUMBER'],float), - ] + dbapi2_to_python = [ # a type code can match more than one, so ordered by + # preference (loop short-circuits when it finds a match + (mod_info["DATETIME"], _datetime.datetime), + (mod_info["ROWID"], int), + (mod_info["NUMBER"], float), + ] ret_types = [] # Ugly nested loop because the standard only guarantees that a type code @@ -457,13 +504,15 @@ def _infer_dbapi2_types(cursor, mod_info): return ret_types + def _pytype_to_printf(in_type): if in_type == int: - return 'd' + return "d" elif in_type == float: - return 'f' + return "f" else: - return 's' + return "s" + # Automatic GPU detection def _get_cuda_gpus(): @@ -475,31 +524,39 @@ def _get_cuda_gpus(): - memory_total (float, total memory in MiB) """ import subprocess + try: - output = subprocess.check_output(['nvidia-smi', - '--query-gpu=index,gpu_name,memory.free,memory.total', - '--format=csv,noheader,nounits'], - universal_newlines=True) + output = subprocess.check_output( + [ + "nvidia-smi", + "--query-gpu=index,gpu_name,memory.free,memory.total", + "--format=csv,noheader,nounits", + ], + universal_newlines=True, + ) except OSError: return [] except subprocess.CalledProcessError: return [] gpus = [] - for gpu_line in output.split('\n'): + for gpu_line in output.split("\n"): if gpu_line: - index, gpu_name, memory_free, memory_total = gpu_line.split(', ') + index, gpu_name, memory_free, memory_total = gpu_line.split(", ") index = int(index) memory_free = float(memory_free) memory_total = float(memory_total) - gpus.append({ - 'index': index, - 'name': gpu_name, - 'memory_free': memory_free, - 'memory_total': memory_total, - }) + gpus.append( + { + "index": index, + "name": gpu_name, + "memory_free": memory_free, + "memory_total": memory_total, + } + ) return gpus + _CUDA_GPUS = _get_cuda_gpus() @@ -513,6 +570,7 @@ def _num_available_gpus(): return num_cuda from turicreate.toolkits._mps_utils import has_fast_mps_support + if has_fast_mps_support(): return 1 diff --git a/src/python/turicreate/util/_cloudpickle.py b/src/python/turicreate/util/_cloudpickle.py index 6e1e87f73b..32cae455ec 100644 --- a/src/python/turicreate/util/_cloudpickle.py +++ b/src/python/turicreate/util/_cloudpickle.py @@ -66,6 +66,7 @@ if sys.version_info.major == 2: from pickle import Pickler + try: from cStringIO import StringIO except ImportError: @@ -73,11 +74,13 @@ PY3 = False # This import prevents futurize from trying to change types.ClassType -> type from types import ClassType as _ClassType + _class_type = _ClassType else: _class_type = type from pickle import _Pickler as Pickler from io import BytesIO as StringIO + PY3 = True @@ -105,6 +108,7 @@ def _stub(value): invalid syntax on Python 2. If we use this function we also don't need to do the weird freevars/cellvars swap below """ + def inner(value): lambda: cell # make ``cell`` a closure so that we get a STORE_DEREF cell = value @@ -158,25 +162,21 @@ def cell_set(cell, value): """Set the value of a closure cell. """ return types.FunctionType( - _cell_set_template_code, - {}, - '_cell_set_inner', - (), - (cell,), + _cell_set_template_code, {}, "_cell_set_inner", (), (cell,), )(value) -#relevant opcodes -STORE_GLOBAL = opcode.opmap['STORE_GLOBAL'] -DELETE_GLOBAL = opcode.opmap['DELETE_GLOBAL'] -LOAD_GLOBAL = opcode.opmap['LOAD_GLOBAL'] +# relevant opcodes +STORE_GLOBAL = opcode.opmap["STORE_GLOBAL"] +DELETE_GLOBAL = opcode.opmap["DELETE_GLOBAL"] +LOAD_GLOBAL = opcode.opmap["LOAD_GLOBAL"] GLOBAL_OPS = (STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL) HAVE_ARGUMENT = dis.HAVE_ARGUMENT EXTENDED_ARG = dis.EXTENDED_ARG def islambda(func): - return getattr(func,'__name__') == '' + return getattr(func, "__name__") == "" _BUILTIN_TYPE_NAMES = {} @@ -190,12 +190,13 @@ def _builtin_type(name): if sys.version_info < (3, 4): + def _walk_global_ops(code): """ Yield (opcode, argument number) tuples for all global-referencing instructions in *code*. """ - code = getattr(code, 'co_code', b'') + code = getattr(code, "co_code", b"") if not PY3: code = map(ord, code) @@ -214,7 +215,9 @@ def _walk_global_ops(code): if op in GLOBAL_OPS: yield op, oparg + else: + def _walk_global_ops(code): """ Yield (opcode, argument number) tuples for all @@ -242,7 +245,7 @@ def dump(self, obj): try: return Pickler.dump(self, obj) except RuntimeError as e: - if 'recursion' in e.args[0]: + if "recursion" in e.args[0]: msg = """Could not pickle object as excessively deep recursion required.""" raise pickle.PicklingError(msg) except pickle.PickleError: @@ -256,18 +259,21 @@ def dump(self, obj): if "'i' format requires" in emsg: msg = "Object too large to serialize: %s" % emsg else: - msg = "Could not serialize object: %s: %s" % (e.__class__.__name__, emsg) + msg = "Could not serialize object: %s: %s" % ( + e.__class__.__name__, + emsg, + ) print_exec(sys.stderr) raise pickle.PicklingError(msg) - def save_memoryview(self, obj): """Fallback to save_string""" Pickler.save_string(self, str(obj)) def save_buffer(self, obj): """Fallback to save_string""" - Pickler.save_string(self,str(obj)) + Pickler.save_string(self, str(obj)) + if PY3: dispatch[memoryview] = save_memoryview else: @@ -275,6 +281,7 @@ def save_buffer(self, obj): def save_unsupported(self, obj): raise pickle.PicklingError("Cannot pickle objects of type %s" % type(obj)) + dispatch[types.GeneratorType] = save_unsupported # itertools objects do not pickle! @@ -288,7 +295,7 @@ def save_module(self, obj): """ mod_name = obj.__name__ # If module is successfully found then it is not a dynamically created module - if hasattr(obj, '__file__'): + if hasattr(obj, "__file__"): is_dynamic = False else: try: @@ -302,6 +309,7 @@ def save_module(self, obj): self.save_reduce(dynamic_subimport, (obj.__name__, vars(obj)), obj=obj) else: self.save_reduce(subimport, (obj.__name__,), obj=obj) + dispatch[types.ModuleType] = save_module def save_codeobject(self, obj): @@ -310,18 +318,41 @@ def save_codeobject(self, obj): """ if PY3: args = ( - obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, - obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, obj.co_varnames, - obj.co_filename, obj.co_name, obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, - obj.co_cellvars + obj.co_argcount, + obj.co_kwonlyargcount, + obj.co_nlocals, + obj.co_stacksize, + obj.co_flags, + obj.co_code, + obj.co_consts, + obj.co_names, + obj.co_varnames, + obj.co_filename, + obj.co_name, + obj.co_firstlineno, + obj.co_lnotab, + obj.co_freevars, + obj.co_cellvars, ) else: args = ( - obj.co_argcount, obj.co_nlocals, obj.co_stacksize, obj.co_flags, obj.co_code, - obj.co_consts, obj.co_names, obj.co_varnames, obj.co_filename, obj.co_name, - obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, obj.co_cellvars + obj.co_argcount, + obj.co_nlocals, + obj.co_stacksize, + obj.co_flags, + obj.co_code, + obj.co_consts, + obj.co_names, + obj.co_varnames, + obj.co_filename, + obj.co_name, + obj.co_firstlineno, + obj.co_lnotab, + obj.co_freevars, + obj.co_cellvars, ) self.save_reduce(types.CodeType, args, obj=obj) + dispatch[types.CodeType] = save_codeobject def save_function(self, obj, name=None): @@ -345,9 +376,9 @@ def save_function(self, obj, name=None): themodule = sys.modules[modname] except KeyError: # eval'd items such as namedtuple give invalid items for their function __module__ - modname = '__main__' + modname = "__main__" - if modname == '__main__': + if modname == "__main__": themodule = None if themodule: @@ -362,14 +393,14 @@ def save_function(self, obj, name=None): # user-defined functions below will fail. # So we pickle them here using save_reduce; have to do it differently # for different python versions. - if not hasattr(obj, '__code__'): + if not hasattr(obj, "__code__"): if PY3: if sys.version_info < (3, 4): raise pickle.PicklingError("Can't pickle %r" % obj) else: rv = obj.__reduce_ex__(self.proto) else: - if hasattr(obj, '__self__'): + if hasattr(obj, "__self__"): rv = (getattr, (obj.__self__, name)) else: raise pickle.PicklingError("Can't pickle %r" % obj) @@ -378,9 +409,11 @@ def save_function(self, obj, name=None): # if func is lambda, def'ed at prompt, is in main, or is nested, then # we'll pickle the actual function object rather than simply saving a # reference (as is done in default pickler), via save_function_tuple. - if (islambda(obj) - or getattr(obj.__code__, 'co_filename', None) == '' - or themodule is None): + if ( + islambda(obj) + or getattr(obj.__code__, "co_filename", None) == "" + or themodule is None + ): self.save_function_tuple(obj) return else: @@ -393,13 +426,14 @@ def save_function(self, obj, name=None): if obj.__dict__: # essentially save_reduce, but workaround needed to avoid recursion self.save(_restore_attr) - write(pickle.MARK + pickle.GLOBAL + modname + '\n' + name + '\n') + write(pickle.MARK + pickle.GLOBAL + modname + "\n" + name + "\n") self.memoize(obj) self.save(obj.__dict__) write(pickle.TUPLE + pickle.REDUCE) else: - write(pickle.GLOBAL + modname + '\n' + name + '\n') + write(pickle.GLOBAL + modname + "\n" + name + "\n") self.memoize(obj) + dispatch[types.FunctionType] = save_function def _save_subimports(self, code, top_level_dependencies): @@ -409,14 +443,18 @@ def _save_subimports(self, code, top_level_dependencies): """ # check if any known dependency is an imported package for x in top_level_dependencies: - if isinstance(x, types.ModuleType) and hasattr(x, '__package__') and x.__package__: + if ( + isinstance(x, types.ModuleType) + and hasattr(x, "__package__") + and x.__package__ + ): # check if the package has any currently loaded sub-imports - prefix = x.__name__ + '.' + prefix = x.__name__ + "." for name, module in sys.modules.items(): # Older versions of pytest will add a "None" module to sys.modules. if name is not None and name.startswith(prefix): # check whether the function can address the sub-module - tokens = set(name[len(prefix):].split('.')) + tokens = set(name[len(prefix) :].split(".")) if not tokens - set(code.co_names): # ensure unpickler executes this import self.save(module) @@ -432,18 +470,18 @@ def save_dynamic_class(self, obj): from global modules. """ clsdict = dict(obj.__dict__) # copy dict proxy to a dict - if not isinstance(clsdict.get('__dict__', None), property): + if not isinstance(clsdict.get("__dict__", None), property): # don't extract dict that are properties - clsdict.pop('__dict__', None) - clsdict.pop('__weakref__', None) + clsdict.pop("__dict__", None) + clsdict.pop("__weakref__", None) # hack as __new__ is stored differently in the __dict__ - new_override = clsdict.get('__new__', None) + new_override = clsdict.get("__new__", None) if new_override: - clsdict['__new__'] = obj.__new__ + clsdict["__new__"] = obj.__new__ # namedtuple is a special case for Spark where we use the _load_namedtuple function - if getattr(obj, '_is_namedtuple_', False): + if getattr(obj, "_is_namedtuple_", False): self.save_reduce(_load_namedtuple, (obj.__name__, obj._fields)) return @@ -471,15 +509,11 @@ def save_dynamic_class(self, obj): # On PyPy, __doc__ is a readonly attribute, so we need to include it in # the initial skeleton class. This is safe because we know that the # doc can't participate in a cycle with the original class. - doc_dict = {'__doc__': clsdict.pop('__doc__', None)} + doc_dict = {"__doc__": clsdict.pop("__doc__", None)} # Create and memoize an empty class with obj's name and bases. save(type(obj)) - save(( - obj.__name__, - obj.__bases__, - doc_dict, - )) + save((obj.__name__, obj.__bases__, doc_dict,)) write(pickle.REDUCE) self.memoize(obj) @@ -506,30 +540,37 @@ def save_function_tuple(self, func): soon as it's created. The other stuff can then be filled in later. """ if is_tornado_coroutine(func): - self.save_reduce(_rebuild_tornado_coroutine, (func.__wrapped__,), - obj=func) + self.save_reduce(_rebuild_tornado_coroutine, (func.__wrapped__,), obj=func) return save = self.save write = self.write - code, f_globals, defaults, closure_values, dct, base_globals = self.extract_func_data(func) + ( + code, + f_globals, + defaults, + closure_values, + dct, + base_globals, + ) = self.extract_func_data(func) save(_fill_function) # skeleton function updater - write(pickle.MARK) # beginning of tuple that _fill_function expects + write(pickle.MARK) # beginning of tuple that _fill_function expects self._save_subimports( - code, - itertools.chain(f_globals.values(), closure_values or ()), + code, itertools.chain(f_globals.values(), closure_values or ()), ) # create a skeleton function object and memoize it save(_make_skel_func) - save(( - code, - len(closure_values) if closure_values is not None else -1, - base_globals, - )) + save( + ( + code, + len(closure_values) if closure_values is not None else -1, + base_globals, + ) + ) write(pickle.REDUCE) self.memoize(func) @@ -545,7 +586,8 @@ def save_function_tuple(self, func): _extract_code_globals_cache = ( weakref.WeakKeyDictionary() if sys.version_info >= (2, 7) and not hasattr(sys, "pypy_version_info") - else {}) + else {} + ) @classmethod def extract_code_globals(cls, co): @@ -560,8 +602,7 @@ def extract_code_globals(cls, co): # PyPy "builtin-code" object out_names = set() else: - out_names = set(names[oparg] - for op, oparg in _walk_global_ops(co)) + out_names = set(names[oparg] for op, oparg in _walk_global_ops(co)) # see if nested function have any global refs if co.co_consts: @@ -611,6 +652,7 @@ def save_builtin_function(self, obj): if obj.__module__ == "__builtin__": return self.save_global(obj) return self.save_function(obj) + dispatch[types.BuiltinFunctionType] = save_builtin_function def save_global(self, obj, name=None, pack=struct.pack): @@ -622,7 +664,9 @@ def save_global(self, obj, name=None, pack=struct.pack): """ if obj.__module__ == "__builtin__" or obj.__module__ == "builtins": if obj in _BUILTIN_TYPE_NAMES: - return self.save_reduce(_builtin_type, (_BUILTIN_TYPE_NAMES[obj],), obj=obj) + return self.save_reduce( + _builtin_type, (_BUILTIN_TYPE_NAMES[obj],), obj=obj + ) if name is None: name = obj.__name__ @@ -634,9 +678,9 @@ def save_global(self, obj, name=None, pack=struct.pack): # https://bitbucket.org/gutworth/six/issues/63/importing-six-breaks-pickling modname = pickle.whichmodule(obj, name) except Exception: - modname = '__main__' + modname = "__main__" - if modname == '__main__': + if modname == "__main__": themodule = None else: __import__(modname) @@ -661,10 +705,16 @@ def save_instancemethod(self, obj): self.save_reduce(getattr, (obj.__self__.__class__, obj.__name__)) else: if PY3: - self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj) + self.save_reduce( + types.MethodType, (obj.__func__, obj.__self__), obj=obj + ) else: - self.save_reduce(types.MethodType, (obj.__func__, obj.__self__, obj.__self__.__class__), - obj=obj) + self.save_reduce( + types.MethodType, + (obj.__func__, obj.__self__, obj.__self__.__class__), + obj=obj, + ) + dispatch[types.MethodType] = save_instancemethod def save_inst(self, obj): @@ -682,7 +732,7 @@ def save_inst(self, obj): write = self.write save = self.save - if hasattr(obj, '__getinitargs__'): + if hasattr(obj, "__getinitargs__"): args = obj.__getinitargs__() len(args) # XXX Assert it's a sequence pickle._keep_alive(args, memo) @@ -699,7 +749,7 @@ def save_inst(self, obj): else: for arg in args: save(arg) - write(pickle.INST + cls.__module__ + '\n' + cls.__name__ + '\n') + write(pickle.INST + cls.__module__ + "\n" + cls.__name__ + "\n") self.memoize(obj) @@ -707,8 +757,8 @@ def save_inst(self, obj): getstate = obj.__getstate__ except AttributeError: stuff = obj.__dict__ - #remove items if transient - if hasattr(obj, '__transient__'): + # remove items if transient + if hasattr(obj, "__transient__"): transient = obj.__transient__ stuff = stuff.copy() for k in list(stuff.keys()): @@ -726,6 +776,7 @@ def save_inst(self, obj): def save_property(self, obj): # properties not correctly saved in python self.save_reduce(property, (obj.fget, obj.fset, obj.fdel, obj.__doc__), obj=obj) + dispatch[property] = save_property def save_classmethod(self, obj): @@ -736,17 +787,20 @@ def save_classmethod(self, obj): if isinstance(obj, classmethod): orig_func = orig_func.__func__ # Unbind self.save_reduce(type(obj), (orig_func,), obj=obj) + dispatch[classmethod] = save_classmethod dispatch[staticmethod] = save_classmethod def save_itemgetter(self, obj): """itemgetter serializer (needed for namedtuple support)""" + class Dummy: def __getitem__(self, item): return item + items = obj(Dummy()) if not isinstance(items, tuple): - items = (items, ) + items = (items,) return self.save_reduce(operator.itemgetter, items) if type(operator.itemgetter) is type: @@ -754,10 +808,12 @@ def __getitem__(self, item): def save_attrgetter(self, obj): """attrgetter serializer""" + class Dummy(object): def __init__(self, attrs, index=None): self.attrs = attrs self.index = index + def __getattribute__(self, item): attrs = object.__getattribute__(self, "attrs") index = object.__getattribute__(self, "index") @@ -767,6 +823,7 @@ def __getattribute__(self, item): else: attrs[index] = ".".join([attrs[index], item]) return type(self)(attrs, index) + attrs = [] obj(Dummy(attrs)) return self.save_reduce(operator.attrgetter, tuple(attrs)) @@ -774,8 +831,9 @@ def __getattribute__(self, item): if type(operator.attrgetter) is type: dispatch[operator.attrgetter] = save_attrgetter - def save_reduce(self, func, args, state=None, - listitems=None, dictitems=None, obj=None): + def save_reduce( + self, func, args, state=None, listitems=None, dictitems=None, obj=None + ): """Modified to support __transient__ on new objects Change only affects protocol level 2 (which is always used by PiCloud""" # Assert that args is a tuple or None @@ -783,7 +841,7 @@ def save_reduce(self, func, args, state=None, raise pickle.PicklingError("args from reduce() should be a tuple") # Assert that func is callable - if not hasattr(func, '__call__'): + if not hasattr(func, "__call__"): raise pickle.PicklingError("func from reduce should be callable") save = self.save @@ -791,19 +849,21 @@ def save_reduce(self, func, args, state=None, # Protocol 2 special case: if func's name is __newobj__, use NEWOBJ if self.proto >= 2 and getattr(func, "__name__", "") == "__newobj__": - #Added fix to allow transient + # Added fix to allow transient cls = args[0] if not hasattr(cls, "__new__"): raise pickle.PicklingError( - "args[0] from __newobj__ args has no __new__") + "args[0] from __newobj__ args has no __new__" + ) if obj is not None and cls is not obj.__class__: raise pickle.PicklingError( - "args[0] from __newobj__ args has the wrong class") + "args[0] from __newobj__ args has the wrong class" + ) args = args[1:] save(cls) - #Don't pickle transient entries - if hasattr(obj, '__transient__'): + # Don't pickle transient entries + if hasattr(obj, "__transient__"): transient = obj.__transient__ state = state.copy() @@ -840,31 +900,34 @@ def save_partial(self, obj): """Partial objects do not serialize correctly in python2.x -- this fixes the bugs""" self.save_reduce(_genpartial, (obj.func, obj.args, obj.keywords)) - if sys.version_info < (2,7): # 2.7 supports partial pickling + if sys.version_info < (2, 7): # 2.7 supports partial pickling dispatch[partial] = save_partial - def save_file(self, obj): """Save a file""" try: - import StringIO as pystringIO #we can't use cStringIO as it lacks the name attribute + import StringIO as pystringIO # we can't use cStringIO as it lacks the name attribute except ImportError: import io as pystringIO - if not hasattr(obj, 'name') or not hasattr(obj, 'mode'): - raise pickle.PicklingError("Cannot pickle files that do not map to an actual file") + if not hasattr(obj, "name") or not hasattr(obj, "mode"): + raise pickle.PicklingError( + "Cannot pickle files that do not map to an actual file" + ) if obj is sys.stdout: - return self.save_reduce(getattr, (sys,'stdout'), obj=obj) + return self.save_reduce(getattr, (sys, "stdout"), obj=obj) if obj is sys.stderr: - return self.save_reduce(getattr, (sys,'stderr'), obj=obj) + return self.save_reduce(getattr, (sys, "stderr"), obj=obj) if obj is sys.stdin: raise pickle.PicklingError("Cannot pickle standard input") if obj.closed: raise pickle.PicklingError("Cannot pickle closed files") - if hasattr(obj, 'isatty') and obj.isatty(): + if hasattr(obj, "isatty") and obj.isatty(): raise pickle.PicklingError("Cannot pickle files that map to tty objects") - if 'r' not in obj.mode and '+' not in obj.mode: - raise pickle.PicklingError("Cannot pickle files that are not opened for reading: %s" % obj.mode) + if "r" not in obj.mode and "+" not in obj.mode: + raise pickle.PicklingError( + "Cannot pickle files that are not opened for reading: %s" % obj.mode + ) name = obj.name @@ -877,7 +940,9 @@ def save_file(self, obj): contents = obj.read() obj.seek(curloc) except IOError: - raise pickle.PicklingError("Cannot pickle file %s as it cannot be read" % name) + raise pickle.PicklingError( + "Cannot pickle file %s as it cannot be read" % name + ) retval.write(contents) retval.seek(curloc) @@ -900,34 +965,36 @@ def save_not_implemented(self, obj): dispatch[type(NotImplemented)] = save_not_implemented # WeakSet was added in 2.7. - if hasattr(weakref, 'WeakSet'): + if hasattr(weakref, "WeakSet"): + def save_weakset(self, obj): self.save_reduce(weakref.WeakSet, (list(obj),)) dispatch[weakref.WeakSet] = save_weakset def inject_numpy(self): - numpy = sys.modules.get('numpy') - if not numpy or not hasattr(numpy, 'ufunc'): + numpy = sys.modules.get("numpy") + if not numpy or not hasattr(numpy, "ufunc"): return self.dispatch[numpy.ufunc] = self.__class__.save_ufunc def save_ufunc(self, obj): """Hack function for saving numpy ufunc objects""" name = obj.__name__ - numpy_tst_mods = ['numpy', 'scipy.special'] + numpy_tst_mods = ["numpy", "scipy.special"] for tst_mod_name in numpy_tst_mods: tst_mod = sys.modules.get(tst_mod_name, None) if tst_mod and name in tst_mod.__dict__: return self.save_reduce(_getobject, (tst_mod_name, name)) - raise pickle.PicklingError('cannot save %s. Cannot resolve what module it is defined in' - % str(obj)) + raise pickle.PicklingError( + "cannot save %s. Cannot resolve what module it is defined in" % str(obj) + ) def inject_unity_proxy(self): from turicreate.toolkits._model import Model # get the top level module - tc = __import__(__name__.split('.')[0]) + tc = __import__(__name__.split(".")[0]) if not tc: return ## Make sure the unity objects are not picklable ## @@ -938,13 +1005,22 @@ def inject_unity_proxy(self): self.dispatch[Model] = self.__class__.save_unsupported ## Make sure the underlying cython objects are not picklable ## - self.dispatch[tc._cython.cy_sarray.UnitySArrayProxy] = self.__class__.save_unsupported - self.dispatch[tc._cython.cy_sframe.UnitySFrameProxy] = self.__class__.save_unsupported - self.dispatch[tc._cython.cy_sketch.UnitySketchProxy] = self.__class__.save_unsupported - self.dispatch[tc._cython.cy_graph.UnityGraphProxy] = self.__class__.save_unsupported + self.dispatch[ + tc._cython.cy_sarray.UnitySArrayProxy + ] = self.__class__.save_unsupported + self.dispatch[ + tc._cython.cy_sframe.UnitySFrameProxy + ] = self.__class__.save_unsupported + self.dispatch[ + tc._cython.cy_sketch.UnitySketchProxy + ] = self.__class__.save_unsupported + self.dispatch[ + tc._cython.cy_graph.UnityGraphProxy + ] = self.__class__.save_unsupported self.dispatch[tc._cython.cy_model.UnityModel] = self.__class__.save_unsupported """Special functions for Add-on libraries""" + def inject_addons(self): """Plug in system. Register additional pickling functions if modules already loaded""" self.inject_numpy() @@ -958,26 +1034,30 @@ def save_logger(self, obj): # Tornado support + def is_tornado_coroutine(func): """ Return whether *func* is a Tornado coroutine function. Running coroutines are not supported. """ - if 'tornado.gen' not in sys.modules: + if "tornado.gen" not in sys.modules: return False - gen = sys.modules['tornado.gen'] + gen = sys.modules["tornado.gen"] if not hasattr(gen, "is_coroutine_function"): # Tornado version is too old return False return gen.is_coroutine_function(func) + def _rebuild_tornado_coroutine(func): from tornado import gen + return gen.coroutine(func) # Shorthands for legacy support + def dump(obj, file, protocol=2): CloudPickler(file, protocol).dump(obj) @@ -985,14 +1065,13 @@ def dump(obj, file, protocol=2): def dumps(obj, protocol=2): file = StringIO() - cp = CloudPickler(file,protocol) + cp = CloudPickler(file, protocol) cp.dump(obj) return file.getvalue() - -#hack for __import__ not working as desired +# hack for __import__ not working as desired def subimport(name): __import__(name) return sys.modules[name] @@ -1004,6 +1083,7 @@ def dynamic_subimport(name, vars): sys.modules[name] = mod return mod + # restores function attributes def _restore_attr(obj, attr): for key, val in attr.items(): @@ -1025,21 +1105,23 @@ def _modules_to_main(modList): if not modList: return - main = sys.modules['__main__'] + main = sys.modules["__main__"] for modname in modList: if type(modname) is str: try: mod = __import__(modname) except Exception: - sys.stderr.write('warning: could not import %s\n. ' - 'Your function may unexpectedly error due to this import failing;' - 'A version mismatch is likely. Specific error was:\n' % modname) + sys.stderr.write( + "warning: could not import %s\n. " + "Your function may unexpectedly error due to this import failing;" + "A version mismatch is likely. Specific error was:\n" % modname + ) print_exec(sys.stderr) else: setattr(main, mod.__name__, mod) -#object generators: +# object generators: def _genpartial(func, args, kwds): if not args: args = () @@ -1047,9 +1129,11 @@ def _genpartial(func, args, kwds): kwds = {} return partial(func, *args, **kwds) + def _gen_ellipsis(): return Ellipsis + def _gen_not_implemented(): return NotImplemented @@ -1082,6 +1166,7 @@ def instance(cls): class _empty_cell_value(object): """sentinel for empty closures """ + @classmethod def __reduce__(cls): return cls.__name__ @@ -1109,7 +1194,7 @@ def _make_empty_cell(): if False: # trick the compiler into creating an empty cell in our lambda cell = None - raise AssertionError('this route should not be executed') + raise AssertionError("this route should not be executed") return (lambda: cell).__closure__[0] @@ -1121,12 +1206,12 @@ def _make_skel_func(code, cell_count, base_globals=None): """ if base_globals is None: base_globals = {} - base_globals['__builtins__'] = __builtins__ + base_globals["__builtins__"] = __builtins__ closure = ( tuple(_make_empty_cell() for _ in range(cell_count)) - if cell_count >= 0 else - None + if cell_count >= 0 + else None ) return types.FunctionType(code, base_globals, None, None, closure) @@ -1147,7 +1232,7 @@ def _find_module(mod_name): This function is able to find submodules (e.g. sickit.tree) """ path = None - for part in mod_name.split('.'): + for part in mod_name.split("."): if path is not None: path = [path] file, path, description = imp.find_module(part, path) @@ -1155,17 +1240,20 @@ def _find_module(mod_name): file.close() return path, description + def _load_namedtuple(name, fields): """ Loads a class generated by namedtuple """ from collections import namedtuple + return namedtuple(name, fields) """Constructors for 3rd party libraries Note: These can never be renamed due to client compatibility issues""" + def _getobject(modname, attribute): mod = __import__(modname, fromlist=[attribute]) return mod.__dict__[attribute] diff --git a/src/python/turicreate/util/_config.py b/src/python/turicreate/util/_config.py index 2c69f7c230..a1d041ac24 100644 --- a/src/python/turicreate/util/_config.py +++ b/src/python/turicreate/util/_config.py @@ -11,16 +11,25 @@ import logging import platform + class TuriConfig: - __slots__ = ['turicreate_server', 'server_addr', 'server_bin', 'log_dir', - 'log_rotation_interval','log_rotation_truncate'] + __slots__ = [ + "turicreate_server", + "server_addr", + "server_bin", + "log_dir", + "log_rotation_interval", + "log_rotation_truncate", + ] def __init__(self, server_addr=None): if not server_addr: - server_addr = 'default' + server_addr = "default" self.server_addr = server_addr - gl_root = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')) + gl_root = os.path.abspath( + os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") + ) self.log_rotation_interval = 86400 self.log_rotation_truncate = 8 if "TURI_UNITY" in os.environ: @@ -35,14 +44,18 @@ def __init__(self, server_addr=None): try: self.log_rotation_interval = int(tmp) except: - logging.getLogger(__name__).warning("TURI_LOG_ROTATION_INTERVAL must be an integral value") + logging.getLogger(__name__).warning( + "TURI_LOG_ROTATION_INTERVAL must be an integral value" + ) if "TURI_LOG_ROTATION_TRUNCATE" in os.environ: tmp = os.environ["TURI_LOG_ROTATION_TRUNCATE"] try: self.log_rotation_truncate = int(tmp) except: - logging.getLogger(__name__).warning("TURI_LOG_ROTATION_TRUNCATE must be an integral value") + logging.getLogger(__name__).warning( + "TURI_LOG_ROTATION_TRUNCATE must be an integral value" + ) if "TURI_LOG_PATH" in os.environ: log_dir = os.environ["TURI_LOG_PATH"] @@ -61,8 +74,11 @@ def __init__(self, server_addr=None): def get_unity_log(self): ts = str(int(time.time())) - log_ext = '.log' - root_package_name = 'turicreate' - return os.path.join(self.log_dir, root_package_name + '_server_' + str(ts) + log_ext) + log_ext = ".log" + root_package_name = "turicreate" + return os.path.join( + self.log_dir, root_package_name + "_server_" + str(ts) + log_ext + ) + DEFAULT_CONFIG = TuriConfig() diff --git a/src/python/turicreate/util/_file_util.py b/src/python/turicreate/util/_file_util.py index 03d3dad315..6831bbbca4 100644 --- a/src/python/turicreate/util/_file_util.py +++ b/src/python/turicreate/util/_file_util.py @@ -3,10 +3,10 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" This package provides commonly used methods for dealing with file operation, including working with network file system like S3, http, etc. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ @@ -19,47 +19,55 @@ __RETRY_TIMES = 5 __SLEEP_SECONDS_BETWEEN_RETRIES = 2 + def get_protocol(path): - '''Given a path, returns the protocol the path uses + """Given a path, returns the protocol the path uses For example, 's3://a/b/c/' returns 's3' 'http://a/b/c' returns 'http' 'tmp/a/bc/' returns '' - ''' - pos = path.find('://') + """ + pos = path.find("://") if pos < 0: - return '' + return "" return path[0:pos].lower() + def expand_full_path(path): - '''Expand a relative path to a full path + """Expand a relative path to a full path For example, '~/tmp' may be expanded to '/Users/username/tmp' 'abc/def' may be expanded to '/pwd/abc/def' - ''' + """ return os.path.abspath(os.path.expanduser(path)) -def exists(path, aws_credentials = {}): + +def exists(path, aws_credentials={}): if is_local_path(path): return os.path.exists(path) else: - raise ValueError('Unsupported protocol %s' % path) + raise ValueError("Unsupported protocol %s" % path) + def is_local_path(path): - '''Returns True if the path indicates a local path, otherwise False''' + """Returns True if the path indicates a local path, otherwise False""" protocol = get_protocol(path) - return protocol != 'hdfs' and protocol != 's3' and \ - protocol != 'http' and protocol != 'https' + return ( + protocol != "hdfs" + and protocol != "s3" + and protocol != "http" + and protocol != "https" + ) -def copy_from_local(localpath, remotepath, is_dir = False, silent = True): +def copy_from_local(localpath, remotepath, is_dir=False, silent=True): if is_local_path(remotepath): if is_dir: shutil.copytree(localpath, remotepath) else: shutil.copy(localpath, remotepath) else: - raise ValueError('Unsupported protocol %s' % remotepath) + raise ValueError("Unsupported protocol %s" % remotepath) diff --git a/src/python/turicreate/util/_progress_table_printer.py b/src/python/turicreate/util/_progress_table_printer.py index 65ef126cb5..fd08a1c63d 100644 --- a/src/python/turicreate/util/_progress_table_printer.py +++ b/src/python/turicreate/util/_progress_table_printer.py @@ -1,7 +1,6 @@ class ProgressTablePrinter(object): - def __init__(self, column_names, column_display_names): - ''' + """ column_names : list(str) Keyword args passed to update(..) @@ -9,32 +8,34 @@ def __init__(self, column_names, column_display_names): Names with are displayed in the header of the table The ordering of column_names and column_display_names must match. - ''' - assert(len(column_names) == len(column_display_names)) + """ + assert len(column_names) == len(column_display_names) num_columns = len(column_names) self.column_names = column_names self.column_width = max(map(lambda x: len(x), column_display_names)) + 2 - self.hr = '+' + '+'.join(['-' * self.column_width] * num_columns) + '+' + self.hr = "+" + "+".join(["-" * self.column_width] * num_columns) + "+" # Print progress table header print(self.hr) - print(('| {:<{width}}' * num_columns + '|').format(*column_display_names, - width=self.column_width - 1)) + print( + ("| {:<{width}}" * num_columns + "|").format( + *column_display_names, width=self.column_width - 1 + ) + ) print(self.hr) - def print_row(self, **kwargs): - ''' + """ keys of kwargs must be the names passed to __init__(...) as `column_names` - ''' - meta_string = '|' + """ + meta_string = "|" for key in self.column_names: - float_specifier = '' + float_specifier = "" if isinstance(kwargs[key], float): - float_specifier = '.3f' + float_specifier = ".3f" meta_string += " {%s:<{width}%s}|" % (key, float_specifier) - kwargs['width'] = self.column_width - 1 + kwargs["width"] = self.column_width - 1 print(meta_string.format(**kwargs)) print(self.hr) diff --git a/src/python/turicreate/util/_sframe_generation.py b/src/python/turicreate/util/_sframe_generation.py index 68b56a4af4..c0327c60ac 100644 --- a/src/python/turicreate/util/_sframe_generation.py +++ b/src/python/turicreate/util/_sframe_generation.py @@ -10,7 +10,9 @@ from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ -def generate_random_sframe(num_rows, column_codes, random_seed = 0): + + +def generate_random_sframe(num_rows, column_codes, random_seed=0): """ Creates a random SFrame with `num_rows` rows and randomly generated column types determined by `column_codes`. The output @@ -70,7 +72,10 @@ def generate_random_sframe(num_rows, column_codes, random_seed = 0): X.materialize() return X -def generate_random_regression_sframe(num_rows, column_codes, random_seed = 0, target_noise_level = 0.25): + +def generate_random_regression_sframe( + num_rows, column_codes, random_seed=0, target_noise_level=0.25 +): """ Creates a random SFrame with `num_rows` rows and randomly generated column types determined by `column_codes`. The output @@ -152,14 +157,21 @@ def generate_random_regression_sframe(num_rows, column_codes, random_seed = 0, t assert isinstance(num_rows, int) assert isinstance(random_seed, int) - X = _generate_random_sframe(num_rows, column_codes, random_seed, True, target_noise_level) + X = _generate_random_sframe( + num_rows, column_codes, random_seed, True, target_noise_level + ) X.materialize() return X -def generate_random_classification_sframe(num_rows, column_codes, num_classes, - misclassification_spread = 0.25, - num_extra_class_bins = None, - random_seed = 0): + +def generate_random_classification_sframe( + num_rows, + column_codes, + num_classes, + misclassification_spread=0.25, + num_extra_class_bins=None, + random_seed=0, +): """ Creates a random SFrame with `num_rows` rows and randomly generated column types determined by `column_codes`. The output @@ -232,7 +244,7 @@ def generate_random_classification_sframe(num_rows, column_codes, num_classes, raise ValueError("num_classes must be >= 2.") if num_extra_class_bins is None: - num_extra_class_bins = 2*num_classes + num_extra_class_bins = 2 * num_classes if num_extra_class_bins < 0: raise ValueError("num_extra_class_bins must be >= 0.") @@ -246,10 +258,14 @@ def generate_random_classification_sframe(num_rows, column_codes, num_classes, assert isinstance(num_classes, int) assert isinstance(num_extra_class_bins, int) - X = _generate_random_classification_sframe( - num_rows, column_codes, random_seed, - num_classes, num_extra_class_bins, misclassification_spread) + num_rows, + column_codes, + random_seed, + num_classes, + num_extra_class_bins, + misclassification_spread, + ) X.materialize() return X diff --git a/src/python/turicreate/util/_type_checks.py b/src/python/turicreate/util/_type_checks.py index ad4f5d4533..1c82417aab 100644 --- a/src/python/turicreate/util/_type_checks.py +++ b/src/python/turicreate/util/_type_checks.py @@ -7,6 +7,7 @@ from __future__ import division as _ from __future__ import absolute_import as _ + def _raise_error_if_not_of_type(arg, expected_type, arg_name=None): """ Check if the input is of expected type. @@ -28,15 +29,19 @@ def _raise_error_if_not_of_type(arg, expected_type, arg_name=None): """ display_name = "%s " % arg_name if arg_name is not None else "Argument " - lst_expected_type = [expected_type] if \ - type(expected_type) == type else expected_type - - err_msg = "%smust be of type %s " % (display_name, - ' or '.join([x.__name__ for x in lst_expected_type])) + lst_expected_type = ( + [expected_type] if type(expected_type) == type else expected_type + ) + + err_msg = "%smust be of type %s " % ( + display_name, + " or ".join([x.__name__ for x in lst_expected_type]), + ) err_msg += "(not %s)." % type(arg).__name__ if not any(map(lambda x: isinstance(arg, x), lst_expected_type)): raise TypeError(err_msg) + def _is_non_string_iterable(obj): # In Python 3, str implements '__iter__'. - return (hasattr(obj, '__iter__') and not isinstance(obj, str)) + return hasattr(obj, "__iter__") and not isinstance(obj, str) diff --git a/src/python/turicreate/util/lambda_closure_capture.py b/src/python/turicreate/util/lambda_closure_capture.py index e6b611fd1a..68fa94aa50 100644 --- a/src/python/turicreate/util/lambda_closure_capture.py +++ b/src/python/turicreate/util/lambda_closure_capture.py @@ -14,6 +14,7 @@ import inspect from .. import meta + class expression_validator(ast.NodeVisitor): """ This tree walk attempts to validate an expression: that the expression @@ -61,9 +62,11 @@ class attribute_reader(ast.NodeVisitor): We need to breakdown the attribute into the original string """ + def default(self, node): - raise NotImplementedError("Cannot process token at " + - str(node.lineno) + ":" + str(node.col_offset)) + raise NotImplementedError( + "Cannot process token at " + str(node.lineno) + ":" + str(node.col_offset) + ) def visit_Name(self, node): return node.id @@ -78,11 +81,12 @@ def __init__(self, name): self.name = name def __str__(self): - return 'λ' + self.name + return "λ" + self.name def __repr__(self): return str(self) + class lambda_closure_visitor(ast.NodeVisitor): """ This implements a *very* limited decompiler. It only handles cases of @@ -91,9 +95,11 @@ class lambda_closure_visitor(ast.NodeVisitor): may be some occurrences of x. No additional statements or expressions are permitted """ - FUNCTION = 0 # I am translating the wrapping lambda function - INNER_CALL = 1 # I am translating the function call inside - PARAMETER = 2 # I am just translating a function parameter + + FUNCTION = 0 # I am translating the wrapping lambda function + INNER_CALL = 1 # I am translating the function call inside + PARAMETER = 2 # I am just translating a function parameter + def __init__(self): # The fn self.closure_fn_name = "" @@ -113,8 +119,9 @@ def __init__(self): self.state = self.FUNCTION def default(self, node): - raise NotImplementedError("Cannot process token at " + - str(node.lineno) + ":" + str(node.col_offset)) + raise NotImplementedError( + "Cannot process token at " + str(node.lineno) + ":" + str(node.col_offset) + ) def __repr__(self): return str(self) @@ -124,39 +131,51 @@ def __str__(self): comma = False for i in self.positional_args: if comma: - ret = ret + ',' + ret = ret + "," ret = ret + str(i) comma = True for i in self.named_args: if comma: - ret = ret + ',' + ret = ret + "," ret = ret + i + ":" + str(self.named_args[i]) comma = True ret = ret + ")" return ret def translate_ast(self, ast_node): - #print(ast.dump(ast_node)) + # print(ast.dump(ast_node)) t = self.visit(ast_node) def visit_Module(self, node): - if (self.state != self.FUNCTION): - raise NotImplementedError("Unexpected module in position " + - str(node.lineno) + ":" + str(node.col_offset)) + if self.state != self.FUNCTION: + raise NotImplementedError( + "Unexpected module in position " + + str(node.lineno) + + ":" + + str(node.col_offset) + ) for line in node.body: self.visit(line) def visit_Call(self, node): - if (self.state != self.INNER_CALL): - raise NotImplementedError("Unexpected call in position " + - str(node.lineno) + ":" + str(node.col_offset)) + if self.state != self.INNER_CALL: + raise NotImplementedError( + "Unexpected call in position " + + str(node.lineno) + + ":" + + str(node.col_offset) + ) self.state = self.INNER_CALL # this is the main closure function call if self.closure_fn_name != "": - raise NotImplementedError("Cannot translate function call " + - str(node.lineno) + ":" + str(node.col_offset)) + raise NotImplementedError( + "Cannot translate function call " + + str(node.lineno) + + ":" + + str(node.col_offset) + ) elif type(node.func) is ast.Name: self.closure_fn_name = node.func.id elif type(node.func) is ast.Attribute: @@ -174,13 +193,18 @@ def visit_Call(self, node): try: expression_validator(self.input_arg_names).visit(arg) # try to evaluate the ast - result = eval(compile(ast.Expression(arg), '', 'eval'), self.caller_globals) + result = eval( + compile(ast.Expression(arg), "", "eval"), + self.caller_globals, + ) except: - raise NotImplementedError("Only simple expressions not using the function arguments are permitted") + raise NotImplementedError( + "Only simple expressions not using the function arguments are permitted" + ) self.positional_args += [result] # keyword arguments next - keywordargs = {i.arg:i.value for i in node.keywords} + keywordargs = {i.arg: i.value for i in node.keywords} for i in keywordargs: arg = keywordargs[i] if type(arg) is ast.Name and arg.id in self.input_arg_names: @@ -189,15 +213,18 @@ def visit_Call(self, node): try: expression_validator(self.input_arg_names).visit(arg) # try to evaluate the ast - result = eval(compile(ast.Expression(arg), '', 'eval'), self.caller_globals) + result = eval( + compile(ast.Expression(arg), "", "eval"), + self.caller_globals, + ) except: - raise NotImplementedError("Only simple expressions not using the function arguments are permitted") + raise NotImplementedError( + "Only simple expressions not using the function arguments are permitted" + ) self.named_args[i] = result - - def visit_arguments(self, node): - if (self.state != self.FUNCTION): + if self.state != self.FUNCTION: raise NotImplementedError("Unexpected function") if sys.version_info.major == 2: self.input_arg_names = [arg.id for arg in node.args] @@ -205,19 +232,18 @@ def visit_arguments(self, node): self.input_arg_names = [arg.arg for arg in node.args] def visit_Name(self, node): - raise NotImplementedError("Unexpected name") + raise NotImplementedError("Unexpected name") def visit_Return(self, node): - if (self.state != self.INNER_CALL): + if self.state != self.INNER_CALL: raise NotImplementedError("Unexpected return") return self.visit(node.value) def visit_Lambda(self, node): return self.visit_FunctionDef(node) - def visit_FunctionDef(self, node): - if (self.state != self.FUNCTION): + if self.state != self.FUNCTION: raise NotImplementedError("Unexpected function") self.visit(node.args) @@ -229,9 +255,9 @@ def visit_FunctionDef(self, node): # it actually shows up in the ast as a Expr.str # so we need to catch that and skip it try: - if type(next_node) is ast.Expr and type(next_node.value) is ast.Str: - # this is *probably* a doc string! - next_node = node.body[1] + if type(next_node) is ast.Expr and type(next_node.value) is ast.Str: + # this is *probably* a doc string! + next_node = node.body[1] except: # just in case the above fails for various reasons like say... # there is *only* a doc string. We still fail with the @@ -250,8 +276,9 @@ def visit_FunctionDef(self, node): def visit_ClassDef(self, node): raise NotImplementedError("Classes are not implemented") + def _isalambda(v): - return isinstance(v, type(lambda: None)) and v.__name__ == '' + return isinstance(v, type(lambda: None)) and v.__name__ == "" def translate(fn): @@ -289,6 +316,8 @@ def translate(fn): raise RuntimeError("Cannot process provided function") visitor.translate_ast(ast_node) return visitor + + # if __name__ == "__main__": # if len(sys.argv) <= 1: # print("Usage:\n\t./Lua_Translator.py \n") diff --git a/src/python/turicreate/version_info.py b/src/python/turicreate/version_info.py index 9952485365..6f34754244 100644 --- a/src/python/turicreate/version_info.py +++ b/src/python/turicreate/version_info.py @@ -3,15 +3,15 @@ # # Use of this source code is governed by a BSD-3-clause license that can # be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause -''' +""" Obtain the version number of the current turicreate create. -''' +""" from __future__ import print_function as _ from __future__ import division as _ from __future__ import absolute_import as _ # python egg version -__version__ = '6.0'#{{VERSION_STRING}} +__version__ = "6.0" # {{VERSION_STRING}} version = __version__ -build_number = '0'#{{BUILD_NUMBER}} -git_sha = 'NA'#{{GIT_SHA}} +build_number = "0" # {{BUILD_NUMBER}} +git_sha = "NA" # {{GIT_SHA}} diff --git a/src/python/turicreate/visualization/_plot.py b/src/python/turicreate/visualization/_plot.py index 6a04299661..d5de2ad7a6 100644 --- a/src/python/turicreate/visualization/_plot.py +++ b/src/python/turicreate/visualization/_plot.py @@ -10,41 +10,68 @@ LABEL_DEFAULT = "__TURI_DEFAULT_LABEL" -_target = 'auto' +_target = "auto" _SUCCESS = 0 _CANVAS_PREBUILT_NOT_FOUND_ERROR = 1 _NODE_NOT_FOUND_ERROR_CODE = 127 _PERMISSION_DENIED_ERROR_CODE = 243 + def _get_client_app_path(): (tcviz_dir, _) = _os.path.split(_os.path.dirname(__file__)) - if _sys.platform != 'darwin' and _sys.platform != 'linux2' and _sys.platform != 'linux' : - raise NotImplementedError('Visualization is currently supported only on macOS and Linux.') + if ( + _sys.platform != "darwin" + and _sys.platform != "linux2" + and _sys.platform != "linux" + ): + raise NotImplementedError( + "Visualization is currently supported only on macOS and Linux." + ) - if _sys.platform == 'darwin': - return _os.path.join(tcviz_dir, 'Turi Create Visualization.app', 'Contents', 'MacOS', 'Turi Create Visualization') + if _sys.platform == "darwin": + return _os.path.join( + tcviz_dir, + "Turi Create Visualization.app", + "Contents", + "MacOS", + "Turi Create Visualization", + ) + + if _sys.platform == "linux2" or _sys.platform == "linux": + return _os.path.join( + tcviz_dir, "Turi Create Visualization", "visualization_client" + ) - if _sys.platform == 'linux2' or _sys.platform == 'linux': - return _os.path.join(tcviz_dir, 'Turi Create Visualization', 'visualization_client') def _ensure_web_server(): import turicreate as tc - if (tc.config.get_runtime_config()['TURI_VISUALIZATION_WEB_SERVER_ROOT_DIRECTORY'] == ''): + + if ( + tc.config.get_runtime_config()["TURI_VISUALIZATION_WEB_SERVER_ROOT_DIRECTORY"] + == "" + ): path_to_client = _get_client_app_path() - tc.config.set_runtime_config('TURI_VISUALIZATION_WEB_SERVER_ROOT_DIRECTORY', - _os.path.abspath(_os.path.join(_os.path.dirname(path_to_client), '..', 'Resources', 'build')) + tc.config.set_runtime_config( + "TURI_VISUALIZATION_WEB_SERVER_ROOT_DIRECTORY", + _os.path.abspath( + _os.path.join( + _os.path.dirname(path_to_client), "..", "Resources", "build" + ) + ), ) + def _run_cmdline(command): # runs a shell command p = _Popen(args=command, stdout=_PIPE, stderr=_PIPE, shell=True) - stdout_feed, stderr_feed = p.communicate() # wait for completion + stdout_feed, stderr_feed = p.communicate() # wait for completion exit_code = p.poll() return (exit_code, stdout_feed, stderr_feed) -def set_target(target='auto'): + +def set_target(target="auto"): """ Sets the target for visualizations launched with the `show` method. If unset, or if target is not provided, defaults to 'auto'. @@ -69,8 +96,10 @@ def set_target(target='auto'): * 'none': prevent all visualizations from being displayed. """ global _target - if target not in ['auto', 'browser', 'gui', 'none']: - raise ValueError("Expected target to be one of: 'auto', 'browser', 'gui', 'none'.") + if target not in ["auto", "browser", "gui", "none"]: + raise ValueError( + "Expected target to be one of: 'auto', 'browser', 'gui', 'none'." + ) _target = target @@ -99,9 +128,11 @@ class Plot(object): >>> plt.save('vega_spec.json', False) """ + def __init__(self, vega_spec=None, _proxy=None): if vega_spec is not None: import turicreate as tc + self.__proxy__ = tc.extensions.plot_from_vega_spec(vega_spec) else: self.__proxy__ = _proxy @@ -128,24 +159,27 @@ def show(self): global _target # Suppress visualization output if 'none' target is set - if _target == 'none': + if _target == "none": return # If browser target is set, launch in web browser - if _target == 'browser': + if _target == "browser": # First, make sure TURI_VISUALIZATION_WEB_SERVER_ROOT_DIRECTORY is set _ensure_web_server() # Launch this plot's URL using Python built-in webbrowser module import webbrowser + url = self.get_url() webbrowser.open_new_tab(url) return # If auto target is set, try to show inline in Jupyter Notebook try: - if _target == 'auto' and \ - (get_ipython().__class__.__name__ == "ZMQInteractiveShell" or get_ipython().__class__.__name__ == "Shell"): + if _target == "auto" and ( + get_ipython().__class__.__name__ == "ZMQInteractiveShell" + or get_ipython().__class__.__name__ == "Shell" + ): self._repr_javascript_() return except NameError: @@ -161,8 +195,10 @@ def show(self): # TODO: allow autodetection of light/dark mode. # Disabled for now, since the GUI side needs some work (ie. background color). - plot_variation = 0x10 # force light mode - self.__proxy__.call_function('show', {'path_to_client': path_to_client, 'variation': plot_variation}) + plot_variation = 0x10 # force light mode + self.__proxy__.call_function( + "show", {"path_to_client": path_to_client, "variation": plot_variation} + ) def save(self, filepath): """ @@ -198,56 +234,74 @@ def save(self, filepath): if filepath.endswith(".json"): # save as vega json - spec = self.get_vega(include_data = True) - with open(filepath, 'w') as fp: + spec = self.get_vega(include_data=True) + with open(filepath, "w") as fp: _json.dump(spec, fp) elif filepath.endswith(".png") or filepath.endswith(".svg"): # save as png/svg, but json first - spec = self.get_vega(include_data = True) + spec = self.get_vega(include_data=True) EXTENSION_START_INDEX = -3 extension = filepath[EXTENSION_START_INDEX:] temp_file_tuple = _mkstemp() temp_file_path = temp_file_tuple[1] - with open(temp_file_path, 'w') as fp: + with open(temp_file_path, "w") as fp: _json.dump(spec, fp) dirname = _os.path.dirname(__file__) relative_path_to_vg2png_vg2svg = "../vg2" + extension - absolute_path_to_vg2png_vg2svg = _os.path.join(dirname, - relative_path_to_vg2png_vg2svg) + absolute_path_to_vg2png_vg2svg = _os.path.join( + dirname, relative_path_to_vg2png_vg2svg + ) # try node vg2[png|svg] json_filepath out_filepath - (exitcode, stdout, stderr) = _run_cmdline("node " + - absolute_path_to_vg2png_vg2svg + " " - + temp_file_path + " " + filepath) + (exitcode, stdout, stderr) = _run_cmdline( + "node " + + absolute_path_to_vg2png_vg2svg + + " " + + temp_file_path + + " " + + filepath + ) if exitcode == _NODE_NOT_FOUND_ERROR_CODE: # user doesn't have node installed - raise RuntimeError("Node.js not found. Saving as PNG and SVG" + - " requires Node.js, please download and install Node.js " + - "from here and try again: https://nodejs.org/en/download/") + raise RuntimeError( + "Node.js not found. Saving as PNG and SVG" + + " requires Node.js, please download and install Node.js " + + "from here and try again: https://nodejs.org/en/download/" + ) elif exitcode == _CANVAS_PREBUILT_NOT_FOUND_ERROR: # try to see if canvas-prebuilt is globally installed # if it is, then link it # if not, tell the user to install it - (is_installed_exitcode, + ( + is_installed_exitcode, is_installed_stdout, - is_installed_stderr) = _run_cmdline( - "npm ls -g -json | grep canvas-prebuilt") + is_installed_stderr, + ) = _run_cmdline("npm ls -g -json | grep canvas-prebuilt") if is_installed_exitcode == _SUCCESS: # npm link canvas-prebuilt link_exitcode, link_stdout, link_stderr = _run_cmdline( - "npm link canvas-prebuilt") + "npm link canvas-prebuilt" + ) if link_exitcode == _PERMISSION_DENIED_ERROR_CODE: # They don't have permission, tell them. - raise RuntimeError(link_stderr + '\n\n' + - "`npm link canvas-prebuilt` failed, " + - "Permission Denied.") + raise RuntimeError( + link_stderr + + "\n\n" + + "`npm link canvas-prebuilt` failed, " + + "Permission Denied." + ) elif link_exitcode == _SUCCESS: # canvas-prebuilt link is now successful, so run the # node vg2[png|svg] json_filepath out_filepath # command again. - (exitcode, stdout, stderr) = _run_cmdline("node " + - absolute_path_to_vg2png_vg2svg + " " - + temp_file_path + " " + filepath) + (exitcode, stdout, stderr) = _run_cmdline( + "node " + + absolute_path_to_vg2png_vg2svg + + " " + + temp_file_path + + " " + + filepath + ) if exitcode != _SUCCESS: # something else that we have not identified yet # happened. @@ -255,11 +309,13 @@ def save(self, filepath): else: raise RuntimeError(link_stderr) else: - raise RuntimeError("canvas-prebuilt not found. " + - "Saving as PNG and SVG requires canvas-prebuilt, " + - "please download and install canvas-prebuilt by " + - "running this command, and try again: " + - "`npm install -g canvas-prebuilt`") + raise RuntimeError( + "canvas-prebuilt not found. " + + "Saving as PNG and SVG requires canvas-prebuilt, " + + "please download and install canvas-prebuilt by " + + "running this command, and try again: " + + "`npm install -g canvas-prebuilt`" + ) elif exitcode == _SUCCESS: pass else: @@ -267,20 +323,23 @@ def save(self, filepath): # delete temp file that user didn't ask for _run_cmdline("rm " + temp_file_path) else: - raise NotImplementedError("filename must end in" + - " .json, .svg, or .png") + raise NotImplementedError("filename must end in" + " .json, .svg, or .png") def get_data(self): - return _json.loads(self.__proxy__.call_function('get_data')) + return _json.loads(self.__proxy__.call_function("get_data")) def get_vega(self, include_data=True): # TODO: allow autodetection of light/dark mode. # Disabled for now, since the GUI side needs some work (ie. background color). - plot_variation = 0x10 # force light mode - return _json.loads(self.__proxy__.call_function('get_spec', {'include_data': include_data, 'variation': plot_variation})) + plot_variation = 0x10 # force light mode + return _json.loads( + self.__proxy__.call_function( + "get_spec", {"include_data": include_data, "variation": plot_variation} + ) + ) def materialize(self): - self.__proxy__.call_function('materialize') + self.__proxy__.call_function("materialize") def get_url(self): """ @@ -290,7 +349,7 @@ def get_url(self): -------- The URL will be served by Turi Create on http://localhost. """ - return self.__proxy__.call_function('get_url') + return self.__proxy__.call_function("get_url") def _repr_javascript_(self): from IPython.core.display import display, HTML @@ -298,7 +357,8 @@ def _repr_javascript_(self): self.materialize() vega_spec = self.get_vega(True) - vega_html = ' \ + vega_html = ( + ' \ \ \ \ @@ -330,7 +390,11 @@ def _repr_javascript_(self):
\
\ \ \ ' + ) - display(HTML(' \ + display( + HTML( + ' \ \ - \ \ - ')); + ' + ) + ) + def display_table_in_notebook(sf, title=None): from IPython.core.display import display @@ -365,32 +443,48 @@ def display_table_in_notebook(sf, title=None): def image_formatter(im): image_buffer = BytesIO() - im.save(image_buffer, format='PNG') - return "" + im.save(image_buffer, format="PNG") + return ( + '' + ) import pandas as pd + maximum_rows = 100 if len(sf) > maximum_rows: import warnings - warnings.warn('Displaying only the first {} rows.'.format(maximum_rows)) + + warnings.warn("Displaying only the first {} rows.".format(maximum_rows)) sf = sf[:maximum_rows] - + check_image_column = [_Image == x for x in sf.column_types()] zipped_image_columns = zip(sf.column_names(), check_image_column) image_columns = filter(lambda a: a[1], zipped_image_columns) image_key = [x[0] for x in image_columns] - image_column_formatter = dict.fromkeys(image_key , image_formatter) - - with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', -1): + image_column_formatter = dict.fromkeys(image_key, image_formatter) + + with pd.option_context( + "display.max_rows", + None, + "display.max_columns", + None, + "display.max_colwidth", + -1, + ): if _sys.version_info.major < 3: import cgi - title = cgi.escape(title,quote = True) + + title = cgi.escape(title, quote=True) else: import html - title = html.escape(title,quote = True) + + title = html.escape(title, quote=True) df = sf.to_dataframe() - html_string = ' \ + html_string = ( + ' \ \ \ \ \ -

'+ title +'

\ - '+df.to_html(formatters=image_column_formatter, escape=False, classes='sframe')+'\ +

' + + title + + "

\ + " + + df.to_html( + formatters=image_column_formatter, escape=False, classes="sframe" + ) + + "\ \ - ' + " + ) display(HTML(html_string)) diff --git a/src/python/turicreate/visualization/show.py b/src/python/turicreate/visualization/show.py index 5cd6cc8ca1..e9e20205b4 100644 --- a/src/python/turicreate/visualization/show.py +++ b/src/python/turicreate/visualization/show.py @@ -9,6 +9,7 @@ from ._plot import Plot, LABEL_DEFAULT import turicreate as tc + def _get_title(title): if title == "": title = " " @@ -17,6 +18,7 @@ def _get_title(title): return title + def plot(x, y, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAULT): """ Plots the data in `x` on the X axis and the data in `y` on the Y axis @@ -77,13 +79,14 @@ def plot(x, y, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAULT): """ title = _get_title(title) - if not(isinstance(x, tc.SArray)): - raise ValueError("The X axis data should be an SArray.") - if not(isinstance(y, tc.SArray)): - raise ValueError("The Y axis data should be an SArray.") + if not (isinstance(x, tc.SArray)): + raise ValueError("The X axis data should be an SArray.") + if not (isinstance(y, tc.SArray)): + raise ValueError("The Y axis data should be an SArray.") plt_ref = tc.extensions.plot(x, y, xlabel, ylabel, title) return Plot(_proxy=plt_ref) + def show(x, y, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAULT): """ Plots the data in `x` on the X axis and the data in `y` on the Y axis @@ -146,6 +149,7 @@ def show(x, y, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAULT): """ plot(x, y, xlabel, ylabel, title).show() + def scatter(x, y, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAULT): """ Plots the data in `x` on the X axis and the data in `y` on the Y axis @@ -185,18 +189,25 @@ def scatter(x, y, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAUL >>> y = x * 2 >>> scplt = turicreate.visualization.scatter(x, y) """ - if (not isinstance(x, tc.data_structures.sarray.SArray) or - not isinstance(y, tc.data_structures.sarray.SArray) or - x.dtype not in [int, float] or y.dtype not in [int, float]): - raise ValueError("turicreate.visualization.scatter supports " + - "SArrays of dtypes: int, float") + if ( + not isinstance(x, tc.data_structures.sarray.SArray) + or not isinstance(y, tc.data_structures.sarray.SArray) + or x.dtype not in [int, float] + or y.dtype not in [int, float] + ): + raise ValueError( + "turicreate.visualization.scatter supports " + + "SArrays of dtypes: int, float" + ) # legit input title = _get_title(title) - plt_ref = tc.extensions.plot_scatter(x, y, - xlabel, ylabel,title) + plt_ref = tc.extensions.plot_scatter(x, y, xlabel, ylabel, title) return Plot(_proxy=plt_ref) -def categorical_heatmap(x, y, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAULT): + +def categorical_heatmap( + x, y, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAULT +): """ Plots the data in `x` on the X axis and the data in `y` on the Y axis in a 2d categorical heatmap, and returns the resulting Plot object. @@ -234,17 +245,22 @@ def categorical_heatmap(x, y, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title= >>> y = turicreate.SArray(['a','b','c','d','e']) >>> catheat = turicreate.visualization.categorical_heatmap(x, y) """ - if (not isinstance(x, tc.data_structures.sarray.SArray) or - not isinstance(y, tc.data_structures.sarray.SArray) or - x.dtype != str or y.dtype != str): - raise ValueError("turicreate.visualization.categorical_heatmap supports " + - "SArrays of dtype: str") + if ( + not isinstance(x, tc.data_structures.sarray.SArray) + or not isinstance(y, tc.data_structures.sarray.SArray) + or x.dtype != str + or y.dtype != str + ): + raise ValueError( + "turicreate.visualization.categorical_heatmap supports " + + "SArrays of dtype: str" + ) # legit input title = _get_title(title) - plt_ref = tc.extensions.plot_categorical_heatmap(x, y, - xlabel, ylabel, title) + plt_ref = tc.extensions.plot_categorical_heatmap(x, y, xlabel, ylabel, title) return Plot(_proxy=plt_ref) + def heatmap(x, y, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAULT): """ Plots the data in `x` on the X axis and the data in `y` on the Y axis @@ -283,16 +299,21 @@ def heatmap(x, y, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAUL >>> y = x * 2 >>> heat = turicreate.visualization.heatmap(x, y) """ - if (not isinstance(x, tc.data_structures.sarray.SArray) or - not isinstance(y, tc.data_structures.sarray.SArray) or - x.dtype not in [int, float] or y.dtype not in [int, float]): - raise ValueError("turicreate.visualization.heatmap supports " + - "SArrays of dtype: int, float") + if ( + not isinstance(x, tc.data_structures.sarray.SArray) + or not isinstance(y, tc.data_structures.sarray.SArray) + or x.dtype not in [int, float] + or y.dtype not in [int, float] + ): + raise ValueError( + "turicreate.visualization.heatmap supports " + + "SArrays of dtype: int, float" + ) title = _get_title(title) - plt_ref = tc.extensions.plot_heatmap(x, y, - xlabel, ylabel, title) + plt_ref = tc.extensions.plot_heatmap(x, y, xlabel, ylabel, title) return Plot(_proxy=plt_ref) + def box_plot(x, y, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAULT): """ Plots the data in `x` on the X axis and the data in `y` on the Y axis @@ -329,17 +350,22 @@ def box_plot(x, y, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAU >>> bp = turicreate.visualization.box_plot(tc.SArray(['a','b','c','a','a']),tc.SArray([4.0,3.25,2.1,2.0,1.0])) """ - if (not isinstance(x, tc.data_structures.sarray.SArray) or - not isinstance(y, tc.data_structures.sarray.SArray) or - x.dtype != str or y.dtype not in [int, float]): - raise ValueError("turicreate.visualization.box_plot supports " + - "x as SArray of dtype str and y as SArray of dtype: int, float." + - "\nExample: turicreate.visualization.box_plot(tc.SArray(['a','b','c','a','a']),tc.SArray([4.0,3.25,2.1,2.0,1.0]))") + if ( + not isinstance(x, tc.data_structures.sarray.SArray) + or not isinstance(y, tc.data_structures.sarray.SArray) + or x.dtype != str + or y.dtype not in [int, float] + ): + raise ValueError( + "turicreate.visualization.box_plot supports " + + "x as SArray of dtype str and y as SArray of dtype: int, float." + + "\nExample: turicreate.visualization.box_plot(tc.SArray(['a','b','c','a','a']),tc.SArray([4.0,3.25,2.1,2.0,1.0]))" + ) title = _get_title(title) - plt_ref = tc.extensions.plot_boxes_and_whiskers(x, y, - xlabel, ylabel, title) + plt_ref = tc.extensions.plot_boxes_and_whiskers(x, y, xlabel, ylabel, title) return Plot(_proxy=plt_ref) + def columnwise_summary(sf): """ Plots a columnwise summary of the sframe provided as input, @@ -367,11 +393,13 @@ def columnwise_summary(sf): >>> colsum = turicreate.visualization.columnwise_summary(sf_test) """ if not isinstance(sf, tc.data_structures.sframe.SFrame): - raise ValueError("turicreate.visualization.columnwise_summary " + - "supports SFrame") + raise ValueError( + "turicreate.visualization.columnwise_summary " + "supports SFrame" + ) plt_ref = tc.extensions.plot_columnwise_summary(sf) return Plot(_proxy=plt_ref) + def histogram(sa, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAULT): """ Plots a histogram of the sarray provided as input, and returns the @@ -405,15 +433,19 @@ def histogram(sa, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAUL >>> x = turicreate.SArray([1,2,3,4,5,1,1,1,1,2,2,3,2,3,1,1,1,4]) >>> hist = turicreate.visualization.histogram(x) """ - if (not isinstance(sa, tc.data_structures.sarray.SArray) or - sa.dtype not in [int, float]): - raise ValueError("turicreate.visualization.histogram supports " + - "SArrays of dtypes: int, float") + if not isinstance(sa, tc.data_structures.sarray.SArray) or sa.dtype not in [ + int, + float, + ]: + raise ValueError( + "turicreate.visualization.histogram supports " + + "SArrays of dtypes: int, float" + ) title = _get_title(title) - plt_ref = tc.extensions.plot_histogram(sa, - xlabel, ylabel, title) + plt_ref = tc.extensions.plot_histogram(sa, xlabel, ylabel, title) return Plot(_proxy=plt_ref) + def item_frequency(sa, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_DEFAULT): """ Plots an item frequency of the sarray provided as input, and returns the @@ -447,11 +479,10 @@ def item_frequency(sa, xlabel=LABEL_DEFAULT, ylabel=LABEL_DEFAULT, title=LABEL_D >>> x = turicreate.SArray(['a','ab','acd','ab','a','a','a','ab','cd']) >>> ifplt = turicreate.visualization.item_frequency(x) """ - if (not isinstance(sa, tc.data_structures.sarray.SArray) or - sa.dtype != str): - raise ValueError("turicreate.visualization.item_frequency supports " + - "SArrays of dtype str") + if not isinstance(sa, tc.data_structures.sarray.SArray) or sa.dtype != str: + raise ValueError( + "turicreate.visualization.item_frequency supports " + "SArrays of dtype str" + ) title = _get_title(title) - plt_ref = tc.extensions.plot_item_frequency(sa, - xlabel, ylabel, title) + plt_ref = tc.extensions.plot_item_frequency(sa, xlabel, ylabel, title) return Plot(_proxy=plt_ref)