Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove separator from local classifier per parent node #104

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions hiclass/DirectedAcyclicGraph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from hiclass.Node import Node


class DirectedAcyclicGraph:
"""
Manages the directed acyclic graph used in HiClass.

It tries to copy networkx API as much as possible,
but extends it by adding support for multiple nodes with the same name,
as long as they have different predecessors.
"""


def __init__(self, n_rows):
"""
Initialize a directed acyclic graph.

Parameters
----------
n_rows : int
The number of rows in x and y, i.e., the features and labels matrices.
"""
self.root = Node(n_rows, "root", True)
self.nodes = {
"root": self.root
}

def add_node(self, node_name):
"""
Add a new as successor of the root node.

Parameters
----------
node_name : str
The name of the node.
"""
if node_name != "":
new_node = self.root.add_successor(node_name)
self.nodes[node_name] = new_node

def add_path(self, nodes):
"""
Add new nodes from a path.

Parameters
----------
nodes : np.ndarray
The list with the path, e.g., [a b c] = a -> b -> c
"""
successor = nodes[0]
leaf = self.root.add_successor(successor)
self.nodes[successor] = leaf
index = 0
while index < len(nodes) - 1 and nodes[index] != "":
successor = nodes[index + 1]
if successor != "":
leaf = leaf.add_successor(successor)
self.nodes[successor] = leaf
index = index + 1

def is_acyclic(self):
visited = set()
to_visit = [self.root]
while len(to_visit) > 0:
next = to_visit.pop(0)
if next in visited:
return False
visited.add(next)
to_visit.extend(next.successors.values())
return True

def get_parent_nodes(self):
parent_nodes = []
for node in self.nodes.values():
# Skip only leaf nodes
successors = node.successors.values()
if len(successors) > 0:
parent_nodes.append(node)
return parent_nodes
124 changes: 96 additions & 28 deletions hiclass/LocalClassifierPerParentNode.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@
Numeric and string output labels are both handled.
"""

from copy import deepcopy

import networkx as nx
import numpy as np
from copy import deepcopy
from sklearn.base import BaseEstimator
from sklearn.utils.validation import _check_sample_weight
from sklearn.utils.validation import check_array, check_is_fitted

from hiclass.ConstantClassifier import ConstantClassifier
from hiclass.DirectedAcyclicGraph import DirectedAcyclicGraph
from hiclass.HierarchicalClassifier import HierarchicalClassifier
from hiclass.HierarchicalClassifier import make_leveled


class LocalClassifierPerParentNode(BaseEstimator, HierarchicalClassifier):
Expand Down Expand Up @@ -98,7 +100,7 @@ def fit(self, X, y, sample_weight=None):
Fitted estimator.
"""
# Execute common methods necessary before fitting
super()._pre_fit(X, y, sample_weight)
self._pre_fit(X, y, sample_weight)

# Fit local classifiers in DAG
super().fit(X, y)
Expand Down Expand Up @@ -157,6 +159,90 @@ def predict(self, X):

return y

def _pre_fit(self, X, y, sample_weight):
# Check that X and y have correct shape
# and convert them to np.ndarray if need be

if not self.bert:
self.X_, self.y_ = self._validate_data(
X, y, multi_output=True, accept_sparse="csr", allow_nd=True
)
else:
self.X_ = np.array(X)
self.y_ = np.array(y)

if sample_weight is not None:
self.sample_weight_ = _check_sample_weight(sample_weight, X)
else:
self.sample_weight_ = None

self.y_ = make_leveled(self.y_)

# Create and configure logger
self._create_logger()

# Create DAG from self.y_ and store to self.hierarchy_
self._create_digraph()

# If user passes edge_list, then export
# DAG to CSV file to visualize with Gephi
self._export_digraph()

# Assert that graph is directed acyclic
self._assert_digraph_is_dag()

# If y is 1D, convert to 2D for binary policies
self._convert_1d_y_to_2d()

# Initialize local classifiers in DAG
self._initialize_local_classifiers()

def _create_digraph(self):
# Create DiGraph
self.hierarchy_ = DirectedAcyclicGraph(self.X_.shape[0])

# Save dtype of y_
self.dtype_ = self.y_.dtype

self._create_digraph_1d()

self._create_digraph_2d()

if self.y_.ndim > 2:
# Unsuported dimension
self.logger_.error(f"y with {self.y_.ndim} dimensions detected")
raise ValueError(
f"Creating graph from y with {self.y_.ndim} dimensions is not supported"
)

def _create_digraph_1d(self):
# Flatten 1D disguised as 2D
if self.y_.ndim == 2 and self.y_.shape[1] == 1:
self.logger_.info("Converting y to 1D")
self.y_ = self.y_.flatten()
if self.y_.ndim == 1:
# Create max_levels_ variable
self.max_levels_ = 1
self.logger_.info(f"Creating digraph from {self.y_.size} 1D labels")
for label in self.y_:
self.hierarchy_.add_node(label)

def _create_digraph_2d(self):
if self.y_.ndim == 2:
# Create max_levels variable
self.max_levels_ = self.y_.shape[1]
rows, columns = self.y_.shape
self.logger_.info(f"Creating digraph from {rows} 2D labels")
for row in range(rows):
path = self.y_[row, :]
self.hierarchy_.add_path(path)

def _assert_digraph_is_dag(self):
# Assert that graph is directed acyclic
if not self.hierarchy_.is_acyclic():
self.logger_.error("Cycle detected in graph")
raise ValueError("Graph is not directed acyclic")

def _predict_remaining_levels(self, X, y):
for level in range(1, y.shape[1]):
predecessors = set(y[:, level - 1])
Expand All @@ -172,40 +258,22 @@ def _predict_remaining_levels(self, X, y):

def _initialize_local_classifiers(self):
super()._initialize_local_classifiers()
local_classifiers = {}
nodes = self._get_parents()
for node in nodes:
local_classifiers[node] = {"classifier": deepcopy(self.local_classifier_)}
nx.set_node_attributes(self.hierarchy_, local_classifiers)

def _get_parents(self):
nodes = []
for node in self.hierarchy_.nodes:
# Skip only leaf nodes
successors = list(self.hierarchy_.successors(node))
if len(successors) > 0:
nodes.append(node)
return nodes
parent_nodes = self.hierarchy_.get_parent_nodes()
for node in parent_nodes:
node.classifier = deepcopy(self.local_classifier_)

def _get_successors(self, node):
successors = list(self.hierarchy_.successors(node))
mask = np.isin(self.y_, successors).any(axis=1)
mask = node.get_successors_mask()
X = self.X_[mask]
y = []
for row in self.y_[mask]:
if node == self.root_:
y.append(row[0])
else:
y.append(row[np.where(row == node)[0][0] + 1])
y = np.array(y)
y = self.y_[mask]
sample_weight = (
self.sample_weight_[mask] if self.sample_weight_ is not None else None
)
return X, y, sample_weight

@staticmethod
def _fit_classifier(self, node):
classifier = self.hierarchy_.nodes[node]["classifier"]
classifier = self.hierarchy_.nodes[node.name].classifier
# get children examples
X, y, sample_weight = self._get_successors(node)
unique_y = np.unique(y)
Expand All @@ -222,5 +290,5 @@ def _fit_classifier(self, node):

def _fit_digraph(self, local_mode: bool = False, use_joblib: bool = False):
self.logger_.info("Fitting local classifiers")
nodes = self._get_parents()
nodes = self.hierarchy_.get_parent_nodes()
self._fit_node_classifier(nodes, local_mode, use_joblib)
45 changes: 45 additions & 0 deletions hiclass/Node.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import numpy as np

class Node:
"""Manages data for an individual node in the hierarchy."""

def __init__(self, n_rows, name, default_mask):
"""
Initialize an individual node.

Parameters
----------
n_rows : int
The number of rows in x and y.
name : str
The name of the node.
default_mask : Bool
The default value of the mask, i.e., True or False.
"""
self.n_rows = n_rows
self.mask = np.full(n_rows, default_mask)
self.successors = dict()
self.name = name
self.classifier = None

def add_successor(self, successor_name):
"""
Add a new successor.

Parameters
----------
node_name : str
The name of the new successor.

Returns
-------
successor : Node
The new successor created.
"""
if successor_name != "":
if not successor_name in self.successors:
new_successor = Node(self.n_rows, successor_name, False)
self.successors[successor_name] = new_successor
return new_successor
else:
return self.successors[successor_name]
4 changes: 4 additions & 0 deletions hiclass/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
"""Init module for the library."""

from .DirectedAcyclicGraph import DirectedAcyclicGraph
from .LocalClassifierPerLevel import LocalClassifierPerLevel
from .LocalClassifierPerNode import LocalClassifierPerNode
from .LocalClassifierPerParentNode import LocalClassifierPerParentNode
from .MultiLabelLocalClassifierPerNode import MultiLabelLocalClassifierPerNode
from .MultiLabelLocalClassifierPerParentNode import (
MultiLabelLocalClassifierPerParentNode,
)
from .Node import Node
from ._version import get_versions

__version__ = get_versions()["version"]
Expand All @@ -18,4 +20,6 @@
"LocalClassifierPerLevel",
"MultiLabelLocalClassifierPerNode",
"MultiLabelLocalClassifierPerParentNode",
"Node",
"DirectedAcyclicGraph",
]
59 changes: 59 additions & 0 deletions tests/test_DirectedAcyclicGraph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import numpy as np

from hiclass import DirectedAcyclicGraph


def test_add_node():
n_rows = 3
dag = DirectedAcyclicGraph(n_rows)
dag.add_node("node1")
dag.add_node("node2")
dag.add_node("node1")
dag.add_node("node2")
assert 3 == len(dag.nodes)
assert "root" in dag.nodes
assert "node1" in dag.nodes
assert "node2" in dag.nodes


def test_add_path():
paths = np.array([
["a", "c", "d"],
["b", "c", "e"],
["a", "c", "f"],
["c", "", ""],
["a", "c", "d"],
["b", "c", "e"],
["a", "c", "f"],
["c", "", ""],
["", "", ""],
])
rows = paths.shape[0]
dag = DirectedAcyclicGraph(rows)
for row in range(rows):
path = paths[row, :]
dag.add_path(path)
assert 8 == len(dag.nodes)


def test_is_acyclic():
n_rows = 3
dag = DirectedAcyclicGraph(n_rows)
dag.add_path([0, 1, 2])
dag.add_path([0, 2, 3])
assert dag.is_acyclic() is True
dag.add_path([0, 2, 0])
# assert dag.is_acyclic() is False
# the creation of new nodes removes cycles
# so this last assertion fails


def test_get_parent_nodes():
n_rows = 3
dag = DirectedAcyclicGraph(n_rows)
dag.add_path(["a", "b", "c"])
dag.add_path(["d", "e", "f"])
parent_nodes = dag.get_parent_nodes()
assert 5 == len(parent_nodes)
names = ["root", "a", "b", "d", "e"]
assert names == [node.name for node in parent_nodes]
Loading
Loading