Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Typing and linting #91

Merged
merged 5 commits into from
Dec 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ on: [ push, pull_request ]
jobs:
build:
runs-on: ubuntu-latest
continue-on-error: ${{ matrix.python-version == '3.12' }}
strategy:
fail-fast: false
matrix:
python-version: [ "3.8", "3.9", "3.10", "3.11" ]
python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
[![Codecov][codecov_badge]][codecov_link]
[![Documentation][docs_badge]][docs_link]

[github_tests_badge]: https://github.com/Toloka/crowdlib/workflows/Tests/badge.svg?branch=main
[github_tests_link]: https://github.com/Toloka/crowdlib/actions?query=workflow:Tests
[github_tests_badge]: https://github.com/Toloka/crowd-kit/actions/workflows/tests.yml/badge.svg?branch=main
[github_tests_link]: https://github.com/Toloka/crowd-kit/actions/workflows/tests.yml
[codecov_badge]: https://codecov.io/gh/Toloka/crowd-kit/branch/main/graph/badge.svg
[codecov_link]: https://codecov.io/gh/Toloka/crowd-kit
[docs_badge]: https://img.shields.io/badge/docs-toloka.ai-1E2126
Expand Down
18 changes: 9 additions & 9 deletions crowdkit/aggregation/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"BasePairwiseAggregator",
]

from typing import Optional
from typing import Any, Optional

import attr
import pandas as pd
Expand All @@ -24,7 +24,7 @@ class BaseClassificationAggregator:
is the tasks's most likely true label.
"""

labels_: Optional[pd.Series] = named_series_attrib(name="agg_label")
labels_: Optional["pd.Series[Any]"] = named_series_attrib(name="agg_label")

def fit(self, data: pd.DataFrame) -> "BaseClassificationAggregator":
"""Args:
Expand All @@ -36,7 +36,7 @@ def fit(self, data: pd.DataFrame) -> "BaseClassificationAggregator":
"""
raise NotImplementedError()

def fit_predict(self, data: pd.DataFrame) -> pd.Series:
def fit_predict(self, data: pd.DataFrame) -> "pd.Series[Any]":
"""Args:
data (DataFrame): Workers' labeling results.
A pandas.DataFrame containing `task`, `worker` and `label` columns.
Expand All @@ -58,7 +58,7 @@ class BaseImageSegmentationAggregator:
is the tasks's aggregated segmentation.
"""

segmentations_: pd.Series = named_series_attrib(name="agg_segmentation")
segmentations_: "pd.Series[Any]" = named_series_attrib(name="agg_segmentation")

def fit(self, data: pd.DataFrame) -> "BaseImageSegmentationAggregator":
"""Args:
Expand All @@ -70,7 +70,7 @@ def fit(self, data: pd.DataFrame) -> "BaseImageSegmentationAggregator":
"""
raise NotImplementedError()

def fit_predict(self, data: pd.DataFrame) -> pd.Series:
def fit_predict(self, data: pd.DataFrame) -> "pd.Series[Any]":
"""Args:
data (DataFrame): Workers' segmentations.
A pandas.DataFrame containing `worker`, `task` and `segmentation` columns'.
Expand Down Expand Up @@ -122,7 +122,7 @@ class BaseTextsAggregator:
is the task's text.
"""

texts_: pd.Series = named_series_attrib(name="agg_text")
texts_: "pd.Series[Any]" = named_series_attrib(name="agg_text")

def fit(self, data: pd.DataFrame) -> "BaseTextsAggregator":
"""Args:
Expand All @@ -133,7 +133,7 @@ def fit(self, data: pd.DataFrame) -> "BaseTextsAggregator":
"""
raise NotImplementedError()

def fit_predict(self, data: pd.DataFrame) -> pd.Series:
def fit_predict(self, data: pd.DataFrame) -> "pd.Series[Any]":
"""Args:
data (DataFrame): Workers' text outputs.
A pandas.DataFrame containing `task`, `worker` and `text` columns.
Expand All @@ -153,7 +153,7 @@ class BasePairwiseAggregator:
A pandas.Series index by labels and holding corresponding label's scores
"""

scores_: pd.Series = named_series_attrib(name="agg_score")
scores_: "pd.Series[Any]" = named_series_attrib(name="agg_score")

def fit(self, data: pd.DataFrame) -> "BasePairwiseAggregator":
"""Args:
Expand All @@ -166,7 +166,7 @@ def fit(self, data: pd.DataFrame) -> "BasePairwiseAggregator":
"""
raise NotImplementedError()

def fit_predict(self, data: pd.DataFrame) -> pd.Series:
def fit_predict(self, data: pd.DataFrame) -> "pd.Series[Any]":
"""Args:
data (DataFrame): Workers' pairwise comparison results.
A pandas.DataFrame containing `worker`, `left`, `right`, and `label` columns'.
Expand Down
46 changes: 25 additions & 21 deletions crowdkit/aggregation/classification/dawid_skene.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
__all__ = ["DawidSkene", "OneCoinDawidSkene"]

from typing import List, Optional
from typing import Any, List, Optional, cast

import attr
import numpy as np
Expand Down Expand Up @@ -81,7 +81,7 @@ class DawidSkene(BaseClassificationAggregator):
tol: float = attr.ib(default=1e-5)

probas_: Optional[pd.DataFrame] = attr.ib(init=False)
priors_: Optional[pd.Series] = named_series_attrib(name="prior")
priors_: Optional["pd.Series[Any]"] = named_series_attrib(name="prior")
# labels_
errors_: Optional[pd.DataFrame] = attr.ib(init=False)
loss_history_: List[float] = attr.ib(init=False)
Expand All @@ -103,7 +103,7 @@ def _m_step(data: pd.DataFrame, probas: pd.DataFrame) -> pd.DataFrame:

@staticmethod
def _e_step(
data: pd.DataFrame, priors: pd.Series, errors: pd.DataFrame
data: pd.DataFrame, priors: "pd.Series[Any]", errors: pd.DataFrame
) -> pd.DataFrame:
"""
Performs E-step of the Dawid-Skene algorithm.
Expand All @@ -115,7 +115,7 @@ def _e_step(
# We have to multiply lots of probabilities and such products are known to converge
# to zero exponentially fast. To avoid floating-point precision problems we work with
# logs of original values
joined = data.join(np.log2(errors), on=["worker", "label"])
joined = data.join(np.log2(errors), on=["worker", "label"]) # type: ignore
joined.drop(columns=["worker", "label"], inplace=True)
log_likelihoods = np.log2(priors) + joined.groupby("task", sort=False).sum()
log_likelihoods.rename_axis("label", axis=1, inplace=True)
Expand All @@ -135,23 +135,23 @@ def _e_step(
scaled_likelihoods.columns = pd.Index(
scaled_likelihoods.columns, name="label", dtype=data.label.dtype
)
return scaled_likelihoods
return cast(pd.DataFrame, scaled_likelihoods)

def _evidence_lower_bound(
self,
data: pd.DataFrame,
probas: pd.DataFrame,
priors: pd.Series,
priors: "pd.Series[Any]",
errors: pd.DataFrame,
) -> float:
# calculate joint probability log-likelihood expectation over probas
joined = data.join(np.log(errors), on=["worker", "label"])
joined = data.join(np.log(errors), on=["worker", "label"]) # type: ignore

# escape boolean index/column names to prevent confusion between indexing by boolean array and iterable of names
joined = joined.rename(columns={True: "True", False: "False"}, copy=False)
priors = priors.rename(index={True: "True", False: "False"}, copy=False)

joined.loc[:, priors.index] = joined.loc[:, priors.index].add(np.log(priors))
joined.loc[:, priors.index] = joined.loc[:, priors.index].add(np.log(priors)) # type: ignore

joined.set_index(["task", "worker"], inplace=True)
joint_expectation = (
Expand Down Expand Up @@ -223,9 +223,11 @@ def fit_predict_proba(self, data: pd.DataFrame) -> pd.DataFrame:
Each probability is in he range from 0 to 1, all task probabilities must sum up to 1.
"""

return self.fit(data).probas_
self.fit(data)
assert self.probas_ is not None, "no probas_"
return self.probas_

def fit_predict(self, data: pd.DataFrame) -> pd.Series:
def fit_predict(self, data: pd.DataFrame) -> "pd.Series[Any]":
"""Fits the model to the training data and returns the aggregated results.
Args:
data (DataFrame): The training dataset of workers' labeling results
Expand All @@ -234,7 +236,9 @@ def fit_predict(self, data: pd.DataFrame) -> pd.Series:
Series: Task labels. The `pandas.Series` data is indexed by `task` so that `labels.loc[task]` is the most likely true label of tasks.
"""

return self.fit(data).labels_
self.fit(data)
assert self.labels_ is not None, "no labels_"
return self.labels_


@attr.s
Expand Down Expand Up @@ -307,37 +311,37 @@ class OneCoinDawidSkene(DawidSkene):
tol: float = attr.ib(default=1e-5)

probas_: Optional[pd.DataFrame] = attr.ib(init=False)
priors_: Optional[pd.Series] = named_series_attrib(name="prior")
priors_: Optional["pd.Series[Any]"] = named_series_attrib(name="prior")
errors_: Optional[pd.DataFrame] = attr.ib(init=False)
skills_: Optional[pd.Series] = attr.ib(init=False)
skills_: Optional["pd.Series[Any]"] = attr.ib(init=False)
loss_history_: List[float] = attr.ib(init=False)

@staticmethod
def _assign_skills(row: pd.Series, skills: pd.DataFrame) -> pd.DataFrame:
def _assign_skills(row: "pd.Series[Any]", skills: pd.DataFrame) -> pd.DataFrame:
"""
Assigns user skills to error matrix row by row.
"""
num_categories = len(row)
for column_name, _ in row.items():
if column_name == row.name[1]:
row[column_name] = skills[row.name[0]]
if column_name == row.name[1]: # type: ignore
row[column_name] = skills[row.name[0]] # type: ignore
else:
row[column_name] = (1 - skills[row.name[0]]) / (num_categories - 1)
return row
row[column_name] = (1 - skills[row.name[0]]) / (num_categories - 1) # type: ignore
return row # type: ignore

@staticmethod
def _process_skills_to_errors(
data: pd.DataFrame, probas: pd.DataFrame, skills: pd.Series
data: pd.DataFrame, probas: pd.DataFrame, skills: "pd.Series[Any]"
) -> pd.DataFrame:
errors = DawidSkene._m_step(data, probas)

errors = errors.apply(OneCoinDawidSkene._assign_skills, args=(skills,), axis=1)
errors = errors.apply(OneCoinDawidSkene._assign_skills, args=(skills,), axis=1) # type: ignore
errors.clip(lower=_EPS, upper=1 - _EPS, inplace=True)

return errors

@staticmethod
def _m_step(data: pd.DataFrame, probas: pd.DataFrame) -> pd.Series:
def _m_step(data: pd.DataFrame, probas: pd.DataFrame) -> "pd.Series[Any]": # type: ignore
"""Performs M-step of Homogeneous Dawid-Skene algorithm.

Calculates a worker skill as their accuracy according to the label probability.
Expand Down
Loading