diff --git a/CHANGELOG.md b/CHANGELOG.md index f10143a6..dd27bb8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- 5 new distance/similarity + 1. KoppenI + 2. KoppenII + 3. KuderRichardson + 4. KuhnsI + 5. KuhnsII - `feature_request.yml` template - `config.yml` for issue template - `SECURITY.md` diff --git a/Document/Distance.ipynb b/Document/Distance.ipynb index 5995a019..467cb7e7 100644 --- a/Document/Distance.ipynb +++ b/Document/Distance.ipynb @@ -2773,6 +2773,278 @@ "" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Köppen I" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Köppen I correlation [[38]](#ref38)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{KoppenI} =\n", + "\\frac{\\frac{2 \\times TP+FP+FN}{2}.\\frac{2 \\times TN+FP+FN}{2} - \\frac{FP+FN}{2}}\n", + "{\\frac{2 \\times TP+FP+FN}{2}.\\frac{2 \\times TN+FP+FN}{2}}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.96875, 1: 0.9368421052631579, 2: 0.9300699300699301}" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.KoppenI)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Köppen II" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Köppen II correlation [[38]](#ref38)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{KoppenII} =\n", + "TP + \\frac{FP + FN}{2}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 4.0, 1: 2.5, 2: 5.5}" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.KoppenII)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## KuderRichardson" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Kuder & Richardson correlation [[39]](#ref39)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$corr_{KuderRichardson} =\n", + "\\frac{4 \\times (TP \\times TN - FP \\times FN)}\n", + "{(TP+FP)(FN+TN) + (TP+FN)(FP+TN) + 2(TP \\times TN - FP \\times FN)}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.8076923076923077, 1: 0.4067796610169492, 2: 0.2891566265060241}" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.KuderRichardson)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## KuhnsI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Kuhns I correlation [[40]](#ref40)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$corr_{KuhnsI} =\n", + "\\frac{2 \\times \\delta(TP + FP, TP + FN)}\n", + "{N}\n", + "$$\n", + "\n", + "$$\n", + "\\delta(TP + FP, TP + FN) = TP - \\frac{(TP + FP) \\times (TP + FN)}{N}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.2916666666666667, 1: 0.08333333333333333, 2: 0.08333333333333333}" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.KuhnsI)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## KuhnsII" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Kuhns II correlation [[40]](#ref40)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$corr_{KuhnsII} =\n", + "\\frac{\\delta(TP + FP, TP + FN)}\n", + "{\\max(TP + FP, TP + FN)}\n", + "$$\n", + "\n", + "$$\n", + "\\delta(TP + FP, TP + FN) = TP - \\frac{(TP + FP) \\times (TP + FN)}{N}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.35, 1: 0.16666666666666666, 2: 0.08333333333333333}" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.KuhnsII)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2856,7 +3128,13 @@ "\n", "
36- M. G. Kendall, \"A new measure of rank correlation,\" Biometrika, vol. 30, no. 1/2, pp. 81-93, 1938.
\n", "\n", - "
37- R. N. Kent and S. L. Foster, \"Direct observational procedures: Methodological issues in naturalistic settings,\" Handbook of behavioral assessment, pp. 279-328, 1977.
" + "
37- R. N. Kent and S. L. Foster, \"Direct observational procedures: Methodological issues in naturalistic settings,\" Handbook of behavioral assessment, pp. 279-328, 1977.
\n", + "\n", + "
38- W. Köppen, \"In Repertorium für Meteorologie,\" Akademiia Nauk, pp. 189–238, 1870.
\n", + "\n", + "
39- G. F. Kuder and M. W. Richardson, \"The theory of the estimation of test reliability,\" Psychometrika, pp. 151–160, 1937.
\n", + "\n", + "
40- J. L. Kuhns, \"Statistical Association Methods for Mechanized Documentation,\" National Bureau of Standards Miscellaneous Publication, pp. 33-40, 1964.
" ] } ], diff --git a/Otherfiles/notebook_to_html.py b/Otherfiles/notebook_to_html.py index aaaa1aa1..d651b32c 100644 --- a/Otherfiles/notebook_to_html.py +++ b/Otherfiles/notebook_to_html.py @@ -22,7 +22,7 @@ "Example8"] MAIN_DOCS_LIST = ["Distance", - "Document"] + "Document"] NOTEBOOK_EXTENSION = ".ipynb" @@ -61,7 +61,7 @@ nb = nbformat.read(f, as_version=4) ep.preprocess( nb, { - 'metadata': { + 'metadata': { 'path': OUTPUT_FOLDER_PATH}}) with open(notebook_copy_path, 'w', encoding='utf-8') as f: nbformat.write(nb, f) @@ -89,7 +89,7 @@ nb = nbformat.read(f, as_version=4) ep.preprocess( nb, { - 'metadata': { + 'metadata': { 'path': OUTPUT_FOLDER_PATH}}) with open(notebook_copy_path, 'w', encoding='utf-8') as f: nbformat.write(nb, f) diff --git a/Test/verified_test.py b/Test/verified_test.py index b266b8cc..990660d0 100644 --- a/Test/verified_test.py +++ b/Test/verified_test.py @@ -406,6 +406,16 @@ >>> assert isclose(cm2.distance(metric=DistanceType.KentFosterI)[1], -0.23529411764705888, abs_tol=ABS_TOL, rel_tol=REL_TOL) >>> assert isclose(cm1.distance(metric=DistanceType.KentFosterII)[1], -0.0012804097311239404, abs_tol=ABS_TOL, rel_tol=REL_TOL) >>> assert isclose(cm2.distance(metric=DistanceType.KentFosterII)[1], -0.002196997436837158, abs_tol=ABS_TOL, rel_tol=REL_TOL) +>>> assert isclose(cm1.distance(metric=DistanceType.KoppenI)[1], 0.9993589743589744, abs_tol=ABS_TOL, rel_tol=REL_TOL) # normalizer: None +>>> assert isclose(cm2.distance(metric=DistanceType.KoppenI)[1], 0.9991825772172593, abs_tol=ABS_TOL, rel_tol=REL_TOL) # normalizer: None +>>> assert isclose(cm1.distance(metric=DistanceType.KoppenII)[1], 4.0, abs_tol=ABS_TOL, rel_tol=REL_TOL) +>>> assert isclose(cm2.distance(metric=DistanceType.KoppenII)[1], 5.5, abs_tol=ABS_TOL, rel_tol=REL_TOL) +>>> assert isclose(cm1.distance(metric=DistanceType.KuderRichardson)[1], 0.6643835616438356, abs_tol=ABS_TOL, rel_tol=REL_TOL) +>>> assert isclose(cm2.distance(metric=DistanceType.KuderRichardson)[1], 0.5285677463699631, abs_tol=ABS_TOL, rel_tol=REL_TOL) +>>> assert isclose(cm1.distance(metric=DistanceType.KuhnsI)[1], 0.005049979175343606, abs_tol=ABS_TOL, rel_tol=REL_TOL) +>>> assert isclose(cm2.distance(metric=DistanceType.KuhnsI)[1], 0.005004425239483548, abs_tol=ABS_TOL, rel_tol=REL_TOL) +>>> assert isclose(cm1.distance(metric=DistanceType.KuhnsII)[1], 0.49489795918367346, abs_tol=ABS_TOL, rel_tol=REL_TOL) +>>> assert isclose(cm2.distance(metric=DistanceType.KuhnsII)[1], 0.32695578231292516, abs_tol=ABS_TOL, rel_tol=REL_TOL) >>> mlcm = MultiLabelCM(actual_vector=[{"cat", "bird"}, {"dog"}], predict_vector=[{"cat"}, {"dog", "bird"}], classes=["cat", "dog", "bird"]) # Verified Case -- (http://bitly.ws/GNq2) >>> mlcm.actual_vector_multihot [[1, 0, 1], [0, 1, 0]] diff --git a/pycm/pycm_distance.py b/pycm/pycm_distance.py index 57820b89..9b7a70da 100644 --- a/pycm/pycm_distance.py +++ b/pycm/pycm_distance.py @@ -67,6 +67,11 @@ class DistanceType(Enum): KendallTau = "KendallTau" KentFosterI = "KentFosterI" KentFosterII = "KentFosterII" + KoppenI = "KoppenI" + KoppenII = "KoppenII" + KuderRichardson = "KuderRichardson" + KuhnsI = "KuhnsI" + KuhnsII = "KuhnsII" def AMPLE_calc(TP, FP, FN, TN): @@ -1219,6 +1224,118 @@ def KentFosterII_calc(TP, FP, FN, TN): return "None" +def KoppenI_calc(TP, FP, FN, TN): + """ + Calculate Koppen I correlation. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Koppen I correlation as float + """ + try: + part1 = (2 * TP + FP + FN) / 2 + part2 = (2 * TN + FP + FN) / 2 + part3 = part1 * part2 + part4 = (FP + FN) / 2 + return (part3 - part4) / part3 + except Exception: + return "None" + + +def KoppenII_calc(TP, FP, FN, TN): + """ + Calculate Koppen II similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Koppen II similarity as float + """ + try: + return TP + (FP + FN) / 2 + except Exception: + return "None" + + +def KuderRichardson_calc(TP, FP, FN, TN): + """ + Calculate Kuder & Richardson correlation. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Kuder & Richardson correlation as float + """ + try: + part1 = 4 * (TP * TN - FP * FN) + part2 = (TP + FP) * (FN + TN) + (TP + FN) * (FP + TN) + part3 = 2 * (TP * TN - FP * FN) + return part1 / (part2 + part3) + except Exception: + return "None" + + +def KuhnsI_calc(TP, FP, FN, TN): + """ + Calculate Kuhns I correlation. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Kuhns I correlation as float + """ + try: + n = TP + FP + FN + TN + delta = TP - ((TP + FP) * (TP + FN)) / n + return 2 * delta / n + except Exception: + return "None" + + +def KuhnsII_calc(TP, FP, FN, TN): + """ + Calculate Kuhns II correlation. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Kuhns II correlation as float + """ + try: + n = TP + FP + FN + TN + delta = TP - ((TP + FP) * (TP + FN)) / n + part1 = max(TP + FP, TP + FN) + return delta / part1 + except Exception: + return "None" + + DISTANCE_MAPPER = { DistanceType.AMPLE: AMPLE_calc, DistanceType.Anderberg: Anderberg_calc, @@ -1272,5 +1389,10 @@ def KentFosterII_calc(TP, FP, FN, TN): DistanceType.HawkinsDotson: HawkinsDotson_calc, DistanceType.KendallTau: KendallTau_calc, DistanceType.KentFosterI: KentFosterI_calc, - DistanceType.KentFosterII: KentFosterII_calc + DistanceType.KentFosterII: KentFosterII_calc, + DistanceType.KoppenI: KoppenI_calc, + DistanceType.KoppenII: KoppenII_calc, + DistanceType.KuderRichardson: KuderRichardson_calc, + DistanceType.KuhnsI: KuhnsI_calc, + DistanceType.KuhnsII: KuhnsII_calc, } diff --git a/pycm/pycm_param.py b/pycm/pycm_param.py index 5f50d763..20ea9e11 100644 --- a/pycm/pycm_param.py +++ b/pycm/pycm_param.py @@ -337,7 +337,6 @@ " Supported parameters are: ") + ", ".join(CI_CLASS_LIST) + ", " + ", ".join(CI_OVERALL_LIST) - MULTICLASS_RECOMMEND = [ "ERR", "TPR Micro",