diff --git a/CHANGELOG.md b/CHANGELOG.md index 345c4f15..a8a0cabe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,31 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- 23 new distance/similarity + 1. Dennis + 2. Digby + 3. Dispersion + 4. Doolittle + 5. Eyraud + 6. Fager & McGowan + 7. Faith + 8. Fleiss-Levin-Paik + 9. Forbes I + 10. Forbes II + 11. Fossum + 12. Gilbert & Wells + 13. Goodall + 14. Goodman & Kruskal's Lambda + 15. Goodman & Kruskal Lambda-r + 16. Guttman's Lambda A + 17. Guttman's Lambda B + 18. Hamann + 19. Harris & Lahey + 20. Hawkins & Dotson + 21. Kendall's Tau + 22. Kent & Foster I + 23. Kent & Foster II ### Changed - `sort` parameter added to `relabel` method - Document modified diff --git a/Document/Distance.ipynb b/Document/Distance.ipynb index 3277728e..2d9e7622 100644 --- a/Document/Distance.ipynb +++ b/Document/Distance.ipynb @@ -4,7 +4,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "
Please cite us if you use the software
" + "Please cite us if you use the software
\n", + "\n", + "\n", + " \n", + " " ] }, { @@ -1578,57 +1588,1022 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## References" + "## Dennis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "1- C. C. Little, \"Abydos Documentation,\" 2018.\n", - "\n", - "
2- V. Dallmeier, C. Lindig, and A. Zeller, \"Lightweight defect localization for Java,\" in European conference on object-oriented programming, 2005: Springer, pp. 528-550.\n", - "\n", - "
3- R. Abreu, P. Zoeteweij, and A. J. Van Gemund, \"An evaluation of similarity coefficients for software fault localization,\" in 2006 12th Pacific Rim International Symposium on Dependable Computing (PRDC'06), 2006: IEEE, pp. 39-46.\n", - "\n", - "
4- M. R. Anderberg, Cluster analysis for applications: probability and mathematical statistics: a series of monographs and textbooks. Academic press, 2014.\n", - "\n", - "
5- A. M. Andrés and P. F. Marzo, \"Delta: A new measure of agreement between two raters,\" British journal of mathematical and statistical psychology, vol. 57, no. 1, pp. 1-19, 2004.\n", - "\n", - "
6- C. Baroni-Urbani and M. W. Buser, \"Similarity of binary data,\" Systematic Zoology, vol. 25, no. 3, pp. 251-259, 1976.\n", - "\n", - "
7- V. Batagelj and M. Bren, \"Comparing resemblance measures,\" Journal of classification, vol. 12, no. 1, pp. 73-90, 1995.\n", - "\n", - "
8- F. B. Baulieu, \"A classification of presence/absence based dissimilarity coefficients,\" Journal of Classification, vol. 6, no. 1, pp. 233-246, 1989.\n", - "\n", - "
9- F. B. Baulieu, \"Two variant axiom systems for presence/absence based dissimilarity coefficients,\" Journal of Classification, vol. 14, no. 1, pp. 0159-0170, 1997.\n", - "\n", - "
10- R. Benini, Principii di demografia. Barbera, 1901.\n", - "\n", - "
11- G. N. Lance and W. T. Williams, \"Computer programs for hierarchical polythetic classification (“similarity analyses”),\" The Computer Journal, vol. 9, no. 1, pp. 60-64, 1966.\n", - "\n", - "
12- G. N. Lance and W. T. Williams, \"Mixed-Data Classificatory Programs I - Agglomerative Systems,\" Australian Computer Journal, vol. 1, no. 1, pp. 15-20, 1967.\n", - "\n", - "
13- P. W. Clement, \"A formula for computing inter-observer agreement,\" Psychological Reports, vol. 39, no. 1, pp. 257-258, 1976.\n", - "\n", - "
14- V. Consonni and R. Todeschini, \"New similarity coefficients for binary data,\" Match-Communications in Mathematical and Computer Chemistry, vol. 68, no. 2, p. 581, 2012." + "Dennis similarity [[15]](#ref15)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "" + "$$sim_{Dennis} =\n", + "\\frac{TP-\\frac{(TP+FP)\\times(TP+FN)}{POP}}{\\sqrt{\\frac{(TP+FP)\\times(TP+FN)}{POP}}}$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cm.distance(metric=DistanceType.Dennis)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
1- C. C. Little, \"Abydos Documentation,\" 2018.\n", + "\n", + "
2- V. Dallmeier, C. Lindig, and A. Zeller, \"Lightweight defect localization for Java,\" in European conference on object-oriented programming, 2005: Springer, pp. 528-550.\n", + "\n", + "
3- R. Abreu, P. Zoeteweij, and A. J. Van Gemund, \"An evaluation of similarity coefficients for software fault localization,\" in 2006 12th Pacific Rim International Symposium on Dependable Computing (PRDC'06), 2006: IEEE, pp. 39-46.\n", + "\n", + "
4- M. R. Anderberg, Cluster analysis for applications: probability and mathematical statistics: a series of monographs and textbooks. Academic press, 2014.\n", + "\n", + "
5- A. M. Andrés and P. F. Marzo, \"Delta: A new measure of agreement between two raters,\" British journal of mathematical and statistical psychology, vol. 57, no. 1, pp. 1-19, 2004.\n", + "\n", + "
6- C. Baroni-Urbani and M. W. Buser, \"Similarity of binary data,\" Systematic Zoology, vol. 25, no. 3, pp. 251-259, 1976.\n", + "\n", + "
7- V. Batagelj and M. Bren, \"Comparing resemblance measures,\" Journal of classification, vol. 12, no. 1, pp. 73-90, 1995.\n", + "\n", + "
8- F. B. Baulieu, \"A classification of presence/absence based dissimilarity coefficients,\" Journal of Classification, vol. 6, no. 1, pp. 233-246, 1989.\n", + "\n", + "
9- F. B. Baulieu, \"Two variant axiom systems for presence/absence based dissimilarity coefficients,\" Journal of Classification, vol. 14, no. 1, pp. 0159-0170, 1997.\n", + "\n", + "
10- R. Benini, Principii di demografia. Barbera, 1901.\n", + "\n", + "
11- G. N. Lance and W. T. Williams, \"Computer programs for hierarchical polythetic classification (“similarity analyses”),\" The Computer Journal, vol. 9, no. 1, pp. 60-64, 1966.\n", + "\n", + "
12- G. N. Lance and W. T. Williams, \"Mixed-Data Classificatory Programs I - Agglomerative Systems,\" Australian Computer Journal, vol. 1, no. 1, pp. 15-20, 1967.\n", + "\n", + "
13- P. W. Clement, \"A formula for computing inter-observer agreement,\" Psychological Reports, vol. 39, no. 1, pp. 257-258, 1976.\n", + "\n", + "
14- V. Consonni and R. Todeschini, \"New similarity coefficients for binary data,\" Match-Communications in Mathematical and Computer Chemistry, vol. 68, no. 2, p. 581, 2012.\n", + "\n", + "
15- S. F. Dennis, \"The Construction of a Thesaurus Automatically From,\" in Statistical Association Methods for Mechanized Documentation: Symposium Proceedings, 1965, vol. 269: US Government Printing Office, p. 61.\n", + "\n", + "
16- P. G. Digby, \"Approximating the tetrachoric correlation coefficient,\" Biometrics, pp. 753-757, 1983.\n", + "\n", + "
17- IBM Corp, \"IBM SPSS Statistics Algorithms,\" ed: IBM Corp Armonk, NY, USA, 2017.\n", + "\n", + "
18- M. H. Doolittle, \"The verification of predictions,\" Bulletin of the Philosophical Society of Washington, vol. 7, pp. 122-127, 1885.\n", + "\n", + "
19- H. Eyraud, \"Les principes de la mesure des correlations,\" Ann. Univ. Lyon, III. Ser., Sect. A, vol. 1, no. 30-47, p. 111, 1936.\n", + "\n", + "
20- E. W. Fager, \"Determination and analysis of recurrent groups,\" Ecology, vol. 38, no. 4, pp. 586-595, 1957.\n", + "\n", + "
21- E. W. Fager and J. A. McGowan, \"Zooplankton Species Groups in the North Pacific: Co-occurrences of species can be used to derive groups whose members react similarly to water-mass types,\" Science, vol. 140, no. 3566, pp. 453-460, 1963.\n", + "\n", + "
22- D. P. Faith, \"Asymmetric binary similarity measures,\" Oecologia, vol. 57, pp. 287-290, 1983.\n", + "\n", + "
23- J. L. Fleiss, B. Levin, and M. C. Paik, Statistical methods for rates and proportions. john wiley & sons, 2013.\n", + "\n", + "
24- S. A. Forbes, On the local distribution of certain Illinois fishes: an essay in statistical ecology. Illinois State Laboratory of Natural History, 1907.\n", + "\n", + "
25- A. Mozley, \"The statistical analysis of the distribution of pond molluscs in western Canada,\" The American Naturalist, vol. 70, no. 728, pp. 237-244, 1936.\n", + "\n", + "
26- S. A. Forbes, \"Method of determining and measuring the associative relations of species,\" Science, vol. 61, no. 1585, pp. 518-524, 1925.\n", + "\n", + "
27- E. G. Fossum and G. Kaskey, \"Optimization and standardization of information retrieval language and systems,\" SPERRY RAND CORP PHILADELPHIA PA UNIVAC DIV, 1966.\n", + "\n", + "
28- N. Gilbert and T. C. Wells, \"Analysis of quadrat data,\" The Journal of Ecology, pp. 675-685, 1966.\n", + "\n", + "
29- D. W. Goodall, \"The distribution of the matching coefficient,\" Biometrics, pp. 647-656, 1967.\n", + "\n", + "
30- B. Austin and R. R. Colwell, \"Evaluation of some coefficients for use in numerical taxonomy of microorganisms,\" International Journal of Systematic and Evolutionary Microbiology, vol. 27, no. 3, pp. 204-210, 1977.\n", + "\n", + "
31- L. A. Goodman, W. H. Kruskal, L. A. Goodman, and W. H. Kruskal, Measures of association for cross classifications. Springer, 1979.\n", + "\n", + "
32- L. Guttman, \"An outline of the statistical theory of prediction,\" The prediction of personal adjustment, vol. 48, pp. 253-318, 1941.\n", + "\n", + "
33- U. Hamann, \"Merkmalsbestand und verwandtschaftsbeziehungen der farinosae: ein beitrag zum system der monokotyledonen,\" Willdenowia, pp. 639-768, 1961.\n", + "\n", + "
34- F. C. Harris and B. B. Lahey, \"A method for combining occurrence and nonoccurrence interobserver agreement scores,\" Journal of Applied Behavior Analysis, vol. 11, no. 4, pp. 523-527, 1978.\n", + "\n", + "
35- R. P. Hawkins and V. A. Dotson, \"Reliability Scores That Delude: An Alice in Wonderland Trip Through the Misleading Characteristics of Inter-Observer Agreement Scores in Interval Recording,\" 1973.\n", + "\n", + "
36- M. G. Kendall, \"A new measure of rank correlation,\" Biometrika, vol. 30, no. 1/2, pp. 81-93, 1938.\n", + "\n", + "
37- R. N. Kent and S. L. Foster, \"Direct observational procedures: Methodological issues in naturalistic settings,\" Handbook of behavioral assessment, pp. 279-328, 1977." ] } ], diff --git a/Test/verified_test.py b/Test/verified_test.py index ffb959d9..44e72f8a 100644 --- a/Test/verified_test.py +++ b/Test/verified_test.py @@ -549,5 +549,97 @@ 0.48072545510682463 >>> cm2.distance(metric=DistanceType.ConsonniTodeschiniV)[1] 0.4003930264973547 +>>> cm1.distance(metric=DistanceType.Dennis)[1] +13.857142857142858 +>>> cm2.distance(metric=DistanceType.Dennis)[1] +10.028539207654113 +>>> cm1.distance(metric=DistanceType.Digby)[1] +0.9774244829419212 +>>> cm2.distance(metric=DistanceType.Digby)[1] +0.9491281473458171 +>>> cm1.distance(metric=DistanceType.Dispersion)[1] +0.002524989587671803 +>>> cm2.distance(metric=DistanceType.Dispersion)[1] +0.002502212619741774 +>>> cm1.distance(metric=DistanceType.Doolittle)[1] +0.24744247205785666 +>>> cm2.distance(metric=DistanceType.Doolittle)[1] +0.13009912077202224 +>>> cm1.distance(metric=DistanceType.Eyraud)[1] +-1.438198553583169e-06 +>>> cm2.distance(metric=DistanceType.Eyraud)[1] +-1.5399964580081465e-06 +>>> cm1.distance(metric=DistanceType.FagerMcGowan)[1] +0.25 +>>> cm2.distance(metric=DistanceType.FagerMcGowan)[1] +0.16102422643817918 +>>> cm1.distance(metric=DistanceType.Faith)[1] +0.4987244897959184 +>>> cm2.distance(metric=DistanceType.Faith)[1] +0.4968112244897959 +>>> cm1.distance(metric=DistanceType.FleissLevinPaik)[1] +0.9974358974358974 +>>> cm2.distance(metric=DistanceType.FleissLevinPaik)[1] +0.9955041746949261 +>>> cm1.distance(metric=DistanceType.ForbesI)[1] +98.0 +>>> cm2.distance(metric=DistanceType.ForbesI)[1] +52.266666666666666 +>>> cm1.distance(metric=DistanceType.ForbesII)[1] +0.49743589743589745 +>>> cm2.distance(metric=DistanceType.ForbesII)[1] +0.3953727506426735 +>>> cm1.distance(metric=DistanceType.Fossum)[1] +110.25 +>>> cm2.distance(metric=DistanceType.Fossum)[1] +58.8 +>>> cm1.distance(metric=DistanceType.GilbertWells)[1] +20.176174477346354 +>>> cm2.distance(metric=DistanceType.GilbertWells)[1] +16.717742356979358 +>>> abs(cm1.distance(metric=DistanceType.Goodall)[1] - 0.9544884026871964) < 1e-15 +True +>>> abs(cm2.distance(metric=DistanceType.Goodall)[1] - 0.9397552079794624) < 1e-15 +True +>>> cm1.distance(metric=DistanceType.GoodmanKruskalLambda)[1] +0.0 +>>> cm2.distance(metric=DistanceType.GoodmanKruskalLambda)[1] +0.0 +>>> cm1.distance(metric=DistanceType.GoodmanKruskalLambdaR)[1] +0.0 +>>> cm2.distance(metric=DistanceType.GoodmanKruskalLambdaR)[1] +-0.2727272727272727 +>>> cm1.distance(metric=DistanceType.GuttmanLambdaA)[1] +0.0 +>>> cm2.distance(metric=DistanceType.GuttmanLambdaA)[1] +0.0 +>>> cm1.distance(metric=DistanceType.GuttmanLambdaB)[1] +0.0 +>>> cm2.distance(metric=DistanceType.GuttmanLambdaB)[1] +0.0 +>>> cm1.distance(metric=DistanceType.Hamann)[1] +0.9897959183673469 +>>> cm2.distance(metric=DistanceType.Hamann)[1] +0.9821428571428571 +>>> cm1.distance(metric=DistanceType.HarrisLahey)[1] +0.3367085964820711 +>>> cm2.distance(metric=DistanceType.HarrisLahey)[1] +0.22761577457069784 +>>> cm1.distance(metric=DistanceType.HawkinsDotson)[1] +0.6641091219096334 +>>> cm2.distance(metric=DistanceType.HawkinsDotson)[1] +0.606635407786303 +>>> cm1.distance(metric=DistanceType.KendallTau)[1] +0.0025282143508744493 +>>> cm2.distance(metric=DistanceType.KendallTau)[1] +0.00250866630176975 +>>> cm1.distance(metric=DistanceType.KentFosterI)[1] +-0.19999999999999996 +>>> cm2.distance(metric=DistanceType.KentFosterI)[1] +-0.23529411764705888 +>>> cm1.distance(metric=DistanceType.KentFosterII)[1] +-0.0012804097311239404 +>>> cm2.distance(metric=DistanceType.KentFosterII)[1] +-0.002196997436837158 """ diff --git a/pycm/pycm_compare.py b/pycm/pycm_compare.py index 3b0f7284..4df3e5ba 100644 --- a/pycm/pycm_compare.py +++ b/pycm/pycm_compare.py @@ -314,7 +314,8 @@ def __compare_assign_handler__( compare.classes = list(cm_dict.values())[0].classes compare.class_weight = {k: 1 for k in compare.classes} compare.class_benchmark_weight = {k: 1 for k in CLASS_BENCHMARK_LIST} - compare.overall_benchmark_weight = {k: 0 if k in KAPPA_BENCHMARK_LIST[1:] else 1 for k in OVERALL_BENCHMARK_LIST} + compare.overall_benchmark_weight = { + k: 0 if k in KAPPA_BENCHMARK_LIST[1:] else 1 for k in OVERALL_BENCHMARK_LIST} compare.digit = digit compare.best = None compare.best_name = None diff --git a/pycm/pycm_distance.py b/pycm/pycm_distance.py index 19298b1b..4dba3a2f 100644 --- a/pycm/pycm_distance.py +++ b/pycm/pycm_distance.py @@ -42,6 +42,29 @@ class DistanceType(Enum): ConsonniTodeschiniIII = "ConsonniTodeschiniIII" ConsonniTodeschiniIV = "ConsonniTodeschiniIV" ConsonniTodeschiniV = "ConsonniTodeschiniV" + Dennis = "Dennis" + Digby = "Digby" + Dispersion = "Dispersion" + Doolittle = "Doolittle" + Eyraud = "Eyraud" + FagerMcGowan = "FagerMcGowan" + Faith = "Faith" + FleissLevinPaik = "FleissLevinPaik" + ForbesI = "ForbesI" + ForbesII = "ForbesII" + Fossum = "Fossum" + GilbertWells = "GilbertWells" + Goodall = "Goodall" + GoodmanKruskalLambda = "GoodmanKruskalLambda" + GoodmanKruskalLambdaR = "GoodmanKruskalLambdaR" + GuttmanLambdaA = "GuttmanLambdaA" + GuttmanLambdaB = "GuttmanLambdaB" + Hamann = "Hamann" + HarrisLahey = "HarrisLahey" + HawkinsDotson = "HawkinsDotson" + KendallTau = "KendallTau" + KentFosterI = "KentFosterI" + KentFosterII = "KentFosterII" def AMPLE_calc(TP, FP, FN, TN): @@ -679,6 +702,521 @@ def ConsonniTodeschiniV_calc(TP, FP, FN, TN): return "None" +def Dennis_calc(TP, FP, FN, TN): + """ + Calculate Dennis similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Dennis similarity as float + """ + try: + n = TP + FP + FN + TN + part1 = ((TP + FP) * (TP + FN)) / n + return (TP - part1) / math.sqrt(part1) + except Exception: + return "None" + + +def Digby_calc(TP, FP, FN, TN): + """ + Calculate Digby correlation. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Digby correlation as float + """ + try: + part1 = (TP * TN) ** 0.75 + part2 = (FP * FN) ** 0.75 + return (part1 - part2) / (part1 + part2) + except Exception: + return "None" + + +def Dispersion_calc(TP, FP, FN, TN): + """ + Calculate Dispersion correlation. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Dispersion correlation as float + """ + try: + n = TP + FP + FN + TN + part1 = TP * TN + part2 = FP * FN + return (part1 - part2) / (n ** 2) + except Exception: + return "None" + + +def Doolittle_calc(TP, FP, FN, TN): + """ + Calculate Doolittle similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Doolittle similarity as float + """ + try: + n = TP + FP + FN + TN + part1 = (TP + FP) * (TP + FN) + part2 = (TN + FP) * (TN + FN) + return ((TP * n - part1) ** 2) / (part1 * part2) + except Exception: + return "None" + + +def Eyraud_calc(TP, FP, FN, TN): + """ + Calculate Eyraud similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Eyraud similarity as float + """ + try: + part1 = (TP + FP) * (TP + FN) + part2 = (TN + FP) * (TN + FN) + return (TP - part1) / (part1 * part2) + except Exception: + return "None" + + +def FagerMcGowan_calc(TP, FP, FN, TN): + """ + Calculate Fager & McGowan similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Fager & McGowan similarity as float + """ + try: + part1 = math.sqrt((TP + FP) * (TP + FN)) + part2 = math.sqrt(max((TP + FP), (TP + FN))) + return (TP / part1) - (1 / (2 * part2)) + except Exception: + return "None" + + +def Faith_calc(TP, FP, FN, TN): + """ + Calculate Faith similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Faith similarity as float + """ + try: + n = TP + FP + FN + TN + return (TP + (TN / 2)) / n + except Exception: + return "None" + + +def FleissLevinPaik_calc(TP, FP, FN, TN): + """ + Calculate Fleiss-Levin-Paik similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Fleiss-Levin-Paik similarity as float + """ + try: + part1 = 2 * TN + return part1 / (part1 + FP + FN) + except Exception: + return "None" + + +def ForbesI_calc(TP, FP, FN, TN): + """ + Calculate Forbes I similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Forbes I similarity as float + """ + try: + n = TP + FP + FN + TN + part1 = (TP + FP) * (TP + FN) + return (n * TP) / part1 + except Exception: + return "None" + + +def ForbesII_calc(TP, FP, FN, TN): + """ + Calculate Forbes II correlation. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Forbes II correlation as float + """ + try: + n = TP + FP + FN + TN + part1 = (FP * FN) - (TP * TN) + part2 = (TP + FP) * (TP + FN) + part3 = min((TP + FP), (TP + FN)) + return part1 / (part2 - (n * part3)) + except Exception: + return "None" + + +def Fossum_calc(TP, FP, FN, TN): + """ + Calculate Fossum similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Fossum similarity as float + """ + try: + n = TP + FP + FN + TN + part1 = (TP - 0.5) ** 2 + part2 = (TP + FP) * (TP + FN) + return (n * part1) / part2 + except Exception: + return "None" + + +def GilbertWells_calc(TP, FP, FN, TN): + """ + Calculate Gilbert & Wells similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Gilbert & Wells similarity as float + """ + try: + n = TP + FP + FN + TN + part1 = (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN) + part2 = math.factorial(TP + FP) * math.factorial(TP + FN) * \ + math.factorial(TN + FP) * math.factorial(TN + FN) + part3 = math.factorial(n) * math.factorial(TP) * \ + math.factorial(FP) * math.factorial(FN) * math.factorial(TN) + return math.log((n ** 3) / (2 * math.pi * part1)) + \ + 2 * math.log(part3 / part2) + except Exception: + return "None" + + +def Goodall_calc(TP, FP, FN, TN): + """ + Calculate Goodall similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Goodall similarity as float + """ + try: + n = TP + FP + FN + TN + part1 = math.sqrt((TP + TN) / n) + return (2 / math.pi) * math.asin(part1) + except Exception: + return "None" + + +def GoodmanKruskalLambda_calc(TP, FP, FN, TN): + """ + Calculate Goodman & Kruskal's Lambda similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Goodman & Kruskal's Lambda similarity as float + """ + try: + n = TP + FP + FN + TN + part1 = max(TP, FP) + max(FN, TN) + max(TP, FN) + max(FP, TN) + part2 = max(TP + FP, FN + TN) + max(TP + FN, FP + TN) + return (0.5 * (part1 - part2)) / (n - 0.5 * part2) + except Exception: + return "None" + + +def GoodmanKruskalLambdaR_calc(TP, FP, FN, TN): + """ + Calculate Goodman & Kruskal Lambda-r correlation. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Goodman & Kruskal Lambda-r correlation as float + """ + try: + n = TP + FP + FN + TN + part1 = 0.5 * (max(TP + FP, FN + TN) + max(TP + FN, FP + TN)) + return (TP + TN - part1) / (n - part1) + except Exception: + return "None" + + +def GuttmanLambdaA_calc(TP, FP, FN, TN): + """ + Calculate Guttman's Lambda A similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Guttman's Lambda A similarity as float + """ + try: + n = TP + FP + FN + TN + part1 = max(TP, FN) + max(FP, TN) + part2 = max(TP + FP, FN + TN) + return (part1 - part2) / (n - part2) + except Exception: + return "None" + + +def GuttmanLambdaB_calc(TP, FP, FN, TN): + """ + Calculate Guttman's Lambda B similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Guttman's Lambda B similarity as float + """ + try: + n = TP + FP + FN + TN + part1 = max(TP, FP) + max(FN, TN) + part2 = max(TP + FN, FP + TN) + return (part1 - part2) / (n - part2) + except Exception: + return "None" + + +def Hamann_calc(TP, FP, FN, TN): + """ + Calculate Hamann correlation. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Hamann correlation as float + """ + try: + n = TP + FP + FN + TN + + return (TP + TN - FP - FN) / n + except Exception: + return "None" + + +def HarrisLahey_calc(TP, FP, FN, TN): + """ + Calculate Harris & Lahey similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Harris & Lahey similarity as float + """ + try: + n = TP + FP + FN + TN + part1 = TP / (TP + FP + FN) + part2 = (2 * TN + FP + FN) / (2 * n) + part3 = TN / (TN + FP + FN) + part4 = (2 * TP + FP + FN) / (2 * n) + return part1 * part2 + part3 * part4 + except Exception: + return "None" + + +def HawkinsDotson_calc(TP, FP, FN, TN): + """ + Calculate Hawkins & Dotson similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Hawkins & Dotson similarity as float + """ + try: + return 0.5 * ((TP / (TP + FP + FN)) + (TN / (TN + FN + FP))) + except Exception: + return "None" + + +def KendallTau_calc(TP, FP, FN, TN): + """ + Calculate Kendall's Tau correlation. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Kendall's Tau correlation as float + """ + try: + n = TP + FP + FN + TN + return (2 * (TP + TN - FP - FN)) / (n * (n - 1)) + except Exception: + return "None" + + +def KentFosterI_calc(TP, FP, FN, TN): + """ + Calculate Kent & Foster I similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Kent & Foster I similarity as float + """ + try: + part1 = ((TP + FP) * (TP + FN)) / (TP + FP + FN) + return (TP - part1) / (TP - part1 + FP + FN) + except Exception: + return "None" + + +def KentFosterII_calc(TP, FP, FN, TN): + """ + Calculate Kent & Foster II similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Kent & Foster II similarity as float + """ + try: + part1 = ((TN + FP) * (TN + FN)) / (TN + FP + FN) + return (TN - part1) / (TN - part1 + FP + FN) + except Exception: + return "None" + + DISTANCE_MAPPER = { DistanceType.AMPLE: AMPLE_calc, DistanceType.Anderberg: Anderberg_calc, @@ -710,4 +1248,27 @@ def ConsonniTodeschiniV_calc(TP, FP, FN, TN): DistanceType.ConsonniTodeschiniIII: ConsonniTodeschiniIII_calc, DistanceType.ConsonniTodeschiniIV: ConsonniTodeschiniIV_calc, DistanceType.ConsonniTodeschiniV: ConsonniTodeschiniV_calc, + DistanceType.Dennis: Dennis_calc, + DistanceType.Digby: Digby_calc, + DistanceType.Dispersion: Dispersion_calc, + DistanceType.Doolittle: Doolittle_calc, + DistanceType.Eyraud: Eyraud_calc, + DistanceType.FagerMcGowan: FagerMcGowan_calc, + DistanceType.Faith: Faith_calc, + DistanceType.FleissLevinPaik: FleissLevinPaik_calc, + DistanceType.ForbesI: ForbesI_calc, + DistanceType.ForbesII: ForbesII_calc, + DistanceType.Fossum: Fossum_calc, + DistanceType.GilbertWells: GilbertWells_calc, + DistanceType.Goodall: Goodall_calc, + DistanceType.GoodmanKruskalLambda: GoodmanKruskalLambda_calc, + DistanceType.GoodmanKruskalLambdaR: GoodmanKruskalLambdaR_calc, + DistanceType.GuttmanLambdaA: GuttmanLambdaA_calc, + DistanceType.GuttmanLambdaB: GuttmanLambdaB_calc, + DistanceType.Hamann: Hamann_calc, + DistanceType.HarrisLahey: HarrisLahey_calc, + DistanceType.HawkinsDotson: HawkinsDotson_calc, + DistanceType.KendallTau: KendallTau_calc, + DistanceType.KentFosterI: KentFosterI_calc, + DistanceType.KentFosterII: KentFosterII_calc } diff --git a/pycm/pycm_output.py b/pycm/pycm_output.py index e3e1a541..93451031 100644 --- a/pycm/pycm_output.py +++ b/pycm/pycm_output.py @@ -519,5 +519,5 @@ def online_help(param=None, alt_link=False): print('Example : online_help("J") or online_help(2)\n') for index, item in enumerate(params_link_keys): print(str(index + 1) + "-" + item) - except Exception: # pragma: no cover + except Exception: # pragma: no cover print("Error in online help") diff --git a/pycm/pycm_overall_func.py b/pycm/pycm_overall_func.py index c83bdcbc..af46ed60 100644 --- a/pycm/pycm_overall_func.py +++ b/pycm/pycm_overall_func.py @@ -1036,7 +1036,8 @@ def overall_statistics(**kwargs): result["SOA6(Matthews)"] = MCC_analysis(result["Overall MCC"]) result["SOA7(Lambda A)"] = lambda_analysis(result["Lambda A"]) result["SOA8(Lambda B)"] = lambda_analysis(result["Lambda B"]) - result["SOA9(Krippendorff Alpha)"] = alpha_analysis(result["Krippendorff Alpha"]) + result["SOA9(Krippendorff Alpha)"] = alpha_analysis( + result["Krippendorff Alpha"]) result["SOA10(Pearson C)"] = pearson_C_analysis(result["Pearson C"]) result["FPR Macro"] = complement(result["TNR Macro"]) result["FNR Macro"] = complement(result["TPR Macro"]) diff --git a/pycm/pycm_param.py b/pycm/pycm_param.py index 54b4849e..35a7b483 100644 --- a/pycm/pycm_param.py +++ b/pycm/pycm_param.py @@ -772,7 +772,7 @@ "Medium": "LawnGreen", "Strong": "Green", "None": "White"} - } +} BENCHMARK_LIST = list(BENCHMARK_COLOR.keys()) diff --git a/pycm/pycm_util.py b/pycm/pycm_util.py index 6252ca51..c13748b3 100644 --- a/pycm/pycm_util.py +++ b/pycm/pycm_util.py @@ -719,6 +719,20 @@ def thresholds_calc(probs): return thresholds +def char_num_transformer(input_item): + """ + Transform the input string to a proper key for char-num sorting. + + :param input_item: input item + :type input_item: str + :return: key as tuple + """ + return [(input_item, False, False) if not re.findall(r'\d+', input_item) + else (input_item[:re.search(r'\d+', input_item).start()], + int(re.findall(r'\d+', input_item)[0]), + input_item[re.search(r'\d+', input_item).end():])] + + def sort_char_num(input_list): """ Sort a list of strings first alphabetically and then numerically. @@ -727,8 +741,4 @@ def sort_char_num(input_list): :type input_list: iterable :return: a sorted list of strings """ - sort_by = lambda x: [(x, False, False) if not re.findall(r'\d+', x) - else (x[:re.search(r'\d+', x).start()], - int(re.findall(r'\d+', x)[0]), - x[re.search(r'\d+', x).end():])] - return sorted(input_list, key=sort_by) + return sorted(input_list, key=char_num_transformer)