plot-thresholds

#!/usr/bin/python
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import argparse
import time
from bisect import bisect_left

import colour
from munge import data_to_clusters, array_to_link_pairs
from munge import write_normalised_png
from meta import read_info_json, CLUSTERINESS

from eval import print_links
from eval import load_clustering_json, load_ranking_json, calc_map

from interpret import find_n_links, find_n_clusters
from interpret import apply_interpret_options
from interpret import add_interpret_options, find_clusteriness_anchors
from interpret import clusteriness_to_threshold, threshold_to_clusteriness

from language import TRAINING_CORPUS


def latex_diff(c_score, score):
    if c_score > score:
        return '\\textbf{%.3f}' % (c_score - score)
    return '%.3f' % (c_score - score)


def calc_fbcubed(truth, clusters):
    r = 0.0
    p = 0.0
    n = 0.0
    for cluster in clusters:
        n += len(cluster)
        for d in cluster:
            tcluster = truth[d]
            correct = len(cluster & tcluster)
            pp = float(correct) / len(cluster)
            rr = float(correct) / len(tcluster)
            p += pp
            r += rr
    rp = n / p
    rr = n / r
    return 2.0 / (rp + rr)


def main():
    parser = argparse.ArgumentParser()
    add_interpret_options(parser, corpus_dir=TRAINING_CORPUS)
    parser.add_argument('-g', '--ground-truth-dir',
                        help='cluster ground truth (json)')

    parser.add_argument('--n-clusters', action='store_true',
                        help='plot the number of clusters at each threshold')

    parser.add_argument('--n-links', action='store_true',
                        help='plot the number of links at each threshold')

    parser.add_argument('--delta', action='store_true',
                        help='changes in threshold')

    parser.add_argument('-m', '--show-map', action='store_true',
                        help='show mean average precision')

    parser.add_argument('--print-links', action='store_true',
                        help='print links, highlighting truth')

    parser.add_argument('--time', action='store_true',
                        help=('show no figure, print elapsed '
                              'time for first problem'))

    parser.add_argument('--no-plot', action='store_true',
                        help="don't actually plot")

    parser.add_argument('--no-show', action='store_true',
                        help="plot but don't show (use with --save-as)")

    parser.add_argument('--save-as',
                        help=("Save images to this filename;"
                              "include {p} for problem id"))

    parser.add_argument('--threshold-x', action='store_true',
                        help="use the threshold as X axis")

    parser.add_argument('--n-clusters-x', action='store_true',
                        help="use the number of clusters as X axis")

    parser.add_argument('--no-fbcubed', action='store_true',
                        help="don't do fbcubed even with ground truth")

    parser.add_argument('--png-save-dir',
                        help="save array images here")

    parser.add_argument('--negative-png', action='store_true',
                        help="invert array image colour (white=0)")

    parser.add_argument('--clusteriness', type=float,
                        help="show fbcubed at this clusteriness")

    parser.add_argument('--auto-clusteriness', action='store_true',
                        help="use the pan-cluster default clusteriness")

    parser.add_argument('--annotate', action='store_true',
                        help="show clusteriness anchor points")

    parser.add_argument('--printable', action='store_true',
                        help="optimise for print")

    parser.add_argument('--latex-table', action='store_true',
                        help="make command-line output like a latex table")

    args = parser.parse_args()

    affinities, all_names = apply_interpret_options(args)

    fbcubed_bests = []
    fbcubed_lasts = []
    fbcubed_clusteriness = []
    fbcubed_c = []
    fbcubed_c_score = []
    map_results = []
    true_clusters = None
    true_ranking = None

    if args.printable:
        styles = {
            #'n clusters': {'color': '#ffffff', 'marker': 'o', 'edgecolors': '#333333'},
            'n clusters': {'marker': '+', 'color': '#990000'},
            'thresholds': {'color': 'k', 'marker': '|'},
            'delta': {'color': '#666666', 'marker': 'd'},
            'n links': {'color': '#cccccc', 'marker': '.'},
            'fbcubed': {'color': '#000077', 'marker': 'x'},
            'cliff': {'color': '#999999', 'zorder': -1, 'linestyle': '--'},
            'fbcubed threshold': {'color': '#999999', 'zorder': -1,
                                  'linestyle': ':'},
            'd median': {'color': '#999999', 'zorder': -1, 'linestyle': '--'},
        }
    else:
        styles = {
            'n clusters': {'color': 'g', 'marker': '.'},
            'thresholds': {'color': 'r', 'marker': '.'},
            'delta': {'color': '#00ccff', 'marker': '.'},
            'n links': {'color': '#ffcc00', 'marker': '.'},
            'fbcubed': {'color': 'b', 'marker': '.'},
            'cliff': {'color': '#cccccc', 'zorder': -1},
            'fbcubed threshold': {'color': '#cc0000', 'zorder': -1},
            'd median': {'color': '#cccccc', 'zorder': -1},
        }

    if args.time:
        start_time = time.time()

    if args.latex_table:
        print '\\hline'

    if args.ground_truth_dir:
        info = read_info_json(os.path.dirname(args.ground_truth_dir.rstrip('/')))
        genre_map = {x[0]: x[2] for x in info}
        lang_map = {x[0]: x[1] for x in info}
        clusteriness_map = {}
        if args.auto_clusteriness:
            for pid, lang, genre in info:
                clusteriness_map[pid] = CLUSTERINESS[lang][genre == 'reviews']

        if not args.no_fbcubed:
            true_clusters = {}
            for pid in affinities:
                true_clusters[pid] = {}
                fn = os.path.join(args.ground_truth_dir, pid,
                                  'clustering.json')
                documents, clusters = load_clustering_json(fn)
                for cluster in clusters:
                    for doc in cluster:
                        true_clusters[pid][doc] = cluster

        if args.show_map or args.print_links:
            true_ranking = {}
            for pid in affinities:
                fn = os.path.join(args.ground_truth_dir, pid, 'ranking.json')
                true_ranking[pid] = load_ranking_json(fn)

    elif args.show_map or args.print_links:
        print ("you need to supply --ground-truth-dir for "
               "--show-map or --print_links")
        sys.exit(1)
    else:
        genre_map = {}
        lang_map = {}

    n_problems = len(affinities)

    for pid, data in sorted(affinities.items()):
        if not args.latex_table:
            print "%s%s%s %s" % (colour.RED, pid, colour.C_NORMAL,
                                 genre_map.get(pid, '-'))
        else:
            print "%s %8s & %s &" % (lang_map[pid], genre_map[pid], pid),

        names = all_names[pid]

        if args.png_save_dir:
            fn = "%s/%s.png" % (args.png_save_dir, pid)
            write_normalised_png(data, fn, negative=args.negative_png)

        thresholds = np.unique(data)

        if args.n_clusters or args.n_clusters_x:
            n_clusters = find_n_clusters(data, thresholds)
            n_clusters = np.array(n_clusters) / float(data.shape[0])

        fig = plt.figure(figsize=(12, 10))
        ax = fig.add_subplot(111)
        ax.set_title(pid)
        if args.no_plot:
            def plot(*args, **kwargs):
                pass
        elif args.threshold_x:
            def plot(seq, label):
                ax.scatter(thresholds, seq, label=label, **styles[label])
        elif args.n_clusters_x:
            def plot(seq, label):
                ax.scatter(n_clusters, seq, label=label, **styles[label])
        else:
            def plot(seq, label):
                ax.plot(seq, label=label, **styles[label])
            plot(thresholds, 'thresholds')

        if args.show_map:
            links = array_to_link_pairs(data, names)
            map = calc_map(links, true_ranking[pid])
            if not args.latex_table:
                print "MAP is %f" % map
            else:
                print "%.3f &" % (map),

            ax.set_title("%s: MAP %.3g" % (pid, map))
            map_results.append(map)

        if args.print_links:
            links = array_to_link_pairs(data, names, True)
            print_links(links, true_ranking[pid], names)

        if args.delta:
            deltas = np.diff(thresholds)
            plot([0] + list(deltas), 'delta')

        if args.n_clusters:
            plot(n_clusters, 'n clusters')

        if args.n_links:
            n_links = find_n_links(data, thresholds)
            plot(n_links, 'n links')

        if true_clusters is not None:
            fbcubed = []
            old_clusters = None
            score = None
            best_fbcubed = -1
            best_fbcubed_c = 0
            for t in thresholds:
                clusters = data_to_clusters(data, t, names)
                if clusters != old_clusters:
                    score = calc_fbcubed(true_clusters[pid], clusters)
                    old_clusters = clusters
                    if score > best_fbcubed:
                        best_fbcubed = score
                        best_fbcubed_c = len(clusters)
                        best_fbcubed_t = t
                fbcubed.append(score)
            plot(fbcubed, 'fbcubed')

            clusteriness = threshold_to_clusteriness(data, best_fbcubed_t)
            if not args.latex_table:
                print ("FBCUBED best: %.3f at %d/%d (threshold %.2f, "
                       "clusteriness %.3g, %d%% last)" %
                       (best_fbcubed,
                        best_fbcubed_c,
                        data.shape[0],
                        best_fbcubed_t,
                        clusteriness,
                        0.5 + 100 * best_fbcubed / score))
            else:
                diff = latex_diff(best_fbcubed, score)
                print "%.3f & %.3f & %.2f & %s &" % (score, best_fbcubed,
                                                     clusteriness,
                                                     diff),

            fbcubed_bests.append(best_fbcubed)
            fbcubed_c.append(best_fbcubed_c / float(data.shape[0]))
            fbcubed_lasts.append(score)
            fbcubed_clusteriness.append(clusteriness)
            if args.clusteriness or args.auto_clusteriness:
                c = clusteriness_map.get(pid, args.clusteriness)
                t2 = clusteriness_to_threshold(data, c)
                clusters = data_to_clusters(data, t2, names)
                c_score = calc_fbcubed(true_clusters[pid], clusters)
                fbcubed_c_score.append(c_score)
                if not args.latex_table:
                    colour.print_CYAN("fbcubed at clusteriness %.3g "
                                      "(threshold %.3g): %3g (%d%%)" %
                                      (c, t2, c_score,
                                       0.5 + 100.0 * c_score / score))
                else:
                    diff = latex_diff(c_score, score)
                    print "%.3f & %.2f & %s \\\\" % (c_score, c, diff)

                if args.annotate:
                    cliff, d_median = find_clusteriness_anchors(data)
                    if not args.threshold_x:
                        cliff = bisect_left(thresholds, cliff)
                        d_median = bisect_left(thresholds, d_median)
                        t2 = bisect_left(thresholds, t2)
                    ax.axvline(cliff, 0.05, 0.95, **styles['cliff'])
                    ax.axvline(d_median, 0.25, 0.95, **styles['d median'])
                    ax.axvline(t2, 0.15, 0.95, **styles['fbcubed threshold'])

        if args.time:
            elapsed = time.time() - start_time
            print "%s %s took %.2f seconds" % (pid, data.shape, elapsed)
            sys.exit()

        if not args.no_plot:
            plt.legend(loc='lower right', numpoints=1, frameon=False,
                       borderpad=0)
            if not args.no_show:
                plt.show()
            if args.save_as:
                fn = args.save_as.format(p=pid)
                plt.savefig(fn, dpi=150, transparent=True,
                            frameon=False)

    if args.latex_table:
        sys.exit()

    if true_ranking:
        print "average MAP is        %f" % (sum(map_results) / n_problems)
        print "harmonic mean MAP is  %f" % (n_problems /
                                            sum(1.0 / max(x, 1e-99)
                                                for x in map_results))
        print "geometric mean MAP is %f" % (reduce(float.__mul__, map_results)
                                            ** (1.0 / n_problems))

    if true_clusters is not None:
        ave_best = sum(fbcubed_bests) / n_problems
        print ("average best fbcubed %f at %.1f%% (%d%% last score) "
               "ave. clusteriness %.3g" %
               (ave_best, 0.5 + 100.0 * sum(fbcubed_c) / n_problems,
                0.5 + 100.0 * sum(fbcubed_bests) / sum(fbcubed_lasts),
                sum(fbcubed_clusteriness) / n_problems))
    if fbcubed_c_score:
        print ("average fbcubed at specified clusteriness: %3g (%d%%)" %
               (sum(fbcubed_c_score) / n_problems,
                0.5 + 100.0 * sum(fbcubed_c_score) / sum(fbcubed_lasts)))

main()