Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Calculate exact null distirbution #85

Closed
wants to merge 1 commit into from

Conversation

afermg
Copy link
Contributor

@afermg afermg commented Feb 22, 2025

I was gonna give up after #84 but it actually performs very well in real datasets over at the map repo.

@afermg
Copy link
Contributor Author

afermg commented Feb 22, 2025

The last bottleneck I've found upon increase in data size is building the ranked lists. It is a much less important bottleneck but the optimisations seem simple enough. This shows that the method matches what the curent build_rank_lists is doing.

import duckdb
import numpy as np
from copairs.map.average_precision import build_rank_lists

def build_rank_duckdb(pos_pairs, neg_pairs, pos_sims, neg_sims):
    pos_pairs, neg_pairs, pos_sims, neg_sims = pos_pairs.T, neg_pairs.T, pos_sims.T, neg_sims.T
    with duckdb.connect(":memory:"):
        # Combine relevance labels: 1 for positive pairs and 0 for negative pairs
        query = (
            "SELECT *,{val} AS label"
            " FROM {var}_pairs"
            " POSITIONAL JOIN (SELECT #1 AS sim FROM {var}_sims)"
        )
        
        pos_table = duckdb.sql(query.format(var="pos", val=1))
        neg_table = duckdb.sql(query.format(var="neg", val=0))

        joint = pos_table.union(neg_table)

        # Pivot to have all indices in one column
        pivot_query = (
            "UNPIVOT joint"
            " ON column0,column1"
            " INTO NAME col VALUE ix"
        )
        pivoted = duckdb.sql(pivot_query)

        # Sort first by similarity and then by index number
        sort_query = (
            "SELECT label"
            " FROM pivoted"
            " ORDER BY ix ASC,"
            "          sim DESC"
        )
        rel_k = duckdb.sql(sort_query)

        # Count
        count_query = (
            "SELECT ix,COUNT(ix) AS counts"
            " FROM pivoted"
            " GROUP BY ix"
            " ORDER BY ix ASC"
        )
        counted = duckdb.sql(count_query).fetchnumpy()
        
    return counted["ix"],rel_k.fetchnumpy()["label"],counted["counts"].astype(np.uint32)


pos_pairs = np.array([[1,2], [2,3], [4,5]])
pos_sims = np.array([0.99,0.5,0.2])
neg_pairs = pos_pairs + 1
neg_sims = 1-pos_sims

print("Original method, results:")
print("ix: {}\nrel: k{}\ncounts: {}".format(*build_rank_lists(pos_pairs, neg_pairs,pos_sims, neg_sims)))
print("New method, results:")
print("ix: {}\nrel: k{}\ncounts: {}".format(*build_rank_duckdb(pos_pairs, neg_pairs,pos_sims, neg_sims)))
"""
Original method, results:
ix: [1 2 3 4 5 6]
rel: [1 1 1 0 1 0 0 0 1 0 1 0]
counts: [1 3 3 2 2 1]
New method, results:
ix: [1 2 3 4 5 6]
rel: [1 1 1 0 0 1 0 0 1 0 1 0]
counts: [1 3 3 2 2 1]
"""

@afermg
Copy link
Contributor Author

afermg commented Feb 22, 2025

The current implementation scales better than a duckdb one. I presume it is due to the amount of operations and re-sortings that we need to do in ddb. So I will leave build_rank_list untouched, but at least we know now.

setup = (
    """
import duckdb
import numpy as np
from copairs.map.average_precision import build_rank_lists
from timeit import Timer

pos_pairs = np.random.randint({max_n}, size=({npos},2))
neg_pairs = np.random.randint({max_n}, size=({nneg},2))
pos_sims = np.random.random({npos})
neg_sims = np.random.random({nneg})

def build_rank_duckdb(pos_pairs, neg_pairs, pos_sims, neg_sims):
    pos_pairs, neg_pairs, pos_sims, neg_sims = pos_pairs.T, neg_pairs.T, pos_sims.T, neg_sims.T
    with duckdb.connect(":memory:"):
        # Combine relevance labels: 1 for positive pairs and 0 for negative pairs
        query = (
            "SELECT *,{{val}} AS label"
            " FROM {{var}}_pairs"
            " POSITIONAL JOIN (SELECT #1 AS sim FROM {{var}}_sims)"
        )
        
        pos_table = duckdb.sql(query.format(var="pos", val=1))
        neg_table = duckdb.sql(query.format(var="neg", val=0))

        joint = pos_table.union(neg_table)

        # Pivot to have all indices in one column
        pivot_query = (
            "UNPIVOT joint"
            " ON column0,column1"
            " INTO NAME col VALUE ix"
        )
        pivoted = duckdb.sql(pivot_query)

        # Sort first by similarity and then by index number
        sort_query = (
            "SELECT label"
            " FROM pivoted"
            " ORDER BY ix ASC,"
            "          sim DESC"
        )
        rel_k = duckdb.sql(sort_query)

        # Count
        count_query = (
            "SELECT ix,COUNT(ix) AS counts"
            " FROM pivoted"
            " GROUP BY ix"
            " ORDER BY ix ASC"
        )
        counted = duckdb.sql(count_query).fetchnumpy()
        
    return counted["ix"],rel_k.fetchnumpy()["label"],counted["counts"].astype(np.uint32)

    """
)

from itertools import product

max_n = [int(10**i) for i in range(2,8)]
npairs = [int(10**i) for i in range(3,10)]

n_times = 3
n_repeats = 5
for npos,nneg, max_n in product(npairs,npairs,max_n):
    for fn in (
            "build_rank_duckdb(pos_pairs, neg_pairs, pos_sims, neg_sims)",
            "build_rank_lists(pos_pairs, neg_pairs, pos_sims, neg_sims)",
    ):
            times = Timer(fn, setup=setup.format(npos=npos,nneg=nneg, max_n=max_n)).repeat(n_times, n_repeats)
            avg = sum(times) / n_repeats
            print(f"{fn.split('(')[0]},{npos=},{nneg=},{max_n} has an avg time of {avg:0.3f} secs.")

"""
build_rank_duckdb,npos=1000,nneg=1000,100 has an avg time of 0.069 secs.
build_rank_lists,npos=1000,nneg=1000,100 has an avg time of 0.001 secs.
build_rank_duckdb,npos=1000,nneg=1000,1000 has an avg time of 0.068 secs.
build_rank_lists,npos=1000,nneg=1000,1000 has an avg time of 0.001 secs.
build_rank_duckdb,npos=1000,nneg=1000,10000 has an avg time of 0.071 secs.
build_rank_lists,npos=1000,nneg=1000,10000 has an avg time of 0.001 secs.
build_rank_duckdb,npos=1000,nneg=1000,100000 has an avg time of 0.070 secs.
build_rank_lists,npos=1000,nneg=1000,100000 has an avg time of 0.001 secs.
build_rank_duckdb,npos=1000,nneg=1000,1000000 has an avg time of 0.070 secs.
build_rank_lists,npos=1000,nneg=1000,1000000 has an avg time of 0.001 secs.
build_rank_duckdb,npos=1000,nneg=1000,10000000 has an avg time of 0.070 secs.
build_rank_lists,npos=1000,nneg=1000,10000000 has an avg time of 0.001 secs.
build_rank_duckdb,npos=1000,nneg=10000,100 has an avg time of 0.114 secs.
build_rank_lists,npos=1000,nneg=10000,100 has an avg time of 0.006 secs.
build_rank_duckdb,npos=1000,nneg=10000,1000 has an avg time of 0.113 secs.
build_rank_lists,npos=1000,nneg=10000,1000 has an avg time of 0.007 secs.
build_rank_duckdb,npos=1000,nneg=10000,10000 has an avg time of 0.115 secs.
build_rank_lists,npos=1000,nneg=10000,10000 has an avg time of 0.008 secs.
build_rank_duckdb,npos=1000,nneg=10000,100000 has an avg time of 0.119 secs.
build_rank_lists,npos=1000,nneg=10000,100000 has an avg time of 0.007 secs.
build_rank_duckdb,npos=1000,nneg=10000,1000000 has an avg time of 0.119 secs.
build_rank_lists,npos=1000,nneg=10000,1000000 has an avg time of 0.007 secs.
build_rank_duckdb,npos=1000,nneg=10000,10000000 has an avg time of 0.119 secs.
build_rank_lists,npos=1000,nneg=10000,10000000 has an avg time of 0.007 secs.
build_rank_duckdb,npos=1000,nneg=100000,100 has an avg time of 0.555 secs.
build_rank_lists,npos=1000,nneg=100000,100 has an avg time of 0.069 secs.
build_rank_duckdb,npos=1000,nneg=100000,1000 has an avg time of 0.550 secs.
build_rank_lists,npos=1000,nneg=100000,1000 has an avg time of 0.081 secs.
build_rank_duckdb,npos=1000,nneg=100000,10000 has an avg time of 0.569 secs.
build_rank_lists,npos=1000,nneg=100000,10000 has an avg time of 0.094 secs.
build_rank_duckdb,npos=1000,nneg=100000,100000 has an avg time of 0.591 secs.
build_rank_lists,npos=1000,nneg=100000,100000 has an avg time of 0.098 secs.
build_rank_duckdb,npos=1000,nneg=100000,1000000 has an avg time of 0.614 secs.
build_rank_lists,npos=1000,nneg=100000,1000000 has an avg time of 0.103 secs.
build_rank_duckdb,npos=1000,nneg=100000,10000000 has an avg time of 0.621 secs.
build_rank_lists,npos=1000,nneg=100000,10000000 has an avg time of 0.100 secs.
build_rank_duckdb,npos=1000,nneg=1000000,100 has an avg time of 4.839 secs.
build_rank_lists,npos=1000,nneg=1000000,100 has an avg time of 0.813 secs.
build_rank_duckdb,npos=1000,nneg=1000000,1000 has an avg time of 4.894 secs.
"""

@@ -531,7 +532,7 @@ def get_null_dists(
# Function to generate null distributions for each configuration
def par_func(i):
num_pos, total = confs[i]
null_dists[i] = null_dist_cached(num_pos, total, seeds[i], null_size, cache_dir)
null_dists[i] = get_random_ap(total, num_pos)
Copy link
Collaborator

@alxndrkalinin alxndrkalinin Feb 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit confused by the setting, we need to get a whole distribution here, while get_random_ap returns a single score.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bad, it already calculates average the exact expected random average precision for M choose n, the p-value should probably be calculated in a different way, not as "the proportion of null scores >= observed score".

@afermg
Copy link
Contributor Author

afermg commented Feb 26, 2025

After a chat with @alxndrkalinin turns out getting the expected AP is not that useful. Will close the issue in favour of the current implementation.

@afermg afermg closed this Feb 26, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants