Skip to content

Commit

Permalink
Merge pull request #33 from tpn/linear-regression-dev
Browse files Browse the repository at this point in the history
Implement linear regression + score/rank coverage type awareness.
  • Loading branch information
tpn authored Jan 19, 2021
2 parents 01af687 + 7bc8587 commit bb811d9
Show file tree
Hide file tree
Showing 27 changed files with 4,027 additions and 311 deletions.
4 changes: 4 additions & 0 deletions include/CompiledPerfectHash.props
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,15 @@
<DefaultPlatformToolset>v142</DefaultPlatformToolset>
</PropertyGroup>

<!--
Disable CodeAnalysis for now; it causes heap exhaustion when compiling
large tables.
<PropertyGroup>
<PGDDirectory>$(SolutionDir)$(Platform)\</PGDDirectory>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<RunCodeAnalysis>true</RunCodeAnalysis>
</PropertyGroup>
-->

<ItemDefinitionGroup>
<ClCompile>
Expand Down
34 changes: 33 additions & 1 deletion include/PerfectHash.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ typedef const UNICODE_STRING *PCUNICODE_STRING;

typedef _Null_terminated_ CONST CHAR *PCSZ;

typedef DOUBLE *PDOUBLE;

//
// Define a helper union that allows easy access to the bytes and shorts
// making up a ULONG. This is predominantly used by the hash routines that
Expand Down Expand Up @@ -3171,6 +3173,12 @@ IsValidPerfectHashTableCreateParameterId(
ENTRY(NumberOfEmptyPages, Lowest, <) \
ENTRY(NumberOfEmptyLargePages, Lowest, <) \
ENTRY(NumberOfEmptyCacheLines, Lowest, <) \
ENTRY(NumberOfUsedPages, Highest, >) \
ENTRY(NumberOfUsedLargePages, Highest, >) \
ENTRY(NumberOfUsedCacheLines, Highest, >) \
ENTRY(NumberOfUsedPages, Lowest, <) \
ENTRY(NumberOfUsedLargePages, Lowest, <) \
ENTRY(NumberOfUsedCacheLines, Lowest, <) \
ENTRY(MaxGraphTraversalDepth, Highest, >) \
ENTRY(MaxGraphTraversalDepth, Lowest, <) \
ENTRY(TotalGraphTraversals, Highest, >) \
Expand All @@ -3188,7 +3196,15 @@ IsValidPerfectHashTableCreateParameterId(
ENTRY(NumberOfCacheLinesUsedByKeysSubset, Lowest, <) \
ENTRY(NumberOfPagesUsedByKeysSubset, Highest, >) \
ENTRY(NumberOfLargePagesUsedByKeysSubset, Highest, >) \
LAST_ENTRY(NumberOfCacheLinesUsedByKeysSubset, Highest, >)
ENTRY(NumberOfCacheLinesUsedByKeysSubset, Highest, >) \
ENTRY(PredictedNumberOfFilledCacheLines, Lowest, <) \
ENTRY(PredictedNumberOfFilledCacheLines, Highest, >) \
ENTRY(Slope, Lowest, <) \
ENTRY(Slope, Highest, >) \
ENTRY(Score, Lowest , <) \
ENTRY(Score, Highest , >) \
ENTRY(Rank, Lowest , <) \
LAST_ENTRY(Rank, Highest , >)

#define BEST_COVERAGE_TYPE_TABLE_ENTRY(ENTRY) \
BEST_COVERAGE_TYPE_TABLE(ENTRY, ENTRY, ENTRY)
Expand Down Expand Up @@ -3282,6 +3298,22 @@ DoesBestCoverageTypeUseValueArray(
);
}

FORCEINLINE
BOOLEAN
DoesBestCoverageTypeUseDouble(
_In_ PERFECT_HASH_TABLE_BEST_COVERAGE_TYPE_ID Type
)
{
return (
Type == BestCoverageTypeLowestSlopeId ||
Type == BestCoverageTypeHighestSlopeId ||
Type == BestCoverageTypeLowestPredictedNumberOfFilledCacheLinesId ||
Type == BestCoverageTypeHighestPredictedNumberOfFilledCacheLinesId ||
Type == BestCoverageTypeLowestRankId ||
Type == BestCoverageTypeHighestRankId
);
}

typedef struct _PERFECT_HASH_TABLE_CREATE_PARAMETER {
PERFECT_HASH_TABLE_CREATE_PARAMETER_ID Id;
ULONG Padding;
Expand Down
393 changes: 222 additions & 171 deletions include/PerfectHashEvents.h

Large diffs are not rendered by default.

58 changes: 48 additions & 10 deletions python/perfecthash/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@
'ProcessID',
'ThreadID',
'CPU',
'ActivityId',
'KeysProcessed',
'NumberOfKeys',
'LastKey',
Expand Down Expand Up @@ -425,6 +426,7 @@
'ProcessID',
'ThreadID',
'CPU',
'ActivityId',
'KeysProcessed',
'NumberOfKeys',
'LastKey',
Expand Down Expand Up @@ -481,6 +483,7 @@
'ProcessID',
'ThreadID',
'CPU',
'ActivityId',
'NumberOfKeys',
'Cycles',
'Microseconds',
Expand All @@ -498,6 +501,7 @@
' etw:Related ActivityId,'
' etw:UserSid,'
' etw:SessionId,'
' KeysFileName,'
' Attempt,'
' ElapsedMilliseconds,'
' CoverageType,'
Expand Down Expand Up @@ -551,7 +555,12 @@
' NumberOfAssignedPerCacheLineCounts_13,'
' NumberOfAssignedPerCacheLineCounts_14,'
' NumberOfAssignedPerCacheLineCounts_15,'
' NumberOfAssignedPerCacheLineCounts_16'
' NumberOfAssignedPerCacheLineCounts_16,'
' Slope,'
' Intercept,'
' CorrelationCoefficient,'
' Score,'
' Rank'
)

FOUND_GRAPH_CSV_HEADER_SHARED = (
Expand All @@ -564,6 +573,7 @@
'RelatedActivityId',
'UserSid',
'SessionId',
'KeysFileName',
'Attempt',
'ElapsedMilliseconds',
'CoverageType',
Expand Down Expand Up @@ -618,6 +628,11 @@
'NumberOfAssignedPerCacheLineCounts_14',
'NumberOfAssignedPerCacheLineCounts_15',
'NumberOfAssignedPerCacheLineCounts_16',
'Slope',
'Intercept',
'CorrelationCoefficient',
'Score',
'Rank',
)

FOUND_GRAPH_CSV_HEADER_SLIM_SHARED = (
Expand All @@ -626,6 +641,8 @@
'ProcessID',
'ThreadID',
'CPU',
'ActivityId',
'KeysFileName',
'Attempt',
'ElapsedMilliseconds',
'CoverageType',
Expand Down Expand Up @@ -680,6 +697,11 @@
'NumberOfAssignedPerCacheLineCounts_14',
'NumberOfAssignedPerCacheLineCounts_15',
'NumberOfAssignedPerCacheLineCounts_16',
'Slope',
'Intercept',
'CorrelationCoefficient',
'Score',
'Rank',
)

# FoundNewBestGraph
Expand Down Expand Up @@ -732,6 +754,7 @@
' etw:Related ActivityId,'
' etw:UserSid,'
' etw:SessionId,'
' KeysFileName,'
' Attempt,'
' NumberOfKeys,'
' NumberOfVertices'
Expand All @@ -747,6 +770,7 @@
'RelatedActivityId',
'UserSid',
'SessionId',
'KeysFileName',
'Attempt',
'NumberOfKeys',
'NumberOfVertices',
Expand All @@ -759,6 +783,8 @@
'ProcessID',
'ThreadID',
'CPU',
'ActivityId',
'KeysFileName',
'Attempt',
'NumberOfKeys',
'NumberOfVertices',
Expand All @@ -778,6 +804,7 @@
' etw:Related ActivityId,'
' etw:UserSid,'
' etw:SessionId,'
' KeysFileName,'
' Attempt,'
' NumberOfKeys,'
' NumberOfVertices,'
Expand All @@ -796,6 +823,7 @@
'RelatedActivityId',
'UserSid',
'SessionId',
'KeysFileName',
'Attempt',
'NumberOfKeys',
'NumberOfVertices',
Expand All @@ -811,6 +839,8 @@
'ProcessID',
'ThreadID',
'CPU',
'ActivityId',
'KeysFileName',
'Attempt',
'NumberOfKeys',
'NumberOfVertices',
Expand Down Expand Up @@ -856,6 +886,7 @@
'ProcessID',
'ThreadID',
'CPU',
'ActivityId',
'BytesRequested',
)

Expand Down Expand Up @@ -898,6 +929,7 @@
'ProcessID',
'ThreadID',
'CPU',
'ActivityId',
'BytesRequested',
'Result',
)
Expand Down Expand Up @@ -1900,35 +1932,35 @@ def get_yyyy_mm_dd_subdirs(dirname):

def get_csv_files(directory):
import glob
return [
return set(
f for f in glob.iglob(
f'{directory}/**/PerfectHashBulkCreate*.csv',
recursive=True
)
]
)

def get_all_bulk_create_parquet_files(directory):
import glob
return [
return set(
f for f in glob.iglob(
f'{directory}/**/PerfectHashBulkCreate*.parquet',
recursive=True
) if 'failed' not in f
] + [
).union(set(
f for f in glob.iglob(
f'{directory}/PerfectHashBulkCreate*.parquet',
recursive=False
) if 'failed' not in f
]
))

def get_best_bulk_create_parquet_files(directory):
import glob
return [
return set(
f for f in glob.iglob(
f'{directory}/**/PerfectHashBulkCreateBest*.parquet',
recursive=True
) if 'failed' not in f
]
)

def convert_csv_to_parquet(path, base_research_dir, out=None):
if not out:
Expand Down Expand Up @@ -2289,6 +2321,7 @@ def process_xperf_perfecthash_csv(path, out=None):
'ProcessID',
'ThreadID',
'CPU',
'ActivityId',
'NumberOfKeys',
'NumberOfVertices',
'NumberOfEmptyVertices',
Expand All @@ -2310,8 +2343,9 @@ def process_xperf_perfecthash_csv(path, out=None):
'ProcessID',
'ThreadID',
'CPU',
'ActivityId',
'BytesRequested',
'Success',
'Result',
]]

df.to_csv(path)
Expand All @@ -2322,6 +2356,7 @@ def process_xperf_perfecthash_csv(path, out=None):
#===============================================================================

def get_cache_line_coverage(df):
import numpy as np
count = df.NewBestGraphCount.values[0]
keys = [
f'BestGraph{i}_CountOfCacheLinesWithNumberOfAssigned_{n}'
Expand All @@ -2335,8 +2370,11 @@ def get_cache_line_coverage(df):
return (keys, values, attempts, columns)

def ridgeline_plot(df):
import joypy
import pandas as pd
import matplotlib.pyplot as plt
plt.ioff()
from matplotlib import cm
#plt.ioff()
keys_name = df.KeysName.values[0]
hash_func = df.HashFunction.values[0]
best_coverage_type = df.BestCoverageType.values[0]
Expand Down
5 changes: 3 additions & 2 deletions python/perfecthash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,8 +498,9 @@ class PathArg(PathInvariant):

def run(self):

from os.path import basename
from .analysis import convert_csv_to_parquet
convert_csv_to_parquet(self._path, self._out)
convert_csv_to_parquet(self._path, basename(self._path))

class ConvertAllCsvToParquet(InvariantAwareCommand):
"""
Expand All @@ -526,7 +527,7 @@ def run(self):

if path:
from os.path import basename
path = basename(path)
base = basename(path)
else:
path = base

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ DECLARE_BENCHMARK_FULL_CPH_ROUTINE()
for (Count = Iterations; Count != 0; Count--) {

//
// Loop through the entire key set and insert a rotated version of the key.
// Loop through the entire key set and insert a rotated version
// of the key.
//

FOR_EACH_KEY {
Expand All @@ -34,8 +35,8 @@ DECLARE_BENCHMARK_FULL_CPH_ROUTINE()
}

//
// Loop through the entire set again and ensure that lookup returns the
// rotated version.
// Loop through the entire set again and ensure that lookup
// returns the rotated version.
//

FOR_EACH_KEY {
Expand Down Expand Up @@ -72,7 +73,8 @@ DECLARE_BENCHMARK_FULL_CPH_ROUTINE()
for (Count = Iterations; Count != 0; Count--) {

//
// Loop through the entire key set and insert a rotated version of the key.
// Loop through the entire key set and insert a rotated version
// of the key.
//

FOR_EACH_KEY {
Expand All @@ -82,8 +84,8 @@ DECLARE_BENCHMARK_FULL_CPH_ROUTINE()
}

//
// Loop through the entire set again and ensure that lookup returns the
// rotated version.
// Loop through the entire set again and ensure that lookup
// returns the rotated version.
//

FOR_EACH_KEY {
Expand Down Expand Up @@ -116,3 +118,4 @@ DECLARE_BENCHMARK_FULL_CPH_ROUTINE()
return Best;
}

// vim:set ts=8 sw=4 sts=4 tw=80 expandtab :
Loading

0 comments on commit bb811d9

Please sign in to comment.