Skip to content

Commit

Permalink
Merge pull request #15 from Kanaries/feat_subspace_new_alg
Browse files Browse the repository at this point in the history
Feat: subspace new algorithm
  • Loading branch information
ObservedObserver authored Jan 14, 2020
2 parents 32d11f4 + 4cdaac3 commit 19bcc63
Show file tree
Hide file tree
Showing 12 changed files with 203 additions and 57 deletions.
4 changes: 2 additions & 2 deletions packages/frontend/src/workers/cluster.worker.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* eslint no-restricted-globals: 0 */
import { kruskalMST } from 'visual-insights';

const PearsonThreshold = 0.5;
function sum (arr) {
let ans = 0;
let len = arr.length;
Expand All @@ -15,7 +15,7 @@ const cluster = (e) => {
const { spaces, maxGroupNumber } = e.data;
let result = [];
for (let space of spaces) {
const { edgesInMST, groups } = kruskalMST(space.matrix, maxGroupNumber);
const { edgesInMST, groups } = kruskalMST(space.matrix, maxGroupNumber, PearsonThreshold);
let measureGroups = new Map();
for (let i = 0; i < groups.length; i++) {
if (!measureGroups.has(groups[i])) {
Expand Down
16 changes: 11 additions & 5 deletions packages/visual-insights/src/dashboard/index.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import { FieldsFeature, correlation, linearMapPositive } from "../insights/impurity";
import { DataSource, OperatorType } from "../commonTypes";
import cluster, { kruskalMST } from "../insights/cluster";
import cluster from "../insights/cluster";
import aggregate from 'cube-core';
import { normalize, entropy } from "../impurityMeasure";
import { crammersV } from './utils';
import { CrammersVThreshold, PearsonCorrelation } from '../insights/config';

interface DashBoardSpace {
dimensions: string[];
Expand Down Expand Up @@ -48,7 +49,8 @@ export function getDashBoardSubspace (dataSource: DataSource, dimensions: string
const measureGroups = cluster({
matrix: correlationMatrix,
measures,
groupMaxSize: Math.round(measures.length / 6) // todo: make a config: max 6 measures in a dashboard
groupMaxSize: Math.round(measures.length / 6), // todo: make a config: max 6 measures in a dashboard
threshold: PearsonCorrelation.weak
})

const dimCorrelationMatrix = dimensions.map(d => dimensions.map(d => 0));
Expand Down Expand Up @@ -119,7 +121,8 @@ export function getDashBoardView (dashBoardSpace: DashBoardSpace, dataSource: Da
const measureGroups = cluster({
matrix: dashBoardSpace.correlationMatrix,
measures: measures,
groupMaxSize: Math.round(measures.length / 3) // todo: make a config: max 3 measures in a chart
groupMaxSize: Math.round(measures.length / 3), // todo: make a config: max 3 measures in a chart
threshold: PearsonCorrelation.strong
});
for (let group of measureGroups) {
const meaIndexList = group.map(mea => measures.indexOf(mea))
Expand All @@ -145,7 +148,8 @@ export function getDashBoardView (dashBoardSpace: DashBoardSpace, dataSource: Da
matrix: dimensionCorrelationMatrix,
measures: dimensions,
groupMaxSize: 2, // todo: make a config: max 2 dimensions in a chart
limitSize: true
limitSize: true,
threshold: CrammersVThreshold
})

const dimGroupEntropyMatrix = getEntropyMatrix(dimensionGroups, measures, dataSource);
Expand Down Expand Up @@ -173,4 +177,6 @@ function minIndex(arr: number[]) {
}
}
return pos;
}
}

export { crammersV }
128 changes: 96 additions & 32 deletions packages/visual-insights/src/dashboard/utils.ts
Original file line number Diff line number Diff line change
@@ -1,45 +1,109 @@
import { DataSource } from "../commonTypes";

export function chiSquared(matrix: number[][] = [[]]): number {
let rowSums = matrix.map(m => 0);
let colSums = matrix[0].map(m => 0);
type nestTree = Map<string, Map<string, number>>;
export function chiSquared(nestTree: nestTree, xSet: Set<string>, ySet: Set<string>): number {
if (typeof nestTree === 'undefined' || typeof xSet === 'undefined' || typeof ySet === 'undefined') {
return 0;
}
let rowSums = new Map<string, number>();
let colSums = new Map<string, number>();
let totalSum = 0;
for (let i = 0; i < matrix.length; i++) {
for (let j = 0; j < matrix[i].length; j++) {
rowSums[i] += matrix[i][j];
colSums[j] += matrix[i][j];
totalSum += matrix[i][j];
for (let x of xSet) {
rowSums.set(x, 0);
}
for (let y of ySet) {
colSums.set(y, 0);
}
for (let [x, node] of nestTree) {
for (let [y, counter] of node) {
rowSums.set(x, rowSums.get(x) + counter);
colSums.set(y, colSums.get(y) + counter);
totalSum += counter;
}
}

let chis = 0;
for (let i = 0; i < matrix.length; i++) {
for (let j = 0; j < matrix[i].length; j++) {
let observed = matrix[i][j];
let expected = rowSums[i] * colSums[j] / totalSum;
for (let [x, node] of nestTree) {
for (let [y, observed] of node) {
let expected = rowSums.get(x) * colSums.get(y) / totalSum;
chis += (observed - expected) ** 2 / expected;
}
}
return chis;
}

export function crammersV(dataSource: DataSource, fieldX: string, fieldY: string) {
const xSet = new Set(dataSource.map(d => d[fieldX]))
const ySet = new Set(dataSource.map(d => d[fieldY]))
const xMembers = [...xSet];
const yMembers = [...ySet];
let xDict = {};
let yDict = {};
for (let i = 0; i < xMembers.length; i++) {
xDict[xMembers[i]] = i;
}
for (let i = 0; i < yMembers.length; i++) {
yDict[yMembers[i]] = i;
}
let matrix: number[][] = xMembers.map(x => yMembers.map(y => 0));
for (let record of dataSource) {
matrix[xDict[record[fieldX]]][yDict[record[fieldY]]]++;

export function crammersV(dataSource: DataSource, fieldX: string, fieldY: string): number {
const xSet = new Set<string>()
const ySet = new Set<string>()
const nestTree = new Map<string, Map<string, number>>();
let len = dataSource.length;
for (let i = 0; i < len; i++) {
let record = dataSource[i];
xSet.add(record[fieldX])
ySet.add(record[fieldY]);
if (!nestTree.has(record[fieldX])) {
nestTree.set(record[fieldX], new Map());
}
let node = nestTree.get(record[fieldX]);
if (!node.has(record[fieldY])) {
node.set(record[fieldY], 0);
}
node.set(record[fieldY], node.get(record[fieldY]) + 1);
}
const chis = chiSquared(matrix);
const V = Math.sqrt(chis / (dataSource.length * Math.min(xMembers.length - 1, yMembers.length - 1)))
const chis = chiSquared(nestTree, xSet, ySet);
const V = Math.sqrt(chis / (dataSource.length * Math.min(xSet.size - 1, ySet.size - 1)))
return V;
}
}


// can be used for test
// export function crammersV(dataSource: DataSource, fieldX: string, fieldY: string): number {
// const xSet = new Set(dataSource.map(d => d[fieldX]))
// const ySet = new Set(dataSource.map(d => d[fieldY]))
// const xMembers = [...xSet];
// const yMembers = [...ySet];
// let xDict = {};
// let yDict = {};
// for (let i = 0; i < xMembers.length; i++) {
// xDict[xMembers[i]] = i;
// }
// for (let i = 0; i < yMembers.length; i++) {
// yDict[yMembers[i]] = i;
// }
// // let matrix: number[][] = xMembers.map(x => yMembers.map(y => 0));
// let matrix: number[][] = [];
// for (let i = 0; i < xMembers.length; i++) {
// matrix.push([]);
// for (let j = 0; j < yMembers.length; j++) {
// matrix[i].push(0);
// }
// }
// for (let record of dataSource) {
// matrix[xDict[record[fieldX]]][yDict[record[fieldY]]]++;
// }
// const chis = chiSquared(matrix);
// const V = Math.sqrt(chis / (dataSource.length * Math.min(xMembers.length - 1, yMembers.length - 1)))
// return V;
// }

// export function chiSquared(matrix: number[][] = [[]]): number {
// let rowSums = matrix.map(m => 0);
// let colSums = matrix[0].map(m => 0);
// let totalSum = 0;
// for (let i = 0; i < matrix.length; i++) {
// for (let j = 0; j < matrix[i].length; j++) {
// rowSums[i] += matrix[i][j];
// colSums[j] += matrix[i][j];
// totalSum += matrix[i][j];
// }
// }
// let chis = 0;
// for (let i = 0; i < matrix.length; i++) {
// for (let j = 0; j < matrix[i].length; j++) {
// let observed = matrix[i][j];
// let expected = rowSums[i] * colSums[j] / totalSum;
// chis += (observed - expected) ** 2 / expected;
// }
// }
// return chis;
// }
3 changes: 2 additions & 1 deletion packages/visual-insights/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import * as Distribution from './distribution';

import * as ImpurityMeasure from './impurityMeasure';

import getInsightViews, { analysisDimensions, getCombination, clusterMeasures, kruskalMST } from './insights/index';
import getInsightViews, { analysisDimensions, getCombination, clusterMeasures, kruskalMST, getDimSetsBasedOnClusterGroups } from './insights/index';
import * as Cleaner from './cleaner/index';

import * as UnivariateSummary from './univariateSummary/index'
Expand All @@ -29,6 +29,7 @@ export {
Cleaner,
getInsightViews,
getCombination,
getDimSetsBasedOnClusterGroups,
clusterMeasures,
kruskalMST
}
14 changes: 7 additions & 7 deletions packages/visual-insights/src/insights/cluster.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ function unionWithEffect (parents: number[], sizes: number[], n1: number, n2: nu
* @param matrix adjmatrix
* @param groupNumber number of group generated by clustering
*/
function kruskal(matrix: number[][], groupNumber: number): Map<number, number[]> {
function kruskal(matrix: number[][], groupNumber: number, threshold: number | undefined = 0): Map<number, number[]> {
const edges = turnAdjMatrix2List(matrix);
edges.sort((a, b) => b[1] - a[1]);
const parents = matrix.map((m, i) => i);
Expand All @@ -68,7 +68,7 @@ function kruskal(matrix: number[][], groupNumber: number): Map<number, number[]>
parents[i] = find(parents, i)
}
let set = new Set(parents);
if (set.size <= groupNumber){
if (set.size <= groupNumber || edge[1] < threshold){
break;
}
}
Expand Down Expand Up @@ -127,7 +127,6 @@ export function kruskalMSTWithLimitSize(matrix: number[][], limitSize: number =
const parents = matrix.map((m, i) => i);
const cloneParents = matrix.map((m, i) => i);
const sizes = matrix.map(() => 1);
let inCutEdge = false;
for (let edge of edges) {
if (findWithEffect(parents, sizes, edge[0][0]) !== findWithEffect(parents, sizes, edge[0][1])) {
if (sizes[edge[0][0]] + sizes[edge[0][1]] > limitSize) {
Expand Down Expand Up @@ -159,7 +158,7 @@ export function kruskalMSTWithLimitSize(matrix: number[][], limitSize: number =
* @param matrix
* @param groupNumber number of group generated by clustering
*/
export function kruskalMST(matrix: number[][], groupNumber: number = 4) {
export function kruskalMST(matrix: number[][], groupNumber: number = 4, threshold: number | undefined = 0) {
const edges = turnAdjMatrix2List(matrix);
edges.sort((a, b) => b[1] - a[1]);

Expand All @@ -180,7 +179,7 @@ export function kruskalMST(matrix: number[][], groupNumber: number = 4) {
let set = new Set(parents);
// TODO:
// + use kruskalMST instead of kruskal.
if (set.size <= groupNumber) {
if (set.size <= groupNumber || edge[1] < threshold) {
inCutEdge = true;
} else {
groups = [...parents]
Expand All @@ -199,15 +198,16 @@ interface ClusterProps {
method?: string;
groupMaxSize?: number;
limitSize?: boolean;
threshold?: number;
}

function cluster ({ matrix, measures ,method = 'kruskal', groupMaxSize = 4, limitSize = false }: ClusterProps): string[][] {
function cluster ({ matrix, measures, method = 'kruskal', groupMaxSize = 4, limitSize = false, threshold = 0 }: ClusterProps): string[][] {
// const groups = kruskal({ matrix, groupMaxSize });
let groups;
if (limitSize) {
groups = kruskalWithLimitSize(matrix, groupMaxSize)
} else {
groups = kruskal(matrix, groupMaxSize)
groups = kruskal(matrix, groupMaxSize, threshold)
}

let ans: string[][] = [];
Expand Down
9 changes: 8 additions & 1 deletion packages/visual-insights/src/insights/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,11 @@ export const Depth = 4;

export const VisualLimit = 8;

export const TopKPercentField = 0.8;
export const TopKPercentField = 0.8;

export const CrammersVThreshold = 0.3;

export const PearsonCorrelation = {
strong: 0.5,
weak: 0.3
};
35 changes: 34 additions & 1 deletion packages/visual-insights/src/insights/impurity.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import aggregate from 'cube-core';
import { entropy, normalize } from '../impurityMeasure';
import { DataSource, OperatorType } from '../commonTypes';
import { crammersV } from '../dashboard/utils';
import { CrammersVThreshold } from './config';
import cluster from './cluster';
// insights like outlier and trend both request high impurity of dimension.
const maxVisualChannel = 8;
function getCombination(elements: string[], start: number = 1, end: number = elements.length): string[][] {
Expand All @@ -22,6 +25,36 @@ function getCombination(elements: string[], start: number = 1, end: number = ele
}
return ans
}
function getDimCorrelationMatrix(dataSource: DataSource, dimensions: string[]): number[][] {
let matrix: number[][] = dimensions.map(d => dimensions.map(d => 0));
for (let i = 0; i < dimensions.length; i++) {
matrix[i][i] = 1;
for(let j = i + 1; j < dimensions.length; j++) {
matrix[i][j] = matrix[j][i] = crammersV(dataSource, dimensions[i], dimensions[j]);
}
}
return matrix;
}

export function getDimSetsBasedOnClusterGroups(dataSource: DataSource, dimensions: string[]): string[][] {
const maxDimNumberInView = 4;
let dimSets: string[][] = [];
let dimCorrelationMatrix = getDimCorrelationMatrix(dataSource, dimensions);
console.log(dimCorrelationMatrix)
// groupMaxSize here means group number.
let groups: string[][] = cluster({
matrix: dimCorrelationMatrix,
measures: dimensions,
groupMaxSize: Math.round(dimensions.length / maxDimNumberInView),
threshold: CrammersVThreshold
});
// todo: maybe a threhold would be better ?
for (let group of groups) {
let combineDimSet: string[][] = getCombination(group);
dimSets.push(...combineDimSet);
}
return dimSets;
}

export function linearMapPositive (arr: number[]): number[] {
let min = Math.min(...arr);
Expand All @@ -48,7 +81,7 @@ export function correlation(dataSource: DataSource, fieldX: string, fieldY: stri
export type FieldsFeature = [string[], any, number[][]];
function analysisDimensions(dataSource: DataSource, dimensions: string[], measures: string[], operator: OperatorType | undefined = 'sum'): FieldsFeature[] {
let impurityList: FieldsFeature[] = [];
let dimSet = getCombination(dimensions)
let dimSet = getDimSetsBasedOnClusterGroups(dataSource, dimensions);
for (let dset of dimSet) {
let impurity = {};
let aggData = aggregate({
Expand Down
12 changes: 9 additions & 3 deletions packages/visual-insights/src/insights/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { analysisDimensions, getCombination } from './impurity';
import { TopKSingleField, TopKPercentField, Depth, VisualLimit } from './config';
import { analysisDimensions, getCombination, getDimSetsBasedOnClusterGroups } from './impurity';
import { TopKPercentField } from './config';
import { entropy, normalize } from '../impurityMeasure';
import { memberCount } from '../utils'
import cluster, { kruskalMST } from './cluster';
Expand Down Expand Up @@ -52,4 +52,10 @@ function getInsightViews(dataSource: DataSource, originDimensions: string[], mea
}

export default getInsightViews;
export { analysisDimensions, getCombination, cluster as clusterMeasures, kruskalMST }
export {
analysisDimensions,
getCombination,
getDimSetsBasedOnClusterGroups,
cluster as clusterMeasures,
kruskalMST
};
2 changes: 0 additions & 2 deletions packages/visual-insights/src/specification.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import { DataSource, FieldType, Field, FieldImpurity, Specification, View } from './commonTypes';
import fieldsAnalysis from './fieldAnalysis';
import {
// isFieldCategory,
// isFieldContinous,
memberCount
} from './utils';
import { FieldSummary } from './univariateSummary';
interface VisualElements {
position: number;
color: number;
Expand Down
Loading

0 comments on commit 19bcc63

Please sign in to comment.