diff --git a/packages/frontend/src/workers/cluster.worker.js b/packages/frontend/src/workers/cluster.worker.js index eff186a8..e5c24776 100644 --- a/packages/frontend/src/workers/cluster.worker.js +++ b/packages/frontend/src/workers/cluster.worker.js @@ -1,6 +1,6 @@ /* eslint no-restricted-globals: 0 */ import { kruskalMST } from 'visual-insights'; - +const PearsonThreshold = 0.5; function sum (arr) { let ans = 0; let len = arr.length; @@ -15,7 +15,7 @@ const cluster = (e) => { const { spaces, maxGroupNumber } = e.data; let result = []; for (let space of spaces) { - const { edgesInMST, groups } = kruskalMST(space.matrix, maxGroupNumber); + const { edgesInMST, groups } = kruskalMST(space.matrix, maxGroupNumber, PearsonThreshold); let measureGroups = new Map(); for (let i = 0; i < groups.length; i++) { if (!measureGroups.has(groups[i])) { diff --git a/packages/visual-insights/src/dashboard/index.ts b/packages/visual-insights/src/dashboard/index.ts index ca8a6cc7..75cd286e 100644 --- a/packages/visual-insights/src/dashboard/index.ts +++ b/packages/visual-insights/src/dashboard/index.ts @@ -4,6 +4,7 @@ import cluster from "../insights/cluster"; import aggregate from 'cube-core'; import { normalize, entropy } from "../impurityMeasure"; import { crammersV } from './utils'; +import { CrammersVThreshold, PearsonCorrelation } from '../insights/config'; interface DashBoardSpace { dimensions: string[]; @@ -48,7 +49,8 @@ export function getDashBoardSubspace (dataSource: DataSource, dimensions: string const measureGroups = cluster({ matrix: correlationMatrix, measures, - groupMaxSize: Math.round(measures.length / 6) // todo: make a config: max 6 measures in a dashboard + groupMaxSize: Math.round(measures.length / 6), // todo: make a config: max 6 measures in a dashboard + threshold: PearsonCorrelation.weak }) const dimCorrelationMatrix = dimensions.map(d => dimensions.map(d => 0)); @@ -119,7 +121,8 @@ export function getDashBoardView (dashBoardSpace: DashBoardSpace, dataSource: Da const measureGroups = cluster({ matrix: dashBoardSpace.correlationMatrix, measures: measures, - groupMaxSize: Math.round(measures.length / 3) // todo: make a config: max 3 measures in a chart + groupMaxSize: Math.round(measures.length / 3), // todo: make a config: max 3 measures in a chart + threshold: PearsonCorrelation.strong }); for (let group of measureGroups) { const meaIndexList = group.map(mea => measures.indexOf(mea)) @@ -145,7 +148,8 @@ export function getDashBoardView (dashBoardSpace: DashBoardSpace, dataSource: Da matrix: dimensionCorrelationMatrix, measures: dimensions, groupMaxSize: 2, // todo: make a config: max 2 dimensions in a chart - limitSize: true + limitSize: true, + threshold: CrammersVThreshold }) const dimGroupEntropyMatrix = getEntropyMatrix(dimensionGroups, measures, dataSource); diff --git a/packages/visual-insights/src/insights/cluster.ts b/packages/visual-insights/src/insights/cluster.ts index 674fec64..705c1ad3 100644 --- a/packages/visual-insights/src/insights/cluster.ts +++ b/packages/visual-insights/src/insights/cluster.ts @@ -55,7 +55,7 @@ function unionWithEffect (parents: number[], sizes: number[], n1: number, n2: nu * @param matrix adjmatrix * @param groupNumber number of group generated by clustering */ -function kruskal(matrix: number[][], groupNumber: number): Map { +function kruskal(matrix: number[][], groupNumber: number, threshold: number | undefined = 0): Map { const edges = turnAdjMatrix2List(matrix); edges.sort((a, b) => b[1] - a[1]); const parents = matrix.map((m, i) => i); @@ -68,7 +68,7 @@ function kruskal(matrix: number[][], groupNumber: number): Map parents[i] = find(parents, i) } let set = new Set(parents); - if (set.size <= groupNumber){ + if (set.size <= groupNumber || edge[1] < threshold){ break; } } @@ -158,7 +158,7 @@ export function kruskalMSTWithLimitSize(matrix: number[][], limitSize: number = * @param matrix * @param groupNumber number of group generated by clustering */ -export function kruskalMST(matrix: number[][], groupNumber: number = 4) { +export function kruskalMST(matrix: number[][], groupNumber: number = 4, threshold: number | undefined = 0) { const edges = turnAdjMatrix2List(matrix); edges.sort((a, b) => b[1] - a[1]); @@ -179,7 +179,7 @@ export function kruskalMST(matrix: number[][], groupNumber: number = 4) { let set = new Set(parents); // TODO: // + use kruskalMST instead of kruskal. - if (set.size <= groupNumber) { + if (set.size <= groupNumber || edge[1] < threshold) { inCutEdge = true; } else { groups = [...parents] @@ -198,15 +198,16 @@ interface ClusterProps { method?: string; groupMaxSize?: number; limitSize?: boolean; + threshold?: number; } -function cluster ({ matrix, measures, method = 'kruskal', groupMaxSize = 4, limitSize = false }: ClusterProps): string[][] { +function cluster ({ matrix, measures, method = 'kruskal', groupMaxSize = 4, limitSize = false, threshold = 0 }: ClusterProps): string[][] { // const groups = kruskal({ matrix, groupMaxSize }); let groups; if (limitSize) { groups = kruskalWithLimitSize(matrix, groupMaxSize) } else { - groups = kruskal(matrix, groupMaxSize) + groups = kruskal(matrix, groupMaxSize, threshold) } let ans: string[][] = []; diff --git a/packages/visual-insights/src/insights/config.ts b/packages/visual-insights/src/insights/config.ts index 45ba0a70..b486faa6 100644 --- a/packages/visual-insights/src/insights/config.ts +++ b/packages/visual-insights/src/insights/config.ts @@ -4,4 +4,11 @@ export const Depth = 4; export const VisualLimit = 8; -export const TopKPercentField = 0.8; \ No newline at end of file +export const TopKPercentField = 0.8; + +export const CrammersVThreshold = 0.3; + +export const PearsonCorrelation = { + strong: 0.5, + weak: 0.3 +}; \ No newline at end of file diff --git a/packages/visual-insights/src/insights/impurity.ts b/packages/visual-insights/src/insights/impurity.ts index b2c1c73b..6c20ff87 100644 --- a/packages/visual-insights/src/insights/impurity.ts +++ b/packages/visual-insights/src/insights/impurity.ts @@ -3,6 +3,7 @@ import aggregate from 'cube-core'; import { entropy, normalize } from '../impurityMeasure'; import { DataSource, OperatorType } from '../commonTypes'; import { crammersV } from '../dashboard/utils'; +import { CrammersVThreshold } from './config'; import cluster from './cluster'; // insights like outlier and trend both request high impurity of dimension. const maxVisualChannel = 8; @@ -44,7 +45,8 @@ export function getDimSetsBasedOnClusterGroups(dataSource: DataSource, dimension let groups: string[][] = cluster({ matrix: dimCorrelationMatrix, measures: dimensions, - groupMaxSize: Math.round(dimensions.length / maxDimNumberInView) + groupMaxSize: Math.round(dimensions.length / maxDimNumberInView), + threshold: CrammersVThreshold }); // todo: maybe a threhold would be better ? for (let group of groups) { @@ -79,7 +81,7 @@ export function correlation(dataSource: DataSource, fieldX: string, fieldY: stri export type FieldsFeature = [string[], any, number[][]]; function analysisDimensions(dataSource: DataSource, dimensions: string[], measures: string[], operator: OperatorType | undefined = 'sum'): FieldsFeature[] { let impurityList: FieldsFeature[] = []; - let dimSet = getCombination(dimensions) + let dimSet = getDimSetsBasedOnClusterGroups(dataSource, dimensions); for (let dset of dimSet) { let impurity = {}; let aggData = aggregate({