diff --git a/packages/frontend/src/workers/cluster.worker.js b/packages/frontend/src/workers/cluster.worker.js index eff186a8..e5c24776 100644 --- a/packages/frontend/src/workers/cluster.worker.js +++ b/packages/frontend/src/workers/cluster.worker.js @@ -1,6 +1,6 @@ /* eslint no-restricted-globals: 0 */ import { kruskalMST } from 'visual-insights'; - +const PearsonThreshold = 0.5; function sum (arr) { let ans = 0; let len = arr.length; @@ -15,7 +15,7 @@ const cluster = (e) => { const { spaces, maxGroupNumber } = e.data; let result = []; for (let space of spaces) { - const { edgesInMST, groups } = kruskalMST(space.matrix, maxGroupNumber); + const { edgesInMST, groups } = kruskalMST(space.matrix, maxGroupNumber, PearsonThreshold); let measureGroups = new Map(); for (let i = 0; i < groups.length; i++) { if (!measureGroups.has(groups[i])) { diff --git a/packages/visual-insights/src/dashboard/index.ts b/packages/visual-insights/src/dashboard/index.ts index 0fa9ecc8..75cd286e 100644 --- a/packages/visual-insights/src/dashboard/index.ts +++ b/packages/visual-insights/src/dashboard/index.ts @@ -1,9 +1,10 @@ import { FieldsFeature, correlation, linearMapPositive } from "../insights/impurity"; import { DataSource, OperatorType } from "../commonTypes"; -import cluster, { kruskalMST } from "../insights/cluster"; +import cluster from "../insights/cluster"; import aggregate from 'cube-core'; import { normalize, entropy } from "../impurityMeasure"; import { crammersV } from './utils'; +import { CrammersVThreshold, PearsonCorrelation } from '../insights/config'; interface DashBoardSpace { dimensions: string[]; @@ -48,7 +49,8 @@ export function getDashBoardSubspace (dataSource: DataSource, dimensions: string const measureGroups = cluster({ matrix: correlationMatrix, measures, - groupMaxSize: Math.round(measures.length / 6) // todo: make a config: max 6 measures in a dashboard + groupMaxSize: Math.round(measures.length / 6), // todo: make a config: max 6 measures in a dashboard + threshold: PearsonCorrelation.weak }) const dimCorrelationMatrix = dimensions.map(d => dimensions.map(d => 0)); @@ -119,7 +121,8 @@ export function getDashBoardView (dashBoardSpace: DashBoardSpace, dataSource: Da const measureGroups = cluster({ matrix: dashBoardSpace.correlationMatrix, measures: measures, - groupMaxSize: Math.round(measures.length / 3) // todo: make a config: max 3 measures in a chart + groupMaxSize: Math.round(measures.length / 3), // todo: make a config: max 3 measures in a chart + threshold: PearsonCorrelation.strong }); for (let group of measureGroups) { const meaIndexList = group.map(mea => measures.indexOf(mea)) @@ -145,7 +148,8 @@ export function getDashBoardView (dashBoardSpace: DashBoardSpace, dataSource: Da matrix: dimensionCorrelationMatrix, measures: dimensions, groupMaxSize: 2, // todo: make a config: max 2 dimensions in a chart - limitSize: true + limitSize: true, + threshold: CrammersVThreshold }) const dimGroupEntropyMatrix = getEntropyMatrix(dimensionGroups, measures, dataSource); @@ -173,4 +177,6 @@ function minIndex(arr: number[]) { } } return pos; -} \ No newline at end of file +} + +export { crammersV } \ No newline at end of file diff --git a/packages/visual-insights/src/dashboard/utils.ts b/packages/visual-insights/src/dashboard/utils.ts index 7da9dc40..c4aab723 100644 --- a/packages/visual-insights/src/dashboard/utils.ts +++ b/packages/visual-insights/src/dashboard/utils.ts @@ -1,45 +1,109 @@ import { DataSource } from "../commonTypes"; - -export function chiSquared(matrix: number[][] = [[]]): number { - let rowSums = matrix.map(m => 0); - let colSums = matrix[0].map(m => 0); +type nestTree = Map>; +export function chiSquared(nestTree: nestTree, xSet: Set, ySet: Set): number { + if (typeof nestTree === 'undefined' || typeof xSet === 'undefined' || typeof ySet === 'undefined') { + return 0; + } + let rowSums = new Map(); + let colSums = new Map(); let totalSum = 0; - for (let i = 0; i < matrix.length; i++) { - for (let j = 0; j < matrix[i].length; j++) { - rowSums[i] += matrix[i][j]; - colSums[j] += matrix[i][j]; - totalSum += matrix[i][j]; + for (let x of xSet) { + rowSums.set(x, 0); + } + for (let y of ySet) { + colSums.set(y, 0); + } + for (let [x, node] of nestTree) { + for (let [y, counter] of node) { + rowSums.set(x, rowSums.get(x) + counter); + colSums.set(y, colSums.get(y) + counter); + totalSum += counter; } } + let chis = 0; - for (let i = 0; i < matrix.length; i++) { - for (let j = 0; j < matrix[i].length; j++) { - let observed = matrix[i][j]; - let expected = rowSums[i] * colSums[j] / totalSum; + for (let [x, node] of nestTree) { + for (let [y, observed] of node) { + let expected = rowSums.get(x) * colSums.get(y) / totalSum; chis += (observed - expected) ** 2 / expected; } } return chis; } -export function crammersV(dataSource: DataSource, fieldX: string, fieldY: string) { - const xSet = new Set(dataSource.map(d => d[fieldX])) - const ySet = new Set(dataSource.map(d => d[fieldY])) - const xMembers = [...xSet]; - const yMembers = [...ySet]; - let xDict = {}; - let yDict = {}; - for (let i = 0; i < xMembers.length; i++) { - xDict[xMembers[i]] = i; - } - for (let i = 0; i < yMembers.length; i++) { - yDict[yMembers[i]] = i; - } - let matrix: number[][] = xMembers.map(x => yMembers.map(y => 0)); - for (let record of dataSource) { - matrix[xDict[record[fieldX]]][yDict[record[fieldY]]]++; + +export function crammersV(dataSource: DataSource, fieldX: string, fieldY: string): number { + const xSet = new Set() + const ySet = new Set() + const nestTree = new Map>(); + let len = dataSource.length; + for (let i = 0; i < len; i++) { + let record = dataSource[i]; + xSet.add(record[fieldX]) + ySet.add(record[fieldY]); + if (!nestTree.has(record[fieldX])) { + nestTree.set(record[fieldX], new Map()); + } + let node = nestTree.get(record[fieldX]); + if (!node.has(record[fieldY])) { + node.set(record[fieldY], 0); + } + node.set(record[fieldY], node.get(record[fieldY]) + 1); } - const chis = chiSquared(matrix); - const V = Math.sqrt(chis / (dataSource.length * Math.min(xMembers.length - 1, yMembers.length - 1))) + const chis = chiSquared(nestTree, xSet, ySet); + const V = Math.sqrt(chis / (dataSource.length * Math.min(xSet.size - 1, ySet.size - 1))) return V; -} \ No newline at end of file +} + + +// can be used for test +// export function crammersV(dataSource: DataSource, fieldX: string, fieldY: string): number { +// const xSet = new Set(dataSource.map(d => d[fieldX])) +// const ySet = new Set(dataSource.map(d => d[fieldY])) +// const xMembers = [...xSet]; +// const yMembers = [...ySet]; +// let xDict = {}; +// let yDict = {}; +// for (let i = 0; i < xMembers.length; i++) { +// xDict[xMembers[i]] = i; +// } +// for (let i = 0; i < yMembers.length; i++) { +// yDict[yMembers[i]] = i; +// } +// // let matrix: number[][] = xMembers.map(x => yMembers.map(y => 0)); +// let matrix: number[][] = []; +// for (let i = 0; i < xMembers.length; i++) { +// matrix.push([]); +// for (let j = 0; j < yMembers.length; j++) { +// matrix[i].push(0); +// } +// } +// for (let record of dataSource) { +// matrix[xDict[record[fieldX]]][yDict[record[fieldY]]]++; +// } +// const chis = chiSquared(matrix); +// const V = Math.sqrt(chis / (dataSource.length * Math.min(xMembers.length - 1, yMembers.length - 1))) +// return V; +// } + +// export function chiSquared(matrix: number[][] = [[]]): number { +// let rowSums = matrix.map(m => 0); +// let colSums = matrix[0].map(m => 0); +// let totalSum = 0; +// for (let i = 0; i < matrix.length; i++) { +// for (let j = 0; j < matrix[i].length; j++) { +// rowSums[i] += matrix[i][j]; +// colSums[j] += matrix[i][j]; +// totalSum += matrix[i][j]; +// } +// } +// let chis = 0; +// for (let i = 0; i < matrix.length; i++) { +// for (let j = 0; j < matrix[i].length; j++) { +// let observed = matrix[i][j]; +// let expected = rowSums[i] * colSums[j] / totalSum; +// chis += (observed - expected) ** 2 / expected; +// } +// } +// return chis; +// } \ No newline at end of file diff --git a/packages/visual-insights/src/index.ts b/packages/visual-insights/src/index.ts index 6286bb77..7e6422c2 100644 --- a/packages/visual-insights/src/index.ts +++ b/packages/visual-insights/src/index.ts @@ -7,7 +7,7 @@ import * as Distribution from './distribution'; import * as ImpurityMeasure from './impurityMeasure'; -import getInsightViews, { analysisDimensions, getCombination, clusterMeasures, kruskalMST } from './insights/index'; +import getInsightViews, { analysisDimensions, getCombination, clusterMeasures, kruskalMST, getDimSetsBasedOnClusterGroups } from './insights/index'; import * as Cleaner from './cleaner/index'; import * as UnivariateSummary from './univariateSummary/index' @@ -29,6 +29,7 @@ export { Cleaner, getInsightViews, getCombination, + getDimSetsBasedOnClusterGroups, clusterMeasures, kruskalMST } \ No newline at end of file diff --git a/packages/visual-insights/src/insights/cluster.ts b/packages/visual-insights/src/insights/cluster.ts index cdd2c5e6..705c1ad3 100644 --- a/packages/visual-insights/src/insights/cluster.ts +++ b/packages/visual-insights/src/insights/cluster.ts @@ -55,7 +55,7 @@ function unionWithEffect (parents: number[], sizes: number[], n1: number, n2: nu * @param matrix adjmatrix * @param groupNumber number of group generated by clustering */ -function kruskal(matrix: number[][], groupNumber: number): Map { +function kruskal(matrix: number[][], groupNumber: number, threshold: number | undefined = 0): Map { const edges = turnAdjMatrix2List(matrix); edges.sort((a, b) => b[1] - a[1]); const parents = matrix.map((m, i) => i); @@ -68,7 +68,7 @@ function kruskal(matrix: number[][], groupNumber: number): Map parents[i] = find(parents, i) } let set = new Set(parents); - if (set.size <= groupNumber){ + if (set.size <= groupNumber || edge[1] < threshold){ break; } } @@ -127,7 +127,6 @@ export function kruskalMSTWithLimitSize(matrix: number[][], limitSize: number = const parents = matrix.map((m, i) => i); const cloneParents = matrix.map((m, i) => i); const sizes = matrix.map(() => 1); - let inCutEdge = false; for (let edge of edges) { if (findWithEffect(parents, sizes, edge[0][0]) !== findWithEffect(parents, sizes, edge[0][1])) { if (sizes[edge[0][0]] + sizes[edge[0][1]] > limitSize) { @@ -159,7 +158,7 @@ export function kruskalMSTWithLimitSize(matrix: number[][], limitSize: number = * @param matrix * @param groupNumber number of group generated by clustering */ -export function kruskalMST(matrix: number[][], groupNumber: number = 4) { +export function kruskalMST(matrix: number[][], groupNumber: number = 4, threshold: number | undefined = 0) { const edges = turnAdjMatrix2List(matrix); edges.sort((a, b) => b[1] - a[1]); @@ -180,7 +179,7 @@ export function kruskalMST(matrix: number[][], groupNumber: number = 4) { let set = new Set(parents); // TODO: // + use kruskalMST instead of kruskal. - if (set.size <= groupNumber) { + if (set.size <= groupNumber || edge[1] < threshold) { inCutEdge = true; } else { groups = [...parents] @@ -199,15 +198,16 @@ interface ClusterProps { method?: string; groupMaxSize?: number; limitSize?: boolean; + threshold?: number; } -function cluster ({ matrix, measures ,method = 'kruskal', groupMaxSize = 4, limitSize = false }: ClusterProps): string[][] { +function cluster ({ matrix, measures, method = 'kruskal', groupMaxSize = 4, limitSize = false, threshold = 0 }: ClusterProps): string[][] { // const groups = kruskal({ matrix, groupMaxSize }); let groups; if (limitSize) { groups = kruskalWithLimitSize(matrix, groupMaxSize) } else { - groups = kruskal(matrix, groupMaxSize) + groups = kruskal(matrix, groupMaxSize, threshold) } let ans: string[][] = []; diff --git a/packages/visual-insights/src/insights/config.ts b/packages/visual-insights/src/insights/config.ts index 45ba0a70..b486faa6 100644 --- a/packages/visual-insights/src/insights/config.ts +++ b/packages/visual-insights/src/insights/config.ts @@ -4,4 +4,11 @@ export const Depth = 4; export const VisualLimit = 8; -export const TopKPercentField = 0.8; \ No newline at end of file +export const TopKPercentField = 0.8; + +export const CrammersVThreshold = 0.3; + +export const PearsonCorrelation = { + strong: 0.5, + weak: 0.3 +}; \ No newline at end of file diff --git a/packages/visual-insights/src/insights/impurity.ts b/packages/visual-insights/src/insights/impurity.ts index 112359be..6c20ff87 100644 --- a/packages/visual-insights/src/insights/impurity.ts +++ b/packages/visual-insights/src/insights/impurity.ts @@ -2,6 +2,9 @@ import aggregate from 'cube-core'; import { entropy, normalize } from '../impurityMeasure'; import { DataSource, OperatorType } from '../commonTypes'; +import { crammersV } from '../dashboard/utils'; +import { CrammersVThreshold } from './config'; +import cluster from './cluster'; // insights like outlier and trend both request high impurity of dimension. const maxVisualChannel = 8; function getCombination(elements: string[], start: number = 1, end: number = elements.length): string[][] { @@ -22,6 +25,36 @@ function getCombination(elements: string[], start: number = 1, end: number = ele } return ans } +function getDimCorrelationMatrix(dataSource: DataSource, dimensions: string[]): number[][] { + let matrix: number[][] = dimensions.map(d => dimensions.map(d => 0)); + for (let i = 0; i < dimensions.length; i++) { + matrix[i][i] = 1; + for(let j = i + 1; j < dimensions.length; j++) { + matrix[i][j] = matrix[j][i] = crammersV(dataSource, dimensions[i], dimensions[j]); + } + } + return matrix; +} + +export function getDimSetsBasedOnClusterGroups(dataSource: DataSource, dimensions: string[]): string[][] { + const maxDimNumberInView = 4; + let dimSets: string[][] = []; + let dimCorrelationMatrix = getDimCorrelationMatrix(dataSource, dimensions); + console.log(dimCorrelationMatrix) + // groupMaxSize here means group number. + let groups: string[][] = cluster({ + matrix: dimCorrelationMatrix, + measures: dimensions, + groupMaxSize: Math.round(dimensions.length / maxDimNumberInView), + threshold: CrammersVThreshold + }); + // todo: maybe a threhold would be better ? + for (let group of groups) { + let combineDimSet: string[][] = getCombination(group); + dimSets.push(...combineDimSet); + } + return dimSets; +} export function linearMapPositive (arr: number[]): number[] { let min = Math.min(...arr); @@ -48,7 +81,7 @@ export function correlation(dataSource: DataSource, fieldX: string, fieldY: stri export type FieldsFeature = [string[], any, number[][]]; function analysisDimensions(dataSource: DataSource, dimensions: string[], measures: string[], operator: OperatorType | undefined = 'sum'): FieldsFeature[] { let impurityList: FieldsFeature[] = []; - let dimSet = getCombination(dimensions) + let dimSet = getDimSetsBasedOnClusterGroups(dataSource, dimensions); for (let dset of dimSet) { let impurity = {}; let aggData = aggregate({ diff --git a/packages/visual-insights/src/insights/index.ts b/packages/visual-insights/src/insights/index.ts index d2e69085..4f6417d0 100644 --- a/packages/visual-insights/src/insights/index.ts +++ b/packages/visual-insights/src/insights/index.ts @@ -1,5 +1,5 @@ -import { analysisDimensions, getCombination } from './impurity'; -import { TopKSingleField, TopKPercentField, Depth, VisualLimit } from './config'; +import { analysisDimensions, getCombination, getDimSetsBasedOnClusterGroups } from './impurity'; +import { TopKPercentField } from './config'; import { entropy, normalize } from '../impurityMeasure'; import { memberCount } from '../utils' import cluster, { kruskalMST } from './cluster'; @@ -52,4 +52,10 @@ function getInsightViews(dataSource: DataSource, originDimensions: string[], mea } export default getInsightViews; -export { analysisDimensions, getCombination, cluster as clusterMeasures, kruskalMST } \ No newline at end of file +export { + analysisDimensions, + getCombination, + getDimSetsBasedOnClusterGroups, + cluster as clusterMeasures, + kruskalMST +}; \ No newline at end of file diff --git a/packages/visual-insights/src/specification.ts b/packages/visual-insights/src/specification.ts index 85b12604..60592735 100644 --- a/packages/visual-insights/src/specification.ts +++ b/packages/visual-insights/src/specification.ts @@ -1,11 +1,9 @@ import { DataSource, FieldType, Field, FieldImpurity, Specification, View } from './commonTypes'; -import fieldsAnalysis from './fieldAnalysis'; import { // isFieldCategory, // isFieldContinous, memberCount } from './utils'; -import { FieldSummary } from './univariateSummary'; interface VisualElements { position: number; color: number; diff --git a/packages/visual-insights/test/dashboard.js b/packages/visual-insights/test/dashboard.js index 502bbe82..4fb45b5c 100644 --- a/packages/visual-insights/test/dashboard.js +++ b/packages/visual-insights/test/dashboard.js @@ -38,5 +38,29 @@ describe('insights test', function () { console.log(result) assert.equal(result.length, Math.pow(2, 6) - 1) }) + /** + * test example + * https://www.empirical-methods.hslu.ch/decisiontree/relationship/chi-square-contingency/ + */ + it('test crammerV', function () { + let matrix = [ + [19, 32, 83, 97, 48], + [2, 6, 16, 42, 26], + [0, 1, 3, 21, 10] + ] + let data = [] + for (let i = 0; i < matrix.length; i++) { + for (let j = 0; j < matrix[i].length; j++) { + for (let k = 0; k < matrix[i][j]; k++) { + data.push({ + x: 'x' + i, + y: 'y' + j + }) + } + } + } + let result = DashBoard.crammersV(data, 'x', 'y'); + assert.equal(result - 0.187 < 0.001, true); + }) }) diff --git a/packages/visual-insights/test/insights.js b/packages/visual-insights/test/insights.js index f10bf47c..50ff7127 100644 --- a/packages/visual-insights/test/insights.js +++ b/packages/visual-insights/test/insights.js @@ -2,7 +2,7 @@ const fs = require('fs'); const assert = require('assert'); const path = require('path'); -const { analysisDimensions, Cleaner, getInsightViews, getCombination } = require('../build/cjs/index'); +const { analysisDimensions, Cleaner, getInsightViews, getCombination, getDimSetsBasedOnClusterGroups } = require('../build/cjs/index'); const datasetPath = path.resolve(__dirname, './dataset/airbnb.json'); const dataset = JSON.parse(fs.readFileSync(datasetPath).toString()); @@ -35,5 +35,12 @@ describe('insights test', function () { console.log(result) assert.equal(result.length, Math.pow(2, 6) - 1) }) + + it('print(clusterCombination vs. combination)', function () { + let result = getDimSetsBasedOnClusterGroups(cleanData, dimensions); + let unClusterResult = getCombination(dimensions); + console.log(result.length, unClusterResult.length, result) + assert.equal(result.length <= unClusterResult.length, true); + }) }) diff --git a/packages/visual-insights/test/utils.js b/packages/visual-insights/test/utils.js index c7648c04..492c2114 100644 --- a/packages/visual-insights/test/utils.js +++ b/packages/visual-insights/test/utils.js @@ -52,10 +52,10 @@ describe('utils test', function () { let t = true; for (let row of data) { if (typeof row[field] === 'number') { - let { groups: { left, right } } = /\[(?([0-9.]+|-Infinity)), (?[0-9.]+|Infinity)\)/.exec(row[newField]); + let { groups: { left, right } } = /.*\[(?([0-9.e+]+|-Infinity)), (?[0-9.e+]+|Infinity)\)/.exec(row[newField]); left = Number(left); right = Number(right); - if (!(left <= row[field] && row[field] < right)) { + if (!(left <= row[field] && row[field] <= right)) { t = false; } } else {