diff --git a/packages/encoder/bag-of-paths-encoder/src/index.ts b/packages/encoder/bag-of-paths-encoder/src/index.ts index bc596594..f9cc1d0d 100644 --- a/packages/encoder/bag-of-paths-encoder/src/index.ts +++ b/packages/encoder/bag-of-paths-encoder/src/index.ts @@ -1,9 +1,11 @@ import type { GraphModel } from '@cm2ml/ir' +import type { StructuredOutput } from '@cm2ml/plugin' import { ExecutionError, batchTryCatch, compose, definePlugin, defineStructuredBatchPlugin, getFirstNonError } from '@cm2ml/plugin' import { Stream } from '@yeger/streams' import type { CompiledTemplates } from './bop-types' import { pathWeightTypes, sortOrders } from './bop-types' +import type { EncodedPath } from './encoding' import { encodePaths } from './encoding' import { collectPaths } from './paths' import type { PruneMethod } from './prune' @@ -14,11 +16,13 @@ export type { PathWeight } from './bop-types' export { pathWeightTypes } export type { EncodedModelMember, EncodedPath } from './encoding' +export interface BagOfPathsMetadata { + idAttribute: string | undefined + typeAttributes: string[] | undefined +} + interface PrecomputedMetadata { - metamodelData: { - idAttribute: string | undefined - typeAttributes: string[] | undefined - } + metamodelData: BagOfPathsMetadata compiledTemplates: CompiledTemplates } @@ -72,6 +76,11 @@ const TemplateCompiler = defineStructuredBatchPlugin({ }, }) +export interface BagOfPathsData { + paths: EncodedPath[] + mapping: string[] +} + const PathBuilder = definePlugin({ name: 'path-builder', parameters: { @@ -134,7 +143,7 @@ const PathBuilder = definePlugin({ group: 'Paths', }, }, - invoke: ({ data, metadata }: { data: GraphModel | ExecutionError, metadata: PrecomputedMetadata }, parameters) => { + invoke: ({ data, metadata }: { data: GraphModel | ExecutionError, metadata: PrecomputedMetadata }, parameters): StructuredOutput | ExecutionError => { if (data instanceof ExecutionError) { return data } diff --git a/packages/encoder/feature-encoder/src/features.ts b/packages/encoder/feature-encoder/src/features.ts index 7b8b4dc0..c0f96d3e 100644 --- a/packages/encoder/feature-encoder/src/features.ts +++ b/packages/encoder/feature-encoder/src/features.ts @@ -58,7 +58,24 @@ export interface FeatureDeriverSettings extends FeatureEncoderProviderSettings { edgeFeatureOverride: FeatureMetadata | null } -export type FeatureContext = ReturnType +export interface FeatureContext { + staticData: StaticFeatureData + canEncodeNodeAttribute: (attribute: Attribute) => boolean + canEncodeEdgeAttribute: (attribute: Attribute) => boolean + mapNodeAttribute: (attribute: Attribute) => number | string | null + mapEdgeAttribute: (attribute: Attribute) => number | string | null + getNodeFeatureVector: (node: GraphNode) => FeatureVector + getEdgeFeatureVector: (edge: GraphEdge) => FeatureVector +} + +export interface StaticFeatureData { + nodeFeatures: FeatureMetadata + edgeFeatures: FeatureMetadata + onlyEncodedFeatures: boolean + idAttribute: string | undefined + typeAttributes: string[] | undefined + nameAttribute: string | undefined +} export function deriveFeatures(models: GraphModel[], settings: FeatureDeriverSettings) { const nodes = Stream.from(models).flatMap(({ nodes }) => nodes).cache() @@ -70,15 +87,17 @@ export function deriveFeatures(models: GraphModel[], settings: FeatureDeriverSet const edgeFeatures: FeatureMetadata = internalEdgeFeatures.map(([name, type, encoder]) => [name, type, encoder?.export?.() ?? null] as const) const metamodel = models[0]?.metamodel + + const staticData: StaticFeatureData = { + edgeFeatures, + nodeFeatures, + onlyEncodedFeatures: settings.onlyEncodedFeatures, + idAttribute: metamodel?.idAttribute, + typeAttributes: metamodel?.typeAttributes, + nameAttribute: metamodel?.nameAttribute, + } return { - staticData: { - edgeFeatures, - nodeFeatures, - onlyEncodedFeatures: settings.onlyEncodedFeatures, - idAttribute: metamodel?.idAttribute, - typeAttributes: metamodel?.typeAttributes, - nameAttribute: metamodel?.nameAttribute, - }, + staticData, canEncodeNodeAttribute: (attribute: Attribute) => nodeEncoderProvider.canEncodeAttribute(attribute), canEncodeEdgeAttribute: (attribute: Attribute) => edgeEncoderProvider.canEncodeAttribute(attribute), mapNodeAttribute: createAttributeMapper(nodeEncoderProvider), diff --git a/packages/encoder/feature-encoder/src/index.ts b/packages/encoder/feature-encoder/src/index.ts index e4fdbb9e..8aaf2957 100644 --- a/packages/encoder/feature-encoder/src/index.ts +++ b/packages/encoder/feature-encoder/src/index.ts @@ -1,13 +1,15 @@ import { GraphModel } from '@cm2ml/ir' +import type { StructuredOutput } from '@cm2ml/plugin' import { ExecutionError, ValidationError, compose, definePlugin, defineStructuredBatchPlugin, getFirstNonError } from '@cm2ml/plugin' import { lazy } from '@cm2ml/utils' import { Stream } from '@yeger/streams' import { ZodError } from 'zod' import { getFeatureMetadataFromFile } from './feature-metadata-extractor' +import type { FeatureContext, FeatureVector, StaticFeatureData } from './features' import { FeatureMetadataSchema, deriveFeatures } from './features' -export type { FeatureContext, FeatureMetadata, FeatureName, FeatureType, FeatureVector } from './features' +export type { FeatureContext, FeatureMetadata, FeatureName, FeatureType, FeatureVector, StaticFeatureData } from './features' export const FeatureEncoder = defineStructuredBatchPlugin({ name: 'feature-encoder', @@ -63,12 +65,12 @@ export const FeatureEncoder = defineStructuredBatchPlugin({ group: 'features', }, }, - invoke(input: (GraphModel | ExecutionError)[], parameters) { + invoke(input: (GraphModel | ExecutionError)[], parameters): (StructuredOutput | ExecutionError)[] { try { const models = input.filter((item) => item instanceof GraphModel) const nodeFeatureOverride = parameters.nodeFeatures !== '' ? FeatureMetadataSchema.parse(JSON.parse(parameters.nodeFeatures)) : null const edgeFeatureOverride = parameters.edgeFeatures !== '' ? FeatureMetadataSchema.parse(JSON.parse(parameters.edgeFeatures)) : null - const features = lazy(() => deriveFeatures(models, { ...parameters, nodeFeatureOverride, edgeFeatureOverride })) + const features: FeatureContext = lazy(() => deriveFeatures(models, { ...parameters, nodeFeatureOverride, edgeFeatureOverride })) return input.map((item) => { if (item instanceof ExecutionError) { return item @@ -87,10 +89,18 @@ export const FeatureEncoder = defineStructuredBatchPlugin({ }, }) +export interface EncodedFeatures { + nodes: FeatureVector[] + edges: FeatureVector[] +} + +/** + * Encodes a graph model with feature vectors. + */ export const StandaloneFeatureEncoder = compose(FeatureEncoder, definePlugin({ name: 'feature-vector-generator', parameters: {}, - invoke(batch, _parameters) { + invoke(batch, _parameters): (StructuredOutput | ExecutionError)[] { const firstNonError = getFirstNonError(batch) const metadata = firstNonError?.metadata.staticData return batch.map((item) => { diff --git a/packages/encoder/graph-encoder/src/edge-encoder.ts b/packages/encoder/graph-encoder/src/edge-encoder.ts index 7b3fece8..f357a93b 100644 --- a/packages/encoder/graph-encoder/src/edge-encoder.ts +++ b/packages/encoder/graph-encoder/src/edge-encoder.ts @@ -6,6 +6,10 @@ import { Stream } from '@yeger/streams' export const formats = ['list', 'matrix'] as const export type Format = typeof formats[number] +export type AdjacencyEncoding = (AdjacencyListEncoding | AdjacencyMatrixEncoding) & { + nodeFeatureVectors: FeatureVector[] +} + export const EdgeEncoder = defineStructuredPlugin({ name: 'edge-encoder', parameters: { @@ -39,11 +43,13 @@ export const EdgeEncoder = defineStructuredPlugin({ .map(getNodeFeatureVector) .toArray() + const encodedData: AdjacencyEncoding = { + ...edgeEncoding, + nodeFeatureVectors, + } + return { - data: { - ...edgeEncoding, - nodeFeatureVectors, - }, + data: encodedData, metadata: staticData, } }, @@ -59,12 +65,19 @@ function getSortedIds(model: GraphModel) { export type AdjacencyList = [number, number][] | [number, number, number][] +export interface AdjacencyListEncoding { + format: 'list' + list: AdjacencyList + nodes: string[] + edgeFeatureVectors: FeatureVector[] +} + function encodeAsAdjacencyList( edges: ReadonlySet, sortedIds: string[], weighted: boolean, getEdgeFeatureVector: (edge: GraphEdge) => FeatureVector, -) { +): AdjacencyListEncoding { const list = new Array< readonly [number, number] | readonly [number, number, number] >() @@ -124,11 +137,18 @@ function sortAdjacencyList(list: AdjacencyList) { export type AdjacencyMatrix = number[][] +export interface AdjacencyMatrixEncoding { + format: 'matrix' + matrix: AdjacencyMatrix + nodes: string[] + edgeFeatureVectors: FeatureVector[] +} + function encodeAsAdjacencyMatrix( edges: ReadonlySet, sortedIds: string[], weighted: boolean, -) { +): AdjacencyMatrixEncoding { const matrix = createAdjacencyMatrix(sortedIds.length) fillAdjacencyMatrix(matrix, edges, sortedIds, weighted) return { format: 'matrix' as const, matrix, nodes: sortedIds, edgeFeatureVectors: [] } diff --git a/packages/encoder/graph-encoder/src/index.ts b/packages/encoder/graph-encoder/src/index.ts index a870fa5a..76183ce7 100644 --- a/packages/encoder/graph-encoder/src/index.ts +++ b/packages/encoder/graph-encoder/src/index.ts @@ -3,6 +3,12 @@ import { batchTryCatch, compose } from '@cm2ml/plugin' import { EdgeEncoder } from './edge-encoder' -export type { AdjacencyList, AdjacencyMatrix } from './edge-encoder' +export type { AdjacencyEncoding, AdjacencyList, AdjacencyListEncoding, AdjacencyMatrix, AdjacencyMatrixEncoding } from './edge-encoder' +/** + * Encodes a graph model as a raw graph with feature vectors and adjacency data. + * + * **Requirements:** + * - Each node must have a unique id. + */ export const GraphEncoder = compose(FeatureEncoder, batchTryCatch(EdgeEncoder), 'raw-graph') diff --git a/packages/encoder/pattern-miner/src/frequency.ts b/packages/encoder/pattern-miner/src/frequency.ts index 1b6f84b7..2e104145 100644 --- a/packages/encoder/pattern-miner/src/frequency.ts +++ b/packages/encoder/pattern-miner/src/frequency.ts @@ -4,11 +4,13 @@ import type { MinedPattern } from './mining' import type { LabeledEdge, SerializedLabeledEdge } from './normalization' import type { FrequencyParameters, PatternOrder } from './pattern-types' +export type PatternData = SerializedLabeledEdge[] + export interface PatternWithFrequency { /** * The pattern */ - pattern: SerializedLabeledEdge[] + pattern: PatternData /** * A DOT-notation graph of the pattern. */ diff --git a/packages/encoder/pattern-miner/src/index.ts b/packages/encoder/pattern-miner/src/index.ts index b7220106..bbdf8292 100644 --- a/packages/encoder/pattern-miner/src/index.ts +++ b/packages/encoder/pattern-miner/src/index.ts @@ -1,16 +1,26 @@ import type { GraphModel } from '@cm2ml/ir' -import type { InferOut } from '@cm2ml/plugin' +import type { InferOut, StructuredOutput } from '@cm2ml/plugin' import { ExecutionError, batchTryCatch, compose, definePlugin } from '@cm2ml/plugin' import { Stream } from '@yeger/streams' import { embedPartitions } from './embedding' +import type { PatternWithFrequency } from './frequency' import { calculateFrequencies } from './frequency' +import type { MinedPattern } from './mining' import { minePatterns } from './mining' +import type { PatternMapping } from './normalization' import { normalizePartitions } from './normalization' import { partitionNodes } from './partitioning' import { restorePartitionEdges } from './restoration' -export type { PatternWithFrequency } from './frequency' +export type { PatternData, PatternWithFrequency } from './frequency' +export type { MinedPattern } from './mining' +export type { PatternMapping, SerializedLabeledEdge } from './normalization' + +interface PatternData { + patterns: MinedPattern[] + mapping: PatternMapping +} const ModelPatternMiner = batchTryCatch(definePlugin({ name: 'patterns', @@ -65,15 +75,16 @@ const ModelPatternMiner = batchTryCatch(definePlugin({ group: 'mining', }, }, - invoke(model: GraphModel, parameters) { + invoke(model: GraphModel, parameters): StructuredOutput { const partitions = partitionNodes(model, parameters) .map(restorePartitionEdges) const { normalizedPartitions, mapping } = normalizePartitions(partitions, parameters) const embedding = embedPartitions(normalizedPartitions) const patterns = minePatterns(embedding, parameters) + const data: PatternData = { patterns, mapping } return { - data: { patterns, mapping }, - metadata: {}, + data, + metadata: null, } }, })) @@ -107,7 +118,7 @@ const PatternFrequencyMiner = definePlugin({ group: 'filter', }, }, - invoke(batch: InferOut, parameters) { + invoke(batch: InferOut, parameters): (StructuredOutput)[] { const patterns = Stream .from(batch) .map((result) => result instanceof ExecutionError ? [] : result.data.patterns) @@ -120,4 +131,10 @@ const PatternFrequencyMiner = definePlugin({ }, }) +/** + * Detects patterns in graph models. + * + * **Requirements:** + * - Each node must have a unique id. + */ export const PatternMiner = compose(ModelPatternMiner, PatternFrequencyMiner, 'pattern-miner') diff --git a/packages/encoder/pattern-miner/src/normalization.ts b/packages/encoder/pattern-miner/src/normalization.ts index b6f9befb..72b91bfa 100644 --- a/packages/encoder/pattern-miner/src/normalization.ts +++ b/packages/encoder/pattern-miner/src/normalization.ts @@ -2,6 +2,11 @@ import type { GraphEdge, GraphNode } from '@cm2ml/ir' import type { NormalizationParameters } from './pattern-types' +/** + * Maps the ID of a labeled node to the IDs of the nodes in the original graph. + */ +export type PatternMapping = Record + export function normalizePartitions(partitions: Set[], parameters: NormalizationParameters) { const normalizedLabeledNodes: LabeledNode[][] = [] const crossPartitionMapping: Record> = {} @@ -18,9 +23,12 @@ export function normalizePartitions(partitions: Set[], parameters: No crossPartitionMapping[labeledNodeId].add(graphNodeId) }) }) + const mapping: PatternMapping = Object + .fromEntries(Object.entries(crossPartitionMapping) + .map(([key, value]) => [key, Array.from(value)])) return { normalizedPartitions: normalizedLabeledNodes, - mapping: Object.fromEntries(Object.entries(crossPartitionMapping).map(([key, value]) => [key, Array.from(value)])), + mapping, } } diff --git a/packages/encoder/tree-encoder/src/index.ts b/packages/encoder/tree-encoder/src/index.ts index 75cb6878..243a04ad 100644 --- a/packages/encoder/tree-encoder/src/index.ts +++ b/packages/encoder/tree-encoder/src/index.ts @@ -1,17 +1,21 @@ -import type { FeatureContext } from '@cm2ml/feature-encoder' +import type { FeatureContext, StaticFeatureData } from '@cm2ml/feature-encoder' import { FeatureEncoder } from '@cm2ml/feature-encoder' import type { GraphModel } from '@cm2ml/ir' -import type { InferOut } from '@cm2ml/plugin' +import type { InferOut, StructuredOutput } from '@cm2ml/plugin' import { ExecutionError, batchTryCatch, compose, definePlugin, defineStructuredPlugin, getFirstNonError } from '@cm2ml/plugin' import { CompactTreeBuilder } from './tree-builder/compact-tree-builder' import { GlobalTreeBuilder } from './tree-builder/global-tree-builder' import { LocalTreeBuilder } from './tree-builder/local-tree-builder' import { isValidTreeFormat, treeFormats } from './tree-model' -import type { RecursiveTreeNode, TreeNodeValue } from './tree-model' +import type { RecursiveTreeNode, TreeModel, TreeNodeValue } from './tree-model' +import type { Vocabularies } from './vocabulary' import { getVocabularies } from './vocabulary' export type * from './tree-model' +export { type Vocabularies } from './vocabulary' + +export type TreeBasedData = TreeModel const treeBuilders = { compact: CompactTreeBuilder, @@ -42,8 +46,8 @@ const TreeTransformer = defineStructuredPlugin({ group: 'vocabulary', }, }, - invoke({ data: model, metadata: featureContext }: { data: GraphModel, metadata: FeatureContext }, parameters) { - function createTreeModel() { + invoke({ data: model, metadata: featureContext }: { data: GraphModel, metadata: FeatureContext }, parameters): StructuredOutput { + function createTreeModel(): TreeBasedData { if (!isValidTreeFormat(parameters.format)) { throw new Error(`Invalid tree format: ${parameters.format}.`) } @@ -58,13 +62,17 @@ const TreeTransformer = defineStructuredPlugin({ }, }) +interface VocabularyMetadata extends Partial { + vocabularies: Vocabularies +} + const BuildVocabulary = definePlugin({ name: 'build-vocabulary', parameters: {}, - invoke(input: (InferOut | ExecutionError)[], _parameters) { + invoke(input: (InferOut | ExecutionError)[], _parameters): (StructuredOutput | ExecutionError)[] { const trees = input.filter((item): item is InferOut => !(item instanceof ExecutionError)).map((item) => item.data) const vocabularies = getVocabularies(trees) - const newMetadata = { + const newMetadata: VocabularyMetadata = { ...(getFirstNonError(input)?.metadata ?? {}), vocabularies, } @@ -86,7 +94,7 @@ export type Word2IdMapping = Record type BuildVocabularyOutput = InferOut -type BuildVocabularyMetadata = Exclude['metadata'] +export type TreeBasedMetadata = VocabularyMetadata & { id2WordMapping: Id2WordMapping } const WordsToIds = definePlugin({ name: 'words-to-ids', @@ -104,9 +112,9 @@ const WordsToIds = definePlugin({ group: 'vocabulary', }, }, - invoke(input: BuildVocabularyOutput, parameters) { + invoke(input: BuildVocabularyOutput, parameters): (StructuredOutput | ExecutionError)[] { function alignOutType() { - let metadata: BuildVocabularyMetadata & { id2WordMapping: Id2WordMapping } | null = null + let metadata: TreeBasedMetadata | null = null return input.map((item) => { if (item instanceof ExecutionError) { return item @@ -146,7 +154,7 @@ const WordsToIds = definePlugin({ children: node.children.map(mapNode), } } - const newMetadata = { + const newMetadata: TreeBasedMetadata = { ...firstValidInput.metadata, id2WordMapping, } @@ -166,6 +174,13 @@ const WordsToIds = definePlugin({ }, }) +/** + * Encodes a graph model as a tree. + * + * **Requirements:** + * - Each node must have a unique id. + * - Each node must have a type (`global` format only). + */ export const TreeEncoder = compose( compose(FeatureEncoder, batchTryCatch(TreeTransformer)), compose(BuildVocabulary, WordsToIds), diff --git a/packages/encoder/tree-encoder/src/vocabulary.ts b/packages/encoder/tree-encoder/src/vocabulary.ts index e2e43e5f..0250d134 100644 --- a/packages/encoder/tree-encoder/src/vocabulary.ts +++ b/packages/encoder/tree-encoder/src/vocabulary.ts @@ -2,7 +2,13 @@ import { Stream } from '@yeger/streams' import type { RecursiveTreeNode, TreeModel, TreeNodeValue } from './tree-model' -export function getVocabularies(trees: TreeModel[]) { +export interface Vocabularies { + staticVocabulary: TreeNodeValue[] + dynamicVocabulary: TreeNodeValue[] + vocabulary: TreeNodeValue[] +} + +export function getVocabularies(trees: TreeModel[]): Vocabularies { const staticVocabulary = getVocabulary(trees, 'static') const dynamicVocabulary = getVocabulary(trees, 'dynamic') const vocabulary = Stream diff --git a/packages/visualizer/src/components/encoder/encodings/pattern/PatternEncoding.tsx b/packages/visualizer/src/components/encoder/encodings/pattern/PatternEncoding.tsx index 0aaf293a..02ca4644 100644 --- a/packages/visualizer/src/components/encoder/encodings/pattern/PatternEncoding.tsx +++ b/packages/visualizer/src/components/encoder/encodings/pattern/PatternEncoding.tsx @@ -1,3 +1,4 @@ +import type { PatternMapping, PatternWithFrequency, SerializedLabeledEdge } from '@cm2ml/builtin' import { PatternMiner } from '@cm2ml/builtin' import type { GraphModel } from '@cm2ml/ir' import { ExecutionError } from '@cm2ml/plugin' @@ -56,15 +57,8 @@ export function PatternEncoding({ model, parameters }: Props) { ) } -interface PatternProps { - pattern: { - source: string - target: string - tag: string - }[] - absoluteFrequency: number - graph: string - mapping: Record +interface PatternProps extends Omit { + mapping: PatternMapping } function Pattern({ pattern, absoluteFrequency, mapping, graph }: PatternProps) { @@ -111,12 +105,8 @@ function Pattern({ pattern, absoluteFrequency, mapping, graph }: PatternProps) { } interface LabeledEdgeProps { - edge: { - source: string - target: string - tag: string - } - mapping: Record + edge: SerializedLabeledEdge + mapping: PatternMapping } function mapsToGraphNode(patternNodeId: string, mapping: Record, graphNodeId: string) { @@ -159,7 +149,7 @@ function LabeledEdge({ edge, mapping }: LabeledEdgeProps) { interface LabeledNodeProps { nodeId: string - mapping: Record + mapping: PatternMapping isEdgeSelected: boolean } diff --git a/packages/visualizer/src/components/encoder/encodings/pattern/PatternGraph.tsx b/packages/visualizer/src/components/encoder/encodings/pattern/PatternGraph.tsx index 4cfc9d09..53aefbf2 100644 --- a/packages/visualizer/src/components/encoder/encodings/pattern/PatternGraph.tsx +++ b/packages/visualizer/src/components/encoder/encodings/pattern/PatternGraph.tsx @@ -1,4 +1,4 @@ -import type { PatternWithFrequency } from '@cm2ml/builtin' +import type { PatternMapping, PatternWithFrequency } from '@cm2ml/builtin' import { debounce } from '@yeger/debounce' import { Stream } from '@yeger/streams' import type { RefObject } from 'react' @@ -16,7 +16,7 @@ export type Pattern = PatternWithFrequency['pattern'] export interface Props { pattern: Pattern - mapping: Record + mapping: PatternMapping } export function PatternGraph({ pattern, mapping }: Props) {