Add hierarchical and semantic chunking strategy (aws-samples#569)

* semantic chaunking add * add hierarchical chunking * fix frontend and cdk * frontend fix * interface refactor fix * commentout remove * frontend fix * package.json fix * frontend and cdk pram validation fix * comment add --------- Co-authored-by: Workshop Participant <[email protected]>
DiUS · Oct 29, 2024 · aa04f0d · aa04f0d
1 parent e98774a
commit aa04f0d
Show file tree

Hide file tree

Showing 13 changed files with 921 additions and 213 deletions.
diff --git a/backend/app/repositories/models/custom_bot_kb.py b/backend/app/repositories/models/custom_bot_kb.py
@@ -24,12 +24,44 @@ class OpenSearchParamsModel(BaseModel):
     analyzer: AnalyzerParamsModel | None
 
 
+class DefaultParamsModel(BaseModel):
+    chunking_strategy: type_kb_chunking_strategy = "default"
+
+
+class FixedSizeParamsModel(BaseModel):
+    chunking_strategy: type_kb_chunking_strategy = "fixed_size"
+    max_tokens: int | None = None
+    overlap_percentage: int | None = None
+
+
+class HierarchicalParamsModel(BaseModel):
+    chunking_strategy: type_kb_chunking_strategy = "hierarchical"
+    overlap_tokens: int | None = None
+    max_parent_token_size: int | None = None
+    max_child_token_size: int | None = None
+
+
+class SemanticParamsModel(BaseModel):
+    chunking_strategy: type_kb_chunking_strategy = "semantic"
+    max_tokens: int | None = None
+    buffer_size: int | None = None
+    breakpoint_percentile_threshold: int | None = None
+
+
+class NoneParamsModel(BaseModel):
+    chunking_strategy: type_kb_chunking_strategy = "none"
+
+
 class BedrockKnowledgeBaseModel(BaseModel):
     embeddings_model: type_kb_embeddings_model
     open_search: OpenSearchParamsModel
-    chunking_strategy: type_kb_chunking_strategy
+    chunking_configuration: (
+        DefaultParamsModel
+        | FixedSizeParamsModel
+        | HierarchicalParamsModel
+        | SemanticParamsModel
+        | NoneParamsModel
+    )
     search_params: SearchParamsModel
-    max_tokens: int | None = None
-    overlap_percentage: int | None = None
     knowledge_base_id: str | None = None
     data_source_ids: list[str] | None = None
diff --git a/backend/app/routes/schemas/bot_kb.py b/backend/app/routes/schemas/bot_kb.py
@@ -4,7 +4,13 @@
 from pydantic import Field
 
 # Ref: https://docs.aws.amazon.com/bedrock/latest/APIReference/API_agent_ChunkingConfiguration.html
-type_kb_chunking_strategy = Literal["default", "fixed_size", "none"]
+type_kb_chunking_strategy = Literal[
+    "default",
+    "fixed_size",
+    "hierarchical",
+    "semantic",
+    "none",
+]
 type_kb_embeddings_model = Literal["titan_v2", "cohere_multilingual_v3"]
 type_kb_search_type = Literal["hybrid", "semantic"]
 
@@ -38,22 +44,58 @@ class OpenSearchParams(BaseSchema):
     analyzer: AnalyzerParams | None
 
 
+class DefaultParams(BaseSchema):
+    chunking_strategy: type_kb_chunking_strategy = "default"
+
+
+class FixedSizeParams(BaseSchema):
+    chunking_strategy: type_kb_chunking_strategy = "fixed_size"
+    max_tokens: int | None = None
+    overlap_percentage: int | None = None
+
+
+class HierarchicalParams(BaseSchema):
+    chunking_strategy: type_kb_chunking_strategy = "hierarchical"
+    overlap_tokens: int | None = None
+    max_parent_token_size: int | None = None
+    max_child_token_size: int | None = None
+
+
+class SemanticParams(BaseSchema):
+    chunking_strategy: type_kb_chunking_strategy = "semantic"
+    max_tokens: int | None = None
+    buffer_size: int | None = None
+    breakpoint_percentile_threshold: int | None = None
+
+
+class NoneParams(BaseSchema):
+    chunking_strategy: type_kb_chunking_strategy = "none"
+
+
 class BedrockKnowledgeBaseInput(BaseSchema):
     embeddings_model: type_kb_embeddings_model
     open_search: OpenSearchParams
-    chunking_strategy: type_kb_chunking_strategy
+    chunking_configuration: (
+        DefaultParams
+        | FixedSizeParams
+        | HierarchicalParams
+        | SemanticParams
+        | NoneParams
+    )
     search_params: SearchParams
-    max_tokens: int | None = None
-    overlap_percentage: int | None = None
     knowledge_base_id: str | None = None
 
 
 class BedrockKnowledgeBaseOutput(BaseSchema):
     embeddings_model: type_kb_embeddings_model
     open_search: OpenSearchParams
-    chunking_strategy: type_kb_chunking_strategy
+    chunking_configuration: (
+        DefaultParams
+        | FixedSizeParams
+        | HierarchicalParams
+        | SemanticParams
+        | NoneParams
+    )
     search_params: SearchParams
-    max_tokens: int | None = None
-    overlap_percentage: int | None = None
     knowledge_base_id: str | None = None
     data_source_ids: list[str] | None = None
diff --git a/cdk/bin/bedrock-custom-bot.ts b/cdk/bin/bedrock-custom-bot.ts
@@ -49,20 +49,34 @@ console.log("guardrails: ", guardrails);
 console.log("existingS3Urls: ", existingS3Urls);
 
 const embeddingsModel = getEmbeddingModel(knowledgeBase.embeddings_model.S);
-const chunkingStrategy = getChunkingStrategy(knowledgeBase.chunking_strategy.S);
-const maxTokens: number | undefined = knowledgeBase.max_tokens
-  ? Number(knowledgeBase.max_tokens.N)
+
+const maxTokens: number | undefined = knowledgeBase.chunking_configuration.M.max_tokens
+  ? Number(knowledgeBase.chunking_configuration.M.max_tokens.N)
   : undefined;
 const instruction: string | undefined = knowledgeBase.instruction
   ? knowledgeBase.instruction.S
   : undefined;
 const analyzer = knowledgeBase.open_search.M.analyzer.M
   ? getAnalyzer(knowledgeBase.open_search.M.analyzer.M)
   : undefined;
-const overlapPercentage: number | undefined = knowledgeBase.overlap_percentage
-  ? Number(knowledgeBase.overlap_percentage.N)
+const overlapPercentage: number | undefined = knowledgeBase.chunking_configuration.M.overlap_percentage
+  ? Number(knowledgeBase.chunking_configuration.M.overlap_percentage.N)
+  : undefined;
+const overlapTokens: number | undefined = knowledgeBase.chunking_configuration.M.overlap_tokens
+  ? Number(knowledgeBase.chunking_configuration.M.overlap_tokens.N)
+  : undefined;
+const maxParentTokenSize: number | undefined = knowledgeBase.chunking_configuration.M.max_parent_token_size
+  ? Number(knowledgeBase.chunking_configuration.M.max_parent_token_size.N)
+  : undefined;
+const maxChildTokenSize: number | undefined = knowledgeBase.chunking_configuration.M.max_child_token_size
+  ? Number(knowledgeBase.chunking_configuration.M.max_child_token_size.N)
+  : undefined;
+const bufferSize: number | undefined = knowledgeBase.chunking_configuration.M.buffer_size
+  ? Number(knowledgeBase.chunking_configuration.M.buffer_size.N)
+  : undefined;
+const breakpointPercentileThreshold: number | undefined = knowledgeBase.chunking_configuration.M.breakpoint_percentile_threshold
+  ? Number(knowledgeBase.chunking_configuration.M.breakpoint_percentile_threshold.N)
   : undefined;
-
 const is_guardrail_enabled: boolean | undefined =
   guardrails.is_guardrail_enabled
     ? Boolean(guardrails.is_guardrail_enabled.BOOL)
@@ -94,6 +108,19 @@ const guardrailArn: number | undefined = guardrails.guardrail_arn
 const guardrailVersion: number | undefined = guardrails.guardrail_version
   ? Number(guardrails.guardrail_version.N)
   : undefined;
+const chunkingStrategy = getChunkingStrategy(
+  knowledgeBase.chunking_configuration.M.chunking_strategy.S,
+  knowledgeBase.embeddings_model.S,
+  {
+    maxTokens,
+    overlapPercentage,
+    overlapTokens,
+    maxParentTokenSize,
+    maxChildTokenSize,
+    bufferSize,
+    breakpointPercentileThreshold,
+  }
+);
 
 console.log("embeddingsModel: ", embeddingsModel);
 console.log("chunkingStrategy: ", chunkingStrategy);

diff --git a/cdk/lib/bedrock-custom-bot-stack.ts b/cdk/lib/bedrock-custom-bot-stack.ts
@@ -9,9 +9,13 @@ import { VectorCollectionStandbyReplicas } from "@cdklabs/generative-ai-cdk-cons
 import * as s3 from "aws-cdk-lib/aws-s3";
 import {
   BedrockFoundationModel,
+} from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock";
+import {
   ChunkingStrategy,
+} from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock/data-sources/chunking";
+import {
   S3DataSource,
-} from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock";
+} from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock/data-sources/s3-data-source";
 import { KnowledgeBase } from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock";
 import { aws_bedrock as bedrock } from "aws-cdk-lib";
 
@@ -98,8 +102,6 @@ export class BedrockCustomBotStack extends Stack {
         knowledgeBase: kb,
         dataSourceName: bucket.bucketName,
         chunkingStrategy: props.chunkingStrategy,
-        maxTokens: props.maxTokens,
-        overlapPercentage: props.overlapPercentage,
         inclusionPrefixes: inclusionPrefixes,
       });
     });

diff --git a/cdk/lib/utils/bedrock-knowledge-base-args.ts b/cdk/lib/utils/bedrock-knowledge-base-args.ts
@@ -1,14 +1,28 @@
 import {
   BedrockFoundationModel,
-  ChunkingStrategy,
 } from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock";
+import {
+  HierarchicalChunkingProps,
+  ChunkingStrategy,
+} from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock/data-sources/chunking";
 import { Analyzer } from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/opensearch-vectorindex";
 import {
   CharacterFilterType,
   TokenFilterType,
   TokenizerType,
 } from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/opensearchserverless";
 
+interface FixedSizeOptions {
+  readonly maxTokens: number;
+  readonly overlapPercentage: number;
+}
+
+interface SemanticOptions {
+  readonly maxTokens: number;
+  readonly bufferSize: number;
+  readonly breakpointPercentileThreshold: number;
+}
+
 export const getEmbeddingModel = (
   embeddingsModel: string
 ): BedrockFoundationModel => {
@@ -23,13 +37,40 @@ export const getEmbeddingModel = (
 };
 
 export const getChunkingStrategy = (
-  chunkingStrategy: string
+  chunkingStrategy: string,
+  embeddingsModel: string,
+  options?: Partial<FixedSizeOptions & HierarchicalChunkingProps & SemanticOptions>
 ): ChunkingStrategy => {
   switch (chunkingStrategy) {
     case "default":
       return ChunkingStrategy.DEFAULT;
     case "fixed_size":
+      if (options?.maxTokens !== undefined && options?.overlapPercentage !== undefined) {
+        return ChunkingStrategy.fixedSize({
+          maxTokens: options.maxTokens,
+          overlapPercentage: options.overlapPercentage
+        });
+      }
       return ChunkingStrategy.FIXED_SIZE;
+    case "hierarchical":
+      if (options?.overlapTokens !== undefined && options?.maxParentTokenSize !== undefined && options?.maxChildTokenSize !== undefined) {
+        return ChunkingStrategy.hierarchical({
+          overlapTokens: options.overlapTokens,
+          maxParentTokenSize: options.maxParentTokenSize,
+          maxChildTokenSize: options.maxChildTokenSize
+        });
+      }
+      return embeddingsModel === 'titan_v2' ? ChunkingStrategy.HIERARCHICAL_TITAN : ChunkingStrategy.HIERARCHICAL_COHERE;
+    case "semantic":
+      // Check that it is not explicitly undefined because bufferSize is set to 0, it will be created with the default value even if other parameters changed.
+      if (options?.maxTokens !== undefined && options?.bufferSize !== undefined && options?.breakpointPercentileThreshold !== undefined) {
+        return ChunkingStrategy.semantic({
+          maxTokens: options.maxTokens,
+          bufferSize: options.bufferSize,
+          breakpointPercentileThreshold: options.breakpointPercentileThreshold
+        });
+      }
+      return ChunkingStrategy.SEMANTIC;
     case "none":
       return ChunkingStrategy.NONE;
     default: