Skip to content

Commit

Permalink
Add hierarchical and semantic chunking strategy (aws-samples#569)
Browse files Browse the repository at this point in the history
* semantic chaunking add

* add hierarchical chunking

* fix frontend and cdk

* frontend fix

* interface refactor fix

* commentout remove

* frontend fix

* package.json fix

* frontend and cdk pram validation fix

* comment add

---------

Co-authored-by: Workshop Participant <[email protected]>
  • Loading branch information
Tsujiba and Workshop Participant authored Oct 29, 2024
1 parent e98774a commit aa04f0d
Show file tree
Hide file tree
Showing 13 changed files with 921 additions and 213 deletions.
38 changes: 35 additions & 3 deletions backend/app/repositories/models/custom_bot_kb.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,44 @@ class OpenSearchParamsModel(BaseModel):
analyzer: AnalyzerParamsModel | None


class DefaultParamsModel(BaseModel):
chunking_strategy: type_kb_chunking_strategy = "default"


class FixedSizeParamsModel(BaseModel):
chunking_strategy: type_kb_chunking_strategy = "fixed_size"
max_tokens: int | None = None
overlap_percentage: int | None = None


class HierarchicalParamsModel(BaseModel):
chunking_strategy: type_kb_chunking_strategy = "hierarchical"
overlap_tokens: int | None = None
max_parent_token_size: int | None = None
max_child_token_size: int | None = None


class SemanticParamsModel(BaseModel):
chunking_strategy: type_kb_chunking_strategy = "semantic"
max_tokens: int | None = None
buffer_size: int | None = None
breakpoint_percentile_threshold: int | None = None


class NoneParamsModel(BaseModel):
chunking_strategy: type_kb_chunking_strategy = "none"


class BedrockKnowledgeBaseModel(BaseModel):
embeddings_model: type_kb_embeddings_model
open_search: OpenSearchParamsModel
chunking_strategy: type_kb_chunking_strategy
chunking_configuration: (
DefaultParamsModel
| FixedSizeParamsModel
| HierarchicalParamsModel
| SemanticParamsModel
| NoneParamsModel
)
search_params: SearchParamsModel
max_tokens: int | None = None
overlap_percentage: int | None = None
knowledge_base_id: str | None = None
data_source_ids: list[str] | None = None
56 changes: 49 additions & 7 deletions backend/app/routes/schemas/bot_kb.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@
from pydantic import Field

# Ref: https://docs.aws.amazon.com/bedrock/latest/APIReference/API_agent_ChunkingConfiguration.html
type_kb_chunking_strategy = Literal["default", "fixed_size", "none"]
type_kb_chunking_strategy = Literal[
"default",
"fixed_size",
"hierarchical",
"semantic",
"none",
]
type_kb_embeddings_model = Literal["titan_v2", "cohere_multilingual_v3"]
type_kb_search_type = Literal["hybrid", "semantic"]

Expand Down Expand Up @@ -38,22 +44,58 @@ class OpenSearchParams(BaseSchema):
analyzer: AnalyzerParams | None


class DefaultParams(BaseSchema):
chunking_strategy: type_kb_chunking_strategy = "default"


class FixedSizeParams(BaseSchema):
chunking_strategy: type_kb_chunking_strategy = "fixed_size"
max_tokens: int | None = None
overlap_percentage: int | None = None


class HierarchicalParams(BaseSchema):
chunking_strategy: type_kb_chunking_strategy = "hierarchical"
overlap_tokens: int | None = None
max_parent_token_size: int | None = None
max_child_token_size: int | None = None


class SemanticParams(BaseSchema):
chunking_strategy: type_kb_chunking_strategy = "semantic"
max_tokens: int | None = None
buffer_size: int | None = None
breakpoint_percentile_threshold: int | None = None


class NoneParams(BaseSchema):
chunking_strategy: type_kb_chunking_strategy = "none"


class BedrockKnowledgeBaseInput(BaseSchema):
embeddings_model: type_kb_embeddings_model
open_search: OpenSearchParams
chunking_strategy: type_kb_chunking_strategy
chunking_configuration: (
DefaultParams
| FixedSizeParams
| HierarchicalParams
| SemanticParams
| NoneParams
)
search_params: SearchParams
max_tokens: int | None = None
overlap_percentage: int | None = None
knowledge_base_id: str | None = None


class BedrockKnowledgeBaseOutput(BaseSchema):
embeddings_model: type_kb_embeddings_model
open_search: OpenSearchParams
chunking_strategy: type_kb_chunking_strategy
chunking_configuration: (
DefaultParams
| FixedSizeParams
| HierarchicalParams
| SemanticParams
| NoneParams
)
search_params: SearchParams
max_tokens: int | None = None
overlap_percentage: int | None = None
knowledge_base_id: str | None = None
data_source_ids: list[str] | None = None
39 changes: 33 additions & 6 deletions cdk/bin/bedrock-custom-bot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,20 +49,34 @@ console.log("guardrails: ", guardrails);
console.log("existingS3Urls: ", existingS3Urls);

const embeddingsModel = getEmbeddingModel(knowledgeBase.embeddings_model.S);
const chunkingStrategy = getChunkingStrategy(knowledgeBase.chunking_strategy.S);
const maxTokens: number | undefined = knowledgeBase.max_tokens
? Number(knowledgeBase.max_tokens.N)

const maxTokens: number | undefined = knowledgeBase.chunking_configuration.M.max_tokens
? Number(knowledgeBase.chunking_configuration.M.max_tokens.N)
: undefined;
const instruction: string | undefined = knowledgeBase.instruction
? knowledgeBase.instruction.S
: undefined;
const analyzer = knowledgeBase.open_search.M.analyzer.M
? getAnalyzer(knowledgeBase.open_search.M.analyzer.M)
: undefined;
const overlapPercentage: number | undefined = knowledgeBase.overlap_percentage
? Number(knowledgeBase.overlap_percentage.N)
const overlapPercentage: number | undefined = knowledgeBase.chunking_configuration.M.overlap_percentage
? Number(knowledgeBase.chunking_configuration.M.overlap_percentage.N)
: undefined;
const overlapTokens: number | undefined = knowledgeBase.chunking_configuration.M.overlap_tokens
? Number(knowledgeBase.chunking_configuration.M.overlap_tokens.N)
: undefined;
const maxParentTokenSize: number | undefined = knowledgeBase.chunking_configuration.M.max_parent_token_size
? Number(knowledgeBase.chunking_configuration.M.max_parent_token_size.N)
: undefined;
const maxChildTokenSize: number | undefined = knowledgeBase.chunking_configuration.M.max_child_token_size
? Number(knowledgeBase.chunking_configuration.M.max_child_token_size.N)
: undefined;
const bufferSize: number | undefined = knowledgeBase.chunking_configuration.M.buffer_size
? Number(knowledgeBase.chunking_configuration.M.buffer_size.N)
: undefined;
const breakpointPercentileThreshold: number | undefined = knowledgeBase.chunking_configuration.M.breakpoint_percentile_threshold
? Number(knowledgeBase.chunking_configuration.M.breakpoint_percentile_threshold.N)
: undefined;

const is_guardrail_enabled: boolean | undefined =
guardrails.is_guardrail_enabled
? Boolean(guardrails.is_guardrail_enabled.BOOL)
Expand Down Expand Up @@ -94,6 +108,19 @@ const guardrailArn: number | undefined = guardrails.guardrail_arn
const guardrailVersion: number | undefined = guardrails.guardrail_version
? Number(guardrails.guardrail_version.N)
: undefined;
const chunkingStrategy = getChunkingStrategy(
knowledgeBase.chunking_configuration.M.chunking_strategy.S,
knowledgeBase.embeddings_model.S,
{
maxTokens,
overlapPercentage,
overlapTokens,
maxParentTokenSize,
maxChildTokenSize,
bufferSize,
breakpointPercentileThreshold,
}
);

console.log("embeddingsModel: ", embeddingsModel);
console.log("chunkingStrategy: ", chunkingStrategy);
Expand Down
8 changes: 5 additions & 3 deletions cdk/lib/bedrock-custom-bot-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,13 @@ import { VectorCollectionStandbyReplicas } from "@cdklabs/generative-ai-cdk-cons
import * as s3 from "aws-cdk-lib/aws-s3";
import {
BedrockFoundationModel,
} from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock";
import {
ChunkingStrategy,
} from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock/data-sources/chunking";
import {
S3DataSource,
} from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock";
} from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock/data-sources/s3-data-source";
import { KnowledgeBase } from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock";
import { aws_bedrock as bedrock } from "aws-cdk-lib";

Expand Down Expand Up @@ -98,8 +102,6 @@ export class BedrockCustomBotStack extends Stack {
knowledgeBase: kb,
dataSourceName: bucket.bucketName,
chunkingStrategy: props.chunkingStrategy,
maxTokens: props.maxTokens,
overlapPercentage: props.overlapPercentage,
inclusionPrefixes: inclusionPrefixes,
});
});
Expand Down
45 changes: 43 additions & 2 deletions cdk/lib/utils/bedrock-knowledge-base-args.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
import {
BedrockFoundationModel,
ChunkingStrategy,
} from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock";
import {
HierarchicalChunkingProps,
ChunkingStrategy,
} from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/bedrock/data-sources/chunking";
import { Analyzer } from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/opensearch-vectorindex";
import {
CharacterFilterType,
TokenFilterType,
TokenizerType,
} from "@cdklabs/generative-ai-cdk-constructs/lib/cdk-lib/opensearchserverless";

interface FixedSizeOptions {
readonly maxTokens: number;
readonly overlapPercentage: number;
}

interface SemanticOptions {
readonly maxTokens: number;
readonly bufferSize: number;
readonly breakpointPercentileThreshold: number;
}

export const getEmbeddingModel = (
embeddingsModel: string
): BedrockFoundationModel => {
Expand All @@ -23,13 +37,40 @@ export const getEmbeddingModel = (
};

export const getChunkingStrategy = (
chunkingStrategy: string
chunkingStrategy: string,
embeddingsModel: string,
options?: Partial<FixedSizeOptions & HierarchicalChunkingProps & SemanticOptions>
): ChunkingStrategy => {
switch (chunkingStrategy) {
case "default":
return ChunkingStrategy.DEFAULT;
case "fixed_size":
if (options?.maxTokens !== undefined && options?.overlapPercentage !== undefined) {
return ChunkingStrategy.fixedSize({
maxTokens: options.maxTokens,
overlapPercentage: options.overlapPercentage
});
}
return ChunkingStrategy.FIXED_SIZE;
case "hierarchical":
if (options?.overlapTokens !== undefined && options?.maxParentTokenSize !== undefined && options?.maxChildTokenSize !== undefined) {
return ChunkingStrategy.hierarchical({
overlapTokens: options.overlapTokens,
maxParentTokenSize: options.maxParentTokenSize,
maxChildTokenSize: options.maxChildTokenSize
});
}
return embeddingsModel === 'titan_v2' ? ChunkingStrategy.HIERARCHICAL_TITAN : ChunkingStrategy.HIERARCHICAL_COHERE;
case "semantic":
// Check that it is not explicitly undefined because bufferSize is set to 0, it will be created with the default value even if other parameters changed.
if (options?.maxTokens !== undefined && options?.bufferSize !== undefined && options?.breakpointPercentileThreshold !== undefined) {
return ChunkingStrategy.semantic({
maxTokens: options.maxTokens,
bufferSize: options.bufferSize,
breakpointPercentileThreshold: options.breakpointPercentileThreshold
});
}
return ChunkingStrategy.SEMANTIC;
case "none":
return ChunkingStrategy.NONE;
default:
Expand Down
Loading

0 comments on commit aa04f0d

Please sign in to comment.