Skip to content

Commit 3a60ab3

Browse files
authored
Adding new KB sync options and fixing older ones (#7679)
1 parent 75f9f78 commit 3a60ab3

File tree

9 files changed

+225
-3
lines changed

9 files changed

+225
-3
lines changed

.github/workflows/KB_Updater.yml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: Deploy KB_Updater Lambda Function
2+
on:
3+
push:
4+
branches: ["main"]
5+
paths:
6+
- '.tools/lambda/KB_Updater.py'
7+
workflow_dispatch:
8+
9+
permissions:
10+
id-token: write
11+
12+
jobs:
13+
deploy:
14+
runs-on: ubuntu-latest
15+
steps:
16+
- name: Checkout
17+
uses: actions/checkout@v4
18+
19+
- name: Configure AWS Credentials
20+
uses: aws-actions/configure-aws-credentials@v5
21+
with:
22+
role-to-assume: ${{ secrets.AWS_ASSUME_ROLE }}
23+
aws-region: us-west-2
24+
25+
- name: Deploy Lambda
26+
run: |
27+
cd .tools/lambda
28+
zip function.zip KB_Updater.py
29+
aws lambda update-function-code \
30+
--function-name KB_Updater \
31+
--zip-file fileb://function.zip

.github/workflows/sync-S3-KB.yml

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,21 @@ on:
2121
- php
2222
- cpp
2323
- kotlin
24+
- steering_docs
25+
- specs
26+
- coding-standards
2427

2528
permissions:
2629
id-token: write
2730

2831
jobs:
2932
run_job_with_aws:
3033
runs-on: ubuntu-latest
34+
strategy:
35+
matrix:
36+
sdk_name: ${{ github.event_name == 'push' && fromJSON('["javascriptv3","dotnetv4","javav2","rustv1","gov2","swift","python","ruby","php","cpp","kotlin","steering_docs","specs","coding-standards"]') || fromJSON(format('["{0}"]', github.event.inputs.sdk_name)) }}
3137
env:
32-
sdk_name: ${{ github.event.inputs.sdk_name || 'python' }}
38+
sdk_name: ${{ matrix.sdk_name }}
3339

3440
steps:
3541
- name: Checkout
@@ -53,11 +59,35 @@ jobs:
5359
echo "S3_LANGUAGE=rust" >> $GITHUB_ENV
5460
elif [ "$sdk_name" == "gov2" ]; then
5561
echo "S3_LANGUAGE=go" >> $GITHUB_ENV
62+
elif [ "$sdk_name" == "steering_docs" ]; then
63+
echo "S3_LANGUAGE=steering-docs" >> $GITHUB_ENV
64+
elif [ "$sdk_name" == "coding-standards" ]; then
65+
echo "S3_LANGUAGE=coding-standards" >> $GITHUB_ENV
66+
elif [ "$sdk_name" == "specs" ]; then
67+
echo "S3_LANGUAGE=final-specs" >> $GITHUB_ENV
5668
else
5769
echo "S3_LANGUAGE=$sdk_name" >> $GITHUB_ENV
5870
fi
59-
71+
72+
- name: Filter SPECIFICATION.md files for specs
73+
if: ${{ github.event.inputs.sdk_name == 'specs' }}
74+
run: |
75+
find ./scenarios -name "SPECIFICATION.md" | while read file; do
76+
mkdir -p "./filtered_specs/$(dirname "$file")"
77+
cp "$file" "./filtered_specs/$file"
78+
done
79+
80+
- name: Clone and filter for coding standards
81+
if: ${{ github.event.inputs.sdk_name == 'coding-standards' }}
82+
run: |
83+
git clone https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.wiki.git wiki-repo
84+
mkdir -p ./filtered-wiki
85+
find ./wiki-repo -type f -name "*[Gg]uidelines*.md" -o -name "*[Ss]tandards*.md" | while read file; do
86+
cp "$file" ./filtered-wiki/
87+
done
88+
6089
- name: Extract and copy premium examples in temp. dir.
90+
if: ${{ contains(fromJSON('["javascriptv3","dotnetv4","javav2","rustv1","gov2","swift","python","ruby","php","cpp","kotlin"]'), github.event.inputs.sdk_name) }}
6191
run: |
6292
MARKDOWN_FILE="./$sdk_name/premium-ex.md"
6393
@@ -99,14 +129,26 @@ jobs:
99129
fi
100130
done
101131
102-
- name: Upload/Sync to S3
132+
- name: Upload/Sync to S3 (SDK languages)
133+
if: ${{ contains(fromJSON('["javascriptv3","dotnetv4","javav2","rustv1","gov2","swift","python","ruby","php","cpp","kotlin"]'), github.event.inputs.sdk_name) }}
103134
run: |
104135
for level in "basics" "feature-scenario" "complex-feature-scenario"; do
105136
if [ -d "./extracted_snippets/$level" ]; then
106137
aws s3 sync "./extracted_snippets/$level/" "s3://$S3_LANGUAGE-premium-bucket/$level/" --delete
107138
echo "Uploaded $level examples to S3"
108139
fi
109140
done
141+
142+
- name: Upload/Sync to S3 (Other directories)
143+
if: ${{ contains(fromJSON('["steering_docs","coding-standards","specs"]'), github.event.inputs.sdk_name) }}
144+
run: |
145+
if [ "$sdk_name" == "steering_docs" ]; then
146+
aws s3 sync "./$sdk_name/" "s3://$S3_LANGUAGE-bucket/" --delete
147+
elif [ "$sdk_name" == "coding-standards" ]; then
148+
aws s3 sync "./filtered-wiki/" "s3://$S3_LANGUAGE-bucket/" --delete
149+
else
150+
aws s3 sync "./filtered_specs/" "s3://$S3_LANGUAGE-bucket/" --delete
151+
fi
110152
111153
- name: Sync Knowledge Base Data Source
112154
run: |

.tools/lambda/KB_Updater.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import boto3
5+
import json
6+
import datetime
7+
import time
8+
9+
class DateTimeEncoder(json.JSONEncoder):
10+
def default(self, obj):
11+
if isinstance(obj, datetime.datetime):
12+
return obj.isoformat()
13+
return super().default(obj)
14+
15+
def get_knowledge_base_id(knowledge_base_name, region_name, bedrock_agent):
16+
response = bedrock_agent.list_knowledge_bases()
17+
for kb in response['knowledgeBaseSummaries']:
18+
if kb['name'] == knowledge_base_name:
19+
return kb['knowledgeBaseId']
20+
raise ValueError(f"Knowledge base '{knowledge_base_name}' not found")
21+
22+
def get_or_create_data_source(knowledge_base_id, language, region_name, bedrock_agent):
23+
# List existing data sources
24+
response = bedrock_agent.list_data_sources(knowledgeBaseId=knowledge_base_id)
25+
data_sources = response['dataSourceSummaries']
26+
27+
# Look for existing data source for this SDK
28+
for ds in data_sources:
29+
if language in ds['name'] and ds['name'] != "default":
30+
return ds['dataSourceId'], ds['name'], False # Found existing
31+
if language in ["steering-docs", "final-specs"]:
32+
ds_name=f"{language}-data-source"
33+
bucket_name = f"{language}-bucket"
34+
else:
35+
ds_name=f"{language}-premium-data-source"
36+
bucket_name = f"{language}-premium-bucket"
37+
# Create new data source if none found
38+
response = bedrock_agent.create_data_source(
39+
knowledgeBaseId=knowledge_base_id,
40+
name=ds_name,
41+
dataSourceConfiguration={
42+
"type": "S3",
43+
"s3Configuration": {
44+
"bucketArn": f"arn:aws:s3:::{bucket_name}"
45+
}
46+
},
47+
vectorIngestionConfiguration = {
48+
"chunkingConfiguration": {
49+
"chunkingStrategy": "HIERARCHICAL",
50+
"hierarchicalChunkingConfiguration": {
51+
"levelConfigurations": [
52+
{
53+
"maxTokens": 1500
54+
},
55+
{
56+
"maxTokens": 300
57+
}
58+
],
59+
"overlapTokens": 75
60+
}
61+
}
62+
}
63+
)
64+
return response['dataSource']['dataSourceId'], response['dataSource']['name'], True # Created new
65+
66+
def sync_data_source(knowledge_base_id, data_source_id, region_name, bedrock_agent):
67+
response = bedrock_agent.start_ingestion_job(
68+
knowledgeBaseId=knowledge_base_id,
69+
dataSourceId=data_source_id
70+
)
71+
return response
72+
73+
def monitor_ingestion_job(knowledge_base_id, data_source_id, ingestion_job_id, region_name, bedrock_agent):
74+
max_attempts = 100
75+
attempts = 0
76+
77+
while attempts < max_attempts:
78+
job_status = bedrock_agent.get_ingestion_job(
79+
knowledgeBaseId=knowledge_base_id,
80+
dataSourceId=data_source_id,
81+
ingestionJobId=ingestion_job_id
82+
)
83+
84+
status = job_status['ingestionJob']['status']
85+
print(f"Current status: {status} - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
86+
87+
if status in ['COMPLETE', 'FAILED', 'STOPPED']:
88+
return job_status
89+
90+
attempts += 1
91+
time.sleep(5)
92+
93+
return {"status": "TIMEOUT", "message": "Job monitoring timed out after 5 minutes"}
94+
95+
def lambda_handler(event, context):
96+
language = event.get('language', 'python')
97+
region_name = event.get('region_name', 'us-west-2')
98+
if language in ["steering-docs", "final-specs","coding-standards"]:
99+
knowledge_base_name = f"{language}-KB"
100+
else:
101+
knowledge_base_name = f"{language}-premium-KB"
102+
103+
bedrock_agent = boto3.client('bedrock-agent', region_name=region_name)
104+
105+
knowledge_base_id = get_knowledge_base_id(knowledge_base_name, region_name, bedrock_agent)
106+
107+
# Get or create data source
108+
data_source_id, data_source_name, is_new = get_or_create_data_source(
109+
knowledge_base_id, language, region_name, bedrock_agent
110+
)
111+
112+
results = {
113+
"data_source": {
114+
"id": data_source_id,
115+
"name": data_source_name,
116+
"is_new": is_new
117+
},
118+
"ingestion_job": {},
119+
"statistics": None
120+
}
121+
122+
# Sync the data source
123+
print(f"Syncing data source {data_source_name}...")
124+
sync_result = sync_data_source(knowledge_base_id, data_source_id, region_name, bedrock_agent)
125+
126+
ingestion_job_id = sync_result['ingestionJob']['ingestionJobId']
127+
results["ingestion_job"] = {"id": ingestion_job_id, "status": "STARTED"}
128+
129+
# Monitor the ingestion job
130+
final_status = monitor_ingestion_job(
131+
knowledge_base_id, data_source_id, ingestion_job_id, region_name, bedrock_agent
132+
)
133+
134+
results["ingestion_job"]["status"] = final_status.get('ingestionJob', {}).get('status', 'UNKNOWN')
135+
136+
# Get statistics
137+
if 'statistics' in final_status.get('ingestionJob', {}):
138+
stats = final_status['ingestionJob']['statistics']
139+
results["statistics"] = {
140+
"documents_processed": stats.get('numberOfDocumentsScanned', 0),
141+
"documents_failed": stats.get('numberOfDocumentsFailed', 0),
142+
"documents_indexed": stats.get('numberOfNewDocumentsIndexed', 0),
143+
"documents_modified_indexed": stats.get('numberOfModifiedDocumentsIndexed',0)
144+
}
145+
146+
return {
147+
'statusCode': 200,
148+
'body': json.dumps(results, cls=DateTimeEncoder)
149+
}
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)