Skip to content

Guardrails Example

Add safety controls to protect your AI agents.

Custom Validator

Create a validator to filter unsafe input:

from akordi_agents.core.interfaces import ValidatorInterface
from akordi_agents.models.validation_models import ValidationResult, ValidationError
from typing import Dict, Any

class SafetyValidator(ValidatorInterface):
    """Validate user input for safety."""

    def __init__(self):
        self.prohibited_patterns = [
            "hack", "exploit", "bypass", "illegal",
            "harmful", "weapon", "violence"
        ]
        self.max_query_length = 5000

    def validate(self, data: Dict[str, Any]) -> ValidationResult:
        errors = []
        query = data.get("query", "")

        # Check length
        if len(query) > self.max_query_length:
            errors.append(ValidationError(
                field="query",
                message=f"Query exceeds maximum length of {self.max_query_length} characters"
            ))

        # Check empty
        if not query.strip():
            errors.append(ValidationError(
                field="query",
                message="Query cannot be empty"
            ))

        # Check prohibited content
        query_lower = query.lower()
        for pattern in self.prohibited_patterns:
            if pattern in query_lower:
                errors.append(ValidationError(
                    field="query",
                    message="Query contains prohibited content"
                ))
                break

        return ValidationResult(
            is_valid=len(errors) == 0,
            errors=errors
        )

    def get_validator_name(self) -> str:
        return "safety_validator"

Using the Validator

from akordi_agents.core import create_langgraph_agent
from akordi_agents.services import AWSBedrockService

# Create validator
validator = SafetyValidator()

# Create agent with guardrails
agent = create_langgraph_agent(
    name="safe_agent",
    llm_service=AWSBedrockService(),
    validator=validator,
    config={
        "enable_validation": True,
        "temperature": 0.1,
    }
)

# Safe query
response = agent.process_request({
    "query": "How do I bake a chocolate cake?",
})
print("Response:", response["llm_response"]["response"])

# Unsafe query
response = agent.process_request({
    "query": "How do I hack into a system?",
})
if not response.get("success"):
    print("Blocked:", response.get("validation_errors"))

AWS Bedrock Guardrails

Use AWS Bedrock's built-in guardrails:

Create a Guardrail

import boto3

client = boto3.client("bedrock", region_name="us-east-1")

response = client.create_guardrail(
    name="production-guardrail",
    description="Guardrail for production AI agents",

    # Content filtering
    contentPolicyConfig={
        "filtersConfig": [
            {"type": "HATE", "inputStrength": "HIGH", "outputStrength": "HIGH"},
            {"type": "VIOLENCE", "inputStrength": "HIGH", "outputStrength": "HIGH"},
            {"type": "SEXUAL", "inputStrength": "HIGH", "outputStrength": "HIGH"},
            {"type": "INSULTS", "inputStrength": "MEDIUM", "outputStrength": "MEDIUM"},
        ]
    },

    # Topic blocking
    topicPolicyConfig={
        "topicsConfig": [
            {
                "name": "illegal_activities",
                "definition": "Topics related to illegal activities or hacking",
                "examples": [
                    "How to hack",
                    "How to steal",
                    "How to make illegal substances"
                ],
                "type": "DENY"
            }
        ]
    },

    # Word filters
    wordPolicyConfig={
        "wordsConfig": [
            {"text": "profanity1"},
            {"text": "profanity2"},
        ],
        "managedWordListsConfig": [
            {"type": "PROFANITY"}
        ]
    },

    # PII handling
    sensitiveInformationPolicyConfig={
        "piiEntitiesConfig": [
            {"type": "EMAIL", "action": "ANONYMIZE"},
            {"type": "PHONE", "action": "ANONYMIZE"},
            {"type": "SSN", "action": "BLOCK"},
            {"type": "CREDIT_DEBIT_CARD_NUMBER", "action": "BLOCK"},
        ]
    },
)

guardrail_id = response["guardrailId"]
print(f"Created guardrail: {guardrail_id}")

Create Guardrail Version

version_response = client.create_guardrail_version(
    guardrailIdentifier=guardrail_id,
    description="Production version 1"
)

version = version_response["version"]
print(f"Created version: {version}")

Using the CLI

# Create default guardrail
poetry run python examples/create_guardrail.py --create-default

# Create a version
poetry run python examples/create_guardrail.py \
  --create-version \
  --guardrail-id your-guardrail-id

# List guardrails
poetry run python examples/create_guardrail.py --list

Bedrock Guardrail Integration

Use the SDK's built-in BedrockGuardrail class:

from akordi_agents.guard_kit.bedrock.bedrock import BedrockGuardrail

# Create a guardrail instance
guardrail = BedrockGuardrail(region_name="us-east-1")

# Create a default guardrail with content filters
response = guardrail.create_default_guardrail(
    name_prefix="my_app"
)
guardrail_id = response["guardrailId"]

# Create a version for production use
version_response = guardrail.create_guardrail_version(
    guardrail_id=guardrail_id,
    guardrail_version_description="Production version"
)
version = version_response["version"]

print(f"Guardrail ID: {guardrail_id}")
print(f"Version: {version}")

Custom LLM Service with Guardrails

From examples/agent_with_guardrails.py:

from akordi_agents.core import LLMServiceInterface, create_langgraph_agent
from akordi_agents.guard_kit.bedrock.bedrock import BedrockGuardrail
from akordi_agents.services.llm_service import AWSBedrockService
from akordi_agents.models.llm_models import ClaudeConfig


class GuardrailEnabledLLMService(LLMServiceInterface):
    """LLM service with integrated Bedrock guardrails."""

    def __init__(self, model_id: str, guardrail_id: str, guardrail_version: str = "1"):
        self.llm_service = AWSBedrockService(model_id)
        self.guardrail_id = guardrail_id
        self.guardrail_version = guardrail_version

    def generate_response(self, prompt: str, context: str = None, **kwargs):
        try:
            config = ClaudeConfig(
                max_tokens=kwargs.get("max_tokens", 2000),
                model_id=self.llm_service.model_id,
                temperature=kwargs.get("temperature", 0.1),
            )

            # Invoke model with guardrail parameters
            llm_response = self.llm_service.invoke_model(
                prompt=prompt,
                config=config,
                system_message=kwargs.get("system_message"),
                guardrailIdentifier=self.guardrail_id,
                guardrailVersion=self.guardrail_version,
            )

            return {
                "response": llm_response.get("response", ""),
                "model_info": {"model_id": self.llm_service.model_id},
                "guardrail_status": "applied",
            }

        except Exception as e:
            return {
                "response": f"Error: {str(e)}",
                "error": str(e),
            }

    def get_service_name(self) -> str:
        return "guardrail_enabled_llm"


# Usage
guardrail_llm = GuardrailEnabledLLMService(
    model_id="anthropic.claude-3-sonnet-20240229-v1:0",
    guardrail_id="your-guardrail-id",
    guardrail_version="1"
)

agent = create_langgraph_agent(
    name="safe_agent",
    llm_service=guardrail_llm,
)

Combining Multiple Validators

Layer multiple validation approaches:

class CompositeValidator(ValidatorInterface):
    def __init__(self, validators: list):
        self.validators = validators

    def validate(self, data: Dict[str, Any]) -> ValidationResult:
        all_errors = []

        for validator in self.validators:
            result = validator.validate(data)
            all_errors.extend(result.errors)

            # Stop on first blocking error
            if not result.is_valid:
                break

        return ValidationResult(
            is_valid=len(all_errors) == 0,
            errors=all_errors
        )

    def get_validator_name(self) -> str:
        return "composite_validator"


# Use multiple validators
validator = CompositeValidator([
    LengthValidator(max_length=5000),
    SafetyValidator(),
    BedrockValidator(guardrail_id="your-id"),
])

agent = create_langgraph_agent(
    name="multi_validated_agent",
    llm_service=llm_service,
    validator=validator,
    config={"enable_validation": True}
)

Output Validation

Validate LLM responses before returning:

class OutputValidator:
    def __init__(self, guardrail: BedrockGuardrail):
        self.guardrail = guardrail

    def validate_output(self, response: str) -> tuple[bool, str]:
        result = self.guardrail.validate_output(response)

        if result.blocked:
            return False, "Response contained inappropriate content"

        return True, response


# Use in custom processing
class SafeAgent:
    def __init__(self, agent, output_validator):
        self.agent = agent
        self.output_validator = output_validator

    def process_request(self, request_data):
        response = self.agent.process_request(request_data)

        if response.get("success"):
            llm_response = response["llm_response"]["response"]
            is_safe, safe_response = self.output_validator.validate_output(
                llm_response
            )

            if not is_safe:
                return {
                    "success": False,
                    "error": "Response validation failed"
                }

            response["llm_response"]["response"] = safe_response

        return response

Complete Example

#!/usr/bin/env python
"""Guardrails example with AWS Bedrock."""

import os
from akordi_agents.core import create_langgraph_agent
from akordi_agents.services import AWSBedrockService
from akordi_agents.core.interfaces import ValidatorInterface
from akordi_agents.models.validation_models import ValidationResult, ValidationError


class SafetyValidator(ValidatorInterface):
    def __init__(self):
        self.blocked_terms = ["hack", "exploit", "illegal"]

    def validate(self, data):
        query = data.get("query", "").lower()

        for term in self.blocked_terms:
            if term in query:
                return ValidationResult(
                    is_valid=False,
                    errors=[ValidationError("query", "Blocked content")]
                )

        return ValidationResult(is_valid=True, errors=[])

    def get_validator_name(self):
        return "safety_validator"


def main():
    os.environ["AWS_REGION"] = "us-east-1"

    # Create agent with guardrails
    agent = create_langgraph_agent(
        name="safe_agent",
        llm_service=AWSBedrockService(),
        validator=SafetyValidator(),
        config={"enable_validation": True}
    )

    # Test queries
    test_queries = [
        "How do I make a chocolate cake?",
        "What's the weather forecast?",
        "How do I hack into a computer?",  # Should be blocked
        "Tell me about machine learning",
    ]

    for query in test_queries:
        print(f"\nQuery: {query}")
        print("-" * 50)

        response = agent.process_request({
            "query": query,
            "system_message": "You are a helpful assistant.",
        })

        if response.get("success"):
            print(f"Response: {response['llm_response']['response'][:100]}...")
        else:
            print(f"BLOCKED: {response.get('validation_errors', response.get('error'))}")


if __name__ == "__main__":
    main()

Running Examples

# With guardrails
poetry run python examples/agent_with_guardrails.py

# Without guardrails (for comparison)
poetry run python examples/agent_without_guardrails.py

# Create guardrail
poetry run python examples/create_guardrail.py --create-default

Best Practices

  1. Always validate input in production systems
  2. Use multiple validation layers for defense in depth
  3. Log validation failures for monitoring and improvement
  4. Keep blocklists updated with new threat patterns
  5. Test guardrails thoroughly before deployment
  6. Monitor false positives to avoid blocking legitimate queries

Next Steps