DevOps & CloudDocumented
terraform-architect
Terraform and Terragrunt IaC architect. Module design, remote state, multi-environment configs, CI/CD for infrastructure, provider version management, and production patterns for AWS, GCP, and Azure.
Share:
Installation
npx clawhub@latest install terraform-architectView the full skill documentation and source below.
Documentation
Terraform Infrastructure as Code Architect
Terraform Mental Model
terraform init → download providers, set up backend
terraform plan → show what WILL change (never destructive)
terraform apply → make it so
terraform destroy → nuclear option
State is everything. State = Terraform's view of what it created. Protect it.
Project Structure
Single Environment (Simple)
├── main.tf # Resources
├── variables.tf # Input declarations
├── outputs.tf # Output declarations
├── versions.tf # Provider + terraform version constraints
├── terraform.tfvars # Variable values (git-ignored for secrets)
└── terraform.tfvars.example
Multi-Environment (Production Pattern)
├── modules/
│ ├── vpc/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ └── outputs.tf
│ ├── eks/
│ └── rds/
├── environments/
│ ├── dev/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ └── backend.tf
│ ├── staging/
│ └── prod/
└── .terraform.lock.hcl # Lock file — ALWAYS commit this
Terragrunt Structure (Best for large orgs)
├── terragrunt.hcl # Root config — remote state, common vars
├── modules/ # Reusable modules
│ ├── vpc/
│ └── eks/
└── live/
├── dev/
│ ├── vpc/
│ │ └── terragrunt.hcl # Thin config, DRY
│ └── eks/
│ └── terragrunt.hcl
├── staging/
└── prod/
versions.tf — Always Pin Versions
terraform {
required_version = ">= 1.6.0, < 2.0.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0" # Minor updates only, no major
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.23"
}
random = {
source = "hashicorp/random"
version = "~> 3.5"
}
}
}
Remote State (Required for Teams)
# backend.tf — S3 + DynamoDB for AWS
terraform {
backend "s3" {
bucket = "mycompany-terraform-state"
key = "production/myapp/terraform.tfstate"
region = "us-east-1"
encrypt = true # SSE-S3
kms_key_id = "arn:aws:kms:..." # Optional KMS
dynamodb_table = "terraform-state-lock" # Prevents concurrent applies
# For workspace support:
# key = "env:/${terraform.workspace}/myapp/terraform.tfstate"
}
}
# Create the state bucket and lock table ONCE, manually or with a bootstrap module
resource "aws_s3_bucket" "terraform_state" {
bucket = "mycompany-terraform-state"
lifecycle {
prevent_destroy = true # Never accidentally delete
}
}
resource "aws_s3_bucket_versioning" "state" {
bucket = aws_s3_bucket.terraform_state.id
versioning_configuration {
status = "Enabled" # Essential for state recovery
}
}
resource "aws_dynamodb_table" "terraform_lock" {
name = "terraform-state-lock"
billing_mode = "PAY_PER_REQUEST"
hash_key = "LockID"
attribute {
name = "LockID"
type = "S"
}
}
Writing Good Modules
Module Structure
# modules/rds/variables.tf
variable "identifier" {
description = "Unique identifier for this RDS instance"
type = string
}
variable "instance_class" {
description = "RDS instance type"
type = string
default = "db.t3.micro"
validation {
condition = can(regex("^db\\.", var.instance_class))
error_message = "instance_class must start with 'db.'"
}
}
variable "tags" {
description = "Tags to apply to all resources"
type = map(string)
default = {}
}
# modules/rds/main.tf
resource "aws_db_instance" "main" {
identifier = var.identifier
engine = "postgres"
engine_version = "16.1"
instance_class = var.instance_class
allocated_storage = var.storage_gb
max_allocated_storage = var.storage_gb * 4 # Auto-scale up
storage_type = "gp3"
storage_encrypted = true # Always encrypt
kms_key_id = var.kms_key_arn
db_name = var.database_name
username = var.master_username
password = random_password.master.result
vpc_security_group_ids = [aws_security_group.rds.id]
db_subnet_group_name = aws_db_subnet_group.main.name
backup_retention_period = var.environment == "prod" ? 30 : 7
backup_window = "03:00-04:00"
maintenance_window = "sun:04:00-sun:05:00"
deletion_protection = var.environment == "prod"
skip_final_snapshot = var.environment != "prod"
final_snapshot_identifier = var.environment == "prod" ? "${var.identifier}-final" : null
# Enable enhanced monitoring
monitoring_interval = 60
monitoring_role_arn = aws_iam_role.rds_monitoring.arn
# Enable Performance Insights
performance_insights_enabled = true
performance_insights_retention_period = 7
tags = merge(var.tags, {
Name = var.identifier
Environment = var.environment
Terraform = "true"
})
}
# Random password — never hardcode
resource "random_password" "master" {
length = 32
special = true
override_special = "!#$%&*()-_=+[]{}<>:?"
}
# Store in Secrets Manager
resource "aws_secretsmanager_secret" "rds_password" {
name = "/${var.environment}/${var.identifier}/db-password"
}
resource "aws_secretsmanager_secret_version" "rds_password" {
secret_id = aws_secretsmanager_secret.rds_password.id
secret_string = jsonencode({
username = var.master_username
password = random_password.master.result
host = aws_db_instance.main.address
port = aws_db_instance.main.port
dbname = var.database_name
})
}
# modules/rds/outputs.tf
output "endpoint" {
description = "RDS connection endpoint"
value = aws_db_instance.main.address
}
output "port" {
value = aws_db_instance.main.port
}
output "secret_arn" {
description = "ARN of the Secrets Manager secret containing credentials"
value = aws_secretsmanager_secret.rds_password.arn
}
Variables and Locals
# variables.tf
variable "environment" {
description = "Deployment environment"
type = string
validation {
condition = contains(["dev", "staging", "prod"], var.environment)
error_message = "environment must be dev, staging, or prod"
}
}
variable "region" {
type = string
default = "us-east-1"
}
# locals.tf — computed values and DRY helpers
locals {
# Common tags applied to ALL resources
common_tags = {
Project = "myapp"
Environment = var.environment
ManagedBy = "terraform"
Owner = "platform-team"
CostCenter = "engineering"
}
# Environment-specific sizes
instance_sizes = {
dev = "t3.micro"
staging = "t3.small"
prod = "m5.large"
}
is_prod = var.environment == "prod"
# Naming convention
name_prefix = "myapp-${var.environment}"
}
Data Sources and Imports
# Reference existing resources without managing them
data "aws_vpc" "main" {
filter {
name = "tag:Name"
values = ["${var.environment}-vpc"]
}
}
data "aws_subnets" "private" {
filter {
name = "vpc-id"
values = [data.aws_vpc.main.id]
}
tags = {
Tier = "private"
}
}
data "aws_caller_identity" "current" {}
data "aws_region" "current" {}
# Use in resources
resource "aws_security_group" "app" {
vpc_id = data.aws_vpc.main.id
# ...
}
# Import existing resource into state (Terraform 1.5+)
# In terraform block:
import {
to = aws_s3_bucket.existing
id = "my-existing-bucket-name"
}
For_Each and Dynamic Blocks
# Create multiple resources from a map
variable "services" {
type = map(object({
port = number
image = string
}))
default = {
api = { port = 8080, image = "api:1.0" }
web = { port = 3000, image = "web:1.0" }
}
}
resource "aws_ecs_service" "services" {
for_each = var.services
name = "${local.name_prefix}-${each.key}"
# each.key = "api" or "web"
# each.value.port = 8080 or 3000
}
# Dynamic blocks — avoid repetition
resource "aws_security_group" "app" {
name = "${local.name_prefix}-app"
vpc_id = data.aws_vpc.main.id
dynamic "ingress" {
for_each = var.allowed_ports
content {
from_port = ingress.value
to_port = ingress.value
protocol = "tcp"
cidr_blocks = var.allowed_cidrs
}
}
}
Lifecycle and Dependencies
resource "aws_instance" "app" {
ami = data.aws_ami.app.id
instance_type = "t3.micro"
lifecycle {
# Create replacement before destroying old (zero-downtime)
create_before_destroy = true
# Prevent accidental destroy in prod
prevent_destroy = local.is_prod
# Ignore changes to specific fields (e.g., auto-updated by AWS)
ignore_changes = [ami, tags["LastDeployment"]]
# Only replace when these fields change
replace_triggered_by = [aws_launch_template.app.latest_version]
}
# Explicit dependency when Terraform can't infer it
depends_on = [aws_iam_role_policy_attachment.app]
}
CI/CD for Infrastructure
GitHub Actions Workflow
# .github/workflows/terraform.yml
name: Terraform
on:
push:
branches: [main]
paths: ['infrastructure/**']
pull_request:
paths: ['infrastructure/**']
jobs:
terraform:
runs-on: ubuntu-latest
defaults:
run:
working-directory: infrastructure/environments/prod
permissions:
id-token: write # For OIDC auth to AWS
contents: read
pull-requests: write
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials (OIDC - no long-lived keys)
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::123456789:role/github-actions-terraform
aws-region: us-east-1
- uses: hashicorp/setup-terraform@v3
with:
terraform_version: "~> 1.6"
- run: terraform init
- run: terraform validate
- run: terraform fmt -check -recursive
# tfsec security scanning
- uses: aquasecurity/[email protected]
# checkov policy-as-code
- uses: bridgecrewio/checkov-action@master
with:
directory: .
soft_fail: false
- name: Terraform Plan
id: plan
run: terraform plan -out=tfplan -no-color
# Post plan diff as PR comment
- uses: actions/github-script@v7
if: github.event_name == 'pull_request'
with:
script: |
const output = `#### Terraform Plan \`${{ steps.plan.outcome }}\`
\`\`\`
${{ steps.plan.outputs.stdout }}
\`\`\``;
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: output
});
# Only apply on main branch push
- name: Terraform Apply
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
run: terraform apply tfplan
Common Patterns and Anti-Patterns
✅ DO
# Use remote state — never commit tfstate
# Use modules for reusable infrastructure
# Use locals for computed values
# Use count/for_each for multiple similar resources
# Tag every resource with environment, project, owner
# Use random_password, never hardcode credentials
# Enable deletion_protection on production databases
# Use data sources to reference existing infrastructure
# Lock provider versions with .terraform.lock.hcl
❌ DON'T
# Don't commit .terraform/ directory
# Don't commit terraform.tfstate or terraform.tfstate.backup
# Don't hardcode credentials or secrets
# Don't use count when you need stable identity (use for_each with maps)
# Don't run apply manually in CI — use -auto-approve only with approval gates
# Don't ignore the plan output
# Don't use latest provider versions in production
Terragrunt DRY Pattern
# terragrunt.hcl (root)
locals {
account_vars = read_terragrunt_config(find_in_parent_folders("account.hcl"))
env_vars = read_terragrunt_config(find_in_parent_folders("env.hcl"))
account_id = local.account_vars.locals.aws_account_id
environment = local.env_vars.locals.environment
}
# Remote state config — DRY across all modules
remote_state {
backend = "s3"
config = {
bucket = "mycompany-terraform-${local.account_id}"
key = "${path_relative_to_include()}/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-state-lock"
}
generate = {
path = "backend.tf"
if_exists = "overwrite_terragrunt"
}
}
# Common inputs for all modules
inputs = {
environment = local.environment
tags = {
ManagedBy = "terragrunt"
Environment = local.environment
}
}
# live/prod/rds/terragrunt.hcl
include "root" {
path = find_in_parent_folders()
}
terraform {
source = "../../../modules/rds"
}
dependency "vpc" {
config_path = "../vpc"
}
inputs = {
vpc_id = dependency.vpc.outputs.vpc_id
subnet_ids = dependency.vpc.outputs.private_subnet_ids
identifier = "myapp-prod"
instance_class = "db.m5.large"
storage_gb = 100
}