DevOps & CloudDocumented
performance-engineer
Application performance engineering. k6 load testing, database query optimization, Python/Node profiling, caching strategies, SLI/SLO definitions, and finding bottlenecks before your users do.
Share:
Installation
npx clawhub@latest install performance-engineerView the full skill documentation and source below.
Documentation
Performance Engineering
Performance First Principles
Measure → Identify bottleneck → Fix → Verify improvement → Repeat
Golden signals (Google SRE):
Latency: How long do requests take?
Traffic: How many requests per second?
Errors: What percentage of requests fail?
Saturation: How full is your system? (CPU, memory, queue depth)
Don't optimize without measuring:
"Premature optimization is the root of all evil" — Knuth
Profile first, assume nothing.
Load Testing with k6
// load-test.js
import http from 'k6/http';
import { check, sleep } from 'k6';
import { Rate, Trend } from 'k6/metrics';
// Custom metrics
const errorRate = new Rate('errors');
const checkoutTrend = new Trend('checkout_duration');
export const options = {
// Ramp up, sustain, ramp down
stages: [
{ duration: '2m', target: 10 }, // Ramp to 10 users
{ duration: '5m', target: 100 }, // Ramp to 100 users
{ duration: '10m', target: 100 }, // Sustain 100 users
{ duration: '2m', target: 0 }, // Ramp down
],
thresholds: {
// SLOs — test fails if these are breached
http_req_duration: ['p(95)<500'], // 95th percentile < 500ms
http_req_duration: ['p(99)<2000'], // 99th percentile < 2s
errors: ['rate<0.01'], // Error rate < 1%
checkout_duration: ['p(90)<1000'], // Custom: checkout < 1s
},
};
const BASE_URL = 'https://api.example.com';
export function setup() {
// Runs once before test — create test data
const res = http.post(`${BASE_URL}/test/setup`);
return { token: res.json('token') };
}
export default function (data) {
// Authenticate
const loginRes = http.post(`${BASE_URL}/auth/login`, JSON.stringify({
email: '[email protected]',
password: 'testpass',
}), {
headers: { 'Content-Type': 'application/json' },
});
check(loginRes, {
'login status 200': (r) => r.status === 200,
'has token': (r) => r.json('token') !== undefined,
});
errorRate.add(loginRes.status !== 200);
const token = loginRes.json('token');
const headers = {
'Authorization': `Bearer ${token}`,
'Content-Type': 'application/json',
};
// Browse products
http.get(`${BASE_URL}/products`, { headers });
sleep(1);
// Checkout (critical path)
const checkoutStart = Date.now();
const checkoutRes = http.post(`${BASE_URL}/checkout`, JSON.stringify({
items: [{ product_id: 'prod_123', quantity: 1 }],
}), { headers });
checkoutTrend.add(Date.now() - checkoutStart);
check(checkoutRes, { 'checkout 200': (r) => r.status === 201 });
sleep(Math.random() * 3); // Simulate user think time
}
export function teardown(data) {
http.delete(`${BASE_URL}/test/cleanup`);
}
# Run load test
k6 run load-test.js
# With environment variables
k6 run -e BASE_URL=https://staging.api.com load-test.js
# Cloud execution (distributed)
k6 cloud load-test.js
# Output to InfluxDB for Grafana dashboard
k6 run --out influxdb=http://localhost:8086/k6 load-test.js
Database Query Optimization
-- Find slow queries (PostgreSQL)
SELECT
query,
calls,
total_exec_time / 1000 AS total_seconds,
mean_exec_time / 1000 AS mean_seconds,
rows,
shared_blks_hit,
shared_blks_read
FROM pg_stat_statements
WHERE calls > 100
ORDER BY total_exec_time DESC
LIMIT 20;
-- Find missing indexes (tables with heavy sequential scans)
SELECT
relname AS table_name,
seq_scan,
seq_tup_read,
idx_scan,
ROUND(seq_tup_read::NUMERIC / NULLIF(seq_scan, 0)) AS avg_scan_size
FROM pg_stat_user_tables
WHERE seq_scan > 100
AND seq_tup_read > 1000000
ORDER BY seq_tup_read DESC;
-- Find unused indexes (wasting write performance)
SELECT
schemaname || '.' || tablename AS table_name,
indexname,
idx_scan,
pg_size_pretty(pg_relation_size(indexrelid)) AS size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
AND schemaname NOT IN ('pg_catalog', 'pg_toast')
ORDER BY pg_relation_size(indexrelid) DESC;
-- Explain analyze query
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT u.*, COUNT(o.id) AS order_count
FROM users u
LEFT JOIN orders o ON o.user_id = u.id
WHERE u.status = 'active'
GROUP BY u.id;
-- What to look for:
-- Seq Scan on large table → needs index
-- Hash Join / Nested Loop with large estimates → add index
-- Buffers: hit=0, read=N → cold cache, IO bound
-- Actual rows >> Estimated rows → run ANALYZE table
Python Profiling
# cProfile — CPU profiling
import cProfile
import pstats
from io import StringIO
def profile_function():
pr = cProfile.Profile()
pr.enable()
my_slow_function() # Profile this
pr.disable()
s = StringIO()
ps = pstats.Stats(pr, stream=s).sort_stats('cumulative')
ps.print_stats(20) # Top 20 functions
print(s.getvalue())
# Memory profiling with memory_profiler
from memory_profiler import profile
@profile
def memory_intensive():
large_list = [i for i in range(1_000_000)]
# Line-by-line memory usage printed
return sum(large_list)
# py-spy for production profiling (no code changes)
# pip install py-spy
# py-spy record -o profile.svg --pid 12345
# py-spy top --pid 12345 (live view like htop)
# tracemalloc — find memory leaks
import tracemalloc
tracemalloc.start()
# ... run code ...
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')
for stat in top_stats[:10]:
print(stat)
# Line profiler — per-line timing
# pip install line_profiler
from line_profiler import LineProfiler
lp = LineProfiler()
lp_wrapper = lp(my_function)
lp_wrapper(*args)
lp.print_stats()
Node.js / JavaScript Profiling
// Built-in profiler
node --prof app.js
node --prof-process isolate-0x*.log > processed.txt
// Clinic.js (flame graphs, heapshots)
// npm install -g clinic
clinic doctor -- node app.js
clinic flame -- node app.js
clinic heapprofile -- node app.js
// Heap snapshot analysis
v8.writeHeapSnapshot(); // Write to file, open in Chrome DevTools
// Event loop lag monitoring (detect CPU-blocking operations)
const { monitorEventLoopDelay } = require('node:perf_hooks');
const h = monitorEventLoopDelay({ resolution: 20 });
h.enable();
setInterval(() => {
console.log({
min: h.min / 1e6, // ms
max: h.max / 1e6,
mean: h.mean / 1e6,
p99: h.percentile(99) / 1e6,
});
}, 10000);
Caching Strategy
import redis
import pickle
import hashlib
import functools
from typing import Callable, Any
r = redis.Redis(decode_responses=False) # Binary for pickle
def cache(ttl: int = 3600, key_prefix: str = ""):
"""Function-level caching decorator."""
def decorator(func: Callable) -> Callable:
@functools.wraps(func)
def wrapper(*args, **kwargs) -> Any:
# Build cache key from function name + arguments
key_data = f"{key_prefix}{func.__name__}:{args}:{sorted(kwargs.items())}"
cache_key = hashlib.md5(key_data.encode()).hexdigest()
# Try cache
cached = r.get(cache_key)
if cached is not None:
return pickle.loads(cached)
# Cache miss
result = func(*args, **kwargs)
r.setex(cache_key, ttl, pickle.dumps(result))
return result
# Allow manual invalidation
def invalidate(*args, **kwargs):
key_data = f"{key_prefix}{func.__name__}:{args}:{sorted(kwargs.items())}"
cache_key = hashlib.md5(key_data.encode()).hexdigest()
r.delete(cache_key)
wrapper.invalidate = invalidate
return wrapper
return decorator
@cache(ttl=300)
def get_user_profile(user_id: str) -> dict:
return db.query_user(user_id)
# Invalidate when user updates
get_user_profile.invalidate("user-123")
SLI/SLO Definition Template
# slos.yaml
service: payment-api
slos:
- name: api_availability
description: "API should be available 99.9% of the time"
sli:
type: availability
metric: sum(rate(http_requests_total{job="payment-api",code!~"5.."}[5m])) / sum(rate(http_requests_total{job="payment-api"}[5m]))
target: 0.999 # 99.9%
window: 30d
alerting:
burnRate1h: 14.4 # 1h burn budget = 5% of monthly in 1h
burnRate6h: 6.0 # 6h burn budget = 5% of monthly in 6h
- name: api_latency_p99
description: "99th percentile request duration < 2 seconds"
sli:
type: latency
metric: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{job="payment-api"}[5m]))
target_ms: 2000
window: 30d
- name: checkout_success_rate
description: "Checkout requests should succeed 99.5% of the time"
sli:
type: success_rate
metric: sum(rate(checkout_total{status="success"}[5m])) / sum(rate(checkout_total[5m]))
target: 0.995
window: 30d
error_budget:
alerting:
- severity: warning
condition: "burn rate > 2x in 6h"
- severity: critical
condition: "burn rate > 5x in 1h"