Data & AnalyticsDocumented
sql-expert
Advanced SQL with window functions, CTEs, recursive queries, indexing strategies, EXPLAIN ANALYZE, JSONB, full-text search, and query optimization. PostgreSQL-first with coverage for MySQL and SQLite.
Share:
Installation
npx clawhub@latest install sql-expertView the full skill documentation and source below.
Documentation
Advanced SQL Expert
PostgreSQL First (Applies to Most Databases)
Window Functions — Most Underused SQL Feature
Window functions compute across rows without collapsing them. No GROUP BY, no subqueries needed.
-- Syntax: function() OVER (PARTITION BY ... ORDER BY ... ROWS/RANGE ...)
-- Running total
SELECT
order_date,
amount,
SUM(amount) OVER (ORDER BY order_date) AS running_total
FROM orders;
-- Rank within each department
SELECT
name,
department,
salary,
RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dept_rank,
DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dept_dense_rank,
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS row_num
FROM employees;
-- Get top 3 per category
WITH ranked AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY category ORDER BY sales DESC) AS rn
FROM products
)
SELECT * FROM ranked WHERE rn <= 3;
-- Moving average (7-day)
SELECT
date,
revenue,
AVG(revenue) OVER (
ORDER BY date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS moving_avg_7d
FROM daily_revenue;
-- LAG / LEAD — compare with previous/next row
SELECT
date,
revenue,
LAG(revenue, 1) OVER (ORDER BY date) AS prev_day,
revenue - LAG(revenue, 1) OVER (ORDER BY date) AS day_over_day,
LEAD(revenue, 1) OVER (ORDER BY date) AS next_day
FROM daily_revenue;
-- FIRST_VALUE / LAST_VALUE — keep context while aggregating
SELECT
name,
department,
salary,
FIRST_VALUE(name) OVER (
PARTITION BY department
ORDER BY salary DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS highest_paid_in_dept
FROM employees;
-- NTILE — percentile buckets
SELECT
name,
salary,
NTILE(4) OVER (ORDER BY salary) AS quartile,
PERCENT_RANK() OVER (ORDER BY salary) AS pct_rank
FROM employees;
CTEs (Common Table Expressions)
-- Basic CTE — readable, not a performance benefit
WITH monthly_revenue AS (
SELECT
DATE_TRUNC('month', created_at) AS month,
SUM(amount) AS total
FROM orders
WHERE status = 'completed'
GROUP BY 1
),
monthly_growth AS (
SELECT
month,
total,
LAG(total) OVER (ORDER BY month) AS prev_month,
ROUND(
(total - LAG(total) OVER (ORDER BY month)) /
NULLIF(LAG(total) OVER (ORDER BY month), 0) * 100, 2
) AS growth_pct
FROM monthly_revenue
)
SELECT * FROM monthly_growth
ORDER BY month;
-- Multiple CTEs
WITH
active_users AS (
SELECT user_id FROM users WHERE status = 'active'
),
recent_orders AS (
SELECT * FROM orders
WHERE created_at > NOW() - INTERVAL '30 days'
),
user_order_counts AS (
SELECT o.user_id, COUNT(*) AS order_count, SUM(o.amount) AS total_spent
FROM recent_orders o
JOIN active_users u ON u.user_id = o.user_id
GROUP BY o.user_id
)
SELECT
u.email,
uc.order_count,
uc.total_spent
FROM user_order_counts uc
JOIN users u ON u.id = uc.user_id
ORDER BY uc.total_spent DESC;
Recursive CTEs — Hierarchies and Graphs
-- Employee hierarchy (org chart)
WITH RECURSIVE employee_tree AS (
-- Base case: CEO (no manager)
SELECT id, name, manager_id, 0 AS depth, name::TEXT AS path
FROM employees
WHERE manager_id IS NULL
UNION ALL
-- Recursive case: reports to someone in the tree
SELECT e.id, e.name, e.manager_id, et.depth + 1,
et.path || ' > ' || e.name
FROM employees e
JOIN employee_tree et ON e.manager_id = et.id
)
SELECT id, name, depth, path
FROM employee_tree
ORDER BY path;
-- Category tree traversal
WITH RECURSIVE category_path AS (
SELECT id, name, parent_id, ARRAY[id] AS ancestors
FROM categories
WHERE parent_id IS NULL
UNION ALL
SELECT c.id, c.name, c.parent_id, cp.ancestors || c.id
FROM categories c
JOIN category_path cp ON c.parent_id = cp.id
)
SELECT * FROM category_path;
-- Find all descendants of a node
WITH RECURSIVE subtree AS (
SELECT id, name FROM categories WHERE id = 5 -- Start node
UNION ALL
SELECT c.id, c.name
FROM categories c
JOIN subtree s ON c.parent_id = s.id
)
SELECT * FROM subtree;
Query Optimization
EXPLAIN ANALYZE — Read It
-- Always use EXPLAIN ANALYZE to see actual execution
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT u.name, COUNT(o.id)
FROM users u
LEFT JOIN orders o ON o.user_id = u.id
GROUP BY u.id, u.name;
-- Key things to look for:
-- Seq Scan → table scan (bad on large tables, needs index)
-- Index Scan → using an index (good)
-- Hash Join → building hash table in memory
-- Nested Loop → O(n*m) — watch for this on large datasets
-- rows= in actual vs estimated → large divergence = bad statistics → ANALYZE table
-- Buffers: hit= (cached) vs read= (disk) → high reads = memory pressure
Indexing Strategy
-- B-tree index (default) — equality, range, ORDER BY
CREATE INDEX idx_orders_user_id ON orders(user_id);
CREATE INDEX idx_orders_status_created ON orders(status, created_at DESC);
-- Partial index — index only the rows you query most
CREATE INDEX idx_active_users ON users(email) WHERE status = 'active';
CREATE INDEX idx_pending_orders ON orders(created_at) WHERE status = 'pending';
-- Covering index (include) — index-only scan, no heap fetch
CREATE INDEX idx_orders_user_covering ON orders(user_id) INCLUDE (amount, status, created_at);
-- GIN index for arrays and JSONB
CREATE INDEX idx_products_tags ON products USING GIN(tags);
CREATE INDEX idx_user_meta ON users USING GIN(metadata jsonb_path_ops);
-- Text search
CREATE INDEX idx_products_search ON products USING GIN(
to_tsvector('english', name || ' ' || COALESCE(description, ''))
);
-- Multicolumn index column order matters!
-- Query: WHERE a = 1 AND b > 5 ORDER BY c
-- Good: INDEX ON (a, b, c) — leftmost prefix matches
-- Bad: INDEX ON (c, b, a) — can't use for this query
-- Check index usage
SELECT indexname, idx_scan, idx_tup_read, idx_tup_fetch
FROM pg_stat_user_indexes
WHERE relname = 'orders'
ORDER BY idx_scan DESC;
-- Find missing indexes (sequential scans on large tables)
SELECT relname, seq_scan, seq_tup_read, idx_scan,
ROUND(seq_tup_read::NUMERIC / NULLIF(seq_scan, 0)) AS avg_rows_per_scan
FROM pg_stat_user_tables
WHERE seq_scan > 100
ORDER BY seq_tup_read DESC;
JSON/JSONB Operations (PostgreSQL)
-- JSONB is binary-stored, indexed, fast. Use it.
-- Create table with JSONB
CREATE TABLE events (
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
type TEXT NOT NULL,
data JSONB NOT NULL,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Insert
INSERT INTO events (type, data) VALUES
('user.signup', '{"user_id": "abc", "email": "[email protected]", "source": "organic"}');
-- Query by JSON field
SELECT * FROM events
WHERE data->>'email' = '[email protected]'; -- ->> extracts as text
SELECT * FROM events
WHERE data->'metadata'->>'plan' = 'pro'; -- Nested field
-- Containment operator @> (uses GIN index)
SELECT * FROM events
WHERE data @> '{"source": "organic"}'; -- data contains this JSON
-- Array operations
SELECT data->'tags' FROM products WHERE id = 1;
SELECT * FROM products WHERE data->'tags' ? 'featured'; -- Contains element
SELECT * FROM products WHERE data->'tags' ?| ARRAY['sale', 'new']; -- Any of
-- Update JSONB
UPDATE users
SET metadata = metadata || '{"plan": "pro"}'::JSONB
WHERE id = 'user-123';
UPDATE users
SET metadata = jsonb_set(metadata, '{address,city}', '"San Francisco"')
WHERE id = 'user-123';
-- Extract and aggregate
SELECT
data->>'source' AS acquisition_source,
COUNT(*) AS signups,
COUNT(*) FILTER (WHERE data->>'plan' = 'pro') AS pro_signups
FROM events
WHERE type = 'user.signup'
GROUP BY 1
ORDER BY 2 DESC;
-- Full-text search on JSONB
SELECT * FROM products
WHERE to_tsvector('english', data->>'description') @@ plainto_tsquery('wireless headphones');
N+1 Query Problem and Solutions
-- BAD: N+1 (1 query for orders + N queries for users)
-- This happens in ORMs when you do: orders.each { |o| o.user.name }
-- GOOD: Single query with JOIN
SELECT o.id, o.amount, u.name, u.email
FROM orders o
JOIN users u ON u.id = o.user_id
WHERE o.status = 'pending';
-- Batch loading pattern with IN
-- Instead of: SELECT * FROM users WHERE id = ? (per row)
SELECT id, name, email
FROM users
WHERE id = ANY(ARRAY['id1', 'id2', 'id3']::UUID[]);
-- Lateral join — per-row subquery (powerful, explicit)
SELECT u.id, u.name, recent_orders.*
FROM users u
CROSS JOIN LATERAL (
SELECT COUNT(*) AS order_count, SUM(amount) AS total_spent
FROM orders
WHERE user_id = u.id
AND created_at > NOW() - INTERVAL '90 days'
) AS recent_orders;
Aggregations and Analytics Patterns
-- Conditional aggregation
SELECT
DATE_TRUNC('month', created_at) AS month,
COUNT(*) AS total_orders,
COUNT(*) FILTER (WHERE status = 'completed') AS completed,
COUNT(*) FILTER (WHERE status = 'refunded') AS refunded,
SUM(amount) FILTER (WHERE status = 'completed') AS revenue,
AVG(amount) AS avg_order_value,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY amount) AS median_order
FROM orders
GROUP BY 1
ORDER BY 1;
-- Pivot table with CROSSTAB
SELECT *
FROM CROSSTAB(
'SELECT department, quarter, SUM(sales)
FROM sales_data
GROUP BY 1, 2
ORDER BY 1, 2',
'VALUES (''Q1''), (''Q2''), (''Q3''), (''Q4'')'
) AS ct(department TEXT, q1 NUMERIC, q2 NUMERIC, q3 NUMERIC, q4 NUMERIC);
-- Cohort analysis
WITH cohorts AS (
SELECT
user_id,
DATE_TRUNC('month', MIN(created_at)) AS cohort_month
FROM orders
GROUP BY user_id
),
order_cohorts AS (
SELECT
c.user_id,
c.cohort_month,
DATE_PART('month', AGE(DATE_TRUNC('month', o.created_at), c.cohort_month)) AS months_since_first
FROM cohorts c
JOIN orders o ON o.user_id = c.user_id
)
SELECT
cohort_month,
COUNT(DISTINCT user_id) FILTER (WHERE months_since_first = 0) AS month_0,
COUNT(DISTINCT user_id) FILTER (WHERE months_since_first = 1) AS month_1,
COUNT(DISTINCT user_id) FILTER (WHERE months_since_first = 2) AS month_2,
COUNT(DISTINCT user_id) FILTER (WHERE months_since_first = 3) AS month_3
FROM order_cohorts
GROUP BY cohort_month
ORDER BY cohort_month;
Performance Patterns
-- UPSERT (PostgreSQL)
INSERT INTO user_stats (user_id, login_count, last_login)
VALUES ($1, 1, NOW())
ON CONFLICT (user_id)
DO UPDATE SET
login_count = user_stats.login_count + 1,
last_login = EXCLUDED.last_login;
-- Efficient pagination (keyset vs offset)
-- BAD: OFFSET 10000 LIMIT 20 (scans 10020 rows)
SELECT * FROM orders ORDER BY id LIMIT 20 OFFSET 10000;
-- GOOD: Keyset pagination (O(log n) with index)
SELECT * FROM orders
WHERE id > $last_id
ORDER BY id ASC
LIMIT 20;
-- Bulk insert
INSERT INTO events (type, data, created_at)
SELECT type, data, created_at
FROM UNNEST($1::TEXT[], $2::JSONB[], $3::TIMESTAMPTZ[])
AS t(type, data, created_at);
-- Vacuuming — prevent table bloat
VACUUM ANALYZE orders; -- Manual (usually auto)
VACUUM (VERBOSE, ANALYZE) orders; -- See what it does
-- Table statistics
SELECT
n_live_tup,
n_dead_tup,
ROUND(n_dead_tup * 100.0 / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_pct,
last_vacuum,
last_autovacuum,
last_analyze
FROM pg_stat_user_tables
WHERE relname = 'orders';