AWS data warehouse built for analytics on massive datasets, delivering fast query performance and cost-efficiency.
Overview
Amazon Redshift is a fully managed, petabyte-scale data warehouse service powered by Apache Spark. It's optimized for analytical workloads and provides rapid performance through columnar storage. The Data Testing connector integrates seamlessly with Redshift for comprehensive data validation.
-- Create a dedicated validation user
CREATE USER validator_user PASSWORD 'secure_password';
-- Grant select permissions on schema
GRANT USAGE ON SCHEMA public TO validator_user;
GRANT SELECT ON ALL TABLES IN SCHEMA public TO validator_user;
SELECT
id,
customer_id,
order_date,
total_amount,
status
FROM orders
WHERE order_date >= DATEADD(day, -30, CURRENT_DATE)
ORDER BY order_date DESC
LIMIT 50000;
SELECT
DATE_TRUNC('day', order_date)::DATE as order_day,
COUNT(*) as total_orders,
SUM(total_amount) as daily_revenue,
COUNT(DISTINCT customer_id) as unique_customers
FROM orders
WHERE order_date >= DATEADD(month, -3, CURRENT_DATE)
GROUP BY DATE_TRUNC('day', order_date)
ORDER BY order_day DESC;
SELECT
o.order_id,
c.customer_name,
c.email,
o.order_date,
SUM(oi.quantity) as total_items
FROM orders o
INNER JOIN customers c ON o.customer_id = c.id
INNER JOIN order_items oi ON o.order_id = oi.order_id
WHERE o.order_date >= DATEADD(day, -7, CURRENT_DATE)
GROUP BY o.order_id, c.customer_name, c.email, o.order_date
ORDER BY o.order_date DESC;
-- Redshift compresses data using columnar encoding
-- Queries benefit from reading only necessary columns
SELECT id, customer_name, email FROM customers WHERE status = 'active';
-- Create table with distribution key
CREATE TABLE orders (
order_id INT,
customer_id INT PRIMARY KEY,
order_date DATE,
total_amount DECIMAL(10, 2)
)
DISTKEY(customer_id);