跳转至

第五章:SQL 查询

基本 SQL

SELECT

-- 基本查询
SELECT * FROM events LIMIT 10;

-- 选择列
SELECT event_id, event_type, event_time FROM events;

-- 过滤
SELECT * FROM events WHERE event_type = 'click';

-- 排序
SELECT * FROM events ORDER BY event_time DESC LIMIT 100;

-- 分页
SELECT * FROM events LIMIT 100 OFFSET 200;

WHERE 条件

-- 基本条件
SELECT * FROM events WHERE event_type = 'click';

-- 多条件
SELECT * FROM events
WHERE event_type = 'click'
  AND event_time > now() - INTERVAL 1 DAY;

-- IN 条件
SELECT * FROM events WHERE event_type IN ('click', 'purchase');

-- LIKE 条件
SELECT * FROM events WHERE event_type LIKE '%click%';

-- 正则匹配
SELECT * FROM events WHERE match(event_type, 'click|purchase');

聚合函数

基本聚合

-- COUNT
SELECT COUNT(*) FROM events;
SELECT COUNT(DISTINCT user_id) FROM events;

-- SUM
SELECT SUM(amount) FROM orders;

-- AVG
SELECT AVG(amount) FROM orders;

-- MIN/MAX
SELECT MIN(event_time), MAX(event_time) FROM events;

GROUP BY

-- 分组聚合
SELECT
    event_type,
    COUNT(*) as count
FROM events
GROUP BY event_type;

-- 多列分组
SELECT
    toDate(event_time) as date,
    event_type,
    COUNT(*) as count
FROM events
GROUP BY date, event_type;

-- WITH ROLLUP
SELECT
    toDate(event_time) as date,
    event_type,
    COUNT(*) as count
FROM events
GROUP BY date, event_type WITH ROLLUP;

-- WITH CUBE
SELECT
    toDate(event_time) as date,
    event_type,
    COUNT(*) as count
FROM events
GROUP BY date, event_type WITH CUBE;

HAVING

-- 过滤分组结果
SELECT
    event_type,
    COUNT(*) as count
FROM events
GROUP BY event_type
HAVING count > 100;

高级聚合

uniq 系列

-- uniq: 近似去重计数
SELECT uniq(user_id) FROM events;

-- uniqExact: 精确去重计数
SELECT uniqExact(user_id) FROM events;

-- uniqCombined: 组合去重
SELECT uniqCombined(user_id) FROM events;

-- uniqHLL: HyperLogLog 去重
SELECT uniqHLL(user_id) FROM events;

topK

-- Top K 元素
SELECT topK(10)(event_type) FROM events;

histogram

-- 直方图
SELECT histogram(10)(amount) FROM orders;

窗口函数

ROW_NUMBER

-- 行号
SELECT
    user_id,
    event_time,
    ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY event_time) as row_num
FROM events;

RANK

-- 排名
SELECT
    user_id,
    amount,
    RANK() OVER (ORDER BY amount DESC) as rank
FROM orders;

LAG/LEAD

-- 上一行/下一行
SELECT
    event_time,
    amount,
    LAG(amount) OVER (ORDER BY event_time) as prev_amount,
    LEAD(amount) OVER (ORDER BY event_time) as next_amount
FROM orders;

聚合窗口

-- 累计求和
SELECT
    event_time,
    amount,
    SUM(amount) OVER (ORDER BY event_time) as running_sum
FROM orders;

-- 移动平均
SELECT
    event_time,
    amount,
    AVG(amount) OVER (
        ORDER BY event_time
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ) as moving_avg
FROM orders;

JOIN

INNER JOIN

SELECT
    e.event_id,
    e.event_type,
    u.name
FROM events e
INNER JOIN users u ON e.user_id = u.user_id;

LEFT JOIN

SELECT
    e.event_id,
    e.event_type,
    u.name
FROM events e
LEFT JOIN users u ON e.user_id = u.user_id;

多表 JOIN

SELECT
    e.event_id,
    e.event_type,
    u.name,
    o.amount
FROM events e
LEFT JOIN users u ON e.user_id = u.user_id
LEFT JOIN orders o ON e.event_id = o.event_id;

JOIN 注意事项

-- ClickHouse JOIN 特点
-- 1. 不支持事务
-- 2. 大表 JOIN 小表性能好
-- 3. 建议使用字典代替 JOIN

-- 使用字典
SELECT
    event_id,
    dictGet('user_dict', 'name', user_id) as name
FROM events;

子查询

标量子查询

SELECT
    event_id,
    (SELECT COUNT(*) FROM events WHERE user_id = e.user_id) as user_event_count
FROM events e;

IN 子查询

SELECT * FROM events
WHERE user_id IN (
    SELECT user_id FROM users WHERE status = 'active'
);

WITH 子句

WITH
    (SELECT COUNT(*) FROM events) as total_count
SELECT
    event_type,
    COUNT(*) as count,
    COUNT(*) / total_count as ratio
FROM events
GROUP BY event_type;

小结

SQL 查询要点:

  • 基本 SQL:SELECT、WHERE、ORDER BY
  • 聚合函数:COUNT、SUM、AVG、GROUP BY
  • 高级聚合:uniq、topK、histogram
  • 窗口函数:ROW_NUMBER、RANK、LAG/LEAD
  • JOIN:INNER JOIN、LEFT JOIN

下一章我们将学习数据导入导出。