star-schema.md 8.1 KB
Newer Older
I
Ivan Blinkov 已提交
1
---
2
toc_priority: 16
I
Ivan Blinkov 已提交
3 4 5
toc_title: Star Schema Benchmark
---

6
# Star Schema Benchmark {#star-schema-benchmark}
7

8
Compiling dbgen:
9

10
``` bash
11 12 13
$ git clone git@github.com:vadimtk/ssb-dbgen.git
$ cd ssb-dbgen
$ make
14 15 16 17
```

Generating data:

D
Denis Zhuravlev 已提交
18
!!! warning "Attention"
19
    With `-s 100` dbgen generates 600 million rows (67 GB), while while `-s 1000` it generates 6 billion rows (which takes a lot of time)
D
Denis Zhuravlev 已提交
20

21
``` bash
22 23 24 25 26
$ ./dbgen -s 1000 -T c
$ ./dbgen -s 1000 -T l
$ ./dbgen -s 1000 -T p
$ ./dbgen -s 1000 -T s
$ ./dbgen -s 1000 -T d
27 28 29 30
```

Creating tables in ClickHouse:

31
``` sql
32 33
CREATE TABLE customer
(
34 35 36
        C_CUSTKEY       UInt32,
        C_NAME          String,
        C_ADDRESS       String,
37 38 39
        C_CITY          LowCardinality(String),
        C_NATION        LowCardinality(String),
        C_REGION        LowCardinality(String),
40
        C_PHONE         String,
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
        C_MKTSEGMENT    LowCardinality(String)
)
ENGINE = MergeTree ORDER BY (C_CUSTKEY);

CREATE TABLE lineorder
(
    LO_ORDERKEY             UInt32,
    LO_LINENUMBER           UInt8,
    LO_CUSTKEY              UInt32,
    LO_PARTKEY              UInt32,
    LO_SUPPKEY              UInt32,
    LO_ORDERDATE            Date,
    LO_ORDERPRIORITY        LowCardinality(String),
    LO_SHIPPRIORITY         UInt8,
    LO_QUANTITY             UInt8,
    LO_EXTENDEDPRICE        UInt32,
    LO_ORDTOTALPRICE        UInt32,
    LO_DISCOUNT             UInt8,
    LO_REVENUE              UInt32,
    LO_SUPPLYCOST           UInt32,
    LO_TAX                  UInt8,
    LO_COMMITDATE           Date,
    LO_SHIPMODE             LowCardinality(String)
)
ENGINE = MergeTree PARTITION BY toYear(LO_ORDERDATE) ORDER BY (LO_ORDERDATE, LO_ORDERKEY);

CREATE TABLE part
(
69 70
        P_PARTKEY       UInt32,
        P_NAME          String,
71 72 73 74 75
        P_MFGR          LowCardinality(String),
        P_CATEGORY      LowCardinality(String),
        P_BRAND         LowCardinality(String),
        P_COLOR         LowCardinality(String),
        P_TYPE          LowCardinality(String),
76
        P_SIZE          UInt8,
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
        P_CONTAINER     LowCardinality(String)
)
ENGINE = MergeTree ORDER BY P_PARTKEY;

CREATE TABLE supplier
(
        S_SUPPKEY       UInt32,
        S_NAME          String,
        S_ADDRESS       String,
        S_CITY          LowCardinality(String),
        S_NATION        LowCardinality(String),
        S_REGION        LowCardinality(String),
        S_PHONE         String
)
ENGINE = MergeTree ORDER BY S_SUPPKEY;
```

Inserting data:
95

96
``` bash
97 98 99 100
$ clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl
$ clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl
$ clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl
$ clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl
101 102
```

103
Converting “star schema” to denormalized “flat schema”:
104

105
``` sql
106
SET max_memory_usage = 20000000000;
107 108 109 110 111

CREATE TABLE lineorder_flat
ENGINE = MergeTree
PARTITION BY toYear(LO_ORDERDATE)
ORDER BY (LO_ORDERDATE, LO_ORDERKEY) AS
D
Denis Zhuravlev 已提交
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
SELECT
    l.LO_ORDERKEY AS LO_ORDERKEY,
    l.LO_LINENUMBER AS LO_LINENUMBER,
    l.LO_CUSTKEY AS LO_CUSTKEY,
    l.LO_PARTKEY AS LO_PARTKEY,
    l.LO_SUPPKEY AS LO_SUPPKEY,
    l.LO_ORDERDATE AS LO_ORDERDATE,
    l.LO_ORDERPRIORITY AS LO_ORDERPRIORITY,
    l.LO_SHIPPRIORITY AS LO_SHIPPRIORITY,
    l.LO_QUANTITY AS LO_QUANTITY,
    l.LO_EXTENDEDPRICE AS LO_EXTENDEDPRICE,
    l.LO_ORDTOTALPRICE AS LO_ORDTOTALPRICE,
    l.LO_DISCOUNT AS LO_DISCOUNT,
    l.LO_REVENUE AS LO_REVENUE,
    l.LO_SUPPLYCOST AS LO_SUPPLYCOST,
    l.LO_TAX AS LO_TAX,
    l.LO_COMMITDATE AS LO_COMMITDATE,
    l.LO_SHIPMODE AS LO_SHIPMODE,
    c.C_NAME AS C_NAME,
    c.C_ADDRESS AS C_ADDRESS,
    c.C_CITY AS C_CITY,
    c.C_NATION AS C_NATION,
    c.C_REGION AS C_REGION,
    c.C_PHONE AS C_PHONE,
    c.C_MKTSEGMENT AS C_MKTSEGMENT,
    s.S_NAME AS S_NAME,
    s.S_ADDRESS AS S_ADDRESS,
    s.S_CITY AS S_CITY,
    s.S_NATION AS S_NATION,
    s.S_REGION AS S_REGION,
    s.S_PHONE AS S_PHONE,
    p.P_NAME AS P_NAME,
    p.P_MFGR AS P_MFGR,
    p.P_CATEGORY AS P_CATEGORY,
    p.P_BRAND AS P_BRAND,
    p.P_COLOR AS P_COLOR,
    p.P_TYPE AS P_TYPE,
    p.P_SIZE AS P_SIZE,
    p.P_CONTAINER AS P_CONTAINER
FROM lineorder AS l
INNER JOIN customer AS c ON c.C_CUSTKEY = l.LO_CUSTKEY
INNER JOIN supplier AS s ON s.S_SUPPKEY = l.LO_SUPPKEY
D
Denis Zhuravlev 已提交
154
INNER JOIN part AS p ON p.P_PARTKEY = l.LO_PARTKEY;
155
```
156

157
Running the queries:
158

159
Q1.1
160 161

``` sql
D
Denis Zhuravlev 已提交
162 163
SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue
FROM lineorder_flat
D
Denis Zhuravlev 已提交
164
WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25;
165
```
166

167
Q1.2
168 169

``` sql
D
Denis Zhuravlev 已提交
170 171
SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue
FROM lineorder_flat
D
Denis Zhuravlev 已提交
172
WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35;
173
```
174

175
Q1.3
176 177

``` sql
D
Denis Zhuravlev 已提交
178 179
SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue
FROM lineorder_flat
180
WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994
D
Denis Zhuravlev 已提交
181
  AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35;
182
```
183

184
Q2.1
185 186

``` sql
D
Denis Zhuravlev 已提交
187 188 189 190 191
SELECT
    sum(LO_REVENUE),
    toYear(LO_ORDERDATE) AS year,
    P_BRAND
FROM lineorder_flat
D
Denis Zhuravlev 已提交
192
WHERE P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA'
D
Denis Zhuravlev 已提交
193 194 195 196
GROUP BY
    year,
    P_BRAND
ORDER BY
D
Denis Zhuravlev 已提交
197 198
    year,
    P_BRAND;
199
```
200

201
Q2.2
202 203

``` sql
D
Denis Zhuravlev 已提交
204 205 206 207 208
SELECT
    sum(LO_REVENUE),
    toYear(LO_ORDERDATE) AS year,
    P_BRAND
FROM lineorder_flat
D
Denis Zhuravlev 已提交
209
WHERE P_BRAND >= 'MFGR#2221' AND P_BRAND <= 'MFGR#2228' AND S_REGION = 'ASIA'
D
Denis Zhuravlev 已提交
210 211 212 213
GROUP BY
    year,
    P_BRAND
ORDER BY
D
Denis Zhuravlev 已提交
214 215
    year,
    P_BRAND;
216
```
217

218
Q2.3
219 220

``` sql
D
Denis Zhuravlev 已提交
221 222 223 224 225
SELECT
    sum(LO_REVENUE),
    toYear(LO_ORDERDATE) AS year,
    P_BRAND
FROM lineorder_flat
D
Denis Zhuravlev 已提交
226
WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE'
D
Denis Zhuravlev 已提交
227 228 229 230
GROUP BY
    year,
    P_BRAND
ORDER BY
D
Denis Zhuravlev 已提交
231 232
    year,
    P_BRAND;
233
```
234

235
Q3.1
236 237

``` sql
D
Denis Zhuravlev 已提交
238 239 240 241 242 243
SELECT
    C_NATION,
    S_NATION,
    toYear(LO_ORDERDATE) AS year,
    sum(LO_REVENUE) AS revenue
FROM lineorder_flat
D
Denis Zhuravlev 已提交
244
WHERE C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND year >= 1992 AND year <= 1997
D
Denis Zhuravlev 已提交
245 246 247 248 249 250
GROUP BY
    C_NATION,
    S_NATION,
    year
ORDER BY
    year ASC,
D
Denis Zhuravlev 已提交
251
    revenue DESC;
252
```
253

254
Q3.2
255 256

``` sql
D
Denis Zhuravlev 已提交
257 258 259 260 261 262
SELECT
    C_CITY,
    S_CITY,
    toYear(LO_ORDERDATE) AS year,
    sum(LO_REVENUE) AS revenue
FROM lineorder_flat
D
Denis Zhuravlev 已提交
263
WHERE C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND year >= 1992 AND year <= 1997
D
Denis Zhuravlev 已提交
264 265 266 267 268 269
GROUP BY
    C_CITY,
    S_CITY,
    year
ORDER BY
    year ASC,
D
Denis Zhuravlev 已提交
270
    revenue DESC;
271
```
272

273
Q3.3
274 275

``` sql
D
Denis Zhuravlev 已提交
276 277 278 279 280 281
SELECT
    C_CITY,
    S_CITY,
    toYear(LO_ORDERDATE) AS year,
    sum(LO_REVENUE) AS revenue
FROM lineorder_flat
D
Denis Zhuravlev 已提交
282
WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND year >= 1992 AND year <= 1997
D
Denis Zhuravlev 已提交
283 284 285 286 287 288 289
GROUP BY
    C_CITY,
    S_CITY,
    year
ORDER BY
    year ASC,
    revenue DESC;
290
```
291

292
Q3.4
293 294

``` sql
D
Denis Zhuravlev 已提交
295 296 297 298 299 300
SELECT
    C_CITY,
    S_CITY,
    toYear(LO_ORDERDATE) AS year,
    sum(LO_REVENUE) AS revenue
FROM lineorder_flat
D
Denis Zhuravlev 已提交
301
WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND toYYYYMM(LO_ORDERDATE) = 199712
D
Denis Zhuravlev 已提交
302 303 304 305 306 307 308
GROUP BY
    C_CITY,
    S_CITY,
    year
ORDER BY
    year ASC,
    revenue DESC;
309
```
310

311
Q4.1
312 313

``` sql
D
Denis Zhuravlev 已提交
314 315 316 317 318
SELECT
    toYear(LO_ORDERDATE) AS year,
    C_NATION,
    sum(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM lineorder_flat
D
Denis Zhuravlev 已提交
319
WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2')
D
Denis Zhuravlev 已提交
320 321 322 323 324 325
GROUP BY
    year,
    C_NATION
ORDER BY
    year ASC,
    C_NATION ASC;
326
```
327

328
Q4.2
329 330

``` sql
D
Denis Zhuravlev 已提交
331 332 333 334 335 336
SELECT
    toYear(LO_ORDERDATE) AS year,
    S_NATION,
    P_CATEGORY,
    sum(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM lineorder_flat
D
Denis Zhuravlev 已提交
337
WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2')
D
Denis Zhuravlev 已提交
338 339 340 341 342 343 344 345
GROUP BY
    year,
    S_NATION,
    P_CATEGORY
ORDER BY
    year ASC,
    S_NATION ASC,
    P_CATEGORY ASC;
346
```
347

348
Q4.3
349 350

``` sql
D
Denis Zhuravlev 已提交
351 352 353 354 355 356
SELECT
    toYear(LO_ORDERDATE) AS year,
    S_CITY,
    P_BRAND,
    sum(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM lineorder_flat
D
Denis Zhuravlev 已提交
357
WHERE S_NATION = 'UNITED STATES' AND (year = 1997 OR year = 1998) AND P_CATEGORY = 'MFGR#14'
D
Denis Zhuravlev 已提交
358 359 360 361 362 363 364 365
GROUP BY
    year,
    S_CITY,
    P_BRAND
ORDER BY
    year ASC,
    S_CITY ASC,
    P_BRAND ASC;
366
```
I
Ivan Blinkov 已提交
367

I
Ivan Blinkov 已提交
368
[Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets/star_schema/) <!--hide-->