前言
如果说让你比较一下两个olap引擎的查询性能怎么样,你首先想到的测试方式是什么?导入部分数据,写几个sql测试一下,那该写什么样的sql呢?测试比较权威的测试方式就出来,那就是今天讲的TPC-H基准测试。
TPC-H基准测试
此测试提供数据,而且是根据你的需求进行生产不同数量级的数据,同时提供22条sql,包含简单查询、排序、分组、关联、子查询等表达方式
安装
https://download.youkuaiyun.com/download/jklcl/47667939
不需要C币,如果csdn设置成需要的,可以给我留言,再私发
# 1.解压
unzip tpch.zip
# 2.配置
cd tpch/dbgen/
cp makefile.suite makefile
# 生成ORACLE数据库的脚本和数据
# 主要修改以下参数值
vim conf/makefile
-----
CC = gcc
DATABASE= ORACLE
MACHINE = LINUX
WORKLOAD = TPCH
-----
# 3. 生成执行命令,生成dbgen
make --生成dbgen
./dbgen -s 10 -f --生成10GB数据
# 4. 此时生产8个*.tbl的文件,转移到其他目录下
mv ./*.tbl /tmp/file/data5g
导入hive
Create external table lineitem (L_ORDERKEY INT, L_PARTKEY INT,L_SUPPKEY INT,L_LINENUMBER INT,L_QUANTITY DOUBLE,L_EXTENDEDPRICE DOUBLE,L_DISCOUNT DOUBLE,L_TAX DOUBLE,L_RETURNFLAG STRING,L_LINESTATUS STRING,L_SHIPDATE STRING,L_COMMITDATE STRING,L_RECEIPTDATE STRING,L_SHIPINSTRUCT STRING,L_SHIPMODE STRING,L_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/tmp/lineitem';
Create external table nation (N_NATIONKEY INT,N_NAME STRING,N_REGIONKEY INT,N_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/tpch/nation';
Create external table region (R_REGIONKEY INT,R_NAME STRING,R_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/tpch/region';
Create external table part (P_PARTKEY INT , P_NAME STRING , P_MFGR STRING, P_BRAND STRING, P_TYPE STRING , P_SIZE INT , P_CONTAINER STRING, P_RETAILPRICE DOUBLE, P_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/tpch/part';
Create external table supplier (S_SUPPKEY INT , S_NAME STRING ,S_ADDRESS STRING , S_NATIONKEY INT , S_PHONE STRING, S_ACCTBAL DOUBLE, S_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/tpch/supplier';
Create external table partsupp (PS_PARTKEY INT , PS_SUPPKEY INT , PS_AVAILQTY INT , PS_SUPPLYCOST DOUBLE, PS_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/tpch/partsupp';
Create external table customer (C_CUSTKEY INT , C_NAME STRING , C_ADDRESS STRING , C_NATIONKEY INT , C_PHONE STRING , C_ACCTBAL DOUBLE , C_MKTSEGMENT STRING , C_COMMENT STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/tpch/customer';
Create external table orders (O_ORDERKEY INT , O_CUSTKEY INT ,O_ORDERSTATUS STRING,O_TOTALPRICE DOUBLE, O_ORDERDATE DATE , O_ORDERPRIORITY STRING, O_CLERK STRING, O_SHIPPRIORITY INT , O_COMMENT STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE LOCATION '/tpch/orders';
LOAD DATA LOCAL INPATH '/tmp/file/data5g/customer.tbl' OVERWRITE INTO TABLE customer;
LOAD DATA LOCAL INPATH '/tmp/file/data5g/region.tbl' OVERWRITE INTO TABLE region;
LOAD DATA LOCAL INPATH '/tmp/file/data5g/nation.tbl' OVERWRITE INTO TABLE nation;
LOAD DATA LOCAL INPATH '/tmp/file/data5g/part.tbl' OVERWRITE INTO TABLE part;
LOAD DATA LOCAL INPATH '/tmp/file/data5g/supplier.tbl' OVERWRITE INTO TABLE supplier;
LOAD DATA LOCAL INPATH '/tmp/file/data5g/partsupp.tbl' OVERWRITE INTO TABLE partsupp;
LOAD DATA LOCAL INPATH '/tmp/file/data5g/orders.tbl' OVERWRITE INTO TABLE orders;
LOAD DATA LOCAL INPATH '/tmp/file/data5g/lineitem.tbl' OVERWRITE INTO TABLE lineitem;
SQL
=========================================================1
select
l_returnflag,
l_linestatus,
sum(l_quantity) as sum_qty,
sum(l_extendedprice) as sum_base_price,
sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
avg(l_quantity) as avg_qty,
avg(l_extendedprice) as avg_price,
avg(l_discount) as avg_disc,
count(*) as count_order
from
tpch.lineitem
where
l_shipdate <= date '1998-12-01' - interval '120' day
group by
l_returnflag,
l_linestatus
order by
l_returnflag,
l_linestatus;
=========================================================2
select
s_acctbal,
s_name,
n_name,
p_partkey,
p_mfgr,
s_address,
s_phone,
s_comment
from
tpch.part,
tpch.supplier,
tpch.partsupp,
tpch.nation,
tpch.region
where
p_partkey = ps_partkey
and s_suppkey = ps_suppkey
and p_size = 48
and p_type like '%STEEL'
and s_nationkey = n_nationkey
and n_regionkey = r_regionkey
and r_name = 'EUROPE'
and ps_supplycost = (
select
min(ps_supplycost)
from
tpch.partsupp,
tpch.supplier,
tpch.nation,
tpch.region
where
p_partkey = ps_partkey
and s_suppkey = ps_suppkey
and s_nationkey = n_nationkey
and n_regionkey = r_regionkey
and r_name = 'EUROPE'
)
order by
s_acctbal desc,
n_name,
s_name,
p_partkey
limit 100;
=========================================================3
select
l_orderkey,
sum(l_extendedprice * (1 - l_discount)) as revenue,
o_orderdate,
o_shippriority
from
tpch.customer,
tpch.orders,
tpch.lineitem
where
c_mktsegment = 'MACHINERY'
and c_custkey = o_custkey
and l_orderkey = o_orderkey
and o_orderdate < date '1995-03-23'
and l_shipdate > date '1995-03-23'
group by
l_orderkey,
o_orderdate,
o_shippriority
order by
revenue desc,
o_orderdate
limit 10;
=========================================================4
select
o_orderpriority,
count(*) as order_count
from
tpch.orders
where
o_orderdate >= date '1996-07-01'
and o_orderdate < date '1996-07-01' + interval '3' month
and exists (
select
*
from
tpch.lineitem
where
l_orderkey = o_orderkey
and l_commitdate < l_receiptdate
)
group by
o_orderpriority
order by
o_orderpriority;
=========================================================5
select
n_name,
sum(l_extendedprice * (1 - l_discount)) as revenue
from
tpch.customer,
tpch.orders,
tpch.lineitem,
tpch.supplier,
tpch.nation,
tpch.region
where
c_custkey = o_custkey
and l_orderkey = o_orderkey
and l_suppkey = s_suppkey
and c_nationkey = s_nationkey
and s_nationkey = n_nationkey
and n_regionkey = r_regionkey
and r_name = 'EUROPE'
and o_orderdate >= date '1996-01-01'
and o_orderdate < date '1996-01-01' + interval '1' year
group by
n_name
order by
revenue desc;
=========================================================6
select
sum(l_extendedprice * l_discount) as revenue
from
tpch.lineitem
where
l_shipdate >= date '1996-01-01'
and l_shipdate < date '1996-01-01' + interval '1' year
and l_discount between 0.02 - 0.01 and 0.02 + 0.01
and l_quantity < 24;
=========================================================7
select
supp_nation,
cust_nation,
l_year,
sum(volume) as revenue
from
(
select
n1.n_name as supp_nation,
n2.n_name as cust_nation,
extract(year from l_shipdate) as l_year,
l_extendedprice * (1 - l_discount) as volume
from
tpch.supplier,
tpch.lineitem,
tpch.orders,
tpch.customer,
tpch.nation n1,
tpch.nation n2
where
s_suppkey = l_suppkey
and o_orderkey = l_orderkey
and c_custkey = o_custkey
and s_nationkey = n1.n_nationkey
and c_nationkey = n2.n_nationkey
and (
(n1.n_name = 'CANADA' and n2.n_name = 'BRAZIL')
or (n1.n_name = 'BRAZIL' and n2.n_name = 'CANADA')
)
and l_shipdate between date '1995-01-01' and date '1996-12-31'
) as shipping
group by
supp_nation,
cust_nation,
l_year
order by
supp_nation,
cust_nation,
l_year;
=========================================================8
select
o_year,
sum(case
when nation = 'BRAZIL' then volume
else 0
end) / sum(volume) as mkt_share
from
(
select
extract(year from o_orderdate) as o_year,
l_extendedprice * (1 - l_discount) as volume,
n2.n_name as nation
from
tpch.part,
tpch.supplier,
tpch.lineitem,
tpch.orders,
tpch.customer,
tpch.nation n1,
tpch.nation n2,
tpch.region
where
p_partkey = l_partkey
and s_suppkey = l_suppkey
and l_orderkey = o_orderkey
and o_custkey = c_custkey
and c_nationkey = n1.n_nationkey
and n1.n_regionkey = r_regionkey
and r_name = 'AMERICA'
and s_nationkey = n2.n_nationkey
and o_orderdate between date '1995-01-01' and date '1996-12-31'
and p_type = 'LARGE ANODIZED COPPER'
) as all_nations
group by
o_year
order by
o_year;
=========================================================9
select
nation,
o_year,
sum(amount) as sum_profit
from
(
select
n_name as nation,
extract(year from o_orderdate) as o_year,
l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
from
tpch.part,
tpch.supplier,
tpch.lineitem,
tpch.partsupp,
tpch.orders,
tpch.nation
where
s_suppkey = l_suppkey
and ps_suppkey = l_suppkey
and ps_partkey = l_partkey
and p_partkey = l_partkey
and o_orderkey = l_orderkey
and s_nationkey = n_nationkey
and p_name like '%maroon%'
) as profit
group by
nation,
o_year
order by
nation,
o_year desc;
=========================================================10
select
c_custkey,
c_name,
sum(l_extendedprice * (1 - l_discount)) as revenue,
c_acctbal,
n_name,
c_address,
c_phone,
c_comment
from
tpch.customer,
tpch.orders,
tpch.lineitem,
tpch.nation
where
c_custkey = o_custkey
and l_orderkey = o_orderkey
and o_orderdate >= date '1993-02-01'
and o_orderdate < date '1993-02-01' + interval '3' month
and l_returnflag = 'R'
and c_nationkey = n_nationkey
group by
c_custkey,
c_name,
c_acctbal,
c_phone,
n_name,
c_address,
c_comment
order by
revenue desc
limit 20;
=========================================================11
select
ps_partkey,
sum(ps_supplycost * ps_availqty) as value
from
tpch.partsupp,
tpch.supplier,
tpch.nation
where
ps_suppkey = s_suppkey
and s_nationkey = n_nationkey
and n_name = 'EGYPT'
group by
ps_partkey having
sum(ps_supplycost * ps_availqty) > (
select
sum(ps_supplycost * ps_availqty) * 0.0001000000
from
tpch.partsupp,
tpch.supplier,
tpch.nation
where
ps_suppkey = s_suppkey
and s_nationkey = n_nationkey
and n_name = 'EGYPT'
)
order by
value desc;
=========================================================12
select
l_shipmode,
sum(case
when o_orderpriority = '1-URGENT'
or o_orderpriority = '2-HIGH'
then 1
else 0
end) as high_line_count,
sum(case
when o_orderpriority <> '1-URGENT'
and o_orderpriority <> '2-HIGH'
then 1
else 0
end) as low_line_count
from
tpch.orders,
tpch.lineitem
where
o_orderkey = l_orderkey
and l_shipmode in ('FOB', 'AIR')
and l_commitdate < l_receiptdate
and l_shipdate < l_commitdate
and l_receiptdate >= date '1997-01-01'
and l_receiptdate < date '1997-01-01' + interval '1' year
group by
l_shipmode
order by
l_shipmode;
=========================================================13
select
c_count,
count(*) as custdist
from
(
select
c_custkey,
count(o_orderkey) as c_count
from
tpch.customer left outer join tpch.orders on
c_custkey = o_custkey
and o_comment not like '%special%deposits%'
group by
c_custkey
) c_orders
group by
c_count
order by
custdist desc,
c_count desc;
=========================================================14
select
100.00 * sum(case
when p_type like 'PROMO%'
then l_extendedprice * (1 - l_discount)
else 0
end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
from
tpch.lineitem,
tpch.part
where
l_partkey = p_partkey
and l_shipdate >= date '1997-06-01'
and l_shipdate < date '1997-06-01' + interval '1' month;
=========================================================16
select
p_brand,
p_type,
p_size,
count(distinct ps_suppkey) as supplier_cnt
from
tpch.partsupp,
tpch.part
where
p_partkey = ps_partkey
and p_brand <> 'Brand#45'
and p_type not like 'SMALL ANODIZED%'
and p_size in (47, 15, 37, 30, 46, 16, 18, 6)
and ps_suppkey not in (
select
s_suppkey
from
tpch.supplier
where
s_comment like '%Customer%Complaints%'
)
group by
p_brand,
p_type,
p_size
order by
supplier_cnt desc,
p_brand,
p_type,
p_size;
=========================================================17
select
sum(l_extendedprice) / 7.0 as avg_yearly
from
tpch.lineitem,
tpch.part
where
p_partkey = l_partkey
and p_brand = 'Brand#51'
and p_container = 'WRAP PACK'
and l_quantity < (
select
0.2 * avg(l_quantity)
from
tpch.lineitem
where
l_partkey = p_partkey
);
=========================================================18
select
c_name,
c_custkey,
o_orderkey,
o_orderdate,
o_totalprice,
sum(l_quantity)
from
tpch.customer,
tpch.orders,
tpch.lineitem
where
o_orderkey in (
select
l_orderkey
from
tpch.lineitem
group by
l_orderkey having
sum(l_quantity) > 312
)
and c_custkey = o_custkey
and o_orderkey = l_orderkey
group by
c_name,
c_custkey,
o_orderkey,
o_orderdate,
o_totalprice
order by
o_totalprice desc,
o_orderdate
limit 100;
=========================================================19
select
sum(l_extendedprice* (1 - l_discount)) as revenue
from
tpch.lineitem,
tpch.part
where
(
p_partkey = l_partkey
and p_brand = 'Brand#52'
and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
and l_quantity >= 3 and l_quantity <= 3 + 10
and p_size between 1 and 5
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_partkey = l_partkey
and p_brand = 'Brand#43'
and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
and l_quantity >= 12 and l_quantity <= 12 + 10
and p_size between 1 and 10
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_partkey = l_partkey
and p_brand = 'Brand#52'
and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
and l_quantity >= 21 and l_quantity <= 21 + 10
and p_size between 1 and 15
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
);
=========================================================20
select
s_name,
s_address
from
tpch.supplier,
tpch.nation
where
s_suppkey in (
select
ps_suppkey
from
tpch.partsupp
where
ps_partkey in (
select
p_partkey
from
tpch.part
where
p_name like 'drab%'
)
and ps_availqty > (
select
0.5 * sum(l_quantity)
from
tpch.lineitem
where
l_partkey = ps_partkey
and l_suppkey = ps_suppkey
and l_shipdate >= date '1996-01-01'
and l_shipdate < date '1996-01-01' + interval '1' year
)
)
and s_nationkey = n_nationkey
and n_name = 'KENYA'
order by
s_name;
=========================================================21
select
s_name,
count(*) as numwait
from
tpch.supplier,
tpch.lineitem l1,
tpch.orders,
tpch.nation
where
s_suppkey = l1.l_suppkey
and o_orderkey = l1.l_orderkey
and o_orderstatus = 'F'
and l1.l_receiptdate > l1.l_commitdate
and exists (
select
*
from
tpch.lineitem l2
where
l2.l_orderkey = l1.l_orderkey
and l2.l_suppkey <> l1.l_suppkey
)
and not exists (
select
*
from
tpch.lineitem l3
where
l3.l_orderkey = l1.l_orderkey
and l3.l_suppkey <> l1.l_suppkey
and l3.l_receiptdate > l3.l_commitdate
)
and s_nationkey = n_nationkey
and n_name = 'PERU'
group by
s_name
order by
numwait desc,
s_name
limit 100;
=========================================================22
select
cntrycode,
count(*) as numcust,
sum(c_acctbal) as totacctbal
from
(
select
substring(c_phone from 1 for 2) as cntrycode,
c_acctbal
from
tpch.customer
where
substring(c_phone from 1 for 2) in
('24', '32', '17', '18', '12', '14', '22')
and c_acctbal > (
select
avg(c_acctbal)
from
tpch.customer
where
c_acctbal > 0.00
and substring(c_phone from 1 for 2) in
('24', '32', '17', '18', '12', '14', '22')
)
and not exists (
select
*
from
tpch.orders
where
o_custkey = c_custkey
)
) as custsale
group by
cntrycode
order by
cntrycode;