1. 程式人生 > >PostgreSQL Limit對索引的影響

PostgreSQL Limit對索引的影響

伺服器CPU排行榜

相關行業的同學如看不懂應該該好好反思一下自己了,思考人生了.

1.建立測試表

drop table if exists test;
create table test(
    objectid serial not null,
    num integer not null,
    ref integer[] not null,
    constraint pk_test_objectid primary key(objectid)
)with (fillfactor=100);
alter table test cluster on pk_test_objectid;

為加快插入速度,其它索引在生成資料完成後再建立.

2.建立函式

函式用於控制num和ref的值分佈,以便num和ref欄位上的索引具有較高的可選擇性.

drop function if exists saveAsTest(integer,integer[]);
drop function if exists gen_row(integer[],tweights[],tweights[]);
drop function if exists gen_array(integer[],tweights[]);
drop function if exists get_next_index(tweights[]);
drop type if exists tweights; /**************************************************************************************** 建立平滑加權輪詢係數型別 weight:設定的係數 curweight:當前使用的係數,初始化設定為0即可 ****************************************************************************************/ create type tweights as
(weight integer,curweight integer);
/**************************************************************************************** 平滑加權輪詢(smooth weighted round-robin balancing)演算法 示例: array[((50,0)::tweights),((30,0)::tweights),((15,0)::tweights),((5,0)::tweights)] 配置了4個係數引數,注意所有係數值累加為100,每呼叫一百次 第一個係數返回索引1的概率為50% 第二個係數返回索引2的概率為30% 第三個係數返回索引3的概率為15% 第四個係數返回索引4的概率為5% ****************************************************************************************/ create or replace function get_next_index(tweights[]) returns table(index integer, weights tweights[]) as $$ declare v_i integer; v_len integer; v_index integer; v_total integer; v_tmp tweights; v_tmpindex tweights; begin v_len := array_length($1,1); if (1 = v_len) then return query select 1,$1; end if; v_index := -1; v_total := 0; for v_i in 1..v_len loop v_tmp := $1[v_i]; v_tmp.curweight := (v_tmp.curweight + v_tmp.weight); v_total := (v_total + v_tmp.weight); $1[v_i] = v_tmp; if (-1 = v_index or ($1[v_index]).curweight < v_tmp.curweight) then v_index := v_i; end if; end loop; v_tmpindex := $1[v_index]; v_tmpindex.curweight := v_tmpindex.curweight - v_total; $1[v_index] = v_tmpindex; return query select v_index,$1; end; $$ language plpgsql strict; /**************************************************************************************** 隨機生成1-4個元素的陣列 drop function if exists gen_array(integer[],tweights[]); ****************************************************************************************/ create or replace function gen_array(integer[],tweights[]) returns table(vals integer[], weights tweights[]) as $$ with recursive cte(id,val,weights,count) as ( (select 1,$1[index],weights,((random()*(4-1)+1)::integer) from get_next_index($2)) union all select (p.id+1),$1[a.index],a.weights,p.count from cte as p,get_next_index(p.weights) as a where p.id < count ) select array_agg(val),(select weights from cte where id=count) from cte; $$ language sql strict; /**************************************************************************************** 生成行 $1、$2、$3的陣列大小必須一至 $2:為生成integer的平滑加權輪詢係數 $3:為生成integer[]的平滑加權輪詢係數 drop function if exists gen_row(integer[],tweights[],tweights[]); ****************************************************************************************/ create or replace function gen_row(integer[],tweights[],tweights[]) returns table(num integer,weights1 tweights[],ref integer[],weights2 tweights[]) as $$ select $1[num.index],num.weights,ref.* from get_next_index($2) as num,gen_array($1,$3) as ref; $$ language sql strict; /**************************************************************************************** 函式測試是否符合預期 ****************************************************************************************/ /* select * from gen_row( array[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], array[ (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights, (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights, (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights, (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights ], array[ (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights, (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights, (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights, (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights ]); */ /**************************************************************************************** 儲存資料到Test表 drop function if exists saveAsTest(integer,integer[]); ****************************************************************************************/ create or replace function saveAsTest(integer,integer[]) returns integer as $$ insert into test(num,ref) values($1,$2) returning objectid; $$ language sql strict;

3.生成測試資料

  • num的值範圍為1-20,平均分佈(各個的值佔比為5%).
  • ref的值範圍為1-20,陣列大小控制在1-4(隨機大小),每生成100個數值各個值的佔比也為5%.
delete from test;
select setval(pg_get_serial_sequence('test','objectid'), 1, false);
/****************************************************************************************
    匯入測試資料,開10個終端,每個終端都執行以下指令碼.
    博主測試機cpu為雙路16核,因此開了16個終端.CPU型號為Intel(R) Xeon(R) CPU E5530  @ 2.40GHz,現屬於垃圾cpu,排行榜在倒數...
    因表比較簡單匯入測試資料硬碟寫入較少(最高約16MB/s,大多數情況下小於2MB/s).
    本例主要是cpu運算,因此16個終端同時執行cpu達到了100%.kao運行了一會風扇狂響.......
****************************************************************************************/
\timing on
do $$
    declare
        v_nums integer[];
    v_weights1 tweights[];
    v_weights2 tweights[];

    v_num integer;
    v_ref integer[];
    v_coun integer;
    begin    
    v_coun := 1;
        v_nums:=array[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20];
    v_weights1:=array[
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights
    ];
    v_weights2:=array[
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,
      (5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights,(5,0)::tweights
    ];

        for i in 1..1000000 loop            
      select num,weights1,ref,weights2 into v_num,v_weights1,v_ref,v_weights2 from gen_row(v_nums,v_weights1,v_weights2);
      perform saveAsTest(v_num,v_ref);
      --raise notice  '%  %', v_num,v_ref;
      if ( 0 = (i % 1000) ) then      
        raise notice  '%', v_coun;
        v_coun := v_coun + 1;
      end if;
        end loop;
    end;
$$;
序號 耗時(ms)
1 1491206.016
2 1511390.919
3 1517245.568
4 1509241.432
5 1519552.252
6 1514420.896
7 1520820.174
8 1512984.280
9 1519851.215
10 1514590.502
11 1505463.332
12 1503091.390
13 1503749.024
14 1501670.722
15 1500027.669
16 1503459.150

4.建立索引

插入完成後vacuum表,測試時結果更準確.

vacuum  freeze verbose  analyze test;
select count(*) from test;
/*
count   
----------
16000000
(1 row)

Time: 587.956 ms
*/

/*B樹索引*/
create index idx_test_num on test(num);

/*陣列索引
 使用gin__int_ops,截止目前根據我的需求陣列索引測試下來gin__int_ops效果最好
 gin__int_ops依賴intarray擴充套件
 create extension intarray;
*/
create index idx_test_ref on test using gin(ref gin__int_ops);
/*其它陣列型別索引,需要相關擴充套件*/
--create index idx_test_ref on test using gist(ref gist__int_ops);
--create index idx_test_ref on test using rum(ref rum_anyarray_ops);

/*可以檢視一下表結構*/
\dS+ test;

5.查詢測試

注意不要加order by,order by會影響執行計劃,目前只單純的測試limit和索引之間的關係.

執行查詢時多執行幾次,直至不讀取磁碟(沒有Buffers: shared read).

因為資料在表中的佔比一樣,因此只要查詢一個值就可以了.

/*表包含的資料,b樹索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where num=1;
--Execution time: 2568.059 ms

/*表裡不包含的資料,b樹索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where num=21;
--Execution time: 0.044 ms

/*表包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where [email protected]>array[1];
--Execution time: 6589.734 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2];
--Execution time: 9037.726 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2,3];
--Execution time: 11621.418 ms

/*表不包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where [email protected]>array[21];
--Execution time: 0.065 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22];
--Execution time: 0.056 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22,23];
--Execution time: 0.060 ms

6.常規limt測試

/*表包含的資料,b樹索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where num=1 limit 50;
--Execution time: 0.535 ms

/*表裡不包含的資料,b樹索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where num=21 limit 50;
--Execution time: 0.050 ms


/*表包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where [email protected]>array[1] limit 50;
--Execution time: 0.585 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2] limit 50;
--Execution time: 0.561 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2,3] limit 50;
--Execution time: 0.537 ms

/*表不包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where [email protected]>array[21]  limit 50;
--Execution time: 3572.286 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22] limit 50;
--Execution time: 3944.530 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22,23] limit 50;
--Execution time: 4130.662 ms

通過對比可以看到B樹索引新增limit效能更高,只返回limit限定的資料,無論表中是否包含條件值.

陣列索引分兩種情況,表中包含條件值、表中不包含條件值.

6.1 陣列索引和limit

6.1.1 表中包含條件值

不會使用陣列索引,使用全表掃描,但是有limit限定,所以速度很快.

6.1.2 表中不包含條件值

不會使用陣列索引,使用全表掃描,因為值不包含在表中,所以需要全表掃描,然後過濾所有資料,速度非常慢.

6.1.2.1 解決方案-使用with

with會使用陣列索引.

/*表包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where [email protected]>array[1]
)select * from cte limit 10;
--Execution time: 293.301 ms

explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where ref&&array[1,2]
)select * from cte limit 10;
--Execution time: 464.427 ms

explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where ref&&array[1,2,3]
)select * from cte limit 10;
--Execution time: 717.172 ms

/*表不包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where [email protected]>array[21]
)select * from cte limit 10;
--Execution time: 0.075 ms

explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where ref&&array[21,22]
)select * from cte limit 10;
--Execution time: 0.078 ms

explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where ref&&array[21,22,23]
)select * from cte limit 10;
--Execution time: 0.079 ms

6.1.2.2 解決方案-禁用全表掃描

禁用全表掃描後,PostgreSQL會自動選擇合適的索引,在本例中使用了索引idx_test_ref.類似Oracle的強制索引.

set enable_seqscan只對當前會話有效,注意使用完成後要開啟.

set enable_seqscan = off;
/*表包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where [email protected]>array[1] limit 50;
--Execution time: 297.018 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2] limit 50;
--Execution time: 466.661 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[1,2,3] limit 50;
--Execution time: 708.372 ms

/*表不包含的資料,陣列索引*/
explain (analyze,verbose,costs,buffers,timing)
select objectid from test where [email protected]>array[21]  limit 50;
--Planning time: 0.089 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22] limit 50;
--Execution time: 0.065 ms

explain (analyze,verbose,costs,buffers,timing)
select objectid from test where ref&&array[21,22,23] limit 50;
--Execution time: 0.066 ms
set enable_seqscan = on;

6.1.3 小結

  • 索引掃描的成本較昂貴,但因返回的資料少,所以比較快.
  • limit會對查詢行為產生較大的影響,設定了limit後需重新檢視執行計劃.
  • order by也會對查詢行為產生較大的影響,需結合需求和執行計劃調整.
  • 如果是單個條件(例如本例),且大多數情況下表包含值,建議使用”6.常規limt測試”,偶爾有表不包含的值時對總體影響不大.
  • 如果是多個條件,建議使用”6.1.2.1 解決方案-使用with”,它和禁用全表掃描效果差不多.具體使用那種需結合需求和執行計劃調整.如下:
--多個條件
explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where num=1 and ref&&array[1,2,3]
)select * from cte limit 10;

explain (analyze,verbose,costs,buffers,timing)
with cte as(
  select objectid from test where num=1 and ref&&array[21,22,23]
)select * from cte limit 10;