From bc4b96d0539b5e0dc42a33c92590ffacda6bda59 Mon Sep 17 00:00:00 2001 From: Graham Christensen Date: Mon, 18 Jan 2021 11:11:09 -0500 Subject: [PATCH] BuildOutputs: index path with HASH Looking at AWS' Performance Insights for a Hydra instance, I found the hydra-queue-runner's query: select id, buildStatus, releaseName, closureSize, size from Builds b join BuildOutputs o on b.id = o.build where finished = ? and (buildStatus = ? or buildStatus = ?) and path = $1 was the slowest query by at least 10x. Running an explain on this showed why: hydra=> explain select id, buildStatus, releaseName, closureSize, size from Builds b join BuildOutputs o on b.id = o.build where finished = 1 and (buildStatus = 0 or buildStatus = 6) and path = '/nix/store/s93khs2dncf2cy273mbyr4fb4ns3db20-MIDIVisualizer-5.1'; QUERY PLAN ------------------------------------------------------------------------ Gather (cost=1000.43..33718.98 rows=2 width=56) Workers Planned: 2 -> Nested Loop (cost=0.43..32718.78 rows=1 width=56) -> Parallel Seq Scan on buildoutputs o (cost=0.00..32710.32 rows=1 width=4) Filter: (path = '/nix/store/s93kh...snip...'::text) -> Index Scan using indexbuildsonjobsetidfinishedid on builds b (cost=0.43..8.45 rows=1 width=56) Index Cond: ((id = o.build) AND (finished = 1)) Filter: ((buildstatus = 0) OR (buildstatus = 6)) (8 rows) A paralell sequential scan is definitely better than a sequential scan, but the cost ranging from 0 to 32710 is not great. Looking at the table, I saw the `path` column is completely unindex: hydra=> \d buildoutputs Table "public.buildoutputs" Column | Type | Collation | Nullable | Default --------+---------+-----------+----------+--------- build | integer | | not null | name | text | | not null | path | text | | not null | Indexes: "buildoutputs_pkey" PRIMARY KEY, btree (build, name) Foreign-key constraints: "buildoutputs_build_fkey" FOREIGN KEY (build) REFERENCES builds(id) ON DELETE CASCADE Since we always do exact matches on the path and don't care about ordering, and since the path column is very high cardinality a `hash` index is a good candidate. Note that I did test a btree index and it performed similarly well, but slightly worse. After creating the index (this took about 10 seconds) on a test database: create index IndexBuildOutputsPath on BuildOutputs using hash(path); We get a *significantly* reduced cost: hydra=> explain select id, buildStatus, releaseName, closureSize, size hydra-> from Builds b join BuildOutputs o on b.id = o.build where hydra-> finished = 1 and (buildStatus = 0 or buildStatus = 6) and hydra-> path = '/nix/store/s93khs2dncf2cy273mbyr4fb4ns3db20-MIDIVisualizer-5.1'; QUERY PLAN ------------------------------------------------------------------------------------------------------- Nested Loop (cost=0.43..41.41 rows=2 width=56) -> Index Scan using buildoutputs_path_hash on buildoutputs o (cost=0.00..16.05 rows=3 width=4) Index Cond: (path = '/nix/store/s93khs2dncf2cy273mbyr4fb4ns3db20-MIDIVisualizer-5.1'::text) -> Index Scan using indexbuildsonjobsetidfinishedid on builds b (cost=0.43..8.45 rows=1 width=56) Index Cond: ((id = o.build) AND (finished = 1)) Filter: ((buildstatus = 0) OR (buildstatus = 6)) (6 rows) For direct comparison, the overall query plan was changed: From: Gather (cost=1000.43..33718.98 rows=2 width=56) To: Nested Loop (cost= 0.43.....41.41 rows=2 width=56) and the query plan for buildoutputs changed from a maximum cost of 32,710 down to 16. In practical terms, the query's planning and execution time was reduced: Before (ms) | Try 1 | Try 2 | Try 3 ------------+---------+---------+-------- Planning | 0.898 | 0.416 | 0.383 Execution | 138.644 | 172.331 | 375.585 After (ms) | Try 1 | Try 2 | Try 3 ------------+---------+---------+-------- Planning | 0.298 | 0.290 | 0.296 Execution | 219.625 | 0.035 | 0.034 --- src/sql/hydra.sql | 2 ++ src/sql/upgrade-69.sql | 1 + 2 files changed, 3 insertions(+) create mode 100644 src/sql/upgrade-69.sql diff --git a/src/sql/hydra.sql b/src/sql/hydra.sql index 46e3827e..7f12ccfe 100644 --- a/src/sql/hydra.sql +++ b/src/sql/hydra.sql @@ -613,6 +613,8 @@ create index IndexJobsetEvalMembersOnEval on JobsetEvalMembers(eval); create index IndexJobsetInputAltsOnInput on JobsetInputAlts(project, jobset, input); create index IndexJobsetInputAltsOnJobset on JobsetInputAlts(project, jobset); create index IndexProjectsOnEnabled on Projects(enabled); +create index IndexBuildOutputsPath on BuildOutputs using hash(path); + -- For hydra-update-gc-roots. create index IndexBuildsOnKeep on Builds(keep) where keep = 1; diff --git a/src/sql/upgrade-69.sql b/src/sql/upgrade-69.sql new file mode 100644 index 00000000..b7acb10d --- /dev/null +++ b/src/sql/upgrade-69.sql @@ -0,0 +1 @@ +create index IndexBuildOutputsPath on BuildOutputs using hash(path);