Add healthcheck ping and dashboard for Gerrit

The healthcheck plugin for Gerrit provides a convenient way to determine
the health of different functionalities and components of Gerrit. If
the endpoint provided by the plugin is pinged, it will execute a set
of checks and return either 200 if all checks passed or 500 if at least
one failed. It will also provide metrics that can be scraped by
Prometheus.

This change adds the option for Gerrit installations outside of Kubernetes
to install a sidecar container in the Prometheus deployment that every
30 s pings the healthcheck plugin's endpoint, thereby triggering the
checks. This is not provided for kubernetes, since there the ping should
be the task of the Kubernetes liveness probes.

The change additionally adds a dashboard displaying the status of the
healthcheck for each Gerrit instance over time.

Change-Id: Ieeedc4406b642e542c89679a8314d771ca0928af
This commit is contained in:
Thomas Draebing 2021-02-11 09:28:33 +01:00
parent 6813b84a99
commit 8e8a55e650
6 changed files with 171 additions and 7 deletions

View file

@ -167,13 +167,14 @@ configuration parameters:
Gerrit installations with just one replica that can run anywhere, where they Gerrit installations with just one replica that can run anywhere, where they
are reachable via HTTP. are reachable via HTTP.
| option | description | | option | description |
|------------------------------------------------|------------------------------------------------------------------------------------| |------------------------------------------------|----------------------------------------------------------------------------------------------|
| `gerritServers.other.[*].host` | Hostname (incl. port, if required) of the Gerrit server to monitor | | `gerritServers.other.[*].host` | Hostname (incl. port, if required) of the Gerrit server to monitor |
| `gerritServers.other.[*].username` | Username of Gerrit user with 'View Metrics' capabilities | | `gerritServers.other.[*].username` | Username of Gerrit user with 'View Metrics' capabilities |
| `gerritServers.other.[*].password` | Password of Gerrit user with 'View Metrics' capabilities | | `gerritServers.other.[*].password` | Password of Gerrit user with 'View Metrics' capabilities |
| `gerritServers.other.[*].promtail.storagePath` | Path to directory, where Promtail is allowed to save files (e.g. `positions.yaml`) | | `gerritServers.other.[*].healthcheck` | Whether to deploy a container that regularly pings the healthcheck plugin endpoint in Gerrit |
| `gerritServers.other.[*].promtail.logPath` | Path to directory containing the Gerrit logs (e.g. `/var/gerrit/logs`) | | `gerritServers.other.[*].promtail.storagePath` | Path to directory, where Promtail is allowed to save files (e.g. `positions.yaml`) |
| `gerritServers.other.[*].promtail.logPath` | Path to directory containing the Gerrit logs (e.g. `/var/gerrit/logs`) |
### Encryption ### Encryption

View file

@ -561,6 +561,25 @@ server:
name: server name: server
sidecarContainers: sidecarContainers:
#@ for instance in data.values.gerritServers.other:
#@ if instance.healthcheck:
- name: #@ "{}-health-ping".format(instance.host).replace('.', '-')
image: curlimages/curl:7.73.0
command:
- "watch"
- "-n"
- "30"
args:
- #@ "curl -Lk https://{}/config/server/healthcheck~status || echo 'Healthcheck failed'".format(instance.host)
resources:
limits:
cpu: 50m
memory: 128Mi
requests:
cpu: 10m
memory: 32Mi
#@ end
#@ end
## Prometheus server container image ## Prometheus server container image
## ##

View file

@ -12,6 +12,7 @@ gerritServers:
- host: gerrit.example.com - host: gerrit.example.com
username: admin username: admin
password: secret password: secret
healthcheck: false
promtail: promtail:
storagePath: /var/promtail storagePath: /var/promtail
logPath: /var/gerrit/logs logPath: /var/gerrit/logs

View file

@ -0,0 +1,71 @@
local grafana = import '../../../vendor/grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local template = grafana.template;
local defaults = import '../../globals/defaults.libsonnet';
local gridPos = import '../../globals/grid_pos.libsonnet';
local publishVariables = import '../../globals/publish.libsonnet';
local variables = import '../globals/variables.libsonnet';
local current_healthcheck_panel = import './panels/current-healthcheck.libsonnet';
local timeseries_healthcheck_panel = import './panels/timeseries-healthcheck.libsonnet';
local HEALTHCHECKS = [
'activeworkers',
'auth',
'deadlock',
'httpactiveworkers',
'jgit',
'projectslist',
'querychanges',
'reviewdb'
];
dashboard.new(
'Gerrit - Healthcheck',
tags=['gerrit'],
schemaVersion=defaults.dashboards.schemaVersion,
editable=defaults.dashboards.editable,
time_from=defaults.dashboards.timeFrom,
time_to=defaults.dashboards.timeTo,
refresh=defaults.dashboards.refresh,
graphTooltip='shared_tooltip',
)
.addTemplate(variables.instance)
.addTemplate(variables.replica)
.addTemplate(
template.new(
name='check',
datasource='Prometheus',
query='metrics(^plugins_healthcheck_.+_failure_total$)',
regex='plugins_healthcheck_(.+)_failure_total',
label='Check',
multi=true,
includeAll=true,
refresh='time',
)
)
.addPanel(
row.new(title='CURRENT'),
gridPos={x: 0, y: 0},
)
.addPanel(
current_healthcheck_panel.new() + {
repeat: 'check',
repeatDirection: 'h',
maxPerRow: 8,
},
gridPos={w: 3, h: 6})
.addPanel(
row.new(title='OVER TIME'),
gridPos={x: 0, y: 6},
)
.addPanel(
timeseries_healthcheck_panel.new() + {
repeat: 'check',
repeatDirection: 'h',
maxPerRow: 3,
},
gridPos={x: 0, y: 6, w: 8, h: 6})
+ if std.extVar('publish') then publishVariables else {}

View file

@ -0,0 +1,50 @@
local grafana = import '../../../../vendor/grafonnet/grafana.libsonnet';
local statPanel = grafana.statPanel;
local prometheus = grafana.prometheus;
local defaults = import '../../../globals/defaults.libsonnet';
{
new():: statPanel.new(
colorMode='background',
datasource=defaults.datasource,
decimals=2,
displayName='${check}',
graphMode='none',
title='',
)
.addTarget(
prometheus.target(
'1-clamp_max(increase(plugins_healthcheck_${check}_failure_total{instance="$instance",replica="$replica"}[2m]), 1)',
instant=true,
)
)
.addThresholds([
{
"color": "dark-red",
"value": null
},
{
"color": "semi-dark-green",
"value": 1
}
])
.addMappings([
{
"from": "",
"id": 1,
"text": "ok",
"to": "",
"type": 1,
"value": "1"
},
{
"from": "",
"id": 2,
"text": "failed",
"to": "",
"type": 1,
"value": "0"
}
])
}

View file

@ -0,0 +1,22 @@
local grafana = import '../../../../vendor/grafonnet/grafana.libsonnet';
local graphPanel = grafana.graphPanel;
local prometheus = grafana.prometheus;
local defaults = import '../../../globals/defaults.libsonnet';
{
new():: graphPanel.new(
datasource=defaults.datasource,
decimals=0,
fill=5,
min=0,
max=1,
title='${check}',
)
.addTarget(
prometheus.target(
'1-clamp_max(increase(plugins_healthcheck_${check}_failure_total{instance="$instance",replica="$replica"}[2m]), 1)',
legendFormat='${check}'
)
)
}