Add healthcheck ping and dashboard for Gerrit
The healthcheck plugin for Gerrit provides a convenient way to determine
the health of different functionalities and components of Gerrit. If
the endpoint provided by the plugin is pinged, it will execute a set
of checks and return either 200 if all checks passed or 500 if at least
one failed. It will also provide metrics that can be scraped by
Prometheus.
This change adds the option for Gerrit installations outside of Kubernetes
to install a sidecar container in the Prometheus deployment that every
30 s pings the healthcheck plugin's endpoint, thereby triggering the
checks. This is not provided for kubernetes, since there the ping should
be the task of the Kubernetes liveness probes.
The change additionally adds a dashboard displaying the status of the
healthcheck for each Gerrit instance over time.
Change-Id: Ieeedc4406b642e542c89679a8314d771ca0928af
This commit is contained in:
parent
6813b84a99
commit
8e8a55e650
6 changed files with 171 additions and 7 deletions
15
README.md
15
README.md
|
@ -167,13 +167,14 @@ configuration parameters:
|
|||
Gerrit installations with just one replica that can run anywhere, where they
|
||||
are reachable via HTTP.
|
||||
|
||||
| option | description |
|
||||
|------------------------------------------------|------------------------------------------------------------------------------------|
|
||||
| `gerritServers.other.[*].host` | Hostname (incl. port, if required) of the Gerrit server to monitor |
|
||||
| `gerritServers.other.[*].username` | Username of Gerrit user with 'View Metrics' capabilities |
|
||||
| `gerritServers.other.[*].password` | Password of Gerrit user with 'View Metrics' capabilities |
|
||||
| `gerritServers.other.[*].promtail.storagePath` | Path to directory, where Promtail is allowed to save files (e.g. `positions.yaml`) |
|
||||
| `gerritServers.other.[*].promtail.logPath` | Path to directory containing the Gerrit logs (e.g. `/var/gerrit/logs`) |
|
||||
| option | description |
|
||||
|------------------------------------------------|----------------------------------------------------------------------------------------------|
|
||||
| `gerritServers.other.[*].host` | Hostname (incl. port, if required) of the Gerrit server to monitor |
|
||||
| `gerritServers.other.[*].username` | Username of Gerrit user with 'View Metrics' capabilities |
|
||||
| `gerritServers.other.[*].password` | Password of Gerrit user with 'View Metrics' capabilities |
|
||||
| `gerritServers.other.[*].healthcheck` | Whether to deploy a container that regularly pings the healthcheck plugin endpoint in Gerrit |
|
||||
| `gerritServers.other.[*].promtail.storagePath` | Path to directory, where Promtail is allowed to save files (e.g. `positions.yaml`) |
|
||||
| `gerritServers.other.[*].promtail.logPath` | Path to directory containing the Gerrit logs (e.g. `/var/gerrit/logs`) |
|
||||
|
||||
|
||||
### Encryption
|
||||
|
|
|
@ -561,6 +561,25 @@ server:
|
|||
|
||||
name: server
|
||||
sidecarContainers:
|
||||
#@ for instance in data.values.gerritServers.other:
|
||||
#@ if instance.healthcheck:
|
||||
- name: #@ "{}-health-ping".format(instance.host).replace('.', '-')
|
||||
image: curlimages/curl:7.73.0
|
||||
command:
|
||||
- "watch"
|
||||
- "-n"
|
||||
- "30"
|
||||
args:
|
||||
- #@ "curl -Lk https://{}/config/server/healthcheck~status || echo 'Healthcheck failed'".format(instance.host)
|
||||
resources:
|
||||
limits:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 32Mi
|
||||
#@ end
|
||||
#@ end
|
||||
|
||||
## Prometheus server container image
|
||||
##
|
||||
|
|
|
@ -12,6 +12,7 @@ gerritServers:
|
|||
- host: gerrit.example.com
|
||||
username: admin
|
||||
password: secret
|
||||
healthcheck: false
|
||||
promtail:
|
||||
storagePath: /var/promtail
|
||||
logPath: /var/gerrit/logs
|
||||
|
|
71
dashboards/gerrit/healthcheck/gerrit-healthcheck.jsonnet
Normal file
71
dashboards/gerrit/healthcheck/gerrit-healthcheck.jsonnet
Normal file
|
@ -0,0 +1,71 @@
|
|||
local grafana = import '../../../vendor/grafonnet/grafana.libsonnet';
|
||||
local dashboard = grafana.dashboard;
|
||||
local row = grafana.row;
|
||||
local template = grafana.template;
|
||||
|
||||
local defaults = import '../../globals/defaults.libsonnet';
|
||||
local gridPos = import '../../globals/grid_pos.libsonnet';
|
||||
local publishVariables = import '../../globals/publish.libsonnet';
|
||||
local variables = import '../globals/variables.libsonnet';
|
||||
|
||||
local current_healthcheck_panel = import './panels/current-healthcheck.libsonnet';
|
||||
local timeseries_healthcheck_panel = import './panels/timeseries-healthcheck.libsonnet';
|
||||
|
||||
local HEALTHCHECKS = [
|
||||
'activeworkers',
|
||||
'auth',
|
||||
'deadlock',
|
||||
'httpactiveworkers',
|
||||
'jgit',
|
||||
'projectslist',
|
||||
'querychanges',
|
||||
'reviewdb'
|
||||
];
|
||||
|
||||
dashboard.new(
|
||||
'Gerrit - Healthcheck',
|
||||
tags=['gerrit'],
|
||||
schemaVersion=defaults.dashboards.schemaVersion,
|
||||
editable=defaults.dashboards.editable,
|
||||
time_from=defaults.dashboards.timeFrom,
|
||||
time_to=defaults.dashboards.timeTo,
|
||||
refresh=defaults.dashboards.refresh,
|
||||
graphTooltip='shared_tooltip',
|
||||
)
|
||||
.addTemplate(variables.instance)
|
||||
.addTemplate(variables.replica)
|
||||
.addTemplate(
|
||||
template.new(
|
||||
name='check',
|
||||
datasource='Prometheus',
|
||||
query='metrics(^plugins_healthcheck_.+_failure_total$)',
|
||||
regex='plugins_healthcheck_(.+)_failure_total',
|
||||
label='Check',
|
||||
multi=true,
|
||||
includeAll=true,
|
||||
refresh='time',
|
||||
)
|
||||
)
|
||||
.addPanel(
|
||||
row.new(title='CURRENT'),
|
||||
gridPos={x: 0, y: 0},
|
||||
)
|
||||
.addPanel(
|
||||
current_healthcheck_panel.new() + {
|
||||
repeat: 'check',
|
||||
repeatDirection: 'h',
|
||||
maxPerRow: 8,
|
||||
},
|
||||
gridPos={w: 3, h: 6})
|
||||
.addPanel(
|
||||
row.new(title='OVER TIME'),
|
||||
gridPos={x: 0, y: 6},
|
||||
)
|
||||
.addPanel(
|
||||
timeseries_healthcheck_panel.new() + {
|
||||
repeat: 'check',
|
||||
repeatDirection: 'h',
|
||||
maxPerRow: 3,
|
||||
},
|
||||
gridPos={x: 0, y: 6, w: 8, h: 6})
|
||||
+ if std.extVar('publish') then publishVariables else {}
|
|
@ -0,0 +1,50 @@
|
|||
local grafana = import '../../../../vendor/grafonnet/grafana.libsonnet';
|
||||
local statPanel = grafana.statPanel;
|
||||
local prometheus = grafana.prometheus;
|
||||
|
||||
local defaults = import '../../../globals/defaults.libsonnet';
|
||||
|
||||
{
|
||||
new():: statPanel.new(
|
||||
colorMode='background',
|
||||
datasource=defaults.datasource,
|
||||
decimals=2,
|
||||
displayName='${check}',
|
||||
graphMode='none',
|
||||
title='',
|
||||
)
|
||||
.addTarget(
|
||||
prometheus.target(
|
||||
'1-clamp_max(increase(plugins_healthcheck_${check}_failure_total{instance="$instance",replica="$replica"}[2m]), 1)',
|
||||
instant=true,
|
||||
)
|
||||
)
|
||||
.addThresholds([
|
||||
{
|
||||
"color": "dark-red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "semi-dark-green",
|
||||
"value": 1
|
||||
}
|
||||
])
|
||||
.addMappings([
|
||||
{
|
||||
"from": "",
|
||||
"id": 1,
|
||||
"text": "ok",
|
||||
"to": "",
|
||||
"type": 1,
|
||||
"value": "1"
|
||||
},
|
||||
{
|
||||
"from": "",
|
||||
"id": 2,
|
||||
"text": "failed",
|
||||
"to": "",
|
||||
"type": 1,
|
||||
"value": "0"
|
||||
}
|
||||
])
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
local grafana = import '../../../../vendor/grafonnet/grafana.libsonnet';
|
||||
local graphPanel = grafana.graphPanel;
|
||||
local prometheus = grafana.prometheus;
|
||||
|
||||
local defaults = import '../../../globals/defaults.libsonnet';
|
||||
|
||||
{
|
||||
new():: graphPanel.new(
|
||||
datasource=defaults.datasource,
|
||||
decimals=0,
|
||||
fill=5,
|
||||
min=0,
|
||||
max=1,
|
||||
title='${check}',
|
||||
)
|
||||
.addTarget(
|
||||
prometheus.target(
|
||||
'1-clamp_max(increase(plugins_healthcheck_${check}_failure_total{instance="$instance",replica="$replica"}[2m]), 1)',
|
||||
legendFormat='${check}'
|
||||
)
|
||||
)
|
||||
}
|
Loading…
Reference in a new issue