diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1521c8b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +dist diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a8be45b --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright 2020 The Android Open Source Project + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..25566c7 --- /dev/null +++ b/README.md @@ -0,0 +1,189 @@ +# Monitoring setup for Gerrit + +This project provides a setup for monitoring Gerrit instances. The setup is +based on Prometheus and Grafana running in Kubernetes. In addition, logging will +be provided by Grafana Loki. + +The setup is provided as a helm chart. It can be installed using Helm +(This README expects Helm version 3.0 or higher). + +The charts used in this setup are the chart provided in the open source and can be +found on GitHub: + +- [Prometheus](https://github.com/helm/charts/tree/master/stable/prometheus) +- [Grafana](https://github.com/helm/charts/tree/master/stable/grafana) +- [Loki](https://github.com/grafana/loki/tree/master/production/helm/loki) + +This project just provides `values.yaml`-files that are already configured to +work with the `metrics-reporter-prometheus`-plugin of Gerrit to make the setup +easier. + +## Dependencies + +- Gerrit \ +Gerrit requires the following plugin to be installed: + - [metrics-reporter-prometheus](https://gerrit.googlesource.com/plugins/metrics-reporter-prometheus/) + +- Promtail \ +Promtail has to be installed with access to the `logs`-directory in the Gerrit- +site. A configuration-file for Promtail will be provided in this setup. Find +the documentation for Promtail +[here](https://github.com/grafana/loki/blob/master/docs/clients/promtail/README.md) + +- Helm \ +To install and configure Helm, follow the +[official guide](https://helm.sh/docs/intro/quickstart/#install-helm). + +- ytt \ +ytt is a templating tool for yaml-files. It is required for some last moment +configuration. Installation instructions can be found +[here](https://k14s.io/#install-from-github-release). + +- yq \ +yq is a commandline processor for yaml-files. Installation instructions can be +found [here](https://mikefarah.gitbook.io/yq/). + +## Add dashboards + +To have dashboards deployed automatically during installation, export the dashboards +to a JSON-file or create JSON-files describing the dashboards in another way. +Put these dashboards into the `./dashboards`-directory of this repository. During +the installation the dashboards will be added to a configmap and with this +automatically installed to Grafana. + +## Configuration + +While this project is supposed to provide a specialized and opinionated monitoring +setup, some configuration is highly dependent on the specific installation. +These options have to be configured in the `./config.yaml` before installing and +are listed here: + +| option | description | +|-----------------------------------------|------------------------------------------------------------------------------------| +| `gerritServers.[0].host` | Hostname (incl. port, if required) of the Gerrit server to monitor | +| `gerritServers.[0].username` | Username of Gerrit user with 'View Metrics' capabilities | +| `gerritServers.[0].password` | Password of Gerrit user with 'View Metrics' capabilities | +| `namespace` | The namespace the charts are installed to | +| `tls.skipVerify` | Whether to skip TLS certificate verification | +| `tls.caCert` | CA certificate used for TLS certificate verification | +| `promtail.storagePath` | Path to directory, where Promtail is allowed to save files (e.g. `positions.yaml`) | +| `promtail.logPath` | Path to directory containing the Gerrit logs (e.g. `/var/gerrit/logs`) | +| `prometheus.server.host` | Prometheus server ingress hostname | +| `prometheus.server.username` | Username for Prometheus | +| `prometheus.server.password` | Password for Prometheus | +| `prometheus.server.tls.cert` | TLS certificate | +| `prometheus.server.tls.key` | TLS key | +| `prometheus.alertmanager.slack.apiUrl` | API URL of the Slack Webhook | +| `prometheus.alertmanager.slack.channel` | Channel to which the alerts should be posted | +| `loki.host` | Loki ingress hostname | +| `loki.username` | Username for Loki | +| `loki.password` | Password for Loki | +| `loki.tls.cert` | TLS certificate | +| `loki.tls.key` | TLS key | +| `grafana.host` | Grafana ingress hostname | +| `grafana.tls.cert` | TLS certificate | +| `grafana.tls.key` | TLS key | +| `grafana.admin.username` | Username for the admin user | +| `grafana.admin.password` | Password for the admin user | +| `grafana.ldap.enabled` | Whether to enable LDAP | +| `grafana.ldap.host` | Hostname of LDAP server | +| `grafana.ldap.port` | Port of LDAP server (Has to be `quoted`!) | +| `grafana.ldap.password` | Password of LDAP server | +| `grafana.ldap.bind_dn` | Bind DN (username) of the LDAP server | +| `grafana.ldap.accountBases` | List of base DNs to discover accounts (Has to have the format `"['a', 'b']"`) | +| `grafana.ldap.groupBases` | List of base DNs to discover groups (Has to have the format `"['a', 'b']"`) | +| `grafana.dashboards.editable` | Whether dashboards can be edited manually in the UI | + +### Encryption + +The configuration file contains secrets. Thus, to be able to share the configuration, +e.g. with the CI-system, it is meant to be encrypted. The encryption is explained +[here](./documentation/config-management.md). + +The `./install.sh`-script will decrypt the file before templating, if it was +encrypted with `sops`. + +## Installation + +Before beginning with the installation, ensure that the local helm repository is +up-to-date: + +```sh +helm repo add loki https://grafana.github.io/loki/charts +helm repo update +``` + +This project provides a script to quickly install the monitoring setup. To use +it, run: + +```sh +./install.sh \ + [--output ./dist] \ + [--dryrun] \ + config.yaml +``` + +The command will use the given configuration to create the final +files in the directory given by `--output` (default `./dist`) and install/update +the Kubernetes resources and charts, if the `--dryrun` flag is not set. + +## Configure Promtail + +Promtail has to be installed with access to the directory containing the Gerrit +logs, e.g. on the same host. The installation as described above will create a +configuration file for Promtail, which can be found in `./dist/promtail.yaml`. +Use it to configure Promtail by using the `-config.file=./dist/promtail.yaml`- +parameter, when starting Promtail. Using the Promtail binary directly this would +result in the following command: + +```sh +$PATH_TO_PROMTAIL/promtail \ + -config.file=./dist/promtail.yaml \ + -client.external-labels=host=$(hostname) +``` + +The `-client.external-labels=host=$(hostname)` option will add a label to each job +that contains the hostname. This is useful, if multiple host are scraped for logs +and only one Grafana is used to view the logs. + +If TLS-verification is activated, the CA-certificate used for verification +(usually the one configured for `tls.caCert`) has to be present in the +directory configured for `promtail.storagePath` in the `config.yaml` and has to +be called `promtail.ca.crt`. + +The Promtail configuration provided here expects the logs to be available in +JSON-format. This can be configured by setting `log.jsonLogging = true` in the +`gerrit.config`. + +## Uninstallation + +To remove the Prometheus chart from the cluster, run + +```sh +helm uninstall prometheus --namespace $NAMESPACE +helm uninstall loki --namespace $NAMESPACE +helm uninstall grafana --namespace $NAMESPACE +kubectl delete -f ./dist/configuration +``` + +To also release the volumes, run + +```sh +kubectl delete -f ./dist/storage +``` + +NOTE: Doing so, all data, which was not backed up will be lost! + +Remove the namespace: + +```sh +kubectl delete -f ./dist/namespace.yaml +``` + +The `./uninstall.sh`-script will automatically remove the charts installed in +by the `./install.sh`-script from the configured namespace and delete the +namespace as well: + +```sh +./uninstall.sh config.yaml +``` diff --git a/charts/grafana/Version b/charts/grafana/Version new file mode 100644 index 0000000..50e2274 --- /dev/null +++ b/charts/grafana/Version @@ -0,0 +1 @@ +5.0.3 diff --git a/charts/grafana/configuration/grafana.ca.secret.yaml b/charts/grafana/configuration/grafana.ca.secret.yaml new file mode 100644 index 0000000..9853ce7 --- /dev/null +++ b/charts/grafana/configuration/grafana.ca.secret.yaml @@ -0,0 +1,12 @@ +#@ load("@ytt:data", "data") +#@ load("@ytt:base64", "base64") +#@ if data.values.grafana.ldap.enabled and not data.values.tls.skipVerify: +apiVersion: v1 +kind: Secret +metadata: + name: grafana-ca + namespace: #@ data.values.namespace +data: + server.ca.crt: #@ base64.encode(data.values.tls.caCert) +type: Opaque +#@ end diff --git a/charts/grafana/configuration/grafana.secret.yaml b/charts/grafana/configuration/grafana.secret.yaml new file mode 100644 index 0000000..a6b0338 --- /dev/null +++ b/charts/grafana/configuration/grafana.secret.yaml @@ -0,0 +1,15 @@ +#@ load("@ytt:data", "data") +#@ load("@ytt:base64", "base64") +#@ load("ldap.lib.txt", "format_ldap_toml") +apiVersion: v1 +kind: Secret +metadata: + name: grafana-credentials + namespace: #@ data.values.namespace +data: + admin-user: #@ base64.encode(data.values.grafana.admin.username) + admin-password: #@ base64.encode(data.values.grafana.admin.password) + #@ if data.values.grafana.ldap.enabled: + ldap-toml: #@ base64.encode(format_ldap_toml()) + #@ end +type: Opaque diff --git a/charts/grafana/configuration/grafana.tls.secret.yaml b/charts/grafana/configuration/grafana.tls.secret.yaml new file mode 100644 index 0000000..d32d491 --- /dev/null +++ b/charts/grafana/configuration/grafana.tls.secret.yaml @@ -0,0 +1,11 @@ +#@ load("@ytt:data", "data") +#@ load("@ytt:base64", "base64") +apiVersion: v1 +kind: Secret +metadata: + name: grafana-server-tls + namespace: #@ data.values.namespace +type: kubernetes.io/tls +data: + tls.crt: #@ base64.encode(data.values.grafana.tls.cert) + tls.key: #@ base64.encode(data.values.grafana.tls.key) diff --git a/charts/grafana/configuration/ldap.lib.txt b/charts/grafana/configuration/ldap.lib.txt new file mode 100644 index 0000000..67a3450 --- /dev/null +++ b/charts/grafana/configuration/ldap.lib.txt @@ -0,0 +1,27 @@ +(@ load("@ytt:data", "data") @) +(@ def format_ldap_toml(): -@) +[[servers]] + +host = "(@= data.values.grafana.ldap.host @)" +port = (@= data.values.grafana.ldap.port @) +use_ssl = true +start_tls = false +ssl_skip_verify = (@= "{}".format(data.values.tls.skipVerify).lower() @) +root_ca_cert = "/etc/secrets/server.ca.crt" +bind_dn = "(@= data.values.grafana.ldap.bind_dn @)" +bind_password = "(@= data.values.grafana.ldap.password @)" +search_filter = "(cn=%s)" +search_base_dns = (@= data.values.grafana.ldap.accountBases @) +group_search_filter = "(cn=%s)" +group_search_base_dns = (@= data.values.grafana.ldap.groupBases @) + +[[servers.group_mappings]] +group_dn = "*" +org_role = "Editor" + +[servers.attributes] +name = "givenName" +surname = "sn" +username = "cn" + +(@- end @) diff --git a/charts/grafana/grafana.yaml b/charts/grafana/grafana.yaml new file mode 100644 index 0000000..d9bee2a --- /dev/null +++ b/charts/grafana/grafana.yaml @@ -0,0 +1,503 @@ +#@ load("@ytt:data", "data") + +rbac: + create: true + pspEnabled: true + pspUseAppArmor: true + namespaced: false + extraRoleRules: [] + # - apiGroups: [] + # resources: [] + # verbs: [] + extraClusterRoleRules: [] + # - apiGroups: [] + # resources: [] + # verbs: [] +serviceAccount: + create: true + name: + nameTest: +# annotations: + +replicas: 1 + +## See `kubectl explain poddisruptionbudget.spec` for more +## ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/ +podDisruptionBudget: {} +# minAvailable: 1 +# maxUnavailable: 1 + +## See `kubectl explain deployment.spec.strategy` for more +## ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy +deploymentStrategy: + type: RollingUpdate + +readinessProbe: + httpGet: + path: /api/health + port: 3000 + +livenessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + +## Use an alternate scheduler, e.g. "stork". +## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ +## +# schedulerName: "default-scheduler" + +image: + repository: grafana/grafana + tag: 6.6.2 + pullPolicy: IfNotPresent + + ## Optionally specify an array of imagePullSecrets. + ## Secrets must be manually created in the namespace. + ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ + ## + # pullSecrets: + # - myRegistrKeySecretName + +testFramework: + enabled: true + image: "bats/bats" + tag: "v1.1.0" + securityContext: {} + +securityContext: + runAsUser: 472 + fsGroup: 472 + + +extraConfigmapMounts: [] + # - name: certs-configmap + # mountPath: /etc/grafana/ssl/ + # subPath: certificates.crt # (optional) + # configMap: certs-configmap + # readOnly: true + + +extraEmptyDirMounts: [] + # - name: provisioning-notifiers + # mountPath: /etc/grafana/provisioning/notifiers + + +## Assign a PriorityClassName to pods if set +# priorityClassName: + +downloadDashboardsImage: + repository: curlimages/curl + tag: 7.68.0 + pullPolicy: IfNotPresent + +downloadDashboards: + env: {} + +## Pod Annotations +# podAnnotations: {} + +## Pod Labels +# podLabels: {} + +podPortName: grafana + +## Deployment annotations +# annotations: {} + +## Expose the grafana service to be accessed from outside the cluster (LoadBalancer service). +## or access it from within the cluster (ClusterIP service). Set the service type and the port to serve it. +## ref: http://kubernetes.io/docs/user-guide/services/ +## +service: + type: ClusterIP + port: 80 + targetPort: 3000 + # targetPort: 4181 To be used with a proxy extraContainer + annotations: {} + labels: {} + portName: service + +ingress: + enabled: true + annotations: + kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + hosts: + - #@ data.values.grafana.host + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + tls: + - secretName: grafana-server-tls + hosts: + - #@ data.values.grafana.host + +resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + +## Node labels for pod assignment +## ref: https://kubernetes.io/docs/user-guide/node-selection/ +# +nodeSelector: {} + +## Tolerations for pod assignment +## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ +## +tolerations: [] + +## Affinity for pod assignment +## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity +## +affinity: {} + +extraInitContainers: [] + +## Enable an Specify container in extraContainers. This is meant to allow adding an authentication proxy to a grafana pod +extraContainers: | +# - name: proxy +# image: quay.io/gambol99/keycloak-proxy:latest +# args: +# - -provider=github +# - -client-id= +# - -client-secret= +# - -github-org= +# - -email-domain=* +# - -cookie-secret= +# - -http-address=http://0.0.0.0:4181 +# - -upstream-url=http://127.0.0.1:3000 +# ports: +# - name: proxy-web +# containerPort: 4181 + +## Enable persistence using Persistent Volume Claims +## ref: http://kubernetes.io/docs/user-guide/persistent-volumes/ +## +persistence: + type: pvc + enabled: true + # storageClassName: default + accessModes: + - ReadWriteOnce + size: 10Gi + # annotations: {} + finalizers: + - kubernetes.io/pvc-protection + # subPath: "" + existingClaim: grafana-pvc + +initChownData: + ## If false, data ownership will not be reset at startup + ## This allows the prometheus-server to be run with an arbitrary user + ## + enabled: true + + ## initChownData container image + ## + image: + repository: busybox + tag: "1.31.1" + pullPolicy: IfNotPresent + + ## initChownData resource requests and limits + ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + ## + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + + +# Administrator credentials when not using an existing secret (see below) +adminUser: admin +# adminPassword: strongpassword + +# Use an existing secret for the admin user. +admin: + existingSecret: "grafana-credentials" + userKey: admin-user + passwordKey: admin-password + +## Define command to be executed at startup by grafana container +## Needed if using `vault-env` to manage secrets (ref: https://banzaicloud.com/blog/inject-secrets-into-pods-vault/) +## Default is "run.sh" as defined in grafana's Dockerfile +# command: +# - "sh" +# - "/run.sh" + +## Use an alternate scheduler, e.g. "stork". +## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ +## +# schedulerName: + +## Extra environment variables that will be pass onto deployment pods +env: {} + +## "valueFrom" environment variable references that will be added to deployment pods +## ref: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.17/#envvarsource-v1-core +## Renders in container spec as: +## env: +## ... +## - name: +## valueFrom: +## +envValueFrom: {} + +## The name of a secret in the same kubernetes namespace which contain values to be added to the environment +## This can be useful for auth tokens, etc +envFromSecret: "" + +## Sensible environment variables that will be rendered as new secret object +## This can be useful for auth tokens, etc +envRenderSecret: {} + +## Additional grafana server secret mounts +# Defines additional mounts with secrets. Secrets must be manually created in the namespace. +extraSecretMounts: +#@ if data.values.grafana.ldap.enabled and not data.values.tls.skipVerify: + - name: tls-ca + mountPath: /etc/secrets + secretName: grafana-ca + readOnly: true +#@ end + +## Additional grafana server volume mounts +# Defines additional volume mounts. +extraVolumeMounts: [] + # - name: extra-volume + # mountPath: /mnt/volume + # readOnly: true + # existingClaim: volume-claim + +## Pass the plugins you want installed as a list. +## +plugins: [] + # - digrich-bubblechart-panel + # - grafana-clock-panel + +## Configure grafana datasources +## ref: http://docs.grafana.org/administration/provisioning/#datasources +## +datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + url: #@ "http://prometheus-{}-server.{}.svc.cluster.local".format(data.values.namespace, data.values.namespace) + access: proxy + isDefault: true + - name: LokiLogQL + type: loki + url: #@ "http://loki-{}.{}.svc.cluster.local:3100".format(data.values.namespace, data.values.namespace) + access: proxy + isDefault: false + - name: LokiPromQL + type: prometheus + url: #@ "http://loki-{}.{}.svc.cluster.local:3100/loki".format(data.values.namespace, data.values.namespace) + access: proxy + isDefault: false + +## Configure notifiers +## ref: http://docs.grafana.org/administration/provisioning/#alert-notification-channels +## +notifiers: {} +# notifiers.yaml: +# notifiers: +# - name: email-notifier +# type: email +# uid: email1 +# # either: +# org_id: 1 +# # or +# org_name: Main Org. +# is_default: true +# settings: +# addresses: an_email_address@example.com +# delete_notifiers: + +## Configure grafana dashboard providers +## ref: http://docs.grafana.org/administration/provisioning/#dashboards +## +## `path` must be /var/lib/grafana/dashboards/ +## +dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'gerrit' + orgId: 1 + folder: '' + type: file + disableDeletion: true + editable: #@ data.values.grafana.dashboards.editable + updateIntervalSeconds: 60 + allowUiUpdates: #@ data.values.grafana.dashboards.editable + options: + path: /var/lib/grafana/dashboards/gerrit + +## Configure grafana dashboard to import +## NOTE: To use dashboards you must also enable/configure dashboardProviders +## ref: https://grafana.com/dashboards +## +## dashboards per provider, use provider name as key. +## +dashboards: {} + # default: + # some-dashboard: + # json: | + # $RAW_JSON + # custom-dashboard: + # file: dashboards/custom-dashboard.json + # prometheus-stats: + # gnetId: 2 + # revision: 2 + # datasource: Prometheus + # local-dashboard: + # url: https://example.com/repository/test.json + # local-dashboard-base64: + # url: https://example.com/repository/test-b64.json + # b64content: true + +## Reference to external ConfigMap per provider. Use provider name as key and ConfiMap name as value. +## A provider dashboards must be defined either by external ConfigMaps or in values.yaml, not in both. +## ConfigMap data example: +## +## data: +## example-dashboard.json: | +## RAW_JSON +## +dashboardsConfigMaps: + gerrit: "grafana-dashboards" + +## Grafana's primary configuration +## NOTE: values in map will be converted to ini format +## ref: http://docs.grafana.org/installation/configuration/ +## +grafana.ini: + paths: + data: /var/lib/grafana/data + logs: /var/log/grafana + plugins: /var/lib/grafana/plugins + provisioning: /etc/grafana/provisioning + analytics: + check_for_updates: true + log: + mode: console + grafana_net: + url: https://grafana.net + users: + auto_assign_org_role: Editor +## LDAP Authentication can be enabled with the following values on grafana.ini +## NOTE: Grafana will fail to start if the value for ldap.toml is invalid + auth.ldap: + enabled: #@ data.values.grafana.ldap.enabled + allow_sign_up: true + config_file: /etc/grafana/ldap.toml + +## Grafana's LDAP configuration +## Templated by the template in _helpers.tpl +## NOTE: To enable the grafana.ini must be configured with auth.ldap.enabled +## ref: http://docs.grafana.org/installation/configuration/#auth-ldap +## ref: http://docs.grafana.org/installation/ldap/#configuration +ldap: + enabled: #@ data.values.grafana.ldap.enabled + # `existingSecret` is a reference to an existing secret containing the ldap configuration + # for Grafana in a key `ldap-toml`. + existingSecret: "grafana-credentials" + # `config` is the content of `ldap.toml` that will be stored in the created secret + # config: "" + # config: |- + # verbose_logging = true + + # [[servers]] + # host = "my-ldap-server" + # port = 636 + # use_ssl = true + # start_tls = false + # ssl_skip_verify = false + # bind_dn = "uid=%s,ou=users,dc=myorg,dc=com" + +## Grafana's SMTP configuration +## NOTE: To enable, grafana.ini must be configured with smtp.enabled +## ref: http://docs.grafana.org/installation/configuration/#smtp +smtp: + # `existingSecret` is a reference to an existing secret containing the smtp configuration + # for Grafana. + existingSecret: "" + userKey: "user" + passwordKey: "password" + +## Sidecars that collect the configmaps with specified label and stores the included files them into the respective folders +## Requires at least Grafana 5 to work and can't be used together with parameters dashboardProviders, datasources and dashboards +sidecar: + image: kiwigrid/k8s-sidecar:0.1.99 + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 100m + memory: 100Mi + requests: + cpu: 50m + memory: 50Mi + # skipTlsVerify Set to true to skip tls verification for kube api calls + # skipTlsVerify: true + dashboards: + enabled: false + ## Method to use to detect ConfigMap changes. With WATCH the sidecar will do a WATCH requests, with SLEEP it will list all ConfigMaps, then sleep for 60 seconds. + watchMethod: WATCH + SCProvider: true + # label that the configmaps with dashboards are marked with + label: grafana_dashboard + # folder in the pod that should hold the collected dashboards (unless `defaultFolderName` is set) + folder: /tmp/dashboards + # The default folder name, it will create a subfolder under the `folder` and put dashboards in there instead + defaultFolderName: null + # If specified, the sidecar will search for dashboard config-maps inside this namespace. + # Otherwise the namespace in which the sidecar is running will be used. + # It's also possible to specify ALL to search in all namespaces + searchNamespace: null + # provider configuration that lets grafana manage the dashboards + provider: + # name of the provider, should be unique + name: sidecarProvider + # orgid as configured in grafana + orgid: 1 + # folder in which the dashboards should be imported in grafana + folder: '' + # type of the provider + type: file + # disableDelete to activate a import-only behaviour + disableDelete: false + # allow updating provisioned dashboards from the UI + allowUiUpdates: false + datasources: + enabled: false + ## Method to use to detect ConfigMap changes. With WATCH the sidecar will do a WATCH requests, with SLEEP it will list all ConfigMaps, then sleep for 60 seconds. + watchMethod: WATCH + # label that the configmaps with datasources are marked with + label: grafana_datasource + # If specified, the sidecar will search for datasource config-maps inside this namespace. + # Otherwise the namespace in which the sidecar is running will be used. + # It's also possible to specify ALL to search in all namespaces + searchNamespace: null + +## Override the deployment namespace +## +namespaceOverride: "" diff --git a/charts/grafana/storage/grafana.pvc.yaml b/charts/grafana/storage/grafana.pvc.yaml new file mode 100644 index 0000000..9f4cfa2 --- /dev/null +++ b/charts/grafana/storage/grafana.pvc.yaml @@ -0,0 +1,15 @@ +#@ load("@ytt:data", "data") +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: grafana-pvc + namespace: #@ data.values.namespace + labels: + app: grafana +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + storageClassName: default diff --git a/charts/loki/Version b/charts/loki/Version new file mode 100644 index 0000000..d21d277 --- /dev/null +++ b/charts/loki/Version @@ -0,0 +1 @@ +0.25.0 diff --git a/charts/loki/configuration/loki.basic-auth.secret.yaml b/charts/loki/configuration/loki.basic-auth.secret.yaml new file mode 100644 index 0000000..8bc0b5f --- /dev/null +++ b/charts/loki/configuration/loki.basic-auth.secret.yaml @@ -0,0 +1,10 @@ +#@ load("@ytt:data", "data") +#@ load("@ytt:base64", "base64") +apiVersion: v1 +kind: Secret +metadata: + name: loki-basic-auth + namespace: #@ data.values.namespace +data: + auth: #@ base64.encode(data.values.loki.htpasswd) +type: Opaque diff --git a/charts/loki/configuration/loki.tls.secret.yaml b/charts/loki/configuration/loki.tls.secret.yaml new file mode 100644 index 0000000..b1a0db8 --- /dev/null +++ b/charts/loki/configuration/loki.tls.secret.yaml @@ -0,0 +1,11 @@ +#@ load("@ytt:data", "data") +#@ load("@ytt:base64", "base64") +apiVersion: v1 +kind: Secret +metadata: + name: loki-server-tls + namespace: #@ data.values.namespace +type: kubernetes.io/tls +data: + tls.crt: #@ base64.encode(data.values.loki.tls.cert) + tls.key: #@ base64.encode(data.values.loki.tls.key) diff --git a/charts/loki/loki.yaml b/charts/loki/loki.yaml new file mode 100644 index 0000000..62645bb --- /dev/null +++ b/charts/loki/loki.yaml @@ -0,0 +1,242 @@ +#@ load("@ytt:data", "data") + +image: + repository: grafana/loki + tag: v1.3.0 + pullPolicy: IfNotPresent + +ingress: + enabled: true + annotations: + kubernetes.io/ingress.class: nginx + nginx.ingress.kubernetes.io/auth-type: basic + nginx.ingress.kubernetes.io/auth-secret: loki-basic-auth + nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required' + # kubernetes.io/tls-acme: "true" + hosts: + - host: #@ data.values.loki.host + paths: + - / + tls: + - secretName: loki-server-tls + hosts: + - #@ data.values.loki.host + +## Affinity for pod assignment +## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity +affinity: {} +# podAntiAffinity: +# requiredDuringSchedulingIgnoredDuringExecution: +# - labelSelector: +# matchExpressions: +# - key: app +# operator: In +# values: +# - loki +# topologyKey: "kubernetes.io/hostname" + +## StatefulSet annotations +annotations: {} + +# enable tracing for debug, need install jaeger and specify right jaeger_agent_host +tracing: + jaegerAgentHost: + +config: + auth_enabled: false + ingester: + chunk_idle_period: 3m + chunk_block_size: 262144 + chunk_retain_period: 1m + max_transfer_retries: 0 + lifecycler: + ring: + kvstore: + store: inmemory + replication_factor: 1 + + ## Different ring configs can be used. E.g. Consul + # ring: + # store: consul + # replication_factor: 1 + # consul: + # host: "consul:8500" + # prefix: "" + # httpclienttimeout: "20s" + # consistentreads: true + limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h + schema_config: + configs: + - from: 2018-04-15 + store: boltdb + object_store: filesystem + schema: v9 + index: + prefix: index_ + period: 24h + chunks: + prefix: chunk_ + period: 24h + server: + http_listen_port: 3100 + storage_config: + boltdb: + directory: /data/loki/index + filesystem: + directory: /data/loki/chunks + chunk_store_config: + max_look_back_period: 0 + table_manager: + retention_deletes_enabled: true + retention_period: 336h + +## Additional Loki container arguments, e.g. log level (debug, info, warn, error) +extraArgs: {} + # log.level: debug + +livenessProbe: + httpGet: + path: /ready + port: http-metrics + initialDelaySeconds: 45 + +## ref: https://kubernetes.io/docs/concepts/services-networking/network-policies/ +networkPolicy: + enabled: false + +## The app name of loki clients +client: {} + # name: + +## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ +nodeSelector: {} + +## ref: https://kubernetes.io/docs/concepts/storage/persistent-volumes/ +## If you set enabled as "True", you need : +## - create a pv which above 10Gi and has same namespace with loki +## - keep storageClassName same with below setting +persistence: + enabled: true + accessModes: + - ReadWriteOnce + size: 10Gi + annotations: {} + # subPath: "" + # existingClaim: + +## Pod Labels +podLabels: {} + +## Pod Annotations +podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/port: "http-metrics" + +podManagementPolicy: OrderedReady + +## Assign a PriorityClassName to pods if set +# priorityClassName: + +rbac: + create: true + pspEnabled: true + +readinessProbe: + httpGet: + path: /ready + port: http-metrics + initialDelaySeconds: 45 + +replicas: 1 + +resources: {} +# limits: +# cpu: 200m +# memory: 256Mi +# requests: +# cpu: 100m +# memory: 128Mi + +securityContext: + fsGroup: 10001 + runAsGroup: 10001 + runAsNonRoot: true + runAsUser: 10001 + +service: + type: ClusterIP + nodePort: + port: 3100 + annotations: {} + labels: {} + +serviceAccount: + create: true + name: + annotations: {} + +terminationGracePeriodSeconds: 4800 + +## Tolerations for pod assignment +## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ +tolerations: [] + +# The values to set in the PodDisruptionBudget spec +# If not set then a PodDisruptionBudget will not be created +podDisruptionBudget: {} +# minAvailable: 1 +# maxUnavailable: 1 + +updateStrategy: + type: RollingUpdate + +serviceMonitor: + enabled: false + interval: "" + additionalLabels: {} + # scrapeTimeout: 10s + +initContainers: [] +## Init containers to be added to the loki pod. +# - name: my-init-container +# image: busybox:latest +# command: ['sh', '-c', 'echo hello'] + +extraContainers: [] +## Additional containers to be added to the loki pod. +# - name: reverse-proxy +# image: angelbarrera92/basic-auth-reverse-proxy:dev +# args: +# - "serve" +# - "--upstream=http://localhost:3100" +# - "--auth-config=/etc/reverse-proxy-conf/authn.yaml" +# ports: +# - name: http +# containerPort: 11811 +# protocol: TCP +# volumeMounts: +# - name: reverse-proxy-auth-config +# mountPath: /etc/reverse-proxy-conf + + +extraVolumes: [] +## Additional volumes to the loki pod. +# - name: reverse-proxy-auth-config +# secret: +# secretName: reverse-proxy-auth-config + +## Extra volume mounts that will be added to the loki container +extraVolumeMounts: [] + +extraPorts: [] +## Additional ports to the loki services. Useful to expose extra container ports. +# - port: 11811 +# protocol: TCP +# name: http +# targetPort: http + +# Extra env variables to pass to the loki container +env: [] diff --git a/charts/namespace.yaml b/charts/namespace.yaml new file mode 100644 index 0000000..53a9f7e --- /dev/null +++ b/charts/namespace.yaml @@ -0,0 +1,5 @@ +#@ load("@ytt:data", "data") +apiVersion: v1 +kind: Namespace +metadata: + name: #@ data.values.namespace diff --git a/charts/prometheus/VERSION b/charts/prometheus/VERSION new file mode 100644 index 0000000..4471145 --- /dev/null +++ b/charts/prometheus/VERSION @@ -0,0 +1 @@ +9.5.4 diff --git a/charts/prometheus/configuration/prometheus.basic-auth.secret.yaml b/charts/prometheus/configuration/prometheus.basic-auth.secret.yaml new file mode 100644 index 0000000..9c213df --- /dev/null +++ b/charts/prometheus/configuration/prometheus.basic-auth.secret.yaml @@ -0,0 +1,10 @@ +#@ load("@ytt:data", "data") +#@ load("@ytt:base64", "base64") +apiVersion: v1 +kind: Secret +metadata: + name: prometheus-basic-auth + namespace: #@ data.values.namespace +data: + auth: #@ base64.encode(data.values.prometheus.server.htpasswd) +type: Opaque diff --git a/charts/prometheus/configuration/prometheus.secret.yaml b/charts/prometheus/configuration/prometheus.secret.yaml new file mode 100644 index 0000000..01de104 --- /dev/null +++ b/charts/prometheus/configuration/prometheus.secret.yaml @@ -0,0 +1,19 @@ +#@ load("@ytt:data", "data") +#@ load("@ytt:base64", "base64") +apiVersion: v1 +kind: Secret +metadata: + name: prometheus-secrets + namespace: #@ data.values.namespace +data: + #@yaml/text-templated-strings + #@ for gerrit in data.values.gerritServers: + .pwd_(@= gerrit.host @): #@ base64.encode(gerrit.password) + #@ end + + #@ if not data.values.tls.skipVerify: + server.ca.crt: #@ base64.encode(data.values.tls.caCert) + server.crt: #@ base64.encode(data.values.prometheus.server.tls.cert) + server.key: #@ base64.encode(data.values.prometheus.server.tls.key) + #@ end +type: Opaque diff --git a/charts/prometheus/configuration/prometheus.tls.secret.yaml b/charts/prometheus/configuration/prometheus.tls.secret.yaml new file mode 100644 index 0000000..c9a834a --- /dev/null +++ b/charts/prometheus/configuration/prometheus.tls.secret.yaml @@ -0,0 +1,11 @@ +#@ load("@ytt:data", "data") +#@ load("@ytt:base64", "base64") +apiVersion: v1 +kind: Secret +metadata: + name: prometheus-server-tls + namespace: #@ data.values.namespace +type: kubernetes.io/tls +data: + tls.crt: #@ base64.encode(data.values.prometheus.server.tls.cert) + tls.key: #@ base64.encode(data.values.prometheus.server.tls.key) diff --git a/charts/prometheus/prometheus.yaml b/charts/prometheus/prometheus.yaml new file mode 100644 index 0000000..4df43f8 --- /dev/null +++ b/charts/prometheus/prometheus.yaml @@ -0,0 +1,1433 @@ +#@ load("@ytt:data", "data") + +rbac: + create: true + +podSecurityPolicy: + enabled: true + +imagePullSecrets: +# - name: "image-pull-secret" + +## Define serviceAccount names for components. Defaults to component's fully qualified name. +## +serviceAccounts: + alertmanager: + create: true + name: + kubeStateMetrics: + create: false + name: + nodeExporter: + create: false + name: + pushgateway: + create: false + name: + server: + create: true + name: + +alertmanager: + ## If false, alertmanager will not be installed + ## + enabled: true + + ## alertmanager container name + ## + name: alertmanager + + ## alertmanager container image + ## + image: + repository: prom/alertmanager + tag: v0.18.0 + pullPolicy: IfNotPresent + + ## alertmanager priorityClassName + ## + priorityClassName: "" + + ## Additional alertmanager container arguments + ## + extraArgs: {} + + ## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug + ## so that the various internal URLs are still able to access as they are in the default case. + ## (Optional) + prefixURL: "" + + ## External URL which can access alertmanager + ## Maybe same with Ingress host name + baseURL: "/" + + ## Additional alertmanager container environment variable + ## For instance to add a http_proxy + ## + extraEnv: {} + + ## Additional alertmanager Secret mounts + # Defines additional mounts with secrets. Secrets must be manually created in the namespace. + extraSecretMounts: [] + # - name: secret-files + # mountPath: /etc/secrets + # subPath: "" + # secretName: alertmanager-secret-files + # readOnly: true + + ## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.alertmanager.configMapOverrideName}} + ## Defining configMapOverrideName will cause templates/alertmanager-configmap.yaml + ## to NOT generate a ConfigMap resource + ## + configMapOverrideName: "" + + ## The name of a secret in the same kubernetes namespace which contains the Alertmanager config + ## Defining configFromSecret will cause templates/alertmanager-configmap.yaml + ## to NOT generate a ConfigMap resource + ## + configFromSecret: "" + + ## The configuration file name to be loaded to alertmanager + ## Must match the key within configuration loaded from ConfigMap/Secret + ## + configFileName: alertmanager.yml + + ingress: + ## If true, alertmanager Ingress will be created + ## + enabled: false + + ## alertmanager Ingress annotations + ## + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: 'true' + + ## alertmanager Ingress additional labels + ## + extraLabels: {} + + ## alertmanager Ingress hostnames with optional path + ## Must be provided if Ingress is enabled + ## + hosts: [] + # - alertmanager.domain.com + # - domain.com/alertmanager + + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + + ## alertmanager Ingress TLS configuration + ## Secrets must be manually created in the namespace + ## + tls: [] + # - secretName: prometheus-alerts-tls + # hosts: + # - alertmanager.domain.com + + ## Alertmanager Deployment Strategy type + # strategy: + # type: Recreate + + ## Node tolerations for alertmanager scheduling to nodes with taints + ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + ## + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" + + ## Node labels for alertmanager pod assignment + ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ + ## + nodeSelector: {} + + ## Pod affinity + ## + affinity: {} + + ## Use an alternate scheduler, e.g. "stork". + ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ + ## + # schedulerName: + + persistentVolume: + ## If true, alertmanager will create/use a Persistent Volume Claim + ## If false, use emptyDir + ## + enabled: true + + ## alertmanager data Persistent Volume access modes + ## Must match those of existing PV or dynamic provisioner + ## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/ + ## + accessModes: + - ReadWriteOnce + + ## alertmanager data Persistent Volume Claim annotations + ## + annotations: {} + + ## alertmanager data Persistent Volume existing claim name + ## Requires alertmanager.persistentVolume.enabled: true + ## If defined, PVC must be created manually before volume will be bound + existingClaim: "" + + ## alertmanager data Persistent Volume mount root path + ## + mountPath: /data + + ## alertmanager data Persistent Volume size + ## + size: 2Gi + + ## alertmanager data Persistent Volume Storage Class + ## If defined, storageClassName: + ## If set to "-", storageClassName: "", which disables dynamic provisioning + ## If undefined (the default) or set to null, no storageClassName spec is + ## set, choosing the default provisioner. (gp2 on AWS, standard on + ## GKE, AWS & OpenStack) + ## + # storageClass: "-" + + ## Subdirectory of alertmanager data Persistent Volume to mount + ## Useful if the volume's root directory is not empty + ## + subPath: "" + + ## Annotations to be added to alertmanager pods + ## + podAnnotations: {} + + ## Specify if a Pod Security Policy for node-exporter must be created + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/ + ## + podSecurityPolicy: + annotations: {} + ## Specify pod annotations + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl + ## + # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' + # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' + # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' + + ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below) + ## + replicaCount: 1 + + statefulSet: + ## If true, use a statefulset instead of a deployment for pod management. + ## This allows to scale replicas to more than 1 pod + ## + enabled: false + + podManagementPolicy: OrderedReady + + ## Alertmanager headless service to use for the statefulset + ## + headless: + annotations: {} + labels: {} + + ## Enabling peer mesh service end points for enabling the HA alert manager + ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md + # enableMeshPeer : true + + servicePort: 80 + + ## alertmanager resource requests and limits + ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + ## + resources: + limits: + cpu: 10m + memory: 32Mi + requests: + cpu: 10m + memory: 32Mi + + ## Security context to be added to alertmanager pods + ## + securityContext: + runAsUser: 65534 + runAsNonRoot: true + runAsGroup: 65534 + fsGroup: 65534 + + service: + annotations: {} + labels: {} + clusterIP: "" + + ## Enabling peer mesh service end points for enabling the HA alert manager + ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md + # enableMeshPeer : true + + ## List of IP addresses at which the alertmanager service is available + ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips + ## + externalIPs: [] + + loadBalancerIP: "" + loadBalancerSourceRanges: [] + servicePort: 80 + # nodePort: 30000 + sessionAffinity: None + type: ClusterIP + +## Monitors ConfigMap changes and POSTs to a URL +## Ref: https://github.com/jimmidyson/configmap-reload +## +configmapReload: + ## configmap-reload container name + ## + name: configmap-reload + + ## configmap-reload container image + ## + image: + repository: jimmidyson/configmap-reload + tag: v0.2.2 + pullPolicy: IfNotPresent + + ## Additional configmap-reload container arguments + ## + extraArgs: {} + ## Additional configmap-reload volume directories + ## + extraVolumeDirs: [] + + + ## Additional configmap-reload mounts + ## + extraConfigmapMounts: [] + # - name: prometheus-alerts + # mountPath: /etc/alerts.d + # subPath: "" + # configMap: prometheus-alerts + # readOnly: true + + + ## configmap-reload resource requests and limits + ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + ## + resources: {} + +kubeStateMetrics: + ## If false, kube-state-metrics will not be installed + ## + enabled: false + + ## kube-state-metrics container name + ## + name: kube-state-metrics + + ## kube-state-metrics container image + ## + image: + repository: quay.io/coreos/kube-state-metrics + tag: v1.6.0 + pullPolicy: IfNotPresent + + ## kube-state-metrics priorityClassName + ## + priorityClassName: "" + + ## kube-state-metrics container arguments + ## + args: {} + + ## Node tolerations for kube-state-metrics scheduling to nodes with taints + ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + ## + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" + + ## Node labels for kube-state-metrics pod assignment + ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ + ## + nodeSelector: {} + + ## Annotations to be added to kube-state-metrics pods + ## + podAnnotations: {} + + ## Specify if a Pod Security Policy for node-exporter must be created + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/ + ## + podSecurityPolicy: + annotations: {} + ## Specify pod annotations + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl + ## + # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' + # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' + # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' + + pod: + labels: {} + + replicaCount: 1 + + ## kube-state-metrics resource requests and limits + ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + ## + resources: {} + # limits: + # cpu: 10m + # memory: 16Mi + # requests: + # cpu: 10m + # memory: 16Mi + + ## Security context to be added to kube-state-metrics pods + ## + securityContext: + runAsUser: 65534 + runAsNonRoot: true + + service: + annotations: + prometheus.io/scrape: "true" + labels: {} + + # Exposed as a headless service: + # https://kubernetes.io/docs/concepts/services-networking/service/#headless-services + clusterIP: None + + ## List of IP addresses at which the kube-state-metrics service is available + ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips + ## + externalIPs: [] + + loadBalancerIP: "" + loadBalancerSourceRanges: [] + servicePort: 80 + type: ClusterIP + +nodeExporter: + ## If false, node-exporter will not be installed + ## + enabled: false + + ## If true, node-exporter pods share the host network namespace + ## + hostNetwork: true + + ## If true, node-exporter pods share the host PID namespace + ## + hostPID: true + + ## node-exporter container name + ## + name: node-exporter + + ## node-exporter container image + ## + image: + repository: prom/node-exporter + tag: v0.18.0 + pullPolicy: IfNotPresent + + ## Specify if a Pod Security Policy for node-exporter must be created + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/ + ## + podSecurityPolicy: + annotations: {} + ## Specify pod annotations + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl + ## + # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' + # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' + # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' + + ## node-exporter priorityClassName + ## + priorityClassName: "" + + ## Custom Update Strategy + ## + updateStrategy: + type: RollingUpdate + + ## Additional node-exporter container arguments + ## + extraArgs: {} + + ## Additional node-exporter hostPath mounts + ## + extraHostPathMounts: [] + # - name: textfile-dir + # mountPath: /srv/txt_collector + # hostPath: /var/lib/node-exporter + # readOnly: true + # mountPropagation: HostToContainer + + extraConfigmapMounts: [] + # - name: certs-configmap + # mountPath: /prometheus + # configMap: certs-configmap + # readOnly: true + + ## Node tolerations for node-exporter scheduling to nodes with taints + ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + ## + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" + + ## Node labels for node-exporter pod assignment + ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ + ## + nodeSelector: {} + + ## Annotations to be added to node-exporter pods + ## + podAnnotations: {} + + ## Labels to be added to node-exporter pods + ## + pod: + labels: {} + + ## node-exporter resource limits & requests + ## Ref: https://kubernetes.io/docs/user-guide/compute-resources/ + ## + resources: {} + # limits: + # cpu: 200m + # memory: 50Mi + # requests: + # cpu: 100m + # memory: 30Mi + + ## Security context to be added to node-exporter pods + ## + securityContext: {} + # runAsUser: 0 + + service: + annotations: + prometheus.io/scrape: "true" + labels: {} + + # Exposed as a headless service: + # https://kubernetes.io/docs/concepts/services-networking/service/#headless-services + clusterIP: None + + ## List of IP addresses at which the node-exporter service is available + ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips + ## + externalIPs: [] + + hostPort: 9100 + loadBalancerIP: "" + loadBalancerSourceRanges: [] + servicePort: 9100 + type: ClusterIP + +server: + ## Prometheus server container name + ## + enabled: true + name: server + sidecarContainers: + + ## Prometheus server container image + ## + image: + repository: prom/prometheus + tag: v2.13.1 + pullPolicy: IfNotPresent + + ## prometheus server priorityClassName + ## + priorityClassName: "" + + ## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug + ## so that the various internal URLs are still able to access as they are in the default case. + ## (Optional) + prefixURL: "" + + ## External URL which can access alertmanager + ## Maybe same with Ingress host name + baseURL: "" + + ## Additional server container environment variables + ## + ## You specify this manually like you would a raw deployment manifest. + ## This means you can bind in environment variables from secrets. + ## + ## e.g. static environment variable: + ## - name: DEMO_GREETING + ## value: "Hello from the environment" + ## + ## e.g. secret environment variable: + ## - name: USERNAME + ## valueFrom: + ## secretKeyRef: + ## name: mysecret + ## key: username + env: {} + + ## This flag controls access to the administrative HTTP API which includes functionality such as deleting time + ## series. This is disabled by default. + enableAdminApi: false + + ## This flag controls BD locking + skipTSDBLock: false + + ## Path to a configuration file on prometheus server container FS + configPath: /etc/config/prometheus.yml + + global: + ## How frequently to scrape targets by default + ## + scrape_interval: 1m + ## How long until a scrape request times out + ## + scrape_timeout: 10s + ## How frequently to evaluate rules + ## + evaluation_interval: 1m + + ## Additional Prometheus server container arguments + ## + extraArgs: {} + + ## Additional InitContainers to initialize the pod + ## + extraInitContainers: [] + + ## Additional Prometheus server Volume mounts + ## + extraVolumeMounts: [] + + ## Additional Prometheus server Volumes + ## + extraVolumes: [] + + ## Additional Prometheus server hostPath mounts + ## + extraHostPathMounts: [] + # - name: certs-dir + # mountPath: /etc/kubernetes/certs + # subPath: "" + # hostPath: /etc/kubernetes/certs + # readOnly: true + + extraConfigmapMounts: [] + # - name: certs-configmap + # mountPath: /prometheus + # subPath: "" + # configMap: certs-configmap + # readOnly: true + + ## Additional Prometheus server Secret mounts + # Defines additional mounts with secrets. Secrets must be manually created in the namespace. + extraSecretMounts: + - name: prometheus-secrets + mountPath: /etc/secrets + secretName: prometheus-secrets + readOnly: true + + ## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.server.configMapOverrideName}} + ## Defining configMapOverrideName will cause templates/server-configmap.yaml + ## to NOT generate a ConfigMap resource + ## + configMapOverrideName: "" + + ingress: + ## If true, Prometheus server Ingress will be created + ## + enabled: true + + ## Prometheus server Ingress annotations + ## + annotations: + kubernetes.io/ingress.class: nginx + nginx.ingress.kubernetes.io/auth-type: basic + nginx.ingress.kubernetes.io/auth-secret: prometheus-basic-auth + nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required' + # kubernetes.io/tls-acme: 'true' + + ## Prometheus server Ingress additional labels + ## + extraLabels: {} + + ## Prometheus server Ingress hostnames with optional path + ## Must be provided if Ingress is enabled + ## + hosts: + - #@ data.values.prometheus.server.host + # - prometheus.domain.com + # - domain.com/prometheus + + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + + ## Prometheus server Ingress TLS configuration + ## Secrets must be manually created in the namespace + ## + tls: + - secretName: prometheus-server-tls + hosts: + - #@ data.values.prometheus.server.host + + ## Server Deployment Strategy type + # strategy: + # type: Recreate + + ## Node tolerations for server scheduling to nodes with taints + ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + ## + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" + + ## Node labels for Prometheus server pod assignment + ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ + ## + nodeSelector: {} + + ## Pod affinity + ## + affinity: {} + + ## Use an alternate scheduler, e.g. "stork". + ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ + ## + # schedulerName: + + persistentVolume: + ## If true, Prometheus server will create/use a Persistent Volume Claim + ## If false, use emptyDir + ## + enabled: true + + ## Prometheus server data Persistent Volume access modes + ## Must match those of existing PV or dynamic provisioner + ## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/ + ## + accessModes: + - ReadWriteOnce + + ## Prometheus server data Persistent Volume annotations + ## + annotations: {} + + ## Prometheus server data Persistent Volume existing claim name + ## Requires server.persistentVolume.enabled: true + ## If defined, PVC must be created manually before volume will be bound + existingClaim: "prometheus-server-pvc" + + ## Prometheus server data Persistent Volume mount root path + ## + mountPath: /data + + ## Prometheus server data Persistent Volume size + ## + size: 8Gi + + ## Prometheus server data Persistent Volume Storage Class + ## If defined, storageClassName: + ## If set to "-", storageClassName: "", which disables dynamic provisioning + ## If undefined (the default) or set to null, no storageClassName spec is + ## set, choosing the default provisioner. (gp2 on AWS, standard on + ## GKE, AWS & OpenStack) + ## + # storageClass: "-" + + ## Subdirectory of Prometheus server data Persistent Volume to mount + ## Useful if the volume's root directory is not empty + ## + subPath: "" + + emptyDir: + sizeLimit: "" + + ## Annotations to be added to Prometheus server pods + ## + podAnnotations: {} + # iam.amazonaws.com/role: prometheus + + ## Labels to be added to Prometheus server pods + ## + podLabels: {} + + ## Prometheus AlertManager configuration + ## + alertmanagers: [] + + ## Specify if a Pod Security Policy for node-exporter must be created + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/ + ## + podSecurityPolicy: + annotations: {} + ## Specify pod annotations + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl + ## + # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' + # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' + # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' + + ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below) + ## + replicaCount: 1 + + statefulSet: + ## If true, use a statefulset instead of a deployment for pod management. + ## This allows to scale replicas to more than 1 pod + ## + enabled: false + + annotations: {} + labels: {} + podManagementPolicy: OrderedReady + + ## Alertmanager headless service to use for the statefulset + ## + headless: + annotations: {} + labels: {} + servicePort: 80 + + ## Prometheus server readiness and liveness probe initial delay and timeout + ## Ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ + ## + readinessProbeInitialDelay: 30 + readinessProbeTimeout: 30 + readinessProbeFailureThreshold: 3 + readinessProbeSuccessThreshold: 1 + livenessProbeInitialDelay: 30 + livenessProbeTimeout: 30 + livenessProbeFailureThreshold: 3 + livenessProbeSuccessThreshold: 1 + + ## Prometheus server resource requests and limits + ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + ## + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + + ## Security context to be added to server pods + ## + securityContext: + runAsUser: 65534 + runAsNonRoot: true + runAsGroup: 65534 + fsGroup: 65534 + + service: + annotations: {} + labels: {} + clusterIP: "" + + ## List of IP addresses at which the Prometheus server service is available + ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips + ## + externalIPs: [] + + loadBalancerIP: "" + loadBalancerSourceRanges: [] + servicePort: 80 + sessionAffinity: None + type: ClusterIP + + ## Prometheus server pod termination grace period + ## + terminationGracePeriodSeconds: 300 + + ## Prometheus data retention period (default if not specified is 15 days) + ## + retention: "15d" + +pushgateway: + ## If false, pushgateway will not be installed + ## + enabled: false + + ## Use an alternate scheduler, e.g. "stork". + ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/ + ## + # schedulerName: + + ## pushgateway container name + ## + name: pushgateway + + ## pushgateway container image + ## + image: + repository: prom/pushgateway + tag: v0.8.0 + pullPolicy: IfNotPresent + + ## pushgateway priorityClassName + ## + priorityClassName: "" + + ## Additional pushgateway container arguments + ## + ## for example: persistence.file: /data/pushgateway.data + extraArgs: {} + + ingress: + ## If true, pushgateway Ingress will be created + ## + enabled: false + + ## pushgateway Ingress annotations + ## + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: 'true' + + ## pushgateway Ingress hostnames with optional path + ## Must be provided if Ingress is enabled + ## + hosts: [] + # - pushgateway.domain.com + # - domain.com/pushgateway + + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + + ## pushgateway Ingress TLS configuration + ## Secrets must be manually created in the namespace + ## + tls: [] + # - secretName: prometheus-alerts-tls + # hosts: + # - pushgateway.domain.com + + ## Node tolerations for pushgateway scheduling to nodes with taints + ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + ## + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" + + ## Node labels for pushgateway pod assignment + ## Ref: https://kubernetes.io/docs/user-guide/node-selection/ + ## + nodeSelector: {} + + ## Annotations to be added to pushgateway pods + ## + podAnnotations: {} + + ## Specify if a Pod Security Policy for node-exporter must be created + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/ + ## + podSecurityPolicy: + annotations: {} + ## Specify pod annotations + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#apparmor + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#seccomp + ## Ref: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#sysctl + ## + # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' + # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' + # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' + + replicaCount: 1 + + ## pushgateway resource requests and limits + ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + ## + resources: {} + # limits: + # cpu: 10m + # memory: 32Mi + # requests: + # cpu: 10m + # memory: 32Mi + + ## Security context to be added to push-gateway pods + ## + securityContext: + runAsUser: 65534 + runAsNonRoot: true + + service: + annotations: + prometheus.io/probe: pushgateway + labels: {} + clusterIP: "" + + ## List of IP addresses at which the pushgateway service is available + ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips + ## + externalIPs: [] + + loadBalancerIP: "" + loadBalancerSourceRanges: [] + servicePort: 9091 + type: ClusterIP + + ## pushgateway Deployment Strategy type + # strategy: + # type: Recreate + + persistentVolume: + ## If true, pushgateway will create/use a Persistent Volume Claim + ## If false, use emptyDir + ## + enabled: false + + ## pushgateway data Persistent Volume access modes + ## Must match those of existing PV or dynamic provisioner + ## Ref: http://kubernetes.io/docs/user-guide/persistent-volumes/ + ## + accessModes: + - ReadWriteOnce + + ## pushgateway data Persistent Volume Claim annotations + ## + annotations: {} + + ## pushgateway data Persistent Volume existing claim name + ## Requires pushgateway.persistentVolume.enabled: true + ## If defined, PVC must be created manually before volume will be bound + existingClaim: "" + + ## pushgateway data Persistent Volume mount root path + ## + mountPath: /data + + ## pushgateway data Persistent Volume size + ## + size: 2Gi + + ## alertmanager data Persistent Volume Storage Class + ## If defined, storageClassName: + ## If set to "-", storageClassName: "", which disables dynamic provisioning + ## If undefined (the default) or set to null, no storageClassName spec is + ## set, choosing the default provisioner. (gp2 on AWS, standard on + ## GKE, AWS & OpenStack) + ## + # storageClass: "-" + + ## Subdirectory of alertmanager data Persistent Volume to mount + ## Useful if the volume's root directory is not empty + ## + subPath: "" + + +## alertmanager ConfigMap entries +## +alertmanagerFiles: + alertmanager.yml: + global: + slack_api_url: #@ data.values.prometheus.alertmanager.slack.apiUrl + + receivers: + - name: gerrit-admin + slack_configs: + - channel: #@ data.values.prometheus.alertmanager.slack.channel + send_resolved: true + title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}" + text: "{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}" + + route: + group_wait: 10s + group_interval: 5m + receiver: gerrit-admin + repeat_interval: 3h + +## Prometheus server ConfigMap entries +## +serverFiles: + + ## Alerts configuration + ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ + alerting_rules.yml: + groups: + - name: Instances + rules: + - alert: InstanceDown + expr: up == 0 + for: 3m + labels: + severity: critical + annotations: + description: '{{ $labels.instance }} has been unresponsive for 3 min.' + summary: '{{ $labels.instance }} down' + - alert: SSHBatchUserScheduledTasks + expr: queue_ssh_batch_worker_scheduled_tasks > 200 + for: 5m + labels: + severity: warning + annotations: + description: > + More than 200 scheduled tasks for SSH batch workers on + {{ $labels.instance }} for 5 min. + summary: '{{ $labels.instance }}: High SSH batch user workload' + - alert: SSHInteractiveUserScheduledTasks + expr: queue_ssh_interactive_worker_scheduled_tasks > 200 + for: 5m + labels: + severity: warning + annotations: + description: > + More than 200 scheduled tasks for SSH interactive workers on + {{ $labels.instance }} for 5 min. + summary: '{{ $labels.instance }}: High SSH interactive user workload' + ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use alerting_rules.yml + alerts: {} + + ## Records configuration + ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/ + recording_rules.yml: {} + ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use recording_rules.yml + rules: {} + + prometheus.yml: + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + ## Below two files are DEPRECATED will be removed from this default values file + - /etc/config/rules + - /etc/config/alerts + + scrape_configs: + #@ for gerrit in data.values.gerritServers: + - job_name: #@ "gerrit-{}".format(gerrit.host) + metrics_path: /a/plugins/metrics-reporter-prometheus/metrics + scheme: https + tls_config: + insecure_skip_verify: #@ data.values.tls.skipVerify + #@ if not data.values.tls.skipVerify: + ca_file: /etc/secrets/server.ca.crt + cert_file: /etc/secrets/server.crt + key_file: /etc/secrets/server.key + #@ end + static_configs: + - targets: + - #@ gerrit.host + basic_auth: + username: #@ gerrit.username + password_file: #@ "/etc/secrets/.pwd_{}".format(gerrit.host) + #@ end + + # - job_name: prometheus + # static_configs: + # - targets: + # - localhost:9090 + + # A scrape configuration for running Prometheus on a Kubernetes cluster. + # This uses separate scrape configs for cluster components (i.e. API server, node) + # and services to allow each to use different authentication configs. + # + # Kubernetes labels will be added as Prometheus labels on metrics via the + # `labelmap` relabeling action. + + # Scrape config for API servers. + # + # Kubernetes exposes API servers as endpoints to the default/kubernetes + # service so this uses `endpoints` role and uses relabelling to only keep + # the endpoints associated with the default/kubernetes service using the + # default named port `https`. This works for single API server deployments as + # well as HA API server deployments. + # - job_name: 'kubernetes-apiservers' + + # kubernetes_sd_configs: + # - role: endpoints + + # # Default to scraping over https. If required, just disable this or change to + # # `http`. + # scheme: https + + # # This TLS & bearer token file config is used to connect to the actual scrape + # # endpoints for cluster components. This is separate to discovery auth + # # configuration because discovery & scraping are two separate concerns in + # # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # # the cluster. Otherwise, more config options have to be provided within the + # # . + # tls_config: + # ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # # If your node certificates are self-signed or use a different CA to the + # # master CA, then disable certificate verification below. Note that + # # certificate verification is an integral part of a secure infrastructure + # # so this should only be disabled in a controlled environment. You can + # # disable certificate verification by uncommenting the line below. + # # + # insecure_skip_verify: true + # bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # # Keep only the default/kubernetes service endpoints for the https port. This + # # will add targets for each API server which Kubernetes adds an endpoint to + # # the default/kubernetes service. + # relabel_configs: + # - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + # action: keep + # regex: default;kubernetes;https + + # - job_name: 'kubernetes-nodes' + + # # Default to scraping over https. If required, just disable this or change to + # # `http`. + # scheme: https + + # # This TLS & bearer token file config is used to connect to the actual scrape + # # endpoints for cluster components. This is separate to discovery auth + # # configuration because discovery & scraping are two separate concerns in + # # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # # the cluster. Otherwise, more config options have to be provided within the + # # . + # tls_config: + # ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # # If your node certificates are self-signed or use a different CA to the + # # master CA, then disable certificate verification below. Note that + # # certificate verification is an integral part of a secure infrastructure + # # so this should only be disabled in a controlled environment. You can + # # disable certificate verification by uncommenting the line below. + # # + # insecure_skip_verify: true + # bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # kubernetes_sd_configs: + # - role: node + + # relabel_configs: + # - action: labelmap + # regex: __meta_kubernetes_node_label_(.+) + # - target_label: __address__ + # replacement: kubernetes.default.svc:443 + # - source_labels: [__meta_kubernetes_node_name] + # regex: (.+) + # target_label: __metrics_path__ + # replacement: /api/v1/nodes/$1/proxy/metrics + + + # - job_name: 'kubernetes-nodes-cadvisor' + + # # Default to scraping over https. If required, just disable this or change to + # # `http`. + # scheme: https + + # # This TLS & bearer token file config is used to connect to the actual scrape + # # endpoints for cluster components. This is separate to discovery auth + # # configuration because discovery & scraping are two separate concerns in + # # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # # the cluster. Otherwise, more config options have to be provided within the + # # . + # tls_config: + # ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # # If your node certificates are self-signed or use a different CA to the + # # master CA, then disable certificate verification below. Note that + # # certificate verification is an integral part of a secure infrastructure + # # so this should only be disabled in a controlled environment. You can + # # disable certificate verification by uncommenting the line below. + # # + # insecure_skip_verify: true + # bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # kubernetes_sd_configs: + # - role: node + + # # This configuration will work only on kubelet 1.7.3+ + # # As the scrape endpoints for cAdvisor have changed + # # if you are using older version you need to change the replacement to + # # replacement: /api/v1/nodes/$1:4194/proxy/metrics + # # more info here https://github.com/coreos/prometheus-operator/issues/633 + # relabel_configs: + # - action: labelmap + # regex: __meta_kubernetes_node_label_(.+) + # - target_label: __address__ + # replacement: kubernetes.default.svc:443 + # - source_labels: [__meta_kubernetes_node_name] + # regex: (.+) + # target_label: __metrics_path__ + # replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + + # Scrape config for service endpoints. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/scrape`: Only scrape services that have a value of `true` + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: If the metrics are exposed on a different port to the + # service then set this appropriately. + # - job_name: 'kubernetes-service-endpoints' + + # kubernetes_sd_configs: + # - role: endpoints + + # relabel_configs: + # - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + # action: keep + # regex: true + # - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + # action: replace + # target_label: __scheme__ + # regex: (https?) + # - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + # action: replace + # target_label: __metrics_path__ + # regex: (.+) + # - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + # action: replace + # target_label: __address__ + # regex: ([^:]+)(?::\d+)?;(\d+) + # replacement: $1:$2 + # - action: labelmap + # regex: __meta_kubernetes_service_label_(.+) + # - source_labels: [__meta_kubernetes_namespace] + # action: replace + # target_label: kubernetes_namespace + # - source_labels: [__meta_kubernetes_service_name] + # action: replace + # target_label: kubernetes_name + # - source_labels: [__meta_kubernetes_pod_node_name] + # action: replace + # target_label: kubernetes_node + + # - job_name: 'prometheus-pushgateway' + # honor_labels: true + + # kubernetes_sd_configs: + # - role: service + + # relabel_configs: + # - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + # action: keep + # regex: pushgateway + + # Example scrape config for probing services via the Blackbox Exporter. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/probe`: Only probe services that have a value of `true` + # - job_name: 'kubernetes-services' + + # metrics_path: /probe + # params: + # module: [http_2xx] + + # kubernetes_sd_configs: + # - role: service + + # relabel_configs: + # - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + # action: keep + # regex: true + # - source_labels: [__address__] + # target_label: __param_target + # - target_label: __address__ + # replacement: blackbox + # - source_labels: [__param_target] + # target_label: instance + # - action: labelmap + # regex: __meta_kubernetes_service_label_(.+) + # - source_labels: [__meta_kubernetes_namespace] + # target_label: kubernetes_namespace + # - source_labels: [__meta_kubernetes_service_name] + # target_label: kubernetes_name + + # Example scrape config for pods + # + # The relabeling allows the actual pod scrape endpoint to be configured via the + # following annotations: + # + # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. + # - job_name: 'kubernetes-pods' + + # kubernetes_sd_configs: + # - role: pod + + # relabel_configs: + # - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + # action: keep + # regex: true + # - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + # action: replace + # target_label: __metrics_path__ + # regex: (.+) + # - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + # action: replace + # regex: ([^:]+)(?::\d+)?;(\d+) + # replacement: $1:$2 + # target_label: __address__ + # - action: labelmap + # regex: __meta_kubernetes_pod_label_(.+) + # - source_labels: [__meta_kubernetes_namespace] + # action: replace + # target_label: kubernetes_namespace + # - source_labels: [__meta_kubernetes_pod_name] + # action: replace + # target_label: kubernetes_pod_name + +# adds additional scrape configs to prometheus.yml +# must be a string so you have to add a | after extraScrapeConfigs: +# example adds prometheus-blackbox-exporter scrape config +extraScrapeConfigs: + # - job_name: 'prometheus-blackbox-exporter' + # metrics_path: /probe + # params: + # module: [http_2xx] + # static_configs: + # - targets: + # - https://example.com + # relabel_configs: + # - source_labels: [__address__] + # target_label: __param_target + # - source_labels: [__param_target] + # target_label: instance + # - target_label: __address__ + # replacement: prometheus-blackbox-exporter:9115 + +# Adds option to add alert_relabel_configs to avoid duplicate alerts in alertmanager +# useful in H/A prometheus with different external labels but the same alerts +alertRelabelConfigs: + # alert_relabel_configs: + # - source_labels: [dc] + # regex: (.+)\d+ + # target_label: dc + +networkPolicy: + ## Enable creation of NetworkPolicy resources. + ## + enabled: true diff --git a/charts/prometheus/storage/prometheus.pvc.yaml b/charts/prometheus/storage/prometheus.pvc.yaml new file mode 100644 index 0000000..a6078f8 --- /dev/null +++ b/charts/prometheus/storage/prometheus.pvc.yaml @@ -0,0 +1,15 @@ +#@ load("@ytt:data", "data") +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: prometheus-server-pvc + namespace: #@ data.values.namespace + labels: + app: prometheus +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 8Gi + storageClassName: default diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..5f1c301 --- /dev/null +++ b/config.yaml @@ -0,0 +1,48 @@ +gerritServers: + - host: gerrit.example.com + username: admin + password: secret +namespace: namespace +tls: + skipVerify: true + caCert: +promtail: + storagePath: /var/promtail + logPath: /var/gerrit/logs +prometheus: + server: + host: prometheus.example.com + username: + password: + tls: + cert: + key: + alertmanager: + slack: + apiUrl: https://hooks.slack.com/services/xxx/xxx + channel: '#alerts' +loki: + host: loki.example.com + username: + password: + tls: + cert: + key: +grafana: + host: grafana.example.com + tls: + cert: + key: + admin: + username: admin + password: secret + ldap: + enabled: false + host: + port: "" + password: + bind_dn: + accountBases: "[]" + groupBases: "[]" + dashboards: + editable: false diff --git a/dashboards/gerrit_caches.json b/dashboards/gerrit_caches.json new file mode 100644 index 0000000..449dc73 --- /dev/null +++ b/dashboards/gerrit_caches.json @@ -0,0 +1,683 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 4, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100-caches_memory_hit_ratio_accounts{instance=\"$instance\"}", + "legendFormat": "accounts", + "refId": "A" + }, + { + "expr": "100-caches_memory_hit_ratio_groups{instance=\"$instance\"}", + "legendFormat": "groups", + "refId": "B" + }, + { + "expr": "100-caches_memory_hit_ratio_groups_byuuid{instance=\"$instance\"}", + "legendFormat": "groups_byuuid", + "refId": "C" + }, + { + "expr": "100-caches_memory_hit_ratio_ldap_groups_byinclude{instance=\"$instance\"}", + "legendFormat": "ldap_groups_byinclude", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "ACCOUNT cache misses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "Cache Misses", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 8, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100-caches_memory_hit_ratio_conflicts{instance=\"$instance\"}", + "legendFormat": "conflicts", + "refId": "A" + }, + { + "expr": "100-caches_memory_hit_ratio_mergeability{instance=\"$instance\"}", + "legendFormat": "mergeability", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CONFLICT cache misses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "Cache Misses", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100-caches_memory_hit_ratio_change_kind{instance=\"$instance\"}", + "legendFormat": "change_kind", + "refId": "B" + }, + { + "expr": "100-caches_memory_hit_ratio_change_notes{instance=\"$instance\"}", + "legendFormat": "change_notes", + "refId": "C" + }, + { + "expr": "100-caches_memory_hit_ratio_changeid_project{instance=\"$instance\"}", + "legendFormat": "changeid_project", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CHANGE cache misses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "Cache Misses", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100-caches_memory_hit_ratio_project_list{instance=\"$instance\"}", + "legendFormat": "project list", + "refId": "A" + }, + { + "expr": "100-caches_memory_hit_ratio_projects{instance=\"$instance\"}", + "legendFormat": "projects", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "PROJECT cache misses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 4, + "format": "percent", + "label": "Cache Misses", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100-caches_memory_hit_ratio_diff{instance=\"$instance\"}", + "legendFormat": "diff", + "refId": "A" + }, + { + "expr": "100-caches_memory_hit_ratio_diff_intraline{instance=\"$instance\"}", + "legendFormat": "diff intraline", + "refId": "B" + }, + { + "expr": "100-caches_memory_hit_ratio_diff_summary{instance=\"$instance\"}", + "legendFormat": "diff summary", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "DIFF cache misses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "Cache Misses", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "hiddenSeries": false, + "id": 12, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100-caches_memory_hit_ratio_web_sessions{instance=\"$instance\"}", + "legendFormat": "web sessions", + "refId": "A" + }, + { + "expr": "100-caches_memory_hit_ratio_sshkeys{instance=\"$instance\"}", + "legendFormat": "sshkeys", + "refId": "B" + }, + { + "expr": "100-caches_memory_hit_ratio_git_tags{instance=\"$instance\"}", + "legendFormat": "git tags", + "refId": "D" + }, + { + "expr": "100-caches_memory_hit_ratio_permission_sort{instance=\"$instance\"}", + "legendFormat": "permission_sort", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "MISC cache misses", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 1, + "format": "percent", + "label": "Cache Misses", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 21, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "Prometheus", + "definition": "label_values(instance)", + "hide": 0, + "includeAll": false, + "label": "Gerrit Instance", + "multi": false, + "name": "instance", + "options": [], + "query": "label_values(instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Caches", + "uid": "vHQVaGsWk", + "version": 15 +} \ No newline at end of file diff --git a/dashboards/gerrit_fetch_clone.json b/dashboards/gerrit_fetch_clone.json new file mode 100644 index 0000000..cb3924e --- /dev/null +++ b/dashboards/gerrit_fetch_clone.json @@ -0,0 +1,466 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "upload-pack requests", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 4, + "interval": "2m", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(git_upload_pack_request_count_CLONE_total{instance=\"$instance\"}[2m])/2", + "legendFormat": "clone", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CLONE count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "opm", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "upload-pack requests for fetches", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 8, + "interval": "", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(git_upload_pack_request_count_FETCH_total{instance=\"$instance\"}[2m])/2", + "legendFormat": "fetch", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "FETCH count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "opm", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 13, + "w": 12, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "quantile: 0.75", + "hiddenSeries": true + }, + { + "alias": "quantile: 0.95", + "hiddenSeries": true + }, + { + "alias": "quantile: 0.98", + "hiddenSeries": true + }, + { + "alias": "quantile: 0.99", + "hiddenSeries": true + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "git_upload_pack_pack_bytes_CLONE{instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "quantile: {{quantile}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "upload-pack pack bytes CLONE", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "decbytes", + "label": "", + "logBase": 10, + "max": "10000000000", + "min": "1000", + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 13, + "w": 12, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "quantile: 0.75", + "hiddenSeries": true + }, + { + "alias": "quantile: 0.95", + "hiddenSeries": true + }, + { + "alias": "quantile: 0.98", + "hiddenSeries": true + }, + { + "alias": "quantile: 0.99", + "hiddenSeries": true + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "git_upload_pack_pack_bytes_FETCH{instance=\"$instance\"}", + "legendFormat": "quantile: {{quantile}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "upload-pack pack bytes FETCH", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "decbytes", + "label": null, + "logBase": 10, + "max": "10000000000", + "min": "1000", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 21, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "Prometheus", + "definition": "label_values(instance)", + "hide": 0, + "includeAll": false, + "label": "Gerrit Instance", + "multi": false, + "name": "instance", + "options": [], + "query": "label_values(instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Git fetch / clone", + "uid": "EV4ZCjEWz", + "version": 3 +} diff --git a/dashboards/gerrit_overview.json b/dashboards/gerrit_overview.json new file mode 100644 index 0000000..6d2d5ff --- /dev/null +++ b/dashboards/gerrit_overview.json @@ -0,0 +1,1206 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 46, + "panels": [], + "title": "", + "type": "row" + }, + { + "cacheTimeout": null, + "datasource": null, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 19, + "links": [], + "options": { + "fieldOptions": { + "calcs": [ + "last" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 60 + }, + { + "color": "#d44a3a", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [], + "values": false + }, + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.6.2", + "targets": [ + { + "expr": "(rate(proc_cpu_usage{instance=\"$instance\"}[5m])/proc_cpu_num_cores{instance=\"$instance\"})*100", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": null, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 1 + }, + "id": 20, + "links": [], + "options": { + "fieldOptions": { + "calcs": [ + "last" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 60 + }, + { + "color": "#d44a3a", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [], + "values": false + }, + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.6.2", + "targets": [ + { + "expr": "(proc_jvm_memory_heap_used{instance=\"$instance\"}/proc_jvm_memory_heap_committed{instance=\"$instance\"})*100", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Heap Memory Usage", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": null, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 1 + }, + "id": 40, + "links": [], + "options": { + "displayMode": "basic", + "fieldOptions": { + "calcs": [ + "lastNotNull" + ], + "defaults": { + "mappings": [], + "min": 0, + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [], + "values": false + }, + "orientation": "vertical", + "showUnfilled": true + }, + "pluginVersion": "6.6.2", + "targets": [ + { + "expr": "proc_jvm_thread_num_daemon_live{instance=\"$instance\"}", + "legendFormat": "current live threads (daemon)", + "refId": "A" + }, + { + "expr": "proc_jvm_thread_num_live{instance=\"$instance\"}", + "legendFormat": "current live threads", + "refId": "B" + }, + { + "expr": "proc_jvm_thread_num_peak_live{instance=\"$instance\"}", + "legendFormat": "peak of live threads", + "refId": "C" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Live Threads", + "type": "bargauge" + }, + { + "cacheTimeout": null, + "datasource": null, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 7 + }, + "id": 11, + "links": [], + "options": { + "fieldOptions": { + "calcs": [ + "last" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 50, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 30 + } + ] + }, + "unit": "ms" + }, + "overrides": [], + "values": false + }, + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.6.2", + "targets": [ + { + "expr": "http_server_rest_api_server_latency_total{quantile=\"0.99\", instance=\"$instance\"}", + "intervalFactor": 4, + "legendFormat": "quantile {{quantile}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "REST API latency (0.99 quantile)", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": null, + "description": "Excludes 404 and 401, since these error codes are caused by client behaviour and are overrepresented in the data.", + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 7 + }, + "id": 13, + "links": [], + "options": { + "fieldOptions": { + "calcs": [ + "mean" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "percent" + }, + "overrides": [], + "values": false + }, + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.6.2", + "targets": [ + { + "expr": "(increase(http_server_error_count_total_total{instance=\"$instance\"}[5m]) - increase(http_server_error_count_404_total{instance=\"$instance\"}[5m]) - increase(http_server_error_count_401_total{instance=\"$instance\"}[5m])) / (increase(http_server_success_count_total_total{instance=\"$instance\"}[5m]) + increase(http_server_error_count_total_total{instance=\"$instance\"}[5m]) - increase(http_server_error_count_404_total{instance=\"$instance\"}[5m]) - increase(http_server_error_count_401_total{instance=\"$instance\"}[5m]))*100", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "HTTP Request Error Rate (last 5 min)", + "type": "gauge" + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "LokiPromQL", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 13 + }, + "hiddenSeries": false, + "id": 48, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 500, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 10, + "points": true, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "count_over_time({job=\"gerrit_error\",host=\"$instance\"} |~ \"Gerrit Code Review .* ready\" [2m])", + "legendFormat": "{{gerrit_version}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Gerrit Version Deployment", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 44, + "title": "", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 18, + "x": 0, + "y": 21 + }, + "hiddenSeries": false, + "id": 39, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "up{instance=\"$instance\"}", + "format": "time_series", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Gerrit Availability", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "colorBackground": true, + "colorValue": false, + "colors": [ + "#d44a3a", + "rgba(237, 129, 40, 0.89)", + "#299c46" + ], + "datasource": null, + "decimals": 2, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 9, + "w": 6, + "x": 18, + "y": 21 + }, + "id": 42, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false, + "ymax": null, + "ymin": null + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg_over_time(up{instance=\"$instance\"}[1d])*100", + "format": "time_series", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": "98, 99", + "timeFrom": null, + "timeShift": null, + "title": "Gerrit availability [last 24h]", + "type": "singlestat", + "valueFontSize": "150%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "This excludes response code 404, since it is caused by an issue with the Gerrit Trigger plugin.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 30 + }, + "hiddenSeries": false, + "id": 2, + "interval": "1m", + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "200", + "color": "#37872D" + }, + { + "alias": "201", + "color": "#56A64B" + }, + { + "alias": "204", + "color": "#73BF69" + }, + { + "alias": "301", + "color": "rgb(110, 210, 110)" + }, + { + "alias": "304", + "color": "rgb(150, 225, 150)" + }, + { + "alias": "400", + "color": "#FA6400" + }, + { + "alias": "401", + "color": "#FF780A", + "hiddenSeries": true + }, + { + "alias": "403", + "color": "#FF9830" + }, + { + "alias": "404", + "color": "#FFB357", + "hiddenSeries": true + }, + { + "alias": "409", + "color": "#FFCB7D" + }, + { + "alias": "412", + "color": "#E0B400" + }, + { + "alias": "422", + "color": "#F2CC0C" + }, + { + "alias": "500", + "color": "#C4162A" + }, + { + "alias": "501", + "color": "#E02F44" + }, + { + "alias": "503", + "color": "#F2495C" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(increase(http_server_success_count_200_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "200", + "refId": "A" + }, + { + "expr": "(increase(http_server_success_count_201_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "201", + "refId": "C" + }, + { + "expr": "(increase(http_server_success_count_204_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "204", + "refId": "D" + }, + { + "expr": "(increase(http_server_success_count_301_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "301", + "refId": "E" + }, + { + "expr": "(increase(http_server_success_count_304_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "304", + "refId": "F" + }, + { + "expr": "(increase(http_server_error_count_400_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "400", + "refId": "B" + }, + { + "expr": "(increase(http_server_error_count_401_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "401", + "refId": "G" + }, + { + "expr": "(increase(http_server_error_count_403_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "403", + "refId": "H" + }, + { + "expr": "(increase(http_server_error_count_404_total{instance=\"$instance\"}[5m]))/5", + "instant": false, + "intervalFactor": 1, + "legendFormat": "404", + "refId": "I" + }, + { + "expr": "(increase(http_server_error_count_405_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "405", + "refId": "J" + }, + { + "expr": "(increase(http_server_error_count_409_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "409", + "refId": "K" + }, + { + "expr": "(increase(http_server_error_count_412_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "412", + "refId": "L" + }, + { + "expr": "(increase(http_server_error_count_422_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "422", + "refId": "M" + }, + { + "expr": "(increase(http_server_error_count_500_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "500", + "refId": "N" + }, + { + "expr": "(increase(http_server_error_count_501_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "501", + "refId": "O" + }, + { + "expr": "(increase(http_server_error_count_503_total{instance=\"$instance\"}[5m]))/5", + "intervalFactor": 1, + "legendFormat": "503", + "refId": "P" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "HTTP response status", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Count", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 40 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": false, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "6.5.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(events_assignee_changed_total{instance=\"$instance\"}[5m])", + "intervalFactor": 1, + "legendFormat": "assignee changed", + "refId": "A" + }, + { + "expr": "increase(events_change_abandoned_total{instance=\"$instance\"}[5m])", + "intervalFactor": 1, + "legendFormat": "change abandoned", + "refId": "B" + }, + { + "expr": "increase(events_change_merged_total{instance=\"$instance\"}[5m])", + "intervalFactor": 1, + "legendFormat": "change merged", + "refId": "C" + }, + { + "expr": "increase(events_comment_added_total{instance=\"$instance\"}[5m])", + "intervalFactor": 1, + "legendFormat": "comment added", + "refId": "D" + }, + { + "expr": "increase(events_patchset_created_total{instance=\"$instance\"}[5m])", + "intervalFactor": 1, + "legendFormat": "patchset created", + "refId": "E" + }, + { + "expr": "increase(events_ref_replicated_total{instance=\"$instance\"}[5m])", + "intervalFactor": 1, + "legendFormat": "ref replicated", + "refId": "F" + }, + { + "expr": "increase(events_ref_updated_total{instance=\"$instance\"}[5m])", + "intervalFactor": 1, + "legendFormat": "ref updated", + "refId": "G" + }, + { + "expr": "increase(events_reviewer_added_total{instance=\"$instance\"}[5m])", + "intervalFactor": 1, + "legendFormat": "reviewer added", + "refId": "H" + }, + { + "expr": "increase(events_reviewer_deleted_total{instance=\"$instance\"}[5m])", + "intervalFactor": 1, + "legendFormat": "reviewer deleted", + "refId": "I" + }, + { + "expr": "increase(events_topic_changed_total{instance=\"$instance\"}[5m])", + "intervalFactor": 1, + "legendFormat": "topic changed", + "refId": "J" + }, + { + "expr": "increase(events_vote_deleted_total{instance=\"$instance\"}[5m])", + "intervalFactor": 1, + "legendFormat": "vote deleted", + "refId": "K" + }, + { + "expr": "increase(events_wip_state_changed_total{instance=\"$instance\"}[5m])", + "intervalFactor": 1, + "legendFormat": "wip state changed", + "refId": "L" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Gerrit Events (last 5 min)", + "tooltip": { + "shared": false, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "series", + "name": null, + "show": true, + "values": [ + "current" + ] + }, + "yaxes": [ + { + "format": "short", + "label": "Count [5 min]", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 40 + }, + "hiddenSeries": false, + "id": 41, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": false, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "6.5.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(git_upload_pack_request_count_FETCH_total{instance=\"$instance\"}[5m])", + "intervalFactor": 4, + "legendFormat": "Fetch", + "refId": "B" + }, + { + "expr": "increase(git_upload_pack_request_count_CLONE_total{instance=\"$instance\"}[5m])", + "intervalFactor": 4, + "legendFormat": "Clone", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Git Fetch/Clone upload-pack requests (last 5 min)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "series", + "name": null, + "show": true, + "values": [ + "current" + ] + }, + "yaxes": [ + { + "format": "short", + "label": "Count [5 min]", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 22, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "tags": [], + "text": "", + "value": "" + }, + "datasource": "Prometheus", + "definition": "label_values(instance)", + "hide": 0, + "includeAll": false, + "label": "Gerrit Instance", + "multi": false, + "name": "instance", + "options": [], + "query": "label_values(instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Gerrit Overview", + "uid": "uXZMn9PWz", + "version": 14 +} diff --git a/dashboards/gerrit_process.json b/dashboards/gerrit_process.json new file mode 100644 index 0000000..e2e3fe0 --- /dev/null +++ b/dashboards/gerrit_process.json @@ -0,0 +1,668 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 13, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "proc_jvm_thread_num_live{instance=\"$instance\"}", + "legendFormat": "Java live threads", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Threads", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Live Threads", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 13, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "proc_jvm_memory_heap_committed{instance=\"$instance\"}", + "legendFormat": "committed heap", + "refId": "C" + }, + { + "expr": "proc_jvm_memory_heap_used{instance=\"$instance\"}", + "legendFormat": "used heap", + "refId": "B" + }, + { + "expr": "jgit_block_cache_cache_used{instance=\"$instance\"}", + "instant": false, + "legendFormat": "JGit block cache", + "refId": "A" + }, + { + "expr": "proc_jvm_memory_non_heap_used{instance=\"$instance\"}", + "legendFormat": "used non-heap", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "decbytes", + "label": "Memory Consumption", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 13 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 100, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(proc_cpu_usage{instance=\"$instance\"}[5m])", + "legendFormat": "used CPUs", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "CPU cores", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": null, + "decimals": 2, + "fill": 6, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 13 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "gc time G1 old gen", + "color": "#F2CC0C" + }, + { + "alias": "gc time G1 young gen", + "color": "#3274D9" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "increase(proc_jvm_gc_time_G1_Young_Generation{instance=\"$instance\"}[2m])/increase(proc_uptime{instance=\"$instance\"}[2m])", + "legendFormat": "gc time G1 young gen", + "refId": "B" + }, + { + "expr": "increase(proc_jvm_gc_time_G1_Old_Generation{instance=\"$instance\"}[2m])/increase(proc_uptime{instance=\"$instance\"}[2m])", + "legendFormat": "gc time G1 old gen", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Java - % of time spent in GC", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": "GC Time", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 12, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "miss ratio", + "yaxis": 1 + }, + { + "alias": "eviction ratio", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(jgit_block_cache_miss_count{instance=\"$instance\"}[2m])/(increase(jgit_block_cache_hit_count{instance=\"$instance\"}[2m])+increase(jgit_block_cache_miss_count{instance=\"$instance\"}[2m]))", + "format": "time_series", + "instant": false, + "legendFormat": "miss ratio", + "refId": "A" + }, + { + "expr": "increase(jgit_block_cache_eviction_count{instance=\"$instance\"}[2m])/(increase(jgit_block_cache_hit_count{instance=\"$instance\"}[2m])+increase(jgit_block_cache_miss_count{instance=\"$instance\"}[2m]))", + "format": "time_series", + "instant": false, + "legendFormat": "eviction ratio", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "JGit block cache", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "percentunit", + "label": "miss ratio", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "percentunit", + "label": "eviction ratio", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 1 + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 6, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 23 + }, + "hiddenSeries": false, + "id": 10, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "jgit_block_cache_open_files{instance=\"$instance\"}", + "legendFormat": "jgit block cache", + "refId": "B" + }, + { + "expr": "proc_num_open_fds{instance=\"$instance\"}-jgit_block_cache_open_files{instance=\"$instance\"}", + "legendFormat": "other", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Java open file descriptors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Open File Descriptors", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 22, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "Prometheus", + "definition": "label_values(instance)", + "hide": 0, + "includeAll": false, + "label": "Gerrit Instance", + "multi": false, + "name": "instance", + "options": [], + "query": "label_values(instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Process", + "uid": "MeOVgCPWz", + "version": 4 +} \ No newline at end of file diff --git a/dashboards/gerrit_queues.json b/dashboards/gerrit_queues.json new file mode 100644 index 0000000..c3fe819 --- /dev/null +++ b/dashboards/gerrit_queues.json @@ -0,0 +1,1107 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 7, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "batch threads", + "color": "#FFB357" + }, + { + "alias": "batch pool size", + "color": "#FA6400", + "fill": 0 + }, + { + "alias": "interactive threads", + "color": "#C0D8FF" + }, + { + "alias": "interactive pool size", + "color": "#1F60C4", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "queue_ssh_batch_worker_active_threads{instance=\"$instance\"}", + "legendFormat": "batch threads", + "refId": "C" + }, + { + "expr": "queue_ssh_batch_worker_pool_size{instance=\"$instance\"}", + "legendFormat": "batch pool size", + "refId": "D" + }, + { + "expr": "queue_ssh_interactive_worker_active_threads{instance=\"$instance\"}", + "legendFormat": "interactive threads", + "refId": "A" + }, + { + "expr": "queue_ssh_interactive_worker_pool_size{instance=\"$instance\"}", + "legendFormat": "interactive pool size", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "SSH threads", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Threads", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 7, + "x": 7, + "y": 0 + }, + "hiddenSeries": false, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "active threads", + "color": "#FFB357" + }, + { + "alias": "reserved threads", + "color": "#56A64B", + "fill": 0 + }, + { + "alias": "pool size", + "color": "#1F60C4", + "fill": 0 + }, + { + "alias": "max pool size", + "color": "#FA6400", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "http_server_jetty_threadpool_active_threads{instance=\"$instance\"}", + "legendFormat": "active threads", + "refId": "A" + }, + { + "expr": "http_server_jetty_threadpool_reserved_threads{instance=\"$instance\"}", + "legendFormat": "reserved threads", + "refId": "C" + }, + { + "expr": "http_server_jetty_threadpool_max_pool_size{instance=\"$instance\"}", + "legendFormat": "max pool size", + "refId": "B" + }, + { + "expr": "http_server_jetty_threadpool_pool_size{instance=\"$instance\"}", + "legendFormat": "pool size", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "HTTP threads", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "threads", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 7, + "x": 14, + "y": 0 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "batch threads", + "color": "#FFB357" + }, + { + "alias": "batch pool size", + "color": "#FA6400", + "fill": 0 + }, + { + "alias": "interactive threads", + "color": "#C0D8FF" + }, + { + "alias": "interactive pool size", + "color": "#1F60C4", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "queue_index_batch_active_threads{instance=\"$instance\"}", + "legendFormat": "batch threads", + "refId": "A" + }, + { + "expr": "queue_index_batch_pool_size{instance=\"$instance\"}", + "legendFormat": "batch pool size", + "refId": "B" + }, + { + "expr": "queue_index_interactive_active_threads{instance=\"$instance\"}", + "legendFormat": "interactive threads", + "refId": "C" + }, + { + "expr": "queue_index_interactive_pool_size{instance=\"$instance\"}", + "legendFormat": "interactive pool size", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "INDEX threads", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Threads", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 7, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "batch", + "color": "#FFB357" + }, + { + "alias": "interactive", + "color": "#C0D8FF" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "queue_ssh_batch_worker_scheduled_tasks{instance=\"$instance\"}", + "legendFormat": "batch", + "refId": "C" + }, + { + "expr": "queue_ssh_interactive_worker_scheduled_tasks{instance=\"$instance\"}", + "legendFormat": "interactive", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "SSH queue", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Tasks", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 7, + "x": 7, + "y": 8 + }, + "hiddenSeries": false, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "http", + "color": "#FFB357" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "http_server_jetty_threadpool_queue_size{instance=\"$instance\"}", + "legendFormat": "http", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "HTTP queue", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": "Tasks", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 7, + "x": 14, + "y": 8 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "batch", + "color": "#FFB357" + }, + { + "alias": "interactive", + "color": "#C0D8FF" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "queue_index_batch_scheduled_tasks{instance=\"$instance\"}", + "legendFormat": "batch", + "refId": "C" + }, + { + "expr": "queue_index_interactive_scheduled_tasks{instance=\"$instance\"}", + "legendFormat": "interactive", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "INDEX queued", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Tasks", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 10, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "stream", + "color": "#C0D8FF" + }, + { + "alias": "stream pool", + "color": "#1F60C4", + "fill": 0 + }, + { + "alias": "email", + "color": "#96D98D" + }, + { + "alias": "email pool", + "color": "#37872D", + "fill": 0 + }, + { + "alias": "receive-commit", + "color": "#FFA6B0" + }, + { + "alias": "receive-commit pool", + "color": "#C4162A", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "queue_ssh_stream_worker_active_threads{instance=\"$instance\"}", + "legendFormat": "stream", + "refId": "A" + }, + { + "expr": "queue_ssh_stream_worker_pool_size{instance=\"$instance\"}", + "legendFormat": "stream pool", + "refId": "B" + }, + { + "expr": "queue_send_email_active_threads{instance=\"$instance\"}", + "legendFormat": "email", + "refId": "C" + }, + { + "expr": "queue_send_email_pool_size{instance=\"$instance\"}", + "legendFormat": "email pool", + "refId": "D" + }, + { + "expr": "queue_receive_commits_active_threads{instance=\"$instance\"}", + "legendFormat": "receive-commit", + "refId": "E" + }, + { + "expr": "queue_receive_commits_pool_size{instance=\"$instance\"}", + "legendFormat": "receive-commit pool", + "refId": "F" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "misc. threads", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Threads", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 11, + "x": 10, + "y": 16 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(http_server_rest_api_count_total_total{instance=\"$instance\"}[5m])", + "legendFormat": "REST API request rate", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "REST API request rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "reqps", + "label": "Requests/Second", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 10, + "x": 0, + "y": 24 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 150, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "stream", + "color": "#8AB8FF" + }, + { + "alias": "email", + "color": "#96D98D" + }, + { + "alias": "receive-commit", + "color": "#FF7383" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "queue_ssh_stream_worker_scheduled_tasks{instance=\"$instance\"}", + "legendFormat": "stream", + "refId": "E" + }, + { + "expr": "queue_send_email_scheduled_tasks{instance=\"$instance\"}", + "legendFormat": "email", + "refId": "C" + }, + { + "expr": "queue_receive_commits_scheduled_tasks{instance=\"$instance\"}", + "legendFormat": "receive-commit", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "misc. queues", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Tasks", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 21, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "Prometheus", + "definition": "label_values(instance)", + "hide": 0, + "includeAll": false, + "label": "Gerrit Instance", + "multi": false, + "name": "instance", + "options": [], + "query": "label_values(instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Queues", + "uid": "Zh_ncGsWk", + "version": 5 +} \ No newline at end of file diff --git a/dashboards/gerrit_replication.json b/dashboards/gerrit_replication.json new file mode 100644 index 0000000..1d78d32 --- /dev/null +++ b/dashboards/gerrit_replication.json @@ -0,0 +1,278 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 1, + "interval": "", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "plugins_replication_replication_delay_$target{instance=\"$instance\"}", + "legendFormat": "quantile: {{quantile}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "replication delay $target", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": "delay", + "logBase": 10, + "max": null, + "min": "5000", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "plugins_replication_replication_latency_$target{instance=\"$instance\"}", + "legendFormat": "quantile: {{quantile}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "replication latency $target", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": "latency", + "logBase": 10, + "max": null, + "min": "0.1", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 21, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "Prometheus", + "definition": "label_values(instance)", + "hide": 0, + "includeAll": false, + "label": "Gerrit Instance", + "multi": false, + "name": "instance", + "options": [], + "query": "label_values(instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "tags": [], + "text": "", + "value": "" + }, + "datasource": "Prometheus", + "definition": "metrics(plugins_replication_replication_latency_.*_count)", + "hide": 0, + "includeAll": false, + "label": "Replication target", + "multi": false, + "name": "target", + "options": [], + "query": "metrics(plugins_replication_replication_latency_.*_count)", + "refresh": 1, + "regex": "plugins_replication_replication_latency_(.*)_count", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Replication", + "uid": "RFLS1GsWk", + "version": 4 +} diff --git a/documentation/config-management.md b/documentation/config-management.md new file mode 100644 index 0000000..037f9af --- /dev/null +++ b/documentation/config-management.md @@ -0,0 +1,103 @@ +# Config Management + +The configuration in the `config.yaml` contains secrets and should not be openly +accessible. To secure the data contained within it, the values can be encrypted +using a tool called [`sops`](https://github.com/mozilla/sops). This tool will use +a GPG-key to encrypt the values of the yaml file. Having the PGP-key also allows +to decrypt the values and work with the file. As long as the key is not compromised, +the encrypted file can be shared securly between collaborators. + +The process of using `sops` is described below. + +## Install `sops` + +On OSX, `sops` can be installed using brew: + +```sh +brew install sops +``` + +Install `gpg`: + +```sh +brew install gpg +``` + +You might need to add this to your `.bashrc` or `.zshrc` to enable `sops` to work +correctly with `gpg` [1]: + +```sh +GPG_TTY=$(tty) +export GPG_TTY +``` + +## Create GPG-key (first time only) + +Create a key by running the following command and following the instructions on +the screen: + +```sh +gpg --gen-key +``` + +## Encrypt the config-file + +Run the following command to encode the file: + +```sh +sops \ + --encrypt \ + --in-place \ + --encrypted-regex '(password|htpasswd|cert|key|apiUrl|caCert)$' \ + --pgp \ + `gpg --fingerprint "$EMAIL" | \ + grep pub -A 1 | \ + grep -v pub | \ + sed s/\ //g` \ + $FILE_TO_ENCODE +``` + +`$EMAIL` refers to the email used during the creation of the GPG key. + +Alternatively, the `./encrypt.sh`-script can be used to encrypt the file: + +```sh +./encrypt.sh \ + [--email $EMAIL] \ + [--fingerprint $FINGERPRINT] \ + $FILE_TO_ENCODE +``` + +The gpg-key used to encrypt the file can be selected by directly giving the key's +fingerprint using the `--fingerprint` option or giving the email used to identify +the key using the `--email` option. The `--fingerprint` option will have preference. +At least one of these options has to be set. + +## Decrypt file + +To decrypt the file, run: + +```sh +sops --in-place -d $FILE_TO_DECODE +``` + +## Export GPG-key + +For other developers or build servers to be able to decrypt the configuration, +the key has to be exported: + +```sh +gpg --export -a "$EMAIL" > public.key +gpg --export-secret-key -a "$EMAIL" > private.key +``` + +On the receiving computer the key has to be imported by running: + +```sh +gpg --import public.key +gpg --allow-secret-key-import --import private.key +``` + +## Links + +[1] https://github.com/mozilla/sops/issues/304 diff --git a/encrypt.sh b/encrypt.sh new file mode 100755 index 0000000..6b0172b --- /dev/null +++ b/encrypt.sh @@ -0,0 +1,58 @@ +#!/bin/bash -e + +# Copyright (C) 2020 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +usage() { + me=`basename "$0"` + echo >&2 "Usage: $me [--email EMAIL] [--fingerprint FINGERPRINT] CONFIG" + exit 1 +} + +while test $# -gt 0 ; do + case "$1" in + --email) + shift + EMAIL=$1 + shift + ;; + + --fingerprint) + shift + FINGERPRINT=$1 + shift + ;; + + *) + break + esac +done + +CONFIG=$1 +test -z "$CONFIG" && usage + +if test -z $FINGERPRINT; then + test -z $EMAIL && usage + FINGERPRINT=$(gpg --fingerprint "$EMAIL" | \ + grep pub -A 1 | \ + grep -v pub | \ + sed s/\ //g) +fi + +sops \ + --encrypt \ + --in-place \ + --encrypted-regex '(password|htpasswd|cert|key|apiUrl|caCert)$' \ + --pgp $FINGERPRINT \ + $CONFIG diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..aea85c1 --- /dev/null +++ b/install.sh @@ -0,0 +1,143 @@ +#!/bin/bash -e + +# Copyright (C) 2020 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +usage() { + me=`basename "$0"` + echo >&2 "Usage: $me [--output OUTPUT] [--dryrun] CONFIG" + exit 1 +} + +while test $# -gt 0 ; do + case "$1" in + --output) + shift + OUTPUT=$1 + shift + ;; + + --dryrun) + DRYRUN="true" + shift + ;; + + *) + break + esac +done + +test -z "$OUTPUT" && OUTPUT="$(dirname $0)/dist" + +CONFIG=$1 +test -z "$CONFIG" && usage + +NAMESPACE=$(yq r $CONFIG namespace) +TMP_CONFIG=$OUTPUT/$(basename $CONFIG) + +function updateOrInstall() { + if test -n "$(helm ls -n $NAMESPACE --short | grep $1)"; then + echo "upgrade" + else + echo "install" + fi +} + +function addHtpasswdEntryUnencrypted() { + local COMPONENT=$1 + + local HTPASSWD=$(htpasswd -nb \ + $(yq r $TMP_CONFIG $COMPONENT.username) \ + $(yq r $TMP_CONFIG $COMPONENT.password)) + + yq w -i $TMP_CONFIG $COMPONENT.htpasswd $HTPASSWD +} + +function addHtpasswdEntryEncrypted() { + local COMPONENT=$1 + + local HTPASSWD=$(htpasswd -nb \ + $(sops -d --extract "$COMPONENT['username']" $TMP_CONFIG) \ + $(sops -d --extract "$COMPONENT['password']" $TMP_CONFIG)) + + sops --set "$COMPONENT['htpasswd'] \"$HTPASSWD\"" $TMP_CONFIG +} + +function runYtt() { + ytt \ + -f charts/namespace.yaml \ + -f charts/prometheus/ \ + -f charts/loki/ \ + -f charts/grafana/ \ + -f promtail/ \ + --output-directory $OUTPUT \ + --ignore-unknown-comments \ + -f $1 +} + +mkdir -p $OUTPUT +cp $CONFIG $TMP_CONFIG + +# Fill in templates +if test -z "$(grep -o '^sops:$' $TMP_CONFIG)"; then + addHtpasswdEntryUnencrypted loki + addHtpasswdEntryUnencrypted prometheus.server + echo -e "#@data/values\n---\n$(cat $TMP_CONFIG)" | runYtt - +else + addHtpasswdEntryEncrypted "['loki']" $TMP_CONFIG + addHtpasswdEntryEncrypted "['prometheus']['server']" $TMP_CONFIG + echo -e "#@data/values\n---\n$(sops -d $TMP_CONFIG)" | runYtt - +fi + +# Create configmap with dashboards +kubectl create configmap grafana-dashboards \ + --from-file=./dashboards \ + --dry-run=true \ + --namespace=$NAMESPACE \ + -o yaml > $OUTPUT/configuration/dashboards.cm.yaml + +test -n "$DRYRUN" && exit 0 + +# Install loose components +kubectl apply -f $OUTPUT/namespace.yaml +kubectl apply -f $OUTPUT/configuration +kubectl apply -f $OUTPUT/storage + +# Add Loki helm repository +helm repo add loki https://grafana.github.io/loki/charts +helm repo update + +# Install Prometheus +PROMETHEUS_CHART_NAME=prometheus-$NAMESPACE +helm $(updateOrInstall $PROMETHEUS_CHART_NAME) $PROMETHEUS_CHART_NAME \ + stable/prometheus \ + --version $(cat ./charts/prometheus/VERSION) \ + --values $OUTPUT/prometheus.yaml \ + --namespace $NAMESPACE + +# Install Loki +LOKI_CHART_NAME=loki-$NAMESPACE +helm $(updateOrInstall $LOKI_CHART_NAME) $LOKI_CHART_NAME \ + loki/loki \ + --version $(cat ./charts/loki/VERSION) \ + --values $OUTPUT/loki.yaml \ + --namespace $NAMESPACE + +# Install Grafana +GRAFANA_CHART_NAME=grafana-$NAMESPACE +helm $(updateOrInstall $GRAFANA_CHART_NAME) $GRAFANA_CHART_NAME \ + stable/grafana \ + --version $(cat ./charts/grafana/VERSION) \ + --values $OUTPUT/grafana.yaml \ + --namespace $NAMESPACE diff --git a/promtail/promtail.yaml b/promtail/promtail.yaml new file mode 100644 index 0000000..e0b9042 --- /dev/null +++ b/promtail/promtail.yaml @@ -0,0 +1,124 @@ +#@ load("@ytt:data", "data") + +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: #@ "{}/positions.yaml".format(data.values.promtail.storagePath) + +clients: + - url: #@ "https://{}/loki/api/v1/push".format(data.values.loki.host) + tls_config: + insecure_skip_verify: #@ data.values.tls.skipVerify + #@ if not data.values.tls.skipVerify: + ca_file: #@ "{}/promtail.ca.crt".format(data.values.promtail.storagePath) + #@ end + basic_auth: + username: #@ data.values.loki.username + password: #@ data.values.loki.password +scrape_configs: +- job_name: gerrit_error + static_configs: + - targets: + - localhost + labels: + job: gerrit_error + __path__: #@ "{}/error_log.json".format(data.values.promtail.logPath) + entry_parser: raw + pipeline_stages: + - json: + expressions: + level: + timestamp: '"@timestamp"' + exception: '"exception"' + thread: thread_name + logger: logger_name + class: + message: + - json: + source: exception + expressions: + exception_message: + exception_class: + - template: + source: timestamp + template: '{{ Replace .Value "," "." 1 }}' + - template: + source: timestamp + template: '{{ Replace .Value "Z" " +0000" 1 }}' + - template: + source: timestamp + template: '{{ Replace .Value "T" " " 1 }}' + - timestamp: + source: timestamp + format: "2006-01-02 15:04:05.999 -0700" + - regex: + source: message + expression: "Gerrit Code Review (?P.*) ready" + - labels: + level: + exception_message: + exception_class: + thread: + logger: + class: + gerrit_version: +- job_name: gerrit_httpd + static_configs: + - targets: + - localhost + labels: + job: gerrit_httpd + __path__: #@ "{}/httpd_log.json".format(data.values.promtail.logPath) + entry_parser: raw + pipeline_stages: + - json: + expressions: + timestamp: + thread: + user: + method: + status: + protocol: + - template: + source: timestamp + template: '{{ Replace .Value "," "." 1 }}' + - timestamp: + source: timestamp + format: '02/Jan/2006:15:04:05.999 -0700' + - labels: + thread: + user: + method: + status: + protocol: +- job_name: gerrit_sshd + static_configs: + - targets: + - localhost + labels: + job: gerrit_sshd + __path__: #@ "{}/sshd_log.json".format(data.values.promtail.logPath) + entry_parser: raw + pipeline_stages: + - json: + expressions: + timestamp: + session: + thread: + user: + account_id: + status: + - template: + source: timestamp + template: '{{ Replace .Value "," "." 1 }}' + - timestamp: + source: timestamp + format: 2006-01-02 15:04:05.999 -0700 + - labels: + session: + thread: + user: + account_id: + status: diff --git a/uninstall.sh b/uninstall.sh new file mode 100755 index 0000000..de5226a --- /dev/null +++ b/uninstall.sh @@ -0,0 +1,42 @@ +#!/bin/bash -e + +# Copyright (C) 2020 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +usage() { + me=`basename "$0"` + echo >&2 "Usage: $me CONFIG" + exit 1 +} + +test -z "$1" && usage +CONFIG=$1 + +NAMESPACE=$(yq r $CONFIG namespace) + +function removeHelmDeployment() { + read -p "This will remove the deployment $1-$NAMESPACE. Continue (y/n)? " response + if [[ "$response" == "y" ]]; then + helm uninstall $1-$NAMESPACE -n $NAMESPACE || true + fi +} + +removeHelmDeployment grafana +removeHelmDeployment loki +removeHelmDeployment prometheus + +read -p "This will remove the namespace $NAMESPACE. Continue (y/n)? " response +if [[ "$response" == "y" ]]; then + kubectl delete ns $NAMESPACE +fi