better gerrit integration #5

Merged
puck merged 14 commits from better-gerrit into gerrit 2024-03-11 13:57:56 +00:00
3 changed files with 250 additions and 287 deletions

View file

@ -2,7 +2,7 @@ import json
import multiprocessing
import os
import sys
import uuid
import graphlib
from collections import defaultdict
from collections.abc import Generator
from dataclasses import dataclass
@ -20,6 +20,12 @@ from buildbot.util import asyncSleep
from buildbot.www.authz.endpointmatchers import EndpointMatcherBase, Match
from buildbot.www.oauth2 import OAuth2Auth
from buildbot.changes.gerritchangesource import GerritChangeSource
from buildbot.reporters.utils import getURLForBuild
from buildbot.reporters.utils import getURLForBuildrequest
from buildbot.process.buildstep import CANCELLED
from buildbot.process.buildstep import EXCEPTION
from buildbot.process.buildstep import SUCCESS
from buildbot.process.results import worst_status
if TYPE_CHECKING:
from buildbot.process.log import Log
@ -32,8 +38,6 @@ from .github_projects import (
slugify_project_name,
)
SKIPPED_BUILDER_NAME = "skipped-builds"
log = Logger()
class LixSystemsOAuth2(OAuth2Auth):
@ -52,82 +56,206 @@ class GerritProject:
# `project` field.
name: str
class BuildTrigger(Trigger):
"""Dynamic trigger that creates a build for every attribute."""
class BuildTrigger(steps.BuildStep):
def __init__(
self,
builds_scheduler: str,
skipped_builds_scheduler: str,
jobs: list[dict[str, Any]],
all_deps: dict[str, Any],
**kwargs: Any,
) -> None:
if "name" not in kwargs:
kwargs["name"] = "trigger"
self.jobs = jobs
self.all_deps = all_deps
self.config = None
self.builds_scheduler = builds_scheduler
self.skipped_builds_scheduler = skipped_builds_scheduler
Trigger.__init__(
self,
waitForFinish=True,
schedulerNames=[builds_scheduler, skipped_builds_scheduler],
haltOnFailure=True,
flunkOnFailure=True,
sourceStamps=[],
alwaysUseLatest=False,
updateSourceStamp=False,
**kwargs,
)
self._result_list = []
self.ended = False
self.waitForFinishDeferred = None
self.brids = []
self.description = f"building {len(jobs)} hydra jobs"
super().__init__(**kwargs)
def createTriggerProperties(self, props: Any) -> Any: # noqa: N802
return props
def interrupt(self, reason):
# We cancel the buildrequests, as the data api handles
# both cases:
# - build started: stop is sent,
# - build not created yet: related buildrequests are set to CANCELLED.
# Note that there is an identified race condition though (more details
# are available at buildbot.data.buildrequests).
for brid in self.brids:
self.master.data.control(
"cancel", {'reason': 'parent build was interrupted'}, ("buildrequests", brid)
)
if self.running and not self.ended:
self.ended = True
# if we are interrupted because of a connection lost, we interrupt synchronously
if self.build.conn is None and self.waitForFinishDeferred is not None:
self.waitForFinishDeferred.cancel()
def getSchedulersAndProperties(self) -> list[tuple[str, Properties]]: # noqa: N802
def getSchedulerByName(self, name):
schedulers = self.master.scheduler_manager.namedServices
if name not in schedulers:
raise ValueError(f"unknown triggered scheduler: {repr(name)}")
sch = schedulers[name]
# todo: check ITriggerableScheduler
return sch
def schedule_one(self, build_props, job):
source = f"nix-eval-lix"
attr = job.get("attr", "eval-error")
name = attr
name = f"hydraJobs.{name}"
error = job.get("error")
props = Properties()
props.setProperty("virtual_builder_name", name, source)
props.setProperty("status_name", f"nix-build .#hydraJobs.{attr}", source)
props.setProperty("virtual_builder_tags", "", source)
if error is not None:
props.setProperty("error", error, source)
return (self.builds_scheduler, props)
drv_path = job.get("drvPath")
system = job.get("system")
out_path = job.get("outputs", {}).get("out")
build_props.setProperty(f"{attr}-out_path", out_path, source)
build_props.setProperty(f"{attr}-drv_path", drv_path, source)
props.setProperty("attr", attr, source)
props.setProperty("system", system, source)
props.setProperty("drv_path", drv_path, source)
props.setProperty("out_path", out_path, source)
props.setProperty("isCached", job.get("isCached"), source)
return (self.builds_scheduler, props)
@defer.inlineCallbacks
def _add_results(self, brid):
@defer.inlineCallbacks
def _is_buildrequest_complete(brid):
buildrequest = yield self.master.db.buildrequests.getBuildRequest(brid)
return buildrequest['complete']
event = ('buildrequests', str(brid), 'complete')
yield self.master.mq.waitUntilEvent(event, lambda: _is_buildrequest_complete(brid))
builds = yield self.master.db.builds.getBuilds(buildrequestid=brid)
for build in builds:
self._result_list.append(build["results"])
self.updateSummary()
def prepareSourcestampListForTrigger(self):
ss_for_trigger = {}
objs_from_build = self.build.getAllSourceStamps()
for ss in objs_from_build:
ss_for_trigger[ss.codebase] = ss.asDict()
trigger_values = [ss_for_trigger[k] for k in sorted(ss_for_trigger.keys())]
return trigger_values
@defer.inlineCallbacks
def run(self):
self.running = True
build_props = self.build.getProperties()
source = f"nix-eval-lix"
triggered_schedulers = []
for job in self.jobs:
attr = job.get("attr", "eval-error")
name = attr
name = f"hydraJobs.{name}"
error = job.get("error")
props = Properties()
props.setProperty("virtual_builder_name", name, source)
props.setProperty("status_name", f"nix-build .#hydraJobs.{attr}", source)
props.setProperty("virtual_builder_tags", "", source)
builds_to_schedule = list(self.jobs)
build_schedule_order = []
sorter = graphlib.TopologicalSorter(self.all_deps)
for item in sorter.static_order():
i = 0
while i < len(builds_to_schedule):
if item == builds_to_schedule[i].get("drvPath"):
build_schedule_order.append(builds_to_schedule[i])
del builds_to_schedule[i]
else:
i += 1
if error is not None:
props.setProperty("error", error, source)
triggered_schedulers.append((self.skipped_builds_scheduler, props))
continue
done = []
scheduled = []
failed = {}
all_results = SUCCESS
ss_for_trigger = self.prepareSourcestampListForTrigger()
while not self.ended and (len(build_schedule_order) > 0 or len(scheduled) > 0):
print('Scheduling..')
schedule_now = []
for build in list(build_schedule_order):
if self.all_deps.get(build.get("drvPath"), []) == []:
build_schedule_order.remove(build)
schedule_now.append(build)
if len(schedule_now) == 0:
print(' No builds to schedule found.')
for job in schedule_now:
print(f" - {job.get('attr')}")
(scheduler, props) = self.schedule_one(build_props, job)
scheduler = self.getSchedulerByName(scheduler)
if job.get("isCached"):
triggered_schedulers.append((self.skipped_builds_scheduler, props))
continue
idsDeferred, resultsDeferred = scheduler.trigger(
waited_for = True,
sourcestamps = ss_for_trigger,
set_props = props,
parent_buildid = self.build.buildid,
parent_relationship = "Triggered from",
)
drv_path = job.get("drvPath")
system = job.get("system")
out_path = job.get("outputs", {}).get("out")
brids = {}
try:
_, brids = yield idsDeferred
except Exception as e:
yield self.addLogWithException(e)
results = EXCEPTION
scheduled.append((job, brids, resultsDeferred))
build_props.setProperty(f"{attr}-out_path", out_path, source)
build_props.setProperty(f"{attr}-drv_path", drv_path, source)
for brid in brids.values():
url = getURLForBuildrequest(self.master, brid)
yield self.addURL(f"{scheduler.name} #{brid}", url)
self._add_results(brid)
self.brids.append(brid)
print('Waiting..')
wait_for_next = defer.DeferredList([results for _, _, results in scheduled], fireOnOneCallback = True, fireOnOneErrback=True)
self.waitForFinishDeferred = wait_for_next
results, index = yield wait_for_next
job, brids, _ = scheduled[index]
done.append((job, brids, results))
del scheduled[index]
result = results[0]
print(f' Found finished build {job.get("attr")}, result {util.Results[result].upper()}')
if result != SUCCESS:
failed_checks = []
failed_paths = [job.get('drvPath')]
removed = []
while True:
old_paths = list(failed_paths)
for build in list(build_schedule_order):
deps = self.all_deps.get(build.get("drvPath"), [])
for path in old_paths:
if path in deps:
failed_checks.append(build)
failed_paths.append(build.get("drvPath"))
build_schedule_order.remove(build)
removed.append(build.get("attr"))
failed[build.get("attr")] = (f"dependency {job.get('attr')} failed", [])
props.setProperty("attr", attr, source)
props.setProperty("system", system, source)
props.setProperty("drv_path", drv_path, source)
props.setProperty("out_path", out_path, source)
# we use this to identify builds when running a retry
props.setProperty("build_uuid", str(uuid.uuid4()), source)
triggered_schedulers.append((self.builds_scheduler, props))
return triggered_schedulers
break
if old_paths == failed_paths:
break
failed[job.get("attr")] = (
"failed",
[ getURLForBuildrequest(self.master, brid) for brid in brids.values() ]
)
print(' Removed jobs: ' + ', '.join(removed))
all_results = worst_status(result, all_results)
print(f' New result: {util.Results[all_results].upper()}')
for dep in self.all_deps:
if job.get("drvPath") in self.all_deps[dep]:
self.all_deps[dep].remove(job.get("drvPath"))
print('Done!')
build_props.setProperty("failed_builds", failed, "nix-eval-lix")
if self.ended:
return util.CANCELLED
return all_results
def getCurrentSummary(self) -> dict[str, str]: # noqa: N802
"""The original build trigger will the generic builder name `nix-build` in this case, which is not helpful"""
if not self.triggeredNames:
return {"step": "running"}
summary = []
if self._result_list:
for status in ALL_RESULTS:
@ -178,13 +306,42 @@ class NixEvalCommand(buildstep.ShellMixin, steps.BuildStep):
if not system or system in self.supported_systems: # report eval errors
filtered_jobs.append(job)
drv_show_log: Log = yield self.getLog("stdio")
drv_show_log.addStdout(f"getting derivation infos\n")
cmd = yield self.makeRemoteShellCommand(
stdioLogName=None,
collectStdout=True,
command=(
["nix", "derivation", "show", "--recursive"]
+ [ drv for drv in (job.get("drvPath") for job in filtered_jobs) if drv ]
),
)
yield self.runCommand(cmd)
drv_show_log.addStdout(f"done\n")
try:
drv_info = json.loads(cmd.stdout)
except json.JSONDecodeError as e:
msg = f"Failed to parse `nix derivation show` output for {cmd.command}"
raise BuildbotNixError(msg) from e
all_deps = dict()
for drv, info in drv_info.items():
all_deps[drv] = set(info.get("inputDrvs").keys())
def closure_of(key, deps):
r, size = set([key]), 0
while len(r) != size:
size = len(r)
r.update(*[ deps[k] for k in r ])
return r.difference([key])
job_set = set(( drv for drv in ( job.get("drvPath") for job in filtered_jobs ) if drv ))
all_deps = { k: list(closure_of(k, all_deps).intersection(job_set)) for k in job_set }
self.build.addStepsAfterCurrentStep(
[
BuildTrigger(
builds_scheduler=f"lix-nix-build",
skipped_builds_scheduler=f"lix-nix-skipped-build",
name="build flake",
jobs=filtered_jobs,
all_deps=all_deps,
),
],
)
@ -192,37 +349,6 @@ class NixEvalCommand(buildstep.ShellMixin, steps.BuildStep):
return result
# FIXME this leaks memory... but probably not enough that we care
class RetryCounter:
def __init__(self, retries: int) -> None:
self.builds: dict[uuid.UUID, int] = defaultdict(lambda: retries)
def retry_build(self, build_id: uuid.UUID) -> int:
retries = self.builds[build_id]
if retries > 1:
self.builds[build_id] = retries - 1
return retries
return 0
# For now we limit this to two. Often this allows us to make the error log
# shorter because we won't see the logs for all previous succeeded builds
RETRY_COUNTER = RetryCounter(retries=2)
class EvalErrorStep(steps.BuildStep):
"""Shows the error message of a failed evaluation."""
@defer.inlineCallbacks
def run(self) -> Generator[Any, object, Any]:
error = self.getProperty("error")
attr = self.getProperty("attr")
# show eval error
error_log: Log = yield self.addLog("nix_error")
error_log.addStderr(f"{attr} failed to evaluate:\n{error}")
return util.FAILURE
class NixBuildCommand(buildstep.ShellMixin, steps.BuildStep):
"""Builds a nix derivation."""
@ -232,16 +358,25 @@ class NixBuildCommand(buildstep.ShellMixin, steps.BuildStep):
@defer.inlineCallbacks
def run(self) -> Generator[Any, object, Any]:
if error := self.getProperty("error"):
attr = self.getProperty("attr")
# show eval error
error_log: Log = yield self.addLog("nix_error")
error_log.addStderr(f"{attr} failed to evaluate:\n{error}")
return util.FAILURE
if self.getProperty("isCached"):
yield self.addCompleteLog(
"cached outpath from previous builds",
# buildbot apparently hides the first line in the ui?
f'\n{self.getProperty("out_path")}\n')
return util.SKIPPED
# run `nix build`
cmd: remotecommand.RemoteCommand = yield self.makeRemoteShellCommand()
yield self.runCommand(cmd)
res = cmd.results()
if res == util.FAILURE:
retries = RETRY_COUNTER.retry_build(self.getProperty("build_uuid"))
if retries > 0:
return util.RETRY
return res
return cmd.results()
class UpdateBuildOutput(steps.BuildStep):
@ -269,32 +404,6 @@ class UpdateBuildOutput(steps.BuildStep):
return util.SUCCESS
# The builtin retry mechanism doesn't seem to work for github,
# since github is sometimes not delivering the pull request ref fast enough.
class GitWithRetry(steps.Git):
@defer.inlineCallbacks
def run_vc(
self,
branch: str,
revision: str,
patch: str,
) -> Generator[Any, object, Any]:
retry_counter = 0
while True:
try:
res = yield super().run_vc(branch, revision, patch)
except Exception as e: # noqa: BLE001
retry_counter += 1
if retry_counter == 3:
msg = "Failed to clone"
raise BuildbotNixError(msg) from e
log: Log = yield self.addLog("log")
yield log.addStderr(f"Retrying git clone (error: {e})\n")
yield asyncSleep(2 << retry_counter) # 2, 4, 8
else:
return res
def nix_eval_config(
project: GerritProject,
gerrit_private_key: str,
@ -318,8 +427,12 @@ def nix_eval_config(
sshPrivateKey=gerrit_private_key
),
)
# use one gcroots directory per worker. this should be scoped to the largest unique resource
# in charge of builds (ie, buildnumber is too narrow) to not litter the system with permanent
# gcroots in case of worker restarts.
# TODO perhaps we should clean the entire /drvs/ directory up too during startup.
drv_gcroots_dir = util.Interpolate(
"/nix/var/nix/gcroots/per-user/buildbot-worker/%(prop:project)s/drvs/",
"/nix/var/nix/gcroots/per-user/buildbot-worker/%(prop:project)s/drvs/%(prop:workername)s/",
)
factory.addStep(
@ -356,6 +469,7 @@ def nix_eval_config(
"-rf",
drv_gcroots_dir,
],
alwaysRun=True,
),
)
@ -368,25 +482,9 @@ def nix_eval_config(
)
@dataclass
class CachixConfig:
name: str
signing_key_secret_name: str | None = None
auth_token_secret_name: str | None = None
def cachix_env(self) -> dict[str, str]:
env = {}
if self.signing_key_secret_name is not None:
env["CACHIX_SIGNING_KEY"] = util.Secret(self.signing_key_secret_name)
if self.auth_token_secret_name is not None:
env["CACHIX_AUTH_TOKEN"] = util.Secret(self.auth_token_secret_name)
return env
def nix_build_config(
project: GerritProject,
worker_names: list[str],
cachix: CachixConfig | None = None,
outputs_path: Path | None = None,
) -> util.BuilderConfig:
"""Builds one nix flake attribute."""
@ -417,19 +515,6 @@ def nix_build_config(
haltOnFailure=True,
),
)
if cachix:
factory.addStep(
steps.ShellCommand(
name="Upload cachix",
env=cachix.cachix_env(),
command=[
"cachix",
"push",
cachix.name,
util.Interpolate("result-%(prop:attr)s"),
],
),
)
factory.addStep(
steps.ShellCommand(
@ -471,38 +556,6 @@ def nix_build_config(
)
def nix_skipped_build_config(
project: GerritProject,
worker_names: list[str],
) -> util.BuilderConfig:
"""Dummy builder that is triggered when a build is skipped."""
factory = util.BuildFactory()
factory.addStep(
EvalErrorStep(
name="Nix evaluation",
doStepIf=lambda s: s.getProperty("error"),
hideStepIf=lambda _, s: not s.getProperty("error"),
),
)
# This is just a dummy step showing the cached build
factory.addStep(
steps.BuildStep(
name="Nix build (cached)",
doStepIf=lambda _: False,
hideStepIf=lambda _, s: s.getProperty("error"),
),
)
return util.BuilderConfig(
name=f"{project.name}/nix-skipped-build",
project=project.name,
workernames=worker_names,
collapseRequests=False,
env={},
factory=factory,
)
def read_secret_file(secret_name: str) -> str:
directory = os.environ.get("CREDENTIALS_DIRECTORY")
if directory is None:
@ -520,7 +573,6 @@ def config_for_project(
nix_eval_worker_count: int,
nix_eval_max_memory_size: int,
eval_lock: util.MasterLock,
cachix: CachixConfig | None = None,
outputs_path: Path | None = None,
) -> Project:
config["projects"].append(Project(project.name))
@ -540,11 +592,6 @@ def config_for_project(
name=f"{project.name}-nix-build",
builderNames=[f"{project.name}/nix-build"],
),
# this is triggered from `nix-eval` when the build is skipped
schedulers.Triggerable(
name=f"{project.name}-nix-skipped-build",
builderNames=[f"{project.name}/nix-skipped-build"],
),
# allow to manually trigger a nix-build
schedulers.ForceScheduler(
name=f"{project.name}-force",
@ -582,10 +629,8 @@ def config_for_project(
nix_build_config(
project,
worker_names,
cachix=cachix,
outputs_path=outputs_path,
),
nix_skipped_build_config(project, [SKIPPED_BUILDER_NAME]),
],
)
@ -608,62 +653,22 @@ def gerritReviewCB(builderName, build, result, master, arg):
if builderName != 'lix/nix-eval':
return dict()
all_checks = {}
for step in build['steps']:
if step['name'] != 'build flake':
continue
failed = build['properties'].get('failed_builds', [{}])[0]
for url in step['urls']:
if url['name'].startswith('success: hydraJobs.'):
path = url['name'].split(' ')[1]
all_checks[path] = (True, url['url'])
elif url['name'].startswith('failure: hydraJobs.'):
path = url['name'].split(' ')[1]
all_checks[path] = (False, url['url'])
collected_oses = {}
for check in all_checks:
arch = check.split('.')[-1]
if not arch.endswith('-linux') and not arch.endswith('-darwin'):
# Not an architecture-specific job, just a test
os = "test"
else:
os = arch.split('-')[1]
(success, failure) = collected_oses.get(os, (0, 0))
if all_checks[check][0]:
success += 1
else:
failure += 1
collected_oses[os] = (success, failure)
labels = {}
if 'linux' in collected_oses:
(success, failure) = collected_oses['linux']
if success > 0 and failure == 0:
labels['Verified-On-Linux'] = 1
elif failure > 0:
labels['Verified-On-Linux'] = -1
if 'darwin' in collected_oses:
(success, failure) = collected_oses['darwin']
if success > 0 and failure == 0:
labels['Verified-On-Darwin'] = 1
elif failure > 0:
labels['Verified-On-Darwin'] = -1
labels = {
'Verified': -1 if failed else 1,
}
message = "Buildbot finished compiling your patchset!\n"
message += "The result is: %s\n" % util.Results[result].upper()
if result != util.SUCCESS:
successful_checks = []
failed_checks = []
for check in all_checks:
if not all_checks[check][0]:
failed_checks.append(f" - {check} (see {all_checks[check][1]})")
if len(failed_checks) > 0:
message += "Failed checks:\n" + "\n".join(failed_checks) + "\n"
message += "\nFailed checks:\n"
for check, context in sorted(failed.items()):
how, urls = context
message += f" - {check}: {how}"
if urls:
message += f" (see {', '.join(urls)})"
message += "\n"
if arg:
message += "\nFor more details visit:\n"
@ -724,7 +729,6 @@ class GerritNixConfigurator(ConfiguratorBase):
nix_eval_worker_count: int | None,
nix_eval_max_memory_size: int,
nix_workers_secret_name: str = "buildbot-nix-workers", # noqa: S107
cachix: CachixConfig | None = None,
outputs_path: str | None = None,
) -> None:
super().__init__()
@ -737,7 +741,6 @@ class GerritNixConfigurator(ConfiguratorBase):
self.nix_supported_systems = nix_supported_systems
self.gerrit_change_source = GerritChangeSource(gerrit_server, gerrit_user, gerritport=gerrit_port, identity_file=gerrit_sshkey_path)
self.url = url
self.cachix = cachix
if outputs_path is None:
self.outputs_path = None
else:
@ -769,12 +772,10 @@ class GerritNixConfigurator(ConfiguratorBase):
self.nix_eval_worker_count or multiprocessing.cpu_count(),
self.nix_eval_max_memory_size,
eval_lock,
self.cachix,
self.outputs_path,
)
config["change_source"] = self.gerrit_change_source
config["workers"].append(worker.LocalWorker(SKIPPED_BUILDER_NAME))
config["services"].append(
reporters.GerritStatusPush(self.gerrit_server, self.gerrit_user,
port=2022,

View file

@ -46,14 +46,6 @@ in
# optional nix-eval-jobs settings
# evalWorkerCount = 8; # limit number of concurrent evaluations
# evalMaxMemorySize = "2048"; # limit memory usage per evaluation
# optional cachix
#cachix = {
# name = "my-cachix";
# # One of the following is required:
# signingKey = "/var/lib/secrets/cachix-key";
# authToken = "/var/lib/secrets/cachix-token";
#};
};
})
buildbot-nix.nixosModules.buildbot-master

View file

@ -15,25 +15,6 @@ in
default = "postgresql://@/buildbot";
description = "Postgresql database url";
};
cachix = {
name = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
description = "Cachix name";
};
signingKeyFile = lib.mkOption {
type = lib.types.nullOr lib.types.path;
default = null;
description = "Cachix signing key";
};
authTokenFile = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
description = "Cachix auth token";
};
};
workersFile = lib.mkOption {
type = lib.types.path;
description = "File containing a list of nix workers";
@ -88,13 +69,6 @@ in
isSystemUser = true;
};
assertions = [
{
assertion = cfg.cachix.name != null -> cfg.cachix.signingKeyFile != null || cfg.cachix.authTokenFile != null;
message = "if cachix.name is provided, then cachix.signingKeyFile and cachix.authTokenFile must be set";
}
];
services.buildbot-master = {
enable = true;
@ -106,7 +80,7 @@ in
home = "/var/lib/buildbot";
extraImports = ''
from datetime import timedelta
from buildbot_nix import GerritNixConfigurator, CachixConfig
from buildbot_nix import GerritNixConfigurator
'';
configurators = [
''
@ -150,11 +124,7 @@ in
LoadCredential = [
"buildbot-nix-workers:${cfg.workersFile}"
"buildbot-oauth2-secret:${cfg.oauth2SecretFile}"
]
++ lib.optional (cfg.cachix.signingKeyFile != null)
"cachix-signing-key:${builtins.toString cfg.cachix.signingKeyFile}"
++ lib.optional (cfg.cachix.authTokenFile != null)
"cachix-auth-token:${builtins.toString cfg.cachix.authTokenFile}";
];
};
};