From fdfeef8ad415552f8866ec2f79847bed98e2b142 Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Sun, 10 Mar 2024 17:34:22 +0100 Subject: [PATCH 01/14] remove retry logic retries don't help us very much, in fact they mostly hurt by repeating builds that failed for non-transient reasons. retries could help with workers dropping while running a build, but those rare cases are better to restart manually than to pend at least twice the ci time for commits that simply do not build cleanly. --- buildbot_nix/__init__.py | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index 2f50695..dc17dce 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -2,7 +2,6 @@ import json import multiprocessing import os import sys -import uuid from collections import defaultdict from collections.abc import Generator from dataclasses import dataclass @@ -118,8 +117,6 @@ class BuildTrigger(Trigger): props.setProperty("system", system, source) props.setProperty("drv_path", drv_path, source) props.setProperty("out_path", out_path, source) - # we use this to identify builds when running a retry - props.setProperty("build_uuid", str(uuid.uuid4()), source) triggered_schedulers.append((self.builds_scheduler, props)) return triggered_schedulers @@ -192,24 +189,6 @@ class NixEvalCommand(buildstep.ShellMixin, steps.BuildStep): return result -# FIXME this leaks memory... but probably not enough that we care -class RetryCounter: - def __init__(self, retries: int) -> None: - self.builds: dict[uuid.UUID, int] = defaultdict(lambda: retries) - - def retry_build(self, build_id: uuid.UUID) -> int: - retries = self.builds[build_id] - if retries > 1: - self.builds[build_id] = retries - 1 - return retries - return 0 - - -# For now we limit this to two. Often this allows us to make the error log -# shorter because we won't see the logs for all previous succeeded builds -RETRY_COUNTER = RetryCounter(retries=2) - - class EvalErrorStep(steps.BuildStep): """Shows the error message of a failed evaluation.""" @@ -236,12 +215,7 @@ class NixBuildCommand(buildstep.ShellMixin, steps.BuildStep): cmd: remotecommand.RemoteCommand = yield self.makeRemoteShellCommand() yield self.runCommand(cmd) - res = cmd.results() - if res == util.FAILURE: - retries = RETRY_COUNTER.retry_build(self.getProperty("build_uuid")) - if retries > 0: - return util.RETRY - return res + return cmd.results() class UpdateBuildOutput(steps.BuildStep): From 0b2545b0361a07e0eb100e3df9d27654f1f285b1 Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Mon, 11 Mar 2024 05:37:00 +0100 Subject: [PATCH 02/14] remove unused GitWithRetry --- buildbot_nix/__init__.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index dc17dce..a858cd1 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -243,32 +243,6 @@ class UpdateBuildOutput(steps.BuildStep): return util.SUCCESS -# The builtin retry mechanism doesn't seem to work for github, -# since github is sometimes not delivering the pull request ref fast enough. -class GitWithRetry(steps.Git): - @defer.inlineCallbacks - def run_vc( - self, - branch: str, - revision: str, - patch: str, - ) -> Generator[Any, object, Any]: - retry_counter = 0 - while True: - try: - res = yield super().run_vc(branch, revision, patch) - except Exception as e: # noqa: BLE001 - retry_counter += 1 - if retry_counter == 3: - msg = "Failed to clone" - raise BuildbotNixError(msg) from e - log: Log = yield self.addLog("log") - yield log.addStderr(f"Retrying git clone (error: {e})\n") - yield asyncSleep(2 << retry_counter) # 2, 4, 8 - else: - return res - - def nix_eval_config( project: GerritProject, gerrit_private_key: str, From 753df8e34033fc4b5c237f5135b9222ed30320b4 Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Mon, 11 Mar 2024 05:45:33 +0100 Subject: [PATCH 03/14] remove cachix we aren't using it and it's somewhat in the way of our efforts to improve scheduling and stuff. --- buildbot_nix/__init__.py | 34 ---------------------------------- examples/default.nix | 8 -------- nix/coordinator.nix | 34 ++-------------------------------- 3 files changed, 2 insertions(+), 74 deletions(-) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index a858cd1..43d9fe0 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -316,25 +316,9 @@ def nix_eval_config( ) -@dataclass -class CachixConfig: - name: str - signing_key_secret_name: str | None = None - auth_token_secret_name: str | None = None - - def cachix_env(self) -> dict[str, str]: - env = {} - if self.signing_key_secret_name is not None: - env["CACHIX_SIGNING_KEY"] = util.Secret(self.signing_key_secret_name) - if self.auth_token_secret_name is not None: - env["CACHIX_AUTH_TOKEN"] = util.Secret(self.auth_token_secret_name) - return env - - def nix_build_config( project: GerritProject, worker_names: list[str], - cachix: CachixConfig | None = None, outputs_path: Path | None = None, ) -> util.BuilderConfig: """Builds one nix flake attribute.""" @@ -365,19 +349,6 @@ def nix_build_config( haltOnFailure=True, ), ) - if cachix: - factory.addStep( - steps.ShellCommand( - name="Upload cachix", - env=cachix.cachix_env(), - command=[ - "cachix", - "push", - cachix.name, - util.Interpolate("result-%(prop:attr)s"), - ], - ), - ) factory.addStep( steps.ShellCommand( @@ -468,7 +439,6 @@ def config_for_project( nix_eval_worker_count: int, nix_eval_max_memory_size: int, eval_lock: util.MasterLock, - cachix: CachixConfig | None = None, outputs_path: Path | None = None, ) -> Project: config["projects"].append(Project(project.name)) @@ -530,7 +500,6 @@ def config_for_project( nix_build_config( project, worker_names, - cachix=cachix, outputs_path=outputs_path, ), nix_skipped_build_config(project, [SKIPPED_BUILDER_NAME]), @@ -672,7 +641,6 @@ class GerritNixConfigurator(ConfiguratorBase): nix_eval_worker_count: int | None, nix_eval_max_memory_size: int, nix_workers_secret_name: str = "buildbot-nix-workers", # noqa: S107 - cachix: CachixConfig | None = None, outputs_path: str | None = None, ) -> None: super().__init__() @@ -685,7 +653,6 @@ class GerritNixConfigurator(ConfiguratorBase): self.nix_supported_systems = nix_supported_systems self.gerrit_change_source = GerritChangeSource(gerrit_server, gerrit_user, gerritport=gerrit_port, identity_file=gerrit_sshkey_path) self.url = url - self.cachix = cachix if outputs_path is None: self.outputs_path = None else: @@ -717,7 +684,6 @@ class GerritNixConfigurator(ConfiguratorBase): self.nix_eval_worker_count or multiprocessing.cpu_count(), self.nix_eval_max_memory_size, eval_lock, - self.cachix, self.outputs_path, ) diff --git a/examples/default.nix b/examples/default.nix index f9fb42e..f59a01a 100644 --- a/examples/default.nix +++ b/examples/default.nix @@ -46,14 +46,6 @@ in # optional nix-eval-jobs settings # evalWorkerCount = 8; # limit number of concurrent evaluations # evalMaxMemorySize = "2048"; # limit memory usage per evaluation - - # optional cachix - #cachix = { - # name = "my-cachix"; - # # One of the following is required: - # signingKey = "/var/lib/secrets/cachix-key"; - # authToken = "/var/lib/secrets/cachix-token"; - #}; }; }) buildbot-nix.nixosModules.buildbot-master diff --git a/nix/coordinator.nix b/nix/coordinator.nix index 80a21f0..797d339 100644 --- a/nix/coordinator.nix +++ b/nix/coordinator.nix @@ -15,25 +15,6 @@ in default = "postgresql://@/buildbot"; description = "Postgresql database url"; }; - cachix = { - name = lib.mkOption { - type = lib.types.nullOr lib.types.str; - default = null; - description = "Cachix name"; - }; - - signingKeyFile = lib.mkOption { - type = lib.types.nullOr lib.types.path; - default = null; - description = "Cachix signing key"; - }; - - authTokenFile = lib.mkOption { - type = lib.types.nullOr lib.types.str; - default = null; - description = "Cachix auth token"; - }; - }; workersFile = lib.mkOption { type = lib.types.path; description = "File containing a list of nix workers"; @@ -88,13 +69,6 @@ in isSystemUser = true; }; - assertions = [ - { - assertion = cfg.cachix.name != null -> cfg.cachix.signingKeyFile != null || cfg.cachix.authTokenFile != null; - message = "if cachix.name is provided, then cachix.signingKeyFile and cachix.authTokenFile must be set"; - } - ]; - services.buildbot-master = { enable = true; @@ -106,7 +80,7 @@ in home = "/var/lib/buildbot"; extraImports = '' from datetime import timedelta - from buildbot_nix import GerritNixConfigurator, CachixConfig + from buildbot_nix import GerritNixConfigurator ''; configurators = [ '' @@ -150,11 +124,7 @@ in LoadCredential = [ "buildbot-nix-workers:${cfg.workersFile}" "buildbot-oauth2-secret:${cfg.oauth2SecretFile}" - ] - ++ lib.optional (cfg.cachix.signingKeyFile != null) - "cachix-signing-key:${builtins.toString cfg.cachix.signingKeyFile}" - ++ lib.optional (cfg.cachix.authTokenFile != null) - "cachix-auth-token:${builtins.toString cfg.cachix.authTokenFile}"; + ]; }; }; From 156e6e3deae3edd06de7de1dc1dd20de0b6a4c74 Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Mon, 11 Mar 2024 06:24:51 +0100 Subject: [PATCH 04/14] remove skipped-builds builder run all of them on the normal build worker. this significantly simplifies the overall scheduler/builder config and removes a triplication of possible builds paths. --- buildbot_nix/__init__.py | 80 +++++++++------------------------------- 1 file changed, 17 insertions(+), 63 deletions(-) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index 43d9fe0..b795afe 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -31,8 +31,6 @@ from .github_projects import ( slugify_project_name, ) -SKIPPED_BUILDER_NAME = "skipped-builds" - log = Logger() class LixSystemsOAuth2(OAuth2Auth): @@ -57,7 +55,6 @@ class BuildTrigger(Trigger): def __init__( self, builds_scheduler: str, - skipped_builds_scheduler: str, jobs: list[dict[str, Any]], **kwargs: Any, ) -> None: @@ -66,11 +63,10 @@ class BuildTrigger(Trigger): self.jobs = jobs self.config = None self.builds_scheduler = builds_scheduler - self.skipped_builds_scheduler = skipped_builds_scheduler Trigger.__init__( self, waitForFinish=True, - schedulerNames=[builds_scheduler, skipped_builds_scheduler], + schedulerNames=[builds_scheduler], haltOnFailure=True, flunkOnFailure=True, sourceStamps=[], @@ -99,11 +95,7 @@ class BuildTrigger(Trigger): if error is not None: props.setProperty("error", error, source) - triggered_schedulers.append((self.skipped_builds_scheduler, props)) - continue - - if job.get("isCached"): - triggered_schedulers.append((self.skipped_builds_scheduler, props)) + triggered_schedulers.append((self.builds_scheduler, props)) continue drv_path = job.get("drvPath") @@ -117,6 +109,7 @@ class BuildTrigger(Trigger): props.setProperty("system", system, source) props.setProperty("drv_path", drv_path, source) props.setProperty("out_path", out_path, source) + props.setProperty("isCached", job.get("isCached"), source) triggered_schedulers.append((self.builds_scheduler, props)) return triggered_schedulers @@ -179,7 +172,6 @@ class NixEvalCommand(buildstep.ShellMixin, steps.BuildStep): [ BuildTrigger( builds_scheduler=f"lix-nix-build", - skipped_builds_scheduler=f"lix-nix-skipped-build", name="build flake", jobs=filtered_jobs, ), @@ -189,19 +181,6 @@ class NixEvalCommand(buildstep.ShellMixin, steps.BuildStep): return result -class EvalErrorStep(steps.BuildStep): - """Shows the error message of a failed evaluation.""" - - @defer.inlineCallbacks - def run(self) -> Generator[Any, object, Any]: - error = self.getProperty("error") - attr = self.getProperty("attr") - # show eval error - error_log: Log = yield self.addLog("nix_error") - error_log.addStderr(f"{attr} failed to evaluate:\n{error}") - return util.FAILURE - - class NixBuildCommand(buildstep.ShellMixin, steps.BuildStep): """Builds a nix derivation.""" @@ -211,6 +190,20 @@ class NixBuildCommand(buildstep.ShellMixin, steps.BuildStep): @defer.inlineCallbacks def run(self) -> Generator[Any, object, Any]: + if error := self.getProperty("error"): + attr = self.getProperty("attr") + # show eval error + error_log: Log = yield self.addLog("nix_error") + error_log.addStderr(f"{attr} failed to evaluate:\n{error}") + return util.FAILURE + + if self.getProperty("isCached"): + yield self.addCompleteLog( + "cached outpath from previous builds", + # buildbot apparently hides the first line in the ui? + f'\n{self.getProperty("out_path")}\n') + return util.SKIPPED + # run `nix build` cmd: remotecommand.RemoteCommand = yield self.makeRemoteShellCommand() yield self.runCommand(cmd) @@ -390,38 +383,6 @@ def nix_build_config( ) -def nix_skipped_build_config( - project: GerritProject, - worker_names: list[str], -) -> util.BuilderConfig: - """Dummy builder that is triggered when a build is skipped.""" - factory = util.BuildFactory() - factory.addStep( - EvalErrorStep( - name="Nix evaluation", - doStepIf=lambda s: s.getProperty("error"), - hideStepIf=lambda _, s: not s.getProperty("error"), - ), - ) - - # This is just a dummy step showing the cached build - factory.addStep( - steps.BuildStep( - name="Nix build (cached)", - doStepIf=lambda _: False, - hideStepIf=lambda _, s: s.getProperty("error"), - ), - ) - return util.BuilderConfig( - name=f"{project.name}/nix-skipped-build", - project=project.name, - workernames=worker_names, - collapseRequests=False, - env={}, - factory=factory, - ) - - def read_secret_file(secret_name: str) -> str: directory = os.environ.get("CREDENTIALS_DIRECTORY") if directory is None: @@ -458,11 +419,6 @@ def config_for_project( name=f"{project.name}-nix-build", builderNames=[f"{project.name}/nix-build"], ), - # this is triggered from `nix-eval` when the build is skipped - schedulers.Triggerable( - name=f"{project.name}-nix-skipped-build", - builderNames=[f"{project.name}/nix-skipped-build"], - ), # allow to manually trigger a nix-build schedulers.ForceScheduler( name=f"{project.name}-force", @@ -502,7 +458,6 @@ def config_for_project( worker_names, outputs_path=outputs_path, ), - nix_skipped_build_config(project, [SKIPPED_BUILDER_NAME]), ], ) @@ -688,7 +643,6 @@ class GerritNixConfigurator(ConfiguratorBase): ) config["change_source"] = self.gerrit_change_source - config["workers"].append(worker.LocalWorker(SKIPPED_BUILDER_NAME)) config["services"].append( reporters.GerritStatusPush(self.gerrit_server, self.gerrit_user, port=2022, From f869b52a8d4ecb2c8d1076639c3286d6e3956cfd Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Mon, 11 Mar 2024 06:40:14 +0100 Subject: [PATCH 05/14] use build-local gc-root directory without this two builds can interfere with each other if: - builds 1 and 2 start - build 1 is starved of workers - build 2 finishes, removes the shared gcroots directory - gc runs - build 1 schedules more builds whose .drvs have now been removed using a dedicated directory for each build fixes this. we now also need to set alwaysRun on the cleanup command or we risk littering the system with stale gc roots when a build fails. --- buildbot_nix/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index b795afe..fc58e34 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -259,8 +259,12 @@ def nix_eval_config( sshPrivateKey=gerrit_private_key ), ) + # use one gcroots directory per worker. this should be scoped to the largest unique resource + # in charge of builds (ie, buildnumber is too narrow) to not litter the system with permanent + # gcroots in case of worker restarts. + # TODO perhaps we should clean the entire /drvs/ directory up too during startup. drv_gcroots_dir = util.Interpolate( - "/nix/var/nix/gcroots/per-user/buildbot-worker/%(prop:project)s/drvs/", + "/nix/var/nix/gcroots/per-user/buildbot-worker/%(prop:project)s/drvs/%(prop:workername)s/", ) factory.addStep( @@ -297,6 +301,7 @@ def nix_eval_config( "-rf", drv_gcroots_dir, ], + alwaysRun=True, ), ) From e9874c3d987eebe77e0bd6c665f4069eb067e711 Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Sun, 10 Mar 2024 21:27:45 +0100 Subject: [PATCH 06/14] wip: dependency-tracked build triggering --- buildbot_nix/__init__.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index fc58e34..5e28500 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -56,11 +56,13 @@ class BuildTrigger(Trigger): self, builds_scheduler: str, jobs: list[dict[str, Any]], + drv_info: dict[str, Any], **kwargs: Any, ) -> None: if "name" not in kwargs: kwargs["name"] = "trigger" self.jobs = jobs + self.drv_info = drv_info self.config = None self.builds_scheduler = builds_scheduler Trigger.__init__( @@ -82,6 +84,20 @@ class BuildTrigger(Trigger): build_props = self.build.getProperties() source = f"nix-eval-lix" + all_deps = dict() + for drv, info in self.drv_info.items(): + all_deps[drv] = set(info.get("inputDrvs").keys()) + def closure_of(key, deps): + r, size = set([key]), 0 + while len(r) != size: + size = len(r) + r.update(*[ deps[k] for k in r ]) + return r.difference([key]) + job_set = set(( drv for drv in ( job.get("drvPath") for job in self.jobs ) if drv )) + all_deps = { k: list(closure_of(k, all_deps).intersection(job_set)) for k in job_set } + + build_props.setProperty("sched_state", all_deps, source, True) + triggered_schedulers = [] for job in self.jobs: attr = job.get("attr", "eval-error") @@ -168,12 +184,31 @@ class NixEvalCommand(buildstep.ShellMixin, steps.BuildStep): if not system or system in self.supported_systems: # report eval errors filtered_jobs.append(job) + drv_show_log: Log = yield self.getLog("stdio") + drv_show_log.addStdout(f"getting derivation infos\n") + cmd = yield self.makeRemoteShellCommand( + stdioLogName=None, + collectStdout=True, + command=( + ["nix", "derivation", "show", "--recursive"] + + [ drv for drv in (job.get("drvPath") for job in filtered_jobs) if drv ] + ), + ) + yield self.runCommand(cmd) + drv_show_log.addStdout(f"done\n") + try: + drv_info = json.loads(cmd.stdout) + except json.JSONDecodeError as e: + msg = f"Failed to parse `nix derivation show` output for {cmd.command}" + raise BuildbotNixError(msg) from e + self.build.addStepsAfterCurrentStep( [ BuildTrigger( builds_scheduler=f"lix-nix-build", name="build flake", jobs=filtered_jobs, + drv_info=drv_info, ), ], ) From 28ca39af258ccd05668ffdbfb4f227bda5f2c145 Mon Sep 17 00:00:00 2001 From: Puck Meerburg Date: Sun, 10 Mar 2024 21:27:24 +0000 Subject: [PATCH 07/14] WIP: Replace Trigger with custom logic --- buildbot_nix/__init__.py | 153 +++++++++++++++++++++++++++------------ 1 file changed, 106 insertions(+), 47 deletions(-) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index 5e28500..3a5b4d3 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -2,6 +2,7 @@ import json import multiprocessing import os import sys +import graphlib from collections import defaultdict from collections.abc import Generator from dataclasses import dataclass @@ -19,6 +20,11 @@ from buildbot.util import asyncSleep from buildbot.www.authz.endpointmatchers import EndpointMatcherBase, Match from buildbot.www.oauth2 import OAuth2Auth from buildbot.changes.gerritchangesource import GerritChangeSource +from buildbot.reporters.utils import getURLForBuild +from buildbot.reporters.utils import getURLForBuildrequest +from buildbot.process.buildstep import CANCELLED +from buildbot.process.buildstep import EXCEPTION +from buildbot.process.buildstep import SUCCESS if TYPE_CHECKING: from buildbot.process.log import Log @@ -49,9 +55,7 @@ class GerritProject: # `project` field. name: str -class BuildTrigger(Trigger): - """Dynamic trigger that creates a build for every attribute.""" - +class BuildTrigger(steps.BuildStep): def __init__( self, builds_scheduler: str, @@ -59,28 +63,64 @@ class BuildTrigger(Trigger): drv_info: dict[str, Any], **kwargs: Any, ) -> None: - if "name" not in kwargs: - kwargs["name"] = "trigger" self.jobs = jobs self.drv_info = drv_info self.config = None self.builds_scheduler = builds_scheduler - Trigger.__init__( - self, - waitForFinish=True, - schedulerNames=[builds_scheduler], - haltOnFailure=True, - flunkOnFailure=True, - sourceStamps=[], - alwaysUseLatest=False, - updateSourceStamp=False, - **kwargs, - ) - def createTriggerProperties(self, props: Any) -> Any: # noqa: N802 - return props + def getSchedulerByName(self, name): + schedulers = self.master.scheduler_manager.namedServices + if name not in schedulers: + raise ValueError(f"unknown triggered scheduler: {repr(name)}") + sch = schedulers[name] + # todo: check ITriggerableScheduler + return sch - def getSchedulersAndProperties(self) -> list[tuple[str, Properties]]: # noqa: N802 + def schedule_one(self, build_props, job): + attr = job.get("attr", "eval-error") + name = attr + name = f"hydraJobs.{name}" + error = job.get("error") + props = Properties() + props.setProperty("virtual_builder_name", name, source) + props.setProperty("status_name", f"nix-build .#hydraJobs.{attr}", source) + props.setProperty("virtual_builder_tags", "", source) + + if error is not None: + props.setProperty("error", error, source) + return (self.builds_scheduler, props) + + drv_path = job.get("drvPath") + system = job.get("system") + out_path = job.get("outputs", {}).get("out") + + build_props.setProperty(f"{attr}-out_path", out_path, source) + build_props.setProperty(f"{attr}-drv_path", drv_path, source) + + props.setProperty("attr", attr, source) + props.setProperty("system", system, source) + props.setProperty("drv_path", drv_path, source) + props.setProperty("out_path", out_path, source) + props.setProperty("isCached", job.get("isCached"), source) + + return (self.builds_scheduler, props) + + @defer.inlineCallbacks + def _add_results(self, brid, results): + @defer.inlineCallbacks + def _is_buildrequest_complete(brid): + buildrequest = yield self.master.db.buildrequests.getBuildRequest(brid) + return buildrequest['complete'] + + event = ('buildrequests', str(brid), 'complete') + yield self.master.mq.waitUntilEvent(event, lambda: _is_buildrequest_complete(brid)) + builds = yield self.master.db.builds.getBuilds(buildrequestid=brid) + for build in builds: + self._result_list.append(build["results"]) + self.updateSummary() + + @defer.inlineCallbacks + def run(self): build_props = self.build.getProperties() source = f"nix-eval-lix" @@ -95,40 +135,59 @@ class BuildTrigger(Trigger): return r.difference([key]) job_set = set(( drv for drv in ( job.get("drvPath") for job in self.jobs ) if drv )) all_deps = { k: list(closure_of(k, all_deps).intersection(job_set)) for k in job_set } + builds_to_schedule = list(self.jobs) + build_schedule_order = [] + sorter = graphlib.TopologicalSorter(all_deps) + for item in sorter.static_order(): + i = 0 + while i < builds_to_schedule.len(): + if item == builds_to_schedule[i].get("drvPath"): + build_schedule_order.append(builds_to_schedule[i]) + del builds_to_schedule[i] + else: + i += 1 - build_props.setProperty("sched_state", all_deps, source, True) + done = [] + scheduled = [] + while len(build_schedule_order) > 0 and len(scheduled) > 0: + schedule_now = [] + for build in list(build_schedule_order): + if all_deps.get(build.get("drvPath"), []) == []: + build_schedule_order.remove(build) + schedule_now.append(build) - triggered_schedulers = [] - for job in self.jobs: - attr = job.get("attr", "eval-error") - name = attr - name = f"hydraJobs.{name}" - error = job.get("error") - props = Properties() - props.setProperty("virtual_builder_name", name, source) - props.setProperty("status_name", f"nix-build .#hydraJobs.{attr}", source) - props.setProperty("virtual_builder_tags", "", source) + for job in schedule_now: + (scheduler, props) = self.schedule_one(build_props, job) + scheduler = self.getSchedulerByName(scheduler) - if error is not None: - props.setProperty("error", error, source) - triggered_schedulers.append((self.builds_scheduler, props)) - continue + idsDeferred, resultsDeferred = scheduler.trigger( + waited_for = True, + sourcestamps = ss_for_trigger, + set_props = props, + parent_buildid = self.build.buildid, + parent_relationship = "Triggered from", + ) - drv_path = job.get("drvPath") - system = job.get("system") - out_path = job.get("outputs", {}).get("out") + brids = {} + try: + _, brids = yield idsDeferred + except Exception as e: + yield self.addLogWithException(e) + results = EXCEPTION + scheduled.append((job, brids, resultsDeferred)) - build_props.setProperty(f"{attr}-out_path", out_path, source) - build_props.setProperty(f"{attr}-drv_path", drv_path, source) + for brid in brids.values(): + url = getURLForBuildrequest(self.master, brid) + yield self.addURL(f"{sch.name} #{brid}", url) + self._add_results(brid) - props.setProperty("attr", attr, source) - props.setProperty("system", system, source) - props.setProperty("drv_path", drv_path, source) - props.setProperty("out_path", out_path, source) - props.setProperty("isCached", job.get("isCached"), source) - - triggered_schedulers.append((self.builds_scheduler, props)) - return triggered_schedulers + wait_for_next = defer.DeferredList([results for _, _, results in scheduled], fireOnOneCallback = True, fireOnOneErrback=True) + results, index = yield wait_for_next + job, brids, _ = scheduled[index] + done.append((job, brids, results)) + del scheduled[index] + # TODO: remove dep from all_deps + # TODO: calculate final result def getCurrentSummary(self) -> dict[str, str]: # noqa: N802 """The original build trigger will the generic builder name `nix-build` in this case, which is not helpful""" From 4d73275123fdd18603965c2783104cc21fc21445 Mon Sep 17 00:00:00 2001 From: Puck Meerburg Date: Sun, 10 Mar 2024 22:55:38 +0000 Subject: [PATCH 08/14] Add build result tracking, schedule newly available builds --- buildbot_nix/__init__.py | 87 +++++++++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 11 deletions(-) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index 3a5b4d3..3b7a394 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -25,6 +25,7 @@ from buildbot.reporters.utils import getURLForBuildrequest from buildbot.process.buildstep import CANCELLED from buildbot.process.buildstep import EXCEPTION from buildbot.process.buildstep import SUCCESS +from buildbot.process.results import worst_status if TYPE_CHECKING: from buildbot.process.log import Log @@ -67,6 +68,27 @@ class BuildTrigger(steps.BuildStep): self.drv_info = drv_info self.config = None self.builds_scheduler = builds_scheduler + self._result_list = [] + self.ended = False + self.waitForFinishDeferred = None + super().__init__(**kwargs) + + def interrupt(self, reason): + # We cancel the buildrequests, as the data api handles + # both cases: + # - build started: stop is sent, + # - build not created yet: related buildrequests are set to CANCELLED. + # Note that there is an identified race condition though (more details + # are available at buildbot.data.buildrequests). + for brid in self.brids: + self.master.data.control( + "cancel", {'reason': 'parent build was interrupted'}, ("buildrequests", brid) + ) + if self.running and not self.ended: + self.ended = True + # if we are interrupted because of a connection lost, we interrupt synchronously + if self.build.conn is None and self.waitForFinishDeferred is not None: + self.waitForFinishDeferred.cancel() def getSchedulerByName(self, name): schedulers = self.master.scheduler_manager.namedServices @@ -77,6 +99,7 @@ class BuildTrigger(steps.BuildStep): return sch def schedule_one(self, build_props, job): + source = f"nix-eval-lix" attr = job.get("attr", "eval-error") name = attr name = f"hydraJobs.{name}" @@ -106,7 +129,7 @@ class BuildTrigger(steps.BuildStep): return (self.builds_scheduler, props) @defer.inlineCallbacks - def _add_results(self, brid, results): + def _add_results(self, brid): @defer.inlineCallbacks def _is_buildrequest_complete(brid): buildrequest = yield self.master.db.buildrequests.getBuildRequest(brid) @@ -119,6 +142,15 @@ class BuildTrigger(steps.BuildStep): self._result_list.append(build["results"]) self.updateSummary() + def prepareSourcestampListForTrigger(self): + ss_for_trigger = {} + objs_from_build = self.build.getAllSourceStamps() + for ss in objs_from_build: + ss_for_trigger[ss.codebase] = ss.asDict() + + trigger_values = [ss_for_trigger[k] for k in sorted(ss_for_trigger.keys())] + return trigger_values + @defer.inlineCallbacks def run(self): build_props = self.build.getProperties() @@ -140,7 +172,7 @@ class BuildTrigger(steps.BuildStep): sorter = graphlib.TopologicalSorter(all_deps) for item in sorter.static_order(): i = 0 - while i < builds_to_schedule.len(): + while i < len(builds_to_schedule): if item == builds_to_schedule[i].get("drvPath"): build_schedule_order.append(builds_to_schedule[i]) del builds_to_schedule[i] @@ -149,14 +181,20 @@ class BuildTrigger(steps.BuildStep): done = [] scheduled = [] - while len(build_schedule_order) > 0 and len(scheduled) > 0: + failed = [] + all_results = SUCCESS + ss_for_trigger = self.prepareSourcestampListForTrigger() + while len(build_schedule_order) > 0 or len(scheduled) > 0: + print('Scheduling..') schedule_now = [] for build in list(build_schedule_order): if all_deps.get(build.get("drvPath"), []) == []: build_schedule_order.remove(build) schedule_now.append(build) - + if len(schedule_now) == 0: + print(' No builds to schedule found.') for job in schedule_now: + print(f" - {job.get('attr')}") (scheduler, props) = self.schedule_one(build_props, job) scheduler = self.getSchedulerByName(scheduler) @@ -178,21 +216,46 @@ class BuildTrigger(steps.BuildStep): for brid in brids.values(): url = getURLForBuildrequest(self.master, brid) - yield self.addURL(f"{sch.name} #{brid}", url) + yield self.addURL(f"{scheduler.name} #{brid}", url) self._add_results(brid) - + print('Waiting..') wait_for_next = defer.DeferredList([results for _, _, results in scheduled], fireOnOneCallback = True, fireOnOneErrback=True) + self.waitForFinishDeferred = wait_for_next results, index = yield wait_for_next job, brids, _ = scheduled[index] done.append((job, brids, results)) del scheduled[index] - # TODO: remove dep from all_deps - # TODO: calculate final result + result = results[0] + print(f' Found finished build {job.get("attr")}, result {util.Results[result].upper()}') + if result != SUCCESS: + failed_checks = [] + failed_paths = [] + removed = [] + while True: + old_paths = list(failed_paths) + print(failed_checks, old_paths) + for build in list(build_schedule_order): + deps = all_deps.get(build.get("drvPath"), []) + for path in old_paths: + if path in deps: + failed_checks.append(build) + failed_paths.append(build.get("drvPath")) + build_schedule_order.remove(build) + removed.append(build.get("attr")) + + break + if old_paths == failed_paths: + break + print(' Removed jobs: ' + ', '.join(removed)) + all_results = worst_status(result, all_results) + print(f' New result: {util.Results[all_results].upper()}') + for dep in all_deps: + if job.get("drvPath") in all_deps[dep]: + all_deps[dep].remove(job.get("drvPath")) + print('Done!') + return all_results def getCurrentSummary(self) -> dict[str, str]: # noqa: N802 - """The original build trigger will the generic builder name `nix-build` in this case, which is not helpful""" - if not self.triggeredNames: - return {"step": "running"} summary = [] if self._result_list: for status in ALL_RESULTS: @@ -742,6 +805,7 @@ class GerritNixConfigurator(ConfiguratorBase): ) config["change_source"] = self.gerrit_change_source + """ config["services"].append( reporters.GerritStatusPush(self.gerrit_server, self.gerrit_user, port=2022, @@ -757,6 +821,7 @@ class GerritNixConfigurator(ConfiguratorBase): # summaryArg=self.url) ) + """ systemd_secrets = secrets.SecretInAFile( dirname=os.environ["CREDENTIALS_DIRECTORY"], From 9a15348984a63e965a5b931ce7e51d9f0751c9d6 Mon Sep 17 00:00:00 2001 From: Puck Meerburg Date: Sun, 10 Mar 2024 23:09:27 +0000 Subject: [PATCH 09/14] Fix up a few loose ends --- buildbot_nix/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index 3b7a394..5552b91 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -153,6 +153,7 @@ class BuildTrigger(steps.BuildStep): @defer.inlineCallbacks def run(self): + self.running = True build_props = self.build.getProperties() source = f"nix-eval-lix" @@ -184,7 +185,7 @@ class BuildTrigger(steps.BuildStep): failed = [] all_results = SUCCESS ss_for_trigger = self.prepareSourcestampListForTrigger() - while len(build_schedule_order) > 0 or len(scheduled) > 0: + while not self.ended and (len(build_schedule_order) > 0 or len(scheduled) > 0): print('Scheduling..') schedule_now = [] for build in list(build_schedule_order): @@ -229,11 +230,10 @@ class BuildTrigger(steps.BuildStep): print(f' Found finished build {job.get("attr")}, result {util.Results[result].upper()}') if result != SUCCESS: failed_checks = [] - failed_paths = [] + failed_paths = [job.get('drvPath')] removed = [] while True: old_paths = list(failed_paths) - print(failed_checks, old_paths) for build in list(build_schedule_order): deps = all_deps.get(build.get("drvPath"), []) for path in old_paths: @@ -253,6 +253,8 @@ class BuildTrigger(steps.BuildStep): if job.get("drvPath") in all_deps[dep]: all_deps[dep].remove(job.get("drvPath")) print('Done!') + if self.ended: + return util.CANCELLED return all_results def getCurrentSummary(self) -> dict[str, str]: # noqa: N802 From 29a2ef63e23312af0c13ee1c562b0aeec198c1fd Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Mon, 11 Mar 2024 09:05:28 +0100 Subject: [PATCH 10/14] show hydra job count in trigger step previously we immediately triggered all jobs, now we no longer do. showing the total count at least somewhere is nice to have a rough indication of how much longer a build may still need to run. --- buildbot_nix/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index 5552b91..456bf65 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -71,6 +71,7 @@ class BuildTrigger(steps.BuildStep): self._result_list = [] self.ended = False self.waitForFinishDeferred = None + self.description = f"building {len(jobs)} hydra jobs" super().__init__(**kwargs) def interrupt(self, reason): From 9933971ab0384f7e8baaa9460c9db9614ee16d6d Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Mon, 11 Mar 2024 09:06:29 +0100 Subject: [PATCH 11/14] re-enable the gerrit status reporter --- buildbot_nix/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index 456bf65..19789a2 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -808,7 +808,6 @@ class GerritNixConfigurator(ConfiguratorBase): ) config["change_source"] = self.gerrit_change_source - """ config["services"].append( reporters.GerritStatusPush(self.gerrit_server, self.gerrit_user, port=2022, @@ -824,7 +823,6 @@ class GerritNixConfigurator(ConfiguratorBase): # summaryArg=self.url) ) - """ systemd_secrets = secrets.SecretInAFile( dirname=os.environ["CREDENTIALS_DIRECTORY"], From 13a67b483a2fe474409a1f01840f4172e5f63a07 Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Mon, 11 Mar 2024 13:05:12 +0100 Subject: [PATCH 12/14] fix interrupt() can't interrupt with things to interrupt. this is technically duplicated information but keeping parts of the code close to Trigger seems useful. --- buildbot_nix/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index 19789a2..84f1b2e 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -71,6 +71,7 @@ class BuildTrigger(steps.BuildStep): self._result_list = [] self.ended = False self.waitForFinishDeferred = None + self.brids = [] self.description = f"building {len(jobs)} hydra jobs" super().__init__(**kwargs) @@ -220,6 +221,7 @@ class BuildTrigger(steps.BuildStep): url = getURLForBuildrequest(self.master, brid) yield self.addURL(f"{scheduler.name} #{brid}", url) self._add_results(brid) + self.brids.append(brid) print('Waiting..') wait_for_next = defer.DeferredList([results for _, _, results in scheduled], fireOnOneCallback = True, fireOnOneErrback=True) self.waitForFinishDeferred = wait_for_next From 51f7b52149a8a2c8e2345d7a6a7ed06d97ffe01f Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Mon, 11 Mar 2024 13:07:35 +0100 Subject: [PATCH 13/14] pre-filter drv_info into all_deps otherwise failure reporting is *enormous* with the entirety of a full derivation info dump in there --- buildbot_nix/__init__.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index 84f1b2e..90dd947 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -61,11 +61,11 @@ class BuildTrigger(steps.BuildStep): self, builds_scheduler: str, jobs: list[dict[str, Any]], - drv_info: dict[str, Any], + all_deps: dict[str, Any], **kwargs: Any, ) -> None: self.jobs = jobs - self.drv_info = drv_info + self.all_deps = all_deps self.config = None self.builds_scheduler = builds_scheduler self._result_list = [] @@ -159,20 +159,9 @@ class BuildTrigger(steps.BuildStep): build_props = self.build.getProperties() source = f"nix-eval-lix" - all_deps = dict() - for drv, info in self.drv_info.items(): - all_deps[drv] = set(info.get("inputDrvs").keys()) - def closure_of(key, deps): - r, size = set([key]), 0 - while len(r) != size: - size = len(r) - r.update(*[ deps[k] for k in r ]) - return r.difference([key]) - job_set = set(( drv for drv in ( job.get("drvPath") for job in self.jobs ) if drv )) - all_deps = { k: list(closure_of(k, all_deps).intersection(job_set)) for k in job_set } builds_to_schedule = list(self.jobs) build_schedule_order = [] - sorter = graphlib.TopologicalSorter(all_deps) + sorter = graphlib.TopologicalSorter(self.all_deps) for item in sorter.static_order(): i = 0 while i < len(builds_to_schedule): @@ -191,7 +180,7 @@ class BuildTrigger(steps.BuildStep): print('Scheduling..') schedule_now = [] for build in list(build_schedule_order): - if all_deps.get(build.get("drvPath"), []) == []: + if self.all_deps.get(build.get("drvPath"), []) == []: build_schedule_order.remove(build) schedule_now.append(build) if len(schedule_now) == 0: @@ -238,7 +227,7 @@ class BuildTrigger(steps.BuildStep): while True: old_paths = list(failed_paths) for build in list(build_schedule_order): - deps = all_deps.get(build.get("drvPath"), []) + deps = self.all_deps.get(build.get("drvPath"), []) for path in old_paths: if path in deps: failed_checks.append(build) @@ -252,9 +241,9 @@ class BuildTrigger(steps.BuildStep): print(' Removed jobs: ' + ', '.join(removed)) all_results = worst_status(result, all_results) print(f' New result: {util.Results[all_results].upper()}') - for dep in all_deps: - if job.get("drvPath") in all_deps[dep]: - all_deps[dep].remove(job.get("drvPath")) + for dep in self.all_deps: + if job.get("drvPath") in self.all_deps[dep]: + self.all_deps[dep].remove(job.get("drvPath")) print('Done!') if self.ended: return util.CANCELLED @@ -328,6 +317,17 @@ class NixEvalCommand(buildstep.ShellMixin, steps.BuildStep): except json.JSONDecodeError as e: msg = f"Failed to parse `nix derivation show` output for {cmd.command}" raise BuildbotNixError(msg) from e + all_deps = dict() + for drv, info in drv_info.items(): + all_deps[drv] = set(info.get("inputDrvs").keys()) + def closure_of(key, deps): + r, size = set([key]), 0 + while len(r) != size: + size = len(r) + r.update(*[ deps[k] for k in r ]) + return r.difference([key]) + job_set = set(( drv for drv in ( job.get("drvPath") for job in filtered_jobs ) if drv )) + all_deps = { k: list(closure_of(k, all_deps).intersection(job_set)) for k in job_set } self.build.addStepsAfterCurrentStep( [ @@ -335,7 +335,7 @@ class NixEvalCommand(buildstep.ShellMixin, steps.BuildStep): builds_scheduler=f"lix-nix-build", name="build flake", jobs=filtered_jobs, - drv_info=drv_info, + all_deps=all_deps, ), ], ) From 5cdef7efb6967575a655b558288e059a3a638f6d Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Mon, 11 Mar 2024 14:44:09 +0100 Subject: [PATCH 14/14] fix status reporting to gerrit also adjust labels from split verified to single verified, split labels were only useful during the pre-ci hours --- buildbot_nix/__init__.py | 70 +++++++++++----------------------------- 1 file changed, 18 insertions(+), 52 deletions(-) diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index 90dd947..1fe77d8 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -173,7 +173,7 @@ class BuildTrigger(steps.BuildStep): done = [] scheduled = [] - failed = [] + failed = {} all_results = SUCCESS ss_for_trigger = self.prepareSourcestampListForTrigger() while not self.ended and (len(build_schedule_order) > 0 or len(scheduled) > 0): @@ -234,10 +234,15 @@ class BuildTrigger(steps.BuildStep): failed_paths.append(build.get("drvPath")) build_schedule_order.remove(build) removed.append(build.get("attr")) + failed[build.get("attr")] = (f"dependency {job.get('attr')} failed", []) break if old_paths == failed_paths: break + failed[job.get("attr")] = ( + "failed", + [ getURLForBuildrequest(self.master, brid) for brid in brids.values() ] + ) print(' Removed jobs: ' + ', '.join(removed)) all_results = worst_status(result, all_results) print(f' New result: {util.Results[all_results].upper()}') @@ -245,6 +250,7 @@ class BuildTrigger(steps.BuildStep): if job.get("drvPath") in self.all_deps[dep]: self.all_deps[dep].remove(job.get("drvPath")) print('Done!') + build_props.setProperty("failed_builds", failed, "nix-eval-lix") if self.ended: return util.CANCELLED return all_results @@ -647,62 +653,22 @@ def gerritReviewCB(builderName, build, result, master, arg): if builderName != 'lix/nix-eval': return dict() - all_checks = {} - for step in build['steps']: - if step['name'] != 'build flake': - continue + failed = build['properties'].get('failed_builds', [{}])[0] - for url in step['urls']: - if url['name'].startswith('success: hydraJobs.'): - path = url['name'].split(' ')[1] - all_checks[path] = (True, url['url']) - elif url['name'].startswith('failure: hydraJobs.'): - path = url['name'].split(' ')[1] - all_checks[path] = (False, url['url']) - - collected_oses = {} - for check in all_checks: - arch = check.split('.')[-1] - if not arch.endswith('-linux') and not arch.endswith('-darwin'): - # Not an architecture-specific job, just a test - os = "test" - else: - os = arch.split('-')[1] - (success, failure) = collected_oses.get(os, (0, 0)) - if all_checks[check][0]: - success += 1 - else: - failure += 1 - - collected_oses[os] = (success, failure) - labels = {} - - if 'linux' in collected_oses: - (success, failure) = collected_oses['linux'] - if success > 0 and failure == 0: - labels['Verified-On-Linux'] = 1 - elif failure > 0: - labels['Verified-On-Linux'] = -1 - - if 'darwin' in collected_oses: - (success, failure) = collected_oses['darwin'] - if success > 0 and failure == 0: - labels['Verified-On-Darwin'] = 1 - elif failure > 0: - labels['Verified-On-Darwin'] = -1 + labels = { + 'Verified': -1 if failed else 1, + } message = "Buildbot finished compiling your patchset!\n" message += "The result is: %s\n" % util.Results[result].upper() if result != util.SUCCESS: - successful_checks = [] - failed_checks = [] - for check in all_checks: - if not all_checks[check][0]: - failed_checks.append(f" - {check} (see {all_checks[check][1]})") - - if len(failed_checks) > 0: - message += "Failed checks:\n" + "\n".join(failed_checks) + "\n" - + message += "\nFailed checks:\n" + for check, context in sorted(failed.items()): + how, urls = context + message += f" - {check}: {how}" + if urls: + message += f" (see {', '.join(urls)})" + message += "\n" if arg: message += "\nFor more details visit:\n"