diff --git a/buildbot_nix/__init__.py b/buildbot_nix/__init__.py index 2f50695..1fe77d8 100644 --- a/buildbot_nix/__init__.py +++ b/buildbot_nix/__init__.py @@ -2,7 +2,7 @@ import json import multiprocessing import os import sys -import uuid +import graphlib from collections import defaultdict from collections.abc import Generator from dataclasses import dataclass @@ -20,6 +20,12 @@ from buildbot.util import asyncSleep from buildbot.www.authz.endpointmatchers import EndpointMatcherBase, Match from buildbot.www.oauth2 import OAuth2Auth from buildbot.changes.gerritchangesource import GerritChangeSource +from buildbot.reporters.utils import getURLForBuild +from buildbot.reporters.utils import getURLForBuildrequest +from buildbot.process.buildstep import CANCELLED +from buildbot.process.buildstep import EXCEPTION +from buildbot.process.buildstep import SUCCESS +from buildbot.process.results import worst_status if TYPE_CHECKING: from buildbot.process.log import Log @@ -32,8 +38,6 @@ from .github_projects import ( slugify_project_name, ) -SKIPPED_BUILDER_NAME = "skipped-builds" - log = Logger() class LixSystemsOAuth2(OAuth2Auth): @@ -52,82 +56,206 @@ class GerritProject: # `project` field. name: str -class BuildTrigger(Trigger): - """Dynamic trigger that creates a build for every attribute.""" - +class BuildTrigger(steps.BuildStep): def __init__( self, builds_scheduler: str, - skipped_builds_scheduler: str, jobs: list[dict[str, Any]], + all_deps: dict[str, Any], **kwargs: Any, ) -> None: - if "name" not in kwargs: - kwargs["name"] = "trigger" self.jobs = jobs + self.all_deps = all_deps self.config = None self.builds_scheduler = builds_scheduler - self.skipped_builds_scheduler = skipped_builds_scheduler - Trigger.__init__( - self, - waitForFinish=True, - schedulerNames=[builds_scheduler, skipped_builds_scheduler], - haltOnFailure=True, - flunkOnFailure=True, - sourceStamps=[], - alwaysUseLatest=False, - updateSourceStamp=False, - **kwargs, - ) + self._result_list = [] + self.ended = False + self.waitForFinishDeferred = None + self.brids = [] + self.description = f"building {len(jobs)} hydra jobs" + super().__init__(**kwargs) - def createTriggerProperties(self, props: Any) -> Any: # noqa: N802 - return props + def interrupt(self, reason): + # We cancel the buildrequests, as the data api handles + # both cases: + # - build started: stop is sent, + # - build not created yet: related buildrequests are set to CANCELLED. + # Note that there is an identified race condition though (more details + # are available at buildbot.data.buildrequests). + for brid in self.brids: + self.master.data.control( + "cancel", {'reason': 'parent build was interrupted'}, ("buildrequests", brid) + ) + if self.running and not self.ended: + self.ended = True + # if we are interrupted because of a connection lost, we interrupt synchronously + if self.build.conn is None and self.waitForFinishDeferred is not None: + self.waitForFinishDeferred.cancel() - def getSchedulersAndProperties(self) -> list[tuple[str, Properties]]: # noqa: N802 + def getSchedulerByName(self, name): + schedulers = self.master.scheduler_manager.namedServices + if name not in schedulers: + raise ValueError(f"unknown triggered scheduler: {repr(name)}") + sch = schedulers[name] + # todo: check ITriggerableScheduler + return sch + + def schedule_one(self, build_props, job): + source = f"nix-eval-lix" + attr = job.get("attr", "eval-error") + name = attr + name = f"hydraJobs.{name}" + error = job.get("error") + props = Properties() + props.setProperty("virtual_builder_name", name, source) + props.setProperty("status_name", f"nix-build .#hydraJobs.{attr}", source) + props.setProperty("virtual_builder_tags", "", source) + + if error is not None: + props.setProperty("error", error, source) + return (self.builds_scheduler, props) + + drv_path = job.get("drvPath") + system = job.get("system") + out_path = job.get("outputs", {}).get("out") + + build_props.setProperty(f"{attr}-out_path", out_path, source) + build_props.setProperty(f"{attr}-drv_path", drv_path, source) + + props.setProperty("attr", attr, source) + props.setProperty("system", system, source) + props.setProperty("drv_path", drv_path, source) + props.setProperty("out_path", out_path, source) + props.setProperty("isCached", job.get("isCached"), source) + + return (self.builds_scheduler, props) + + @defer.inlineCallbacks + def _add_results(self, brid): + @defer.inlineCallbacks + def _is_buildrequest_complete(brid): + buildrequest = yield self.master.db.buildrequests.getBuildRequest(brid) + return buildrequest['complete'] + + event = ('buildrequests', str(brid), 'complete') + yield self.master.mq.waitUntilEvent(event, lambda: _is_buildrequest_complete(brid)) + builds = yield self.master.db.builds.getBuilds(buildrequestid=brid) + for build in builds: + self._result_list.append(build["results"]) + self.updateSummary() + + def prepareSourcestampListForTrigger(self): + ss_for_trigger = {} + objs_from_build = self.build.getAllSourceStamps() + for ss in objs_from_build: + ss_for_trigger[ss.codebase] = ss.asDict() + + trigger_values = [ss_for_trigger[k] for k in sorted(ss_for_trigger.keys())] + return trigger_values + + @defer.inlineCallbacks + def run(self): + self.running = True build_props = self.build.getProperties() source = f"nix-eval-lix" - triggered_schedulers = [] - for job in self.jobs: - attr = job.get("attr", "eval-error") - name = attr - name = f"hydraJobs.{name}" - error = job.get("error") - props = Properties() - props.setProperty("virtual_builder_name", name, source) - props.setProperty("status_name", f"nix-build .#hydraJobs.{attr}", source) - props.setProperty("virtual_builder_tags", "", source) + builds_to_schedule = list(self.jobs) + build_schedule_order = [] + sorter = graphlib.TopologicalSorter(self.all_deps) + for item in sorter.static_order(): + i = 0 + while i < len(builds_to_schedule): + if item == builds_to_schedule[i].get("drvPath"): + build_schedule_order.append(builds_to_schedule[i]) + del builds_to_schedule[i] + else: + i += 1 - if error is not None: - props.setProperty("error", error, source) - triggered_schedulers.append((self.skipped_builds_scheduler, props)) - continue + done = [] + scheduled = [] + failed = {} + all_results = SUCCESS + ss_for_trigger = self.prepareSourcestampListForTrigger() + while not self.ended and (len(build_schedule_order) > 0 or len(scheduled) > 0): + print('Scheduling..') + schedule_now = [] + for build in list(build_schedule_order): + if self.all_deps.get(build.get("drvPath"), []) == []: + build_schedule_order.remove(build) + schedule_now.append(build) + if len(schedule_now) == 0: + print(' No builds to schedule found.') + for job in schedule_now: + print(f" - {job.get('attr')}") + (scheduler, props) = self.schedule_one(build_props, job) + scheduler = self.getSchedulerByName(scheduler) - if job.get("isCached"): - triggered_schedulers.append((self.skipped_builds_scheduler, props)) - continue + idsDeferred, resultsDeferred = scheduler.trigger( + waited_for = True, + sourcestamps = ss_for_trigger, + set_props = props, + parent_buildid = self.build.buildid, + parent_relationship = "Triggered from", + ) - drv_path = job.get("drvPath") - system = job.get("system") - out_path = job.get("outputs", {}).get("out") + brids = {} + try: + _, brids = yield idsDeferred + except Exception as e: + yield self.addLogWithException(e) + results = EXCEPTION + scheduled.append((job, brids, resultsDeferred)) - build_props.setProperty(f"{attr}-out_path", out_path, source) - build_props.setProperty(f"{attr}-drv_path", drv_path, source) + for brid in brids.values(): + url = getURLForBuildrequest(self.master, brid) + yield self.addURL(f"{scheduler.name} #{brid}", url) + self._add_results(brid) + self.brids.append(brid) + print('Waiting..') + wait_for_next = defer.DeferredList([results for _, _, results in scheduled], fireOnOneCallback = True, fireOnOneErrback=True) + self.waitForFinishDeferred = wait_for_next + results, index = yield wait_for_next + job, brids, _ = scheduled[index] + done.append((job, brids, results)) + del scheduled[index] + result = results[0] + print(f' Found finished build {job.get("attr")}, result {util.Results[result].upper()}') + if result != SUCCESS: + failed_checks = [] + failed_paths = [job.get('drvPath')] + removed = [] + while True: + old_paths = list(failed_paths) + for build in list(build_schedule_order): + deps = self.all_deps.get(build.get("drvPath"), []) + for path in old_paths: + if path in deps: + failed_checks.append(build) + failed_paths.append(build.get("drvPath")) + build_schedule_order.remove(build) + removed.append(build.get("attr")) + failed[build.get("attr")] = (f"dependency {job.get('attr')} failed", []) - props.setProperty("attr", attr, source) - props.setProperty("system", system, source) - props.setProperty("drv_path", drv_path, source) - props.setProperty("out_path", out_path, source) - # we use this to identify builds when running a retry - props.setProperty("build_uuid", str(uuid.uuid4()), source) - - triggered_schedulers.append((self.builds_scheduler, props)) - return triggered_schedulers + break + if old_paths == failed_paths: + break + failed[job.get("attr")] = ( + "failed", + [ getURLForBuildrequest(self.master, brid) for brid in brids.values() ] + ) + print(' Removed jobs: ' + ', '.join(removed)) + all_results = worst_status(result, all_results) + print(f' New result: {util.Results[all_results].upper()}') + for dep in self.all_deps: + if job.get("drvPath") in self.all_deps[dep]: + self.all_deps[dep].remove(job.get("drvPath")) + print('Done!') + build_props.setProperty("failed_builds", failed, "nix-eval-lix") + if self.ended: + return util.CANCELLED + return all_results def getCurrentSummary(self) -> dict[str, str]: # noqa: N802 - """The original build trigger will the generic builder name `nix-build` in this case, which is not helpful""" - if not self.triggeredNames: - return {"step": "running"} summary = [] if self._result_list: for status in ALL_RESULTS: @@ -178,13 +306,42 @@ class NixEvalCommand(buildstep.ShellMixin, steps.BuildStep): if not system or system in self.supported_systems: # report eval errors filtered_jobs.append(job) + drv_show_log: Log = yield self.getLog("stdio") + drv_show_log.addStdout(f"getting derivation infos\n") + cmd = yield self.makeRemoteShellCommand( + stdioLogName=None, + collectStdout=True, + command=( + ["nix", "derivation", "show", "--recursive"] + + [ drv for drv in (job.get("drvPath") for job in filtered_jobs) if drv ] + ), + ) + yield self.runCommand(cmd) + drv_show_log.addStdout(f"done\n") + try: + drv_info = json.loads(cmd.stdout) + except json.JSONDecodeError as e: + msg = f"Failed to parse `nix derivation show` output for {cmd.command}" + raise BuildbotNixError(msg) from e + all_deps = dict() + for drv, info in drv_info.items(): + all_deps[drv] = set(info.get("inputDrvs").keys()) + def closure_of(key, deps): + r, size = set([key]), 0 + while len(r) != size: + size = len(r) + r.update(*[ deps[k] for k in r ]) + return r.difference([key]) + job_set = set(( drv for drv in ( job.get("drvPath") for job in filtered_jobs ) if drv )) + all_deps = { k: list(closure_of(k, all_deps).intersection(job_set)) for k in job_set } + self.build.addStepsAfterCurrentStep( [ BuildTrigger( builds_scheduler=f"lix-nix-build", - skipped_builds_scheduler=f"lix-nix-skipped-build", name="build flake", jobs=filtered_jobs, + all_deps=all_deps, ), ], ) @@ -192,37 +349,6 @@ class NixEvalCommand(buildstep.ShellMixin, steps.BuildStep): return result -# FIXME this leaks memory... but probably not enough that we care -class RetryCounter: - def __init__(self, retries: int) -> None: - self.builds: dict[uuid.UUID, int] = defaultdict(lambda: retries) - - def retry_build(self, build_id: uuid.UUID) -> int: - retries = self.builds[build_id] - if retries > 1: - self.builds[build_id] = retries - 1 - return retries - return 0 - - -# For now we limit this to two. Often this allows us to make the error log -# shorter because we won't see the logs for all previous succeeded builds -RETRY_COUNTER = RetryCounter(retries=2) - - -class EvalErrorStep(steps.BuildStep): - """Shows the error message of a failed evaluation.""" - - @defer.inlineCallbacks - def run(self) -> Generator[Any, object, Any]: - error = self.getProperty("error") - attr = self.getProperty("attr") - # show eval error - error_log: Log = yield self.addLog("nix_error") - error_log.addStderr(f"{attr} failed to evaluate:\n{error}") - return util.FAILURE - - class NixBuildCommand(buildstep.ShellMixin, steps.BuildStep): """Builds a nix derivation.""" @@ -232,16 +358,25 @@ class NixBuildCommand(buildstep.ShellMixin, steps.BuildStep): @defer.inlineCallbacks def run(self) -> Generator[Any, object, Any]: + if error := self.getProperty("error"): + attr = self.getProperty("attr") + # show eval error + error_log: Log = yield self.addLog("nix_error") + error_log.addStderr(f"{attr} failed to evaluate:\n{error}") + return util.FAILURE + + if self.getProperty("isCached"): + yield self.addCompleteLog( + "cached outpath from previous builds", + # buildbot apparently hides the first line in the ui? + f'\n{self.getProperty("out_path")}\n') + return util.SKIPPED + # run `nix build` cmd: remotecommand.RemoteCommand = yield self.makeRemoteShellCommand() yield self.runCommand(cmd) - res = cmd.results() - if res == util.FAILURE: - retries = RETRY_COUNTER.retry_build(self.getProperty("build_uuid")) - if retries > 0: - return util.RETRY - return res + return cmd.results() class UpdateBuildOutput(steps.BuildStep): @@ -269,32 +404,6 @@ class UpdateBuildOutput(steps.BuildStep): return util.SUCCESS -# The builtin retry mechanism doesn't seem to work for github, -# since github is sometimes not delivering the pull request ref fast enough. -class GitWithRetry(steps.Git): - @defer.inlineCallbacks - def run_vc( - self, - branch: str, - revision: str, - patch: str, - ) -> Generator[Any, object, Any]: - retry_counter = 0 - while True: - try: - res = yield super().run_vc(branch, revision, patch) - except Exception as e: # noqa: BLE001 - retry_counter += 1 - if retry_counter == 3: - msg = "Failed to clone" - raise BuildbotNixError(msg) from e - log: Log = yield self.addLog("log") - yield log.addStderr(f"Retrying git clone (error: {e})\n") - yield asyncSleep(2 << retry_counter) # 2, 4, 8 - else: - return res - - def nix_eval_config( project: GerritProject, gerrit_private_key: str, @@ -318,8 +427,12 @@ def nix_eval_config( sshPrivateKey=gerrit_private_key ), ) + # use one gcroots directory per worker. this should be scoped to the largest unique resource + # in charge of builds (ie, buildnumber is too narrow) to not litter the system with permanent + # gcroots in case of worker restarts. + # TODO perhaps we should clean the entire /drvs/ directory up too during startup. drv_gcroots_dir = util.Interpolate( - "/nix/var/nix/gcroots/per-user/buildbot-worker/%(prop:project)s/drvs/", + "/nix/var/nix/gcroots/per-user/buildbot-worker/%(prop:project)s/drvs/%(prop:workername)s/", ) factory.addStep( @@ -356,6 +469,7 @@ def nix_eval_config( "-rf", drv_gcroots_dir, ], + alwaysRun=True, ), ) @@ -368,25 +482,9 @@ def nix_eval_config( ) -@dataclass -class CachixConfig: - name: str - signing_key_secret_name: str | None = None - auth_token_secret_name: str | None = None - - def cachix_env(self) -> dict[str, str]: - env = {} - if self.signing_key_secret_name is not None: - env["CACHIX_SIGNING_KEY"] = util.Secret(self.signing_key_secret_name) - if self.auth_token_secret_name is not None: - env["CACHIX_AUTH_TOKEN"] = util.Secret(self.auth_token_secret_name) - return env - - def nix_build_config( project: GerritProject, worker_names: list[str], - cachix: CachixConfig | None = None, outputs_path: Path | None = None, ) -> util.BuilderConfig: """Builds one nix flake attribute.""" @@ -417,19 +515,6 @@ def nix_build_config( haltOnFailure=True, ), ) - if cachix: - factory.addStep( - steps.ShellCommand( - name="Upload cachix", - env=cachix.cachix_env(), - command=[ - "cachix", - "push", - cachix.name, - util.Interpolate("result-%(prop:attr)s"), - ], - ), - ) factory.addStep( steps.ShellCommand( @@ -471,38 +556,6 @@ def nix_build_config( ) -def nix_skipped_build_config( - project: GerritProject, - worker_names: list[str], -) -> util.BuilderConfig: - """Dummy builder that is triggered when a build is skipped.""" - factory = util.BuildFactory() - factory.addStep( - EvalErrorStep( - name="Nix evaluation", - doStepIf=lambda s: s.getProperty("error"), - hideStepIf=lambda _, s: not s.getProperty("error"), - ), - ) - - # This is just a dummy step showing the cached build - factory.addStep( - steps.BuildStep( - name="Nix build (cached)", - doStepIf=lambda _: False, - hideStepIf=lambda _, s: s.getProperty("error"), - ), - ) - return util.BuilderConfig( - name=f"{project.name}/nix-skipped-build", - project=project.name, - workernames=worker_names, - collapseRequests=False, - env={}, - factory=factory, - ) - - def read_secret_file(secret_name: str) -> str: directory = os.environ.get("CREDENTIALS_DIRECTORY") if directory is None: @@ -520,7 +573,6 @@ def config_for_project( nix_eval_worker_count: int, nix_eval_max_memory_size: int, eval_lock: util.MasterLock, - cachix: CachixConfig | None = None, outputs_path: Path | None = None, ) -> Project: config["projects"].append(Project(project.name)) @@ -540,11 +592,6 @@ def config_for_project( name=f"{project.name}-nix-build", builderNames=[f"{project.name}/nix-build"], ), - # this is triggered from `nix-eval` when the build is skipped - schedulers.Triggerable( - name=f"{project.name}-nix-skipped-build", - builderNames=[f"{project.name}/nix-skipped-build"], - ), # allow to manually trigger a nix-build schedulers.ForceScheduler( name=f"{project.name}-force", @@ -582,10 +629,8 @@ def config_for_project( nix_build_config( project, worker_names, - cachix=cachix, outputs_path=outputs_path, ), - nix_skipped_build_config(project, [SKIPPED_BUILDER_NAME]), ], ) @@ -608,62 +653,22 @@ def gerritReviewCB(builderName, build, result, master, arg): if builderName != 'lix/nix-eval': return dict() - all_checks = {} - for step in build['steps']: - if step['name'] != 'build flake': - continue + failed = build['properties'].get('failed_builds', [{}])[0] - for url in step['urls']: - if url['name'].startswith('success: hydraJobs.'): - path = url['name'].split(' ')[1] - all_checks[path] = (True, url['url']) - elif url['name'].startswith('failure: hydraJobs.'): - path = url['name'].split(' ')[1] - all_checks[path] = (False, url['url']) - - collected_oses = {} - for check in all_checks: - arch = check.split('.')[-1] - if not arch.endswith('-linux') and not arch.endswith('-darwin'): - # Not an architecture-specific job, just a test - os = "test" - else: - os = arch.split('-')[1] - (success, failure) = collected_oses.get(os, (0, 0)) - if all_checks[check][0]: - success += 1 - else: - failure += 1 - - collected_oses[os] = (success, failure) - labels = {} - - if 'linux' in collected_oses: - (success, failure) = collected_oses['linux'] - if success > 0 and failure == 0: - labels['Verified-On-Linux'] = 1 - elif failure > 0: - labels['Verified-On-Linux'] = -1 - - if 'darwin' in collected_oses: - (success, failure) = collected_oses['darwin'] - if success > 0 and failure == 0: - labels['Verified-On-Darwin'] = 1 - elif failure > 0: - labels['Verified-On-Darwin'] = -1 + labels = { + 'Verified': -1 if failed else 1, + } message = "Buildbot finished compiling your patchset!\n" message += "The result is: %s\n" % util.Results[result].upper() if result != util.SUCCESS: - successful_checks = [] - failed_checks = [] - for check in all_checks: - if not all_checks[check][0]: - failed_checks.append(f" - {check} (see {all_checks[check][1]})") - - if len(failed_checks) > 0: - message += "Failed checks:\n" + "\n".join(failed_checks) + "\n" - + message += "\nFailed checks:\n" + for check, context in sorted(failed.items()): + how, urls = context + message += f" - {check}: {how}" + if urls: + message += f" (see {', '.join(urls)})" + message += "\n" if arg: message += "\nFor more details visit:\n" @@ -724,7 +729,6 @@ class GerritNixConfigurator(ConfiguratorBase): nix_eval_worker_count: int | None, nix_eval_max_memory_size: int, nix_workers_secret_name: str = "buildbot-nix-workers", # noqa: S107 - cachix: CachixConfig | None = None, outputs_path: str | None = None, ) -> None: super().__init__() @@ -737,7 +741,6 @@ class GerritNixConfigurator(ConfiguratorBase): self.nix_supported_systems = nix_supported_systems self.gerrit_change_source = GerritChangeSource(gerrit_server, gerrit_user, gerritport=gerrit_port, identity_file=gerrit_sshkey_path) self.url = url - self.cachix = cachix if outputs_path is None: self.outputs_path = None else: @@ -769,12 +772,10 @@ class GerritNixConfigurator(ConfiguratorBase): self.nix_eval_worker_count or multiprocessing.cpu_count(), self.nix_eval_max_memory_size, eval_lock, - self.cachix, self.outputs_path, ) config["change_source"] = self.gerrit_change_source - config["workers"].append(worker.LocalWorker(SKIPPED_BUILDER_NAME)) config["services"].append( reporters.GerritStatusPush(self.gerrit_server, self.gerrit_user, port=2022, diff --git a/examples/default.nix b/examples/default.nix index f9fb42e..f59a01a 100644 --- a/examples/default.nix +++ b/examples/default.nix @@ -46,14 +46,6 @@ in # optional nix-eval-jobs settings # evalWorkerCount = 8; # limit number of concurrent evaluations # evalMaxMemorySize = "2048"; # limit memory usage per evaluation - - # optional cachix - #cachix = { - # name = "my-cachix"; - # # One of the following is required: - # signingKey = "/var/lib/secrets/cachix-key"; - # authToken = "/var/lib/secrets/cachix-token"; - #}; }; }) buildbot-nix.nixosModules.buildbot-master diff --git a/nix/coordinator.nix b/nix/coordinator.nix index 80a21f0..797d339 100644 --- a/nix/coordinator.nix +++ b/nix/coordinator.nix @@ -15,25 +15,6 @@ in default = "postgresql://@/buildbot"; description = "Postgresql database url"; }; - cachix = { - name = lib.mkOption { - type = lib.types.nullOr lib.types.str; - default = null; - description = "Cachix name"; - }; - - signingKeyFile = lib.mkOption { - type = lib.types.nullOr lib.types.path; - default = null; - description = "Cachix signing key"; - }; - - authTokenFile = lib.mkOption { - type = lib.types.nullOr lib.types.str; - default = null; - description = "Cachix auth token"; - }; - }; workersFile = lib.mkOption { type = lib.types.path; description = "File containing a list of nix workers"; @@ -88,13 +69,6 @@ in isSystemUser = true; }; - assertions = [ - { - assertion = cfg.cachix.name != null -> cfg.cachix.signingKeyFile != null || cfg.cachix.authTokenFile != null; - message = "if cachix.name is provided, then cachix.signingKeyFile and cachix.authTokenFile must be set"; - } - ]; - services.buildbot-master = { enable = true; @@ -106,7 +80,7 @@ in home = "/var/lib/buildbot"; extraImports = '' from datetime import timedelta - from buildbot_nix import GerritNixConfigurator, CachixConfig + from buildbot_nix import GerritNixConfigurator ''; configurators = [ '' @@ -150,11 +124,7 @@ in LoadCredential = [ "buildbot-nix-workers:${cfg.workersFile}" "buildbot-oauth2-secret:${cfg.oauth2SecretFile}" - ] - ++ lib.optional (cfg.cachix.signingKeyFile != null) - "cachix-signing-key:${builtins.toString cfg.cachix.signingKeyFile}" - ++ lib.optional (cfg.cachix.authTokenFile != null) - "cachix-auth-token:${builtins.toString cfg.cachix.authTokenFile}"; + ]; }; };