hydra-queue-runner: Improved scheduling

Each jobset now has a "scheduling share" that determines how much of the build farm's time it is entitled to. For instance, if a jobset has 100 shares and the total number of shares of all jobsets is 1000, it's entitled to 10% of the build farm's time. When there is a free build slot for a given system type, the queue runner will select the jobset that is furthest below its scheduling share over a certain time window (currently, the last day). Withing that jobset, it will pick the build with the highest priority. So meta.schedulingPriority now only determines the order of builds within a jobset, not between jobsets. This makes it much easier to prioritise one jobset over another (e.g. nixpkgs:trunk over nixpkgs:stdenv).
2013-09-21 14:47:52 +00:00 · 2013-09-21 14:47:52 +00:00 · 4ed877360b
commit 4ed877360b
parent 7efe793ee6
10 changed files with 164 additions and 66 deletions
--- a/src/lib/Hydra/Controller/Jobset.pm
+++ b/src/lib/Hydra/Controller/Jobset.pm
@ -50,7 +50,9 @@ sub jobset_GET {

    $c->stash->{evals} = getEvals($self, $c, scalar $c->stash->{jobset}->jobsetevals, 0, 10);

-    ($c->stash->{latestEval}) = $c->stash->{jobset}->jobsetevals->search({}, { rows => 1, order_by => ["id desc"] });
+    $c->stash->{latestEval} = $c->stash->{jobset}->jobsetevals->search({}, { rows => 1, order_by => ["id desc"] })->single;
+
+    $c->stash->{totalShares} = getTotalShares($c->model('DB')->schema);

    $self->status_ok(
        $c,
@ -161,22 +163,22 @@ sub jobs_tab : Chained('jobsetChain') PathPart('jobs-tab') Args(0) {
        my @builds = $eval->builds->search(
            { job => { ilike => $filter } },
            { columns => ['id', 'job', 'finished', 'buildstatus'] });
-	foreach my $b (@builds) {
-	    my $jobName = $b->get_column('job');
-	    $evals->{$eval->id}->{$jobName} = 
-	        { id => $b->id, finished => $b->finished, buildstatus => $b->buildstatus };
-	    $jobs{$jobName} = 1;
-	    $nrBuilds++;
-	}
-	last if $nrBuilds >= 10000;
+        foreach my $b (@builds) {
+            my $jobName = $b->get_column('job');
+            $evals->{$eval->id}->{$jobName} = 
+                { id => $b->id, finished => $b->finished, buildstatus => $b->buildstatus };
+            $jobs{$jobName} = 1;
+            $nrBuilds++;
+        }
+        last if $nrBuilds >= 10000;
    }

    if ($c->request->params->{showInactive}) {
-	$c->stash->{showInactive} = 1;
-	foreach my $job ($c->stash->{jobset}->jobs->search({ name => { ilike => $filter } })) {
-	    next if defined $jobs{$job->name};
-	    $c->stash->{inactiveJobs}->{$job->name} = $jobs{$job->name} = 1;
-	}
+        $c->stash->{showInactive} = 1;
+        foreach my $job ($c->stash->{jobset}->jobs->search({ name => { ilike => $filter } })) {
+            next if defined $jobs{$job->name};
+            $c->stash->{inactiveJobs}->{$job->name} = $jobs{$job->name} = 1;
+        }
    }

    $c->stash->{evals} = $evals;
@ -209,6 +211,7 @@ sub edit : Chained('jobsetChain') PathPart Args(0) {

    $c->stash->{template} = 'edit-jobset.tt';
    $c->stash->{edit} = 1;
+    $c->stash->{totalShares} = getTotalShares($c->model('DB')->schema);
 }


@ -287,6 +290,7 @@ sub updateJobset {
        , keepnr => int(trim($c->stash->{params}->{keepnr}))
        , checkinterval => int(trim($c->stash->{params}->{checkinterval}))
        , triggertime => $enabled ? $jobset->triggertime // time() : undef
+        , schedulingshares => int($c->stash->{params}->{schedulingshares})
        });

    # Process the inputs of this jobset.
--- a/src/lib/Hydra/Controller/Project.pm
+++ b/src/lib/Hydra/Controller/Project.pm
@ -201,6 +201,7 @@ sub create_jobset : Chained('projectChain') PathPart('create-jobset') Args(0) {
    $c->stash->{template} = 'edit-jobset.tt';
    $c->stash->{create} = 1;
    $c->stash->{edit} = 1;
+    $c->stash->{totalShares} = getTotalShares($c->model('DB')->schema);
 }


--- a/src/lib/Hydra/Helper/Nix.pm
+++ b/src/lib/Hydra/Helper/Nix.pm
@ -20,7 +20,8 @@ our @EXPORT = qw(
    getMainOutput
    getEvals getMachines
    pathIsInsidePrefix
-    captureStdoutStderr run grab);
+    captureStdoutStderr run grab
+    getTotalShares);


 sub getHydraHome {
@ -533,4 +534,12 @@ sub grab {
 }


+sub getTotalShares {
+    my ($db) = @_;
+    return $db->resultset('Jobsets')->search(
+        { 'project.enabled' => 1, 'me.enabled' => 1 },
+        { join => 'project', select => { sum => 'schedulingshares' }, as => 'sum' })->single->get_column('sum');
+}
+
+
 1;
--- a/src/lib/Hydra/Schema/CachedDarcsInputs.pm
+++ b/src/lib/Hydra/Schema/CachedDarcsInputs.pm
@ -15,6 +15,18 @@ use warnings;

 use base 'DBIx::Class::Core';

+=head1 COMPONENTS LOADED
+
+=over 4
+
+=item * L<Hydra::Component::ToJSON>
+
+=back
+
+=cut
+
+__PACKAGE__->load_components("+Hydra::Component::ToJSON");
+
 =head1 TABLE: C<CachedDarcsInputs>

 =cut
@ -28,11 +40,6 @@ __PACKAGE__->table("CachedDarcsInputs");
  data_type: 'text'
  is_nullable: 0

-=head2 branch
-
-  data_type: 'text'
-  is_nullable: 0
-
 =head2 revision

  data_type: 'text'
@ -48,6 +55,11 @@ __PACKAGE__->table("CachedDarcsInputs");
  data_type: 'text'
  is_nullable: 0

+=head2 revcount
+
+  data_type: 'integer'
+  is_nullable: 0
+
 =cut

 __PACKAGE__->add_columns(
@ -55,12 +67,12 @@ __PACKAGE__->add_columns(
  { data_type => "text", is_nullable => 0 },
  "revision",
  { data_type => "text", is_nullable => 0 },
-  "revcount",
-  { data_type => "integer", is_nullable => 0 },
  "sha256hash",
  { data_type => "text", is_nullable => 0 },
  "storepath",
  { data_type => "text", is_nullable => 0 },
+  "revcount",
+  { data_type => "integer", is_nullable => 0 },
 );

 =head1 PRIMARY KEY
@ -69,8 +81,6 @@ __PACKAGE__->add_columns(

 =item * L</uri>

-=item * L</branch>
-
 =item * L</revision>

 =back
@ -80,7 +90,9 @@ __PACKAGE__->add_columns(
 __PACKAGE__->set_primary_key("uri", "revision");


-# Created by DBIx::Class::Schema::Loader v0.07014 @ 2011-12-05 14:15:43
-# DO NOT MODIFY THIS OR ANYTHING ABOVE! md5sum:fx3yosWMmJ+MnvL/dSWtFA
+# Created by DBIx::Class::Schema::Loader v0.07033 @ 2013-09-20 11:08:50
+# DO NOT MODIFY THIS OR ANYTHING ABOVE! md5sum:Yl1slt3SAizijgu0KUTn0A

+
+# You can replace this text with custom code or comments, and it will be preserved on regeneration
 1;
--- a/src/lib/Hydra/Schema/Jobsets.pm
+++ b/src/lib/Hydra/Schema/Jobsets.pm
@ -118,6 +118,12 @@ __PACKAGE__->table("Jobsets");
  default_value: 300
  is_nullable: 0

+=head2 schedulingshares
+
+  data_type: 'integer'
+  default_value: 100
+  is_nullable: 0
+
 =cut

 __PACKAGE__->add_columns(
@ -151,6 +157,8 @@ __PACKAGE__->add_columns(
  { data_type => "integer", default_value => 3, is_nullable => 0 },
  "checkinterval",
  { data_type => "integer", default_value => 300, is_nullable => 0 },
+  "schedulingshares",
+  { data_type => "integer", default_value => 100, is_nullable => 0 },
 );

 =head1 PRIMARY KEY
@ -272,7 +280,7 @@ __PACKAGE__->belongs_to(
 );


-# Created by DBIx::Class::Schema::Loader v0.07033 @ 2013-06-13 01:54:50
-# DO NOT MODIFY THIS OR ANYTHING ABOVE! md5sum:tsGR8MhZRIUeNwpcVczMUw
+# Created by DBIx::Class::Schema::Loader v0.07033 @ 2013-09-20 12:15:23
+# DO NOT MODIFY THIS OR ANYTHING ABOVE! md5sum:pD6tGW0Ob3fuA1p0uQnBWw

 1;
--- a/src/root/edit-jobset.tt
+++ b/src/root/edit-jobset.tt
@ -1,5 +1,6 @@
 [% WRAPPER layout.tt title=(create ? "Create jobset in project $project.name" : "Editing jobset $project.name:$jobset.name") %]
 [% PROCESS common.tt %]
+[% USE format %]

 [% BLOCK renderJobsetInputAlt %]
  <button type="button" class="btn btn-warning" onclick='$(this).parents(".inputalt").remove()'><i class="icon-trash icon-white"></i></button>
@ -94,6 +95,18 @@
      </div>
    </div>

+    <div class="control-group">
+      <label class="control-label">Scheduling shares</label>
+      <div class="controls">
+        <div class="input-append">
+          <input type="number" class="span3" name="schedulingshares" [% HTML.attributes(value => jobset.schedulingshares) %]/>
+        </div>
+        [% IF totalShares %]
+          <span class="help-inline">([% f = format("%.2f"); f(jobset.schedulingshares / totalShares * 100) %]% out of [% totalShares %] shares)</span>
+        [% END %]
+      </div>
+    </div>
+
    <div class="control-group">
      <div class="controls">
        <label class="checkbox">
--- a/src/root/jobset.tt
+++ b/src/root/jobset.tt
@ -1,5 +1,6 @@
 [% WRAPPER layout.tt title="Jobset $project.name:$jobset.name" %]
 [% PROCESS common.tt %]
+[% USE format %]


 [% BLOCK renderJobsetInput %]
@ -121,6 +122,10 @@
        <th>Check interval:</th>
        <td>[% jobset.checkinterval || "<em>disabled</em>"  %]</td>
      </tr>
+      <tr>
+        <th>Scheduling shares:</th>
+        <td>[% jobset.schedulingshares %] [% IF totalShares %] ([% f = format("%.2f"); f(jobset.schedulingshares / totalShares * 100) %]% out of [% totalShares %] shares)[% END %]</td>
+      </tr>
      <tr>
        <th>Enable email notification:</th>
        <td>[% jobset.enableemail ? "Yes" : "No" %]</td>
--- a/src/script/hydra-queue-runner
+++ b/src/script/hydra-queue-runner
@ -28,7 +28,7 @@ sub unlockDeadBuilds {
            my $pid = $build->locker;
            my $unlock = 0;
            if ($pid == $$) {
-                if (!defined $lastTime || $build->starttime < $lastTime - 300) {
+                if (!defined $lastTime || $build->starttime < $lastTime - 600) {
                    $unlock = 1;
                }
            } elsif (kill(0, $pid) != 1) { # see if we can signal the process
@ -70,27 +70,29 @@ sub checkBuilds {
    my %maxConcurrent;

    foreach my $machineName (keys %{$machines}) {
-        foreach my $system (${$machines}{$machineName}{'systemTypes'}) {
+        foreach my $system (@{${$machines}{$machineName}{'systemTypes'}}) {
            $maxConcurrent{$system} = (${$machines}{$machineName}{'maxJobs'} or 0) + ($maxConcurrent{$system} or 0)
        }
    }

    txn_do($db, sub {

-        # Cache scheduled by derivation path to speed up
+        # Cache scheduled builds by derivation path to speed up
        # findBuildDependencyInQueue.
        my $buildsByDrv = {};
        $buildsByDrv->{$_->drvpath} = $_->id
-            foreach $db->resultset('Builds')->search({ finished => 0, enabled => 1 }, { join => ['project'] });
+            foreach $db->resultset('Builds')->search({ finished => 0 }, { join => ['project'] });

        # Get the system types for the runnable builds.
        my @systemTypes = $db->resultset('Builds')->search(
-            { finished => 0, busy => 0, enabled => 1 },
+            { finished => 0, busy => 0 },
            { join => ['project'], select => ['system'], as => ['system'], distinct => 1 });

+        # Get the total number of scheduling shares.
+        my $totalShares = getTotalShares($db);
+
        # For each system type, select up to the maximum number of
-        # concurrent build for that system type.  Choose the highest
-        # priority builds first, then the oldest builds.
+        # concurrent build for that system type.
        foreach my $system (@systemTypes) {
            # How many builds are already currently executing for this
            # system type?
@ -101,42 +103,84 @@ sub checkBuilds {
            my $max = defined $systemTypeInfo ? $systemTypeInfo->maxconcurrent : $maxConcurrent{$system->system} // 2;

            my $extraAllowed = $max - $nrActive;
-            $extraAllowed = 0 if $extraAllowed < 0;
+            next if $extraAllowed <= 0;

-            # Select the highest-priority builds to start.
-            my @builds = $extraAllowed == 0 ? () : $db->resultset('Builds')->search(
-                { finished => 0, busy => 0, system => $system->system, enabled => 1 },
-                { join => ['project'], order_by => ["priority DESC", "id"] });
+            print STDERR "starting at most $extraAllowed builds for system ${\$system->system}\n";

-            my $started = 0;
-            foreach my $build (@builds) {
-                # Find a dependency of $build that has no queued
-                # dependencies itself.  This isn't strictly necessary,
-                # but it ensures that Nix builds are done as part of
-                # their corresponding Hydra builds, rather than as a
-                # dependency of some other Hydra build.
-                while (my $dep = findBuildDependencyInQueue($buildsByDrv, $build)) {
-                    $build = $dep;
+            j: while ($extraAllowed-- > 0) {
+
+                my @runnableJobsets = $db->resultset('Builds')->search(
+                    { finished => 0, busy => 0, system => $system->system },
+                    { select => ['project', 'jobset'], distinct => 1 });
+
+                next if @runnableJobsets == 0;
+
+                my $windowSize = 24 * 3600;
+                my $totalWindowSize = $windowSize * $max;
+
+                my @res;
+
+                foreach my $b (@runnableJobsets) {
+                    my $jobset = $db->resultset('Jobsets')->find($b->get_column('project'), $b->get_column('jobset')) or die;
+
+                    my $duration = $jobset->builds->search(
+                        { },
+                        { where => \ ("(finished = 0 or (me.stoptime >= " . (time() - $windowSize) . "))")
+                        , join => 'buildsteps'
+                        , select => \ "sum(coalesce(buildsteps.stoptime, ${\time}) - buildsteps.starttime)"
+                        , as => "sum" })->single->get_column("sum") // 0;
+
+                    # Add a 30s penalty for each started build.  This
+                    # is to account for jobsets that have running
+                    # builds but no build steps yet.
+                    $duration += $jobset->builds->search({ finished => 0, busy => 1 })->count * 30;
+
+                    my $share = $jobset->schedulingshares;
+                    my $delta = ($share / $totalShares) - ($duration / $totalWindowSize);
+
+                    #printf STDERR "%s:%s: %d s, %.3f%%, allowance = %.3f%%\n", $jobset->get_column('project'), $jobset->name, $duration, $duration / $totalWindowSize, $delta;
+
+                    push @res, { jobset => $jobset, delta => $delta };
                }
-                next if $build->busy;

-                my $logfile = getcwd . "/logs/" . $build->id;
-                mkdir(dirname $logfile);
-                unlink($logfile);
-                $build->update(
-                    { busy => 1
-                    , locker => $$
-                    , logfile => $logfile
-                    , starttime => time()
-                    });
-                push @buildsStarted, $build;
+                foreach my $r (sort { $b->{delta} <=> $a->{delta} } @res) {
+                    my $jobset = $r->{jobset};
+                    #print STDERR "selected ", $jobset->get_column('project'), ':', $jobset->name, "\n";

-                last if ++$started >= $extraAllowed;
-            }
+                    # Select the highest-priority build for this jobset.
+                    my @builds = $jobset->builds->search(
+                        { finished => 0, busy => 0, system => $system->system },
+                        { order_by => ["priority DESC", "id"] });

-            if ($started > 0) {
-                print STDERR "system type `", $system->system,
-                    "': $nrActive active, $max allowed, started $started builds\n";
+                    foreach my $build (@builds) {
+                        # Find a dependency of $build that has no queued
+                        # dependencies itself.  This isn't strictly necessary,
+                        # but it ensures that Nix builds are done as part of
+                        # their corresponding Hydra builds, rather than as a
+                        # dependency of some other Hydra build.
+                        while (my $dep = findBuildDependencyInQueue($buildsByDrv, $build)) {
+                            $build = $dep;
+                        }
+                        next if $build->busy;
+
+                        printf STDERR "starting build %d (%s:%s:%s) on %s (jobset allowance = %.3f%%)\n", 
+                            $build->id, $build->project->name, $build->jobset->name, $build->job->name, $build->system, $r->{delta};
+
+                        my $logfile = getcwd . "/logs/" . $build->id;
+                        mkdir(dirname $logfile);
+                        unlink($logfile);
+                        $build->update(
+                            { busy => 1
+                            , locker => $$
+                            , logfile => $logfile
+                            , starttime => time()
+                            });
+                        push @buildsStarted, $build;
+                        next j;
+                    }
+                }
+
+                last; # nothing found, give up on this system type
            }
        }
    });
@ -145,7 +189,6 @@ sub checkBuilds {
    # outside the transaction in case it aborts or something.
    foreach my $build (@buildsStarted) {
        my $id = $build->id;
-        print "starting build $id (", $build->project->name, ":", $build->jobset->name, ':', $build->job->name, ") on ", $build->system, "\n";
        eval {
            my $logfile = $build->logfile;
            my $child = fork();
--- a/src/sql/hydra.sql
+++ b/src/sql/hydra.sql
@ -61,6 +61,7 @@ create table Jobsets (
    emailOverride text not null,
    keepnr        integer not null default 3,
    checkInterval integer not null default 300, -- minimum time in seconds between polls (0 = disable polling)
+    schedulingShares integer not null default 100,
    primary key   (project, name),
    foreign key   (project) references Projects(name) on delete cascade on update cascade
 #ifdef SQLITE
--- a/src/sql/upgrade-21.sql
+++ b/src/sql/upgrade-21.sql
@ -0,0 +1,2 @@
+alter table Jobsets
+    add column schedulingShares integer not null default 100;