From f07b3b7efc252be7e84dc2f0042bf8a5a45e99b6 Mon Sep 17 00:00:00 2001 From: Stephen Connolly Date: Fri, 2 Oct 2015 09:19:40 +0100 Subject: [PATCH] Merge pull request #1815 from varmenise/OSS-192 [FIXED JENKINS-30084] FlyWeightTasks tied to a label will not cause node provisioning and will be blocked forever. When a flyweighttask is limited to run on a specific label (e.g. matrix project set restrict where this project can run) if there are no nodes with that label available when it enters the queue then it will immediately move to blocked. As it is blocked the Node provisioner will not attempt to create any slaves, so the project will sit in the queue forever (or until some other project allocates a slave with the correct label). (cherry picked from commit 5a5f9c5d10f04cf47e37829979dfb2f4e57972af) --- core/src/main/java/hudson/model/Queue.java | 159 ++++++++++++------ .../src/test/java/hudson/model/QueueTest.java | 121 +++++++++++++ .../java/hudson/slaves/DummyCloudImpl.java | 31 +++- 3 files changed, 260 insertions(+), 51 deletions(-) diff --git a/core/src/main/java/hudson/model/Queue.java b/core/src/main/java/hudson/model/Queue.java index 12d3078927..c265e477df 100644 --- a/core/src/main/java/hudson/model/Queue.java +++ b/core/src/main/java/hudson/model/Queue.java @@ -876,6 +876,13 @@ public class Queue extends ResourceController implements Saveable { return new ArrayList(snapshot.pendings); } + /** + * Gets the snapshot of all {@link BlockedItem}s. + */ + protected List getBlockedItems() { + return new ArrayList(snapshot.blockedProjects); + } + /** * Returns the snapshot of all {@link LeftItem}s. * @@ -1471,43 +1478,54 @@ public class Queue extends ResourceController implements Saveable { continue; } - List candidates = new ArrayList(parked.size()); - for (JobOffer j : parked.values()) - if (j.canTake(p)) - candidates.add(j); - - MappingWorksheet ws = new MappingWorksheet(p, candidates); - Mapping m = loadBalancer.map(p.task, ws); - if (m == null) { - // if we couldn't find the executor that fits, - // just leave it in the buildables list and - // check if we can execute other projects - LOGGER.log(Level.FINER, "Failed to map {0} to executors. candidates={1} parked={2}", - new Object[]{p, candidates, parked.values()}); - continue; - } + if (p.task instanceof FlyweightTask) { + Runnable r = makeFlyWeightTaskBuildable(new BuildableItem(p)); + if (r != null) { + p.leave(this); + r.run(); + updateSnapshot(); + } + } else { + + List candidates = new ArrayList(parked.size()); + for (JobOffer j : parked.values()) + if (j.canTake(p)) + candidates.add(j); + + MappingWorksheet ws = new MappingWorksheet(p, candidates); + Mapping m = loadBalancer.map(p.task, ws); + if (m == null) { + // if we couldn't find the executor that fits, + // just leave it in the buildables list and + // check if we can execute other projects + LOGGER.log(Level.FINER, "Failed to map {0} to executors. candidates={1} parked={2}", + new Object[]{p, candidates, parked.values()}); + continue; + } + + // found a matching executor. use it. + WorkUnitContext wuc = new WorkUnitContext(p); + m.execute(wuc); - // found a matching executor. use it. - WorkUnitContext wuc = new WorkUnitContext(p); - m.execute(wuc); - - p.leave(this); - if (!wuc.getWorkUnits().isEmpty()) - makePending(p); - else - LOGGER.log(Level.FINE, "BuildableItem {0} with empty work units!?", p); - - // Ensure that identification of blocked tasks is using the live state: JENKINS-27708 & JENKINS-27871 - // The creation of a snapshot itself should be relatively cheap given the expected rate of - // job execution. You probably would need 100's of jobs starting execution every iteration - // of maintain() before this could even start to become an issue and likely the calculation - // of isBuildBlocked(p) will become a bottleneck before updateSnapshot() will. Additionally - // since the snapshot itself only ever has at most one reference originating outside of the stack - // it should remain in the eden space and thus be cheap to GC. - // See https://issues.jenkins-ci.org/browse/JENKINS-27708?focusedCommentId=225819&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-225819 - // or https://issues.jenkins-ci.org/browse/JENKINS-27708?focusedCommentId=225906&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-225906 - // for alternative fixes of this issue. - updateSnapshot(); + p.leave(this); + if (!wuc.getWorkUnits().isEmpty()) { + makePending(p); + } + else + LOGGER.log(Level.FINE, "BuildableItem {0} with empty work units!?", p); + + // Ensure that identification of blocked tasks is using the live state: JENKINS-27708 & JENKINS-27871 + // The creation of a snapshot itself should be relatively cheap given the expected rate of + // job execution. You probably would need 100's of jobs starting execution every iteration + // of maintain() before this could even start to become an issue and likely the calculation + // of isBuildBlocked(p) will become a bottleneck before updateSnapshot() will. Additionally + // since the snapshot itself only ever has at most one reference originating outside of the stack + // it should remain in the eden space and thus be cheap to GC. + // See https://issues.jenkins-ci.org/browse/JENKINS-27708?focusedCommentId=225819&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-225819 + // or https://issues.jenkins-ci.org/browse/JENKINS-27708?focusedCommentId=225906&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-225906 + // for alternative fixes of this issue. + updateSnapshot(); + } } } finally { updateSnapshot(); } } finally { lock.unlock(); @@ -1522,8 +1540,38 @@ public class Queue extends ResourceController implements Saveable { private @CheckForNull Runnable makeBuildable(final BuildableItem p) { if (p.task instanceof FlyweightTask) { if (!isBlockedByShutdown(p.task)) { + + Runnable runnable = makeFlyWeightTaskBuildable(p); + + if(runnable != null){ + return runnable; + } + + //this is to solve JENKINS-30084: the task has to be buildable to force the provisioning of nodes. + //if the execution gets here, it means the task could not be scheduled since the node + //the task is supposed to run on is offline or not available. + //Thus, the flyweighttask enters the buildables queue and will ask Jenkins to provision a node + return new BuildableRunnable(p); + } + // if the execution gets here, it means the task is blocked by shutdown and null is returned. + return null; + } else { + // regular heavyweight task + return new BuildableRunnable(p); + } + } + + /** + * This method checks if the flyweight task can be run on any of the available executors + * @param p - the flyweight task to be scheduled + * @return a Runnable if there is an executor that can take the task, null otherwise + */ + @CheckForNull + private Runnable makeFlyWeightTaskBuildable(final BuildableItem p){ + //we double check if this is a flyweight task + if (p.task instanceof FlyweightTask) { Jenkins h = Jenkins.getInstance(); - Map hashSource = new HashMap(h.getNodes().size()); + Map hashSource = new HashMap(h.getNodes().size()); // Even if master is configured with zero executors, we may need to run a flyweight task like MatrixProject on it. hashSource.put(h, Math.max(h.getNumExecutors() * 100, 1)); @@ -1538,9 +1586,15 @@ public class Queue extends ResourceController implements Saveable { Label lbl = p.getAssignedLabel(); for (Node n : hash.list(p.task.getFullDisplayName())) { final Computer c = n.toComputer(); - if (c==null || c.isOffline()) continue; - if (lbl!=null && !lbl.contains(n)) continue; - if (n.canTake(p) != null) continue; + if (c == null || c.isOffline()) { + continue; + } + if (lbl!=null && !lbl.contains(n)) { + continue; + } + if (n.canTake(p) != null) { + continue; + } return new Runnable() { @Override public void run() { c.startFlyWeightTask(new WorkUnitContext(p).createWorkUnit(p.task)); @@ -1548,19 +1602,10 @@ public class Queue extends ResourceController implements Saveable { } }; } - } - // if the execution get here, it means we couldn't schedule it anywhere. - return null; - } else { // regular heavyweight task - return new Runnable() { - @Override public void run() { - p.enter(Queue.this); - } - }; } + return null; } - private static Hash NODE_HASH = new Hash() { public String hash(Node node) { return node.getNodeName(); @@ -2672,6 +2717,20 @@ public class Queue extends ResourceController implements Saveable { } } + private class BuildableRunnable implements Runnable { + private final BuildableItem buildableItem; + + private BuildableRunnable(BuildableItem p) { + this.buildableItem = p; + } + + @Override + public void run() { + //the flyweighttask enters the buildables queue and will ask Jenkins to provision a node + buildableItem.enter(Queue.this); + } + } + private static class LockedJUCCallable implements java.util.concurrent.Callable { private final java.util.concurrent.Callable delegate; diff --git a/test/src/test/java/hudson/model/QueueTest.java b/test/src/test/java/hudson/model/QueueTest.java index 6e566f7129..56a23a7839 100644 --- a/test/src/test/java/hudson/model/QueueTest.java +++ b/test/src/test/java/hudson/model/QueueTest.java @@ -30,6 +30,7 @@ import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.xml.XmlPage; import hudson.Launcher; import hudson.XmlFile; +import hudson.matrix.Axis; import hudson.matrix.AxisList; import hudson.matrix.LabelAxis; import hudson.matrix.MatrixBuild; @@ -41,7 +42,9 @@ import hudson.model.Cause.UserIdCause; import hudson.model.Queue.BlockedItem; import hudson.model.Queue.Executable; import hudson.model.Queue.WaitingItem; +import hudson.model.labels.LabelExpression; import hudson.model.queue.AbstractQueueTask; +import hudson.model.queue.CauseOfBlockage; import hudson.model.queue.QueueTaskFuture; import hudson.model.queue.ScheduleResult; import hudson.model.queue.SubTask; @@ -50,6 +53,8 @@ import hudson.security.GlobalMatrixAuthorizationStrategy; import hudson.security.SparseACL; import hudson.slaves.DumbSlave; import hudson.slaves.DummyCloudImpl; +import hudson.slaves.NodeProperty; +import hudson.slaves.NodePropertyDescriptor; import hudson.slaves.NodeProvisionerRule; import hudson.tasks.BuildTrigger; import hudson.tasks.Shell; @@ -99,6 +104,7 @@ import org.jvnet.hudson.test.MockQueueItemAuthenticator; import org.jvnet.hudson.test.SequenceLock; import org.jvnet.hudson.test.SleepBuilder; import org.jvnet.hudson.test.TestBuilder; +import org.jvnet.hudson.test.TestExtension; import org.jvnet.hudson.test.recipes.LocalData; import org.mortbay.jetty.Server; import org.mortbay.jetty.bio.SocketConnector; @@ -780,4 +786,119 @@ public class QueueTest { "B finished at " + buildBEndTime + ", C started at " + buildC.getStartTimeInMillis(), buildC.getStartTimeInMillis() >= buildBEndTime); } + + @Issue("JENKINS-30084") + @Test + /* + * When a flyweight task is restricted to run on a specific node, the node will be provisioned + * and the flyweight task will be executed. + */ + public void shouldRunFlyweightTaskOnProvisionedNodeWhenNodeRestricted() throws Exception { + MatrixProject matrixProject = r.createMatrixProject(); + matrixProject.setAxes(new AxisList( + new Axis("axis", "a", "b") + )); + Label label = LabelExpression.get("aws-linux-dummy"); + DummyCloudImpl dummyCloud = new DummyCloudImpl(r, 0); + dummyCloud.label = label; + r.jenkins.clouds.add(dummyCloud); + matrixProject.setAssignedLabel(label); + r.assertBuildStatusSuccess(matrixProject.scheduleBuild2(0)); + assertEquals("aws-linux-dummy", matrixProject.getBuilds().getLastBuild().getBuiltOn().getLabelString()); + } + + @Test + public void shouldBeAbleToBlockFlyweightTaskAtTheLastMinute() throws Exception { + MatrixProject matrixProject = r.createMatrixProject("downstream"); + matrixProject.setDisplayName("downstream"); + matrixProject.setAxes(new AxisList( + new Axis("axis", "a", "b") + )); + + Label label = LabelExpression.get("aws-linux-dummy"); + DummyCloudImpl dummyCloud = new DummyCloudImpl(r, 0); + dummyCloud.label = label; + BlockDownstreamProjectExecution property = new BlockDownstreamProjectExecution(); + dummyCloud.getNodeProperties().add(property); + r.jenkins.clouds.add(dummyCloud); + matrixProject.setAssignedLabel(label); + + FreeStyleProject upstreamProject = r.createFreeStyleProject("upstream"); + upstreamProject.getBuildersList().add(new SleepBuilder(10000)); + upstreamProject.setDisplayName("upstream"); + + //let's assume the flyweighttask has an upstream project and that must be blocked + // when the upstream project is running + matrixProject.addTrigger(new ReverseBuildTrigger("upstream", Result.SUCCESS)); + matrixProject.setBlockBuildWhenUpstreamBuilding(true); + + //we schedule the project but we pretend no executors are available thus + //the flyweight task is in the buildable queue without being executed + QueueTaskFuture downstream = matrixProject.scheduleBuild2(0); + if (downstream == null) { + throw new Exception("the flyweight task could not be scheduled, thus the test will be interrupted"); + } + //let s wait for the Queue instance to be updated + while (Queue.getInstance().getBuildableItems().size() != 1) { + Thread.sleep(10); + } + //in this state the build is not blocked, it's just waiting for an available executor + assertFalse(Queue.getInstance().getItems()[0].isBlocked()); + + //we start the upstream project that should block the downstream one + QueueTaskFuture upstream = upstreamProject.scheduleBuild2(0); + if (upstream == null) { + throw new Exception("the upstream task could not be scheduled, thus the test will be interrupted"); + } + //let s wait for the Upstream to enter the buildable Queue + boolean enteredTheQueue = false; + while (!enteredTheQueue) { + for (Queue.BuildableItem item : Queue.getInstance().getBuildableItems()) { + if (item.task.getDisplayName() != null && item.task.getDisplayName().equals(upstreamProject.getDisplayName())) { + enteredTheQueue = true; + } + } + } + //let's wait for the upstream project to actually start so that we're sure the Queue has been updated + //when the upstream starts the downstream has already left the buildable queue and the queue is empty + while (!Queue.getInstance().getBuildableItems().isEmpty()) { + Thread.sleep(10); + } + assertTrue(Queue.getInstance().getItems()[0].isBlocked()); + assertTrue(Queue.getInstance().getBlockedItems().get(0).task.getDisplayName().equals(matrixProject.displayName)); + + //once the upstream is completed, the downstream can join the buildable queue again. + r.assertBuildStatusSuccess(upstream); + while (Queue.getInstance().getBuildableItems().isEmpty()) { + Thread.sleep(10); + } + assertFalse(Queue.getInstance().getItems()[0].isBlocked()); + assertTrue(Queue.getInstance().getBlockedItems().isEmpty()); + assertTrue(Queue.getInstance().getBuildableItems().get(0).task.getDisplayName().equals(matrixProject.displayName)); + } + + //let's make sure that the downstram project is not started before the upstream --> we want to simulate + // the case: buildable-->blocked-->buildable + public static class BlockDownstreamProjectExecution extends NodeProperty { + @Override + public CauseOfBlockage canTake(Queue.BuildableItem item) { + if (item.task.getName().equals("downstream")) { + return new CauseOfBlockage() { + @Override + public String getShortDescription() { + return "slave not provisioned"; + } + }; + } + return null; + } + + @TestExtension("shouldBeAbleToBlockFlyWeightTaskOnLastMinute") + public static class DescriptorImpl extends NodePropertyDescriptor { + @Override + public String getDisplayName() { + return "Some Property"; + } + } + } } diff --git a/test/src/test/java/hudson/slaves/DummyCloudImpl.java b/test/src/test/java/hudson/slaves/DummyCloudImpl.java index 5134414922..132c22e9eb 100644 --- a/test/src/test/java/hudson/slaves/DummyCloudImpl.java +++ b/test/src/test/java/hudson/slaves/DummyCloudImpl.java @@ -34,8 +34,13 @@ import java.util.Collection; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.Future; + +import hudson.util.DescribableList; +import jenkins.model.Jenkins; import org.jvnet.hudson.test.JenkinsRule; +import javax.xml.ws.Provider; + /** * {@link Cloud} implementation useful for testing. * @@ -64,12 +69,26 @@ public class DummyCloudImpl extends Cloud { */ public Label label; + public List> getNodeProperties() { + return this.nodeProperties; + } + + List> nodeProperties = + new ArrayList>(); + public DummyCloudImpl(JenkinsRule rule, int delay) { super("test"); this.rule = rule; this.delay = delay; } + public DummyCloudImpl(JenkinsRule rule, int delay, List> nodeProperties) { + super("test"); + this.rule = rule; + this.delay = delay; + this.nodeProperties = nodeProperties; + } + public Collection provision(Label label, int excessWorkload) { List r = new ArrayList(); if(label!=this.label) return r; // provisioning impossible @@ -105,7 +124,17 @@ public class DummyCloudImpl extends Cloud { Thread.sleep(time); System.out.println("launching slave"); - DumbSlave slave = rule.createSlave(label); + final DumbSlave slave = rule.createSlave(label); + Provider nodePropertyProvider = new Provider() { + @Override + public NodeProperty invoke(NodeProperty request) { + request.setNode(slave); + return request; + } + }; + for (NodeProperty nodeProperty : nodeProperties) { + slave.getNodeProperties().add(nodePropertyProvider.invoke(nodeProperty)); + } computer = slave.toComputer(); computer.connect(false).get(); synchronized (DummyCloudImpl.this) { -- GitLab