提交 06b0cd63 编写于 作者: O Oliver Gondža

[JENKINS-46680] Disconnect computer on ping timeout (#3005)

* [JENKINS-46680] Reproduce in unittest

* [FIX JENKINS-46680] Reset SlaveComputer channel before closing it on ping timeout

* [JENKINS-46680] Attach channel termination offline cause on ping timeouts

(cherry picked from commit dbb5e443)
上级 8ec2510b
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
*/ */
package hudson.slaves; package hudson.slaves;
import com.google.common.annotations.VisibleForTesting;
import hudson.Extension; import hudson.Extension;
import hudson.FilePath; import hudson.FilePath;
import jenkins.util.SystemProperties; import jenkins.util.SystemProperties;
...@@ -34,6 +35,7 @@ import hudson.remoting.PingThread; ...@@ -34,6 +35,7 @@ import hudson.remoting.PingThread;
import jenkins.security.MasterToSlaveCallable; import jenkins.security.MasterToSlaveCallable;
import jenkins.slaves.PingFailureAnalyzer; import jenkins.slaves.PingFailureAnalyzer;
import javax.annotation.CheckForNull;
import java.io.IOException; import java.io.IOException;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level; import java.util.logging.Level;
...@@ -86,28 +88,38 @@ public class ChannelPinger extends ComputerListener { ...@@ -86,28 +88,38 @@ public class ChannelPinger extends ComputerListener {
@Override @Override
public void preOnline(Computer c, Channel channel, FilePath root, TaskListener listener) { public void preOnline(Computer c, Channel channel, FilePath root, TaskListener listener) {
install(channel); SlaveComputer slaveComputer = null;
if (c instanceof SlaveComputer) {
slaveComputer = (SlaveComputer) c;
}
install(channel, slaveComputer);
} }
public void install(Channel channel) { public void install(Channel channel) {
install(channel, null);
}
@VisibleForTesting
/*package*/ void install(Channel channel, @CheckForNull SlaveComputer c) {
if (pingTimeoutSeconds < 1 || pingIntervalSeconds < 1) { if (pingTimeoutSeconds < 1 || pingIntervalSeconds < 1) {
LOGGER.warning("Agent ping is disabled"); LOGGER.warning("Agent ping is disabled");
return; return;
} }
// set up ping from both directions, so that in case of a router dropping a connection,
// both sides can notice it and take compensation actions.
try { try {
channel.call(new SetUpRemotePing(pingTimeoutSeconds, pingIntervalSeconds)); channel.call(new SetUpRemotePing(pingTimeoutSeconds, pingIntervalSeconds));
LOGGER.fine("Set up a remote ping for " + channel.getName()); LOGGER.fine("Set up a remote ping for " + channel.getName());
} catch (Exception e) { } catch (Exception e) {
LOGGER.severe("Failed to set up a ping for " + channel.getName()); LOGGER.log(Level.SEVERE, "Failed to set up a ping for " + channel.getName(), e);
} }
// set up ping from both directions, so that in case of a router dropping a connection, setUpPingForChannel(channel, c, pingTimeoutSeconds, pingIntervalSeconds, true);
// both sides can notice it and take compensation actions.
setUpPingForChannel(channel, pingTimeoutSeconds, pingIntervalSeconds, true);
} }
static class SetUpRemotePing extends MasterToSlaveCallable<Void, IOException> { @VisibleForTesting
/*package*/ static class SetUpRemotePing extends MasterToSlaveCallable<Void, IOException> {
private static final long serialVersionUID = -2702219700841759872L; private static final long serialVersionUID = -2702219700841759872L;
@Deprecated @Deprecated
private transient int pingInterval; private transient int pingInterval;
...@@ -121,7 +133,7 @@ public class ChannelPinger extends ComputerListener { ...@@ -121,7 +133,7 @@ public class ChannelPinger extends ComputerListener {
@Override @Override
public Void call() throws IOException { public Void call() throws IOException {
setUpPingForChannel(Channel.current(), pingTimeoutSeconds, pingIntervalSeconds, false); setUpPingForChannel(Channel.current(), null, pingTimeoutSeconds, pingIntervalSeconds, false);
return null; return null;
} }
...@@ -163,30 +175,36 @@ public class ChannelPinger extends ComputerListener { ...@@ -163,30 +175,36 @@ public class ChannelPinger extends ComputerListener {
} }
} }
static void setUpPingForChannel(final Channel channel, int timeoutSeconds, int intervalSeconds, final boolean analysis) { @VisibleForTesting
/*package*/ static void setUpPingForChannel(final Channel channel, final SlaveComputer computer, int timeoutSeconds, int intervalSeconds, final boolean analysis) {
LOGGER.log(Level.FINE, "setting up ping on {0} with a {1} seconds interval and {2} seconds timeout", new Object[] {channel.getName(), intervalSeconds, timeoutSeconds}); LOGGER.log(Level.FINE, "setting up ping on {0} with a {1} seconds interval and {2} seconds timeout", new Object[] {channel.getName(), intervalSeconds, timeoutSeconds});
final AtomicBoolean isInClosed = new AtomicBoolean(false); final AtomicBoolean isInClosed = new AtomicBoolean(false);
final PingThread t = new PingThread(channel, timeoutSeconds * 1000L, intervalSeconds * 1000L) { final PingThread t = new PingThread(channel, timeoutSeconds * 1000L, intervalSeconds * 1000L) {
@Override @Override
protected void onDead(Throwable cause) { protected void onDead(Throwable cause) {
try {
if (analysis) { if (analysis) {
analyze(cause); analyze(cause);
} }
if (isInClosed.get()) { boolean inClosed = isInClosed.get();
// Disassociate computer channel before closing it
if (computer != null) {
Exception exception = cause instanceof Exception ? (Exception) cause: new IOException(cause);
computer.disconnect(new OfflineCause.ChannelTermination(exception));
}
if (inClosed) {
LOGGER.log(Level.FINE,"Ping failed after the channel "+channel.getName()+" is already partially closed.",cause); LOGGER.log(Level.FINE,"Ping failed after the channel "+channel.getName()+" is already partially closed.",cause);
} else { } else {
LOGGER.log(Level.INFO,"Ping failed. Terminating the channel "+channel.getName()+".",cause); LOGGER.log(Level.INFO,"Ping failed. Terminating the channel "+channel.getName()+".",cause);
channel.close(cause);
} }
} catch (IOException e) {
LOGGER.log(Level.SEVERE,"Failed to terminate the channel "+channel.getName(),e);
}
} }
/** Keep in a separate method so we do not even try to do class loading on {@link PingFailureAnalyzer} from an agent JVM. */ /** Keep in a separate method so we do not even try to do class loading on {@link PingFailureAnalyzer} from an agent JVM. */
private void analyze(Throwable cause) throws IOException { private void analyze(Throwable cause) {
for (PingFailureAnalyzer pfa : PingFailureAnalyzer.all()) { for (PingFailureAnalyzer pfa : PingFailureAnalyzer.all()) {
pfa.onPingFailure(channel,cause); try {
pfa.onPingFailure(channel, cause);
} catch (IOException ex) {
LOGGER.log(Level.WARNING, "Ping failure analyzer " + pfa.getClass().getName() + " failed for " + channel.getName(), ex);
}
} }
} }
@Deprecated @Deprecated
......
...@@ -59,12 +59,12 @@ public class ChannelPingerTest { ...@@ -59,12 +59,12 @@ public class ChannelPingerTest {
@Test @Test
public void testDefaults() throws Exception { public void testDefaults() throws Exception {
ChannelPinger channelPinger = new ChannelPinger(); ChannelPinger channelPinger = new ChannelPinger();
channelPinger.install(mockChannel); channelPinger.install(mockChannel, null);
verify(mockChannel).call(eq(new ChannelPinger.SetUpRemotePing(ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, verify(mockChannel).call(eq(new ChannelPinger.SetUpRemotePing(ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT,
ChannelPinger.PING_INTERVAL_SECONDS_DEFAULT))); ChannelPinger.PING_INTERVAL_SECONDS_DEFAULT)));
verifyStatic(); verifyStatic();
ChannelPinger.setUpPingForChannel(mockChannel, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, ChannelPinger.setUpPingForChannel(mockChannel, null, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT,
ChannelPinger.PING_INTERVAL_SECONDS_DEFAULT, true); ChannelPinger.PING_INTERVAL_SECONDS_DEFAULT, true);
} }
...@@ -74,11 +74,11 @@ public class ChannelPingerTest { ...@@ -74,11 +74,11 @@ public class ChannelPingerTest {
System.setProperty("hudson.slaves.ChannelPinger.pingIntervalSeconds", "73"); System.setProperty("hudson.slaves.ChannelPinger.pingIntervalSeconds", "73");
ChannelPinger channelPinger = new ChannelPinger(); ChannelPinger channelPinger = new ChannelPinger();
channelPinger.install(mockChannel); channelPinger.install(mockChannel, null);
verify(mockChannel).call(new ChannelPinger.SetUpRemotePing(42, 73)); verify(mockChannel).call(new ChannelPinger.SetUpRemotePing(42, 73));
verifyStatic(); verifyStatic();
ChannelPinger.setUpPingForChannel(mockChannel, 42, 73, true); ChannelPinger.setUpPingForChannel(mockChannel, null, 42, 73, true);
} }
@Test @Test
...@@ -86,11 +86,11 @@ public class ChannelPingerTest { ...@@ -86,11 +86,11 @@ public class ChannelPingerTest {
System.setProperty("hudson.slaves.ChannelPinger.pingInterval", "7"); System.setProperty("hudson.slaves.ChannelPinger.pingInterval", "7");
ChannelPinger channelPinger = new ChannelPinger(); ChannelPinger channelPinger = new ChannelPinger();
channelPinger.install(mockChannel); channelPinger.install(mockChannel, null);
verify(mockChannel).call(eq(new ChannelPinger.SetUpRemotePing(ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 420))); verify(mockChannel).call(eq(new ChannelPinger.SetUpRemotePing(ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 420)));
verifyStatic(); verifyStatic();
ChannelPinger.setUpPingForChannel(mockChannel, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 420, true); ChannelPinger.setUpPingForChannel(mockChannel, null, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 420, true);
} }
@Test @Test
...@@ -99,11 +99,11 @@ public class ChannelPingerTest { ...@@ -99,11 +99,11 @@ public class ChannelPingerTest {
System.setProperty("hudson.slaves.ChannelPinger.pingInterval", "7"); System.setProperty("hudson.slaves.ChannelPinger.pingInterval", "7");
ChannelPinger channelPinger = new ChannelPinger(); ChannelPinger channelPinger = new ChannelPinger();
channelPinger.install(mockChannel); channelPinger.install(mockChannel, null);
verify(mockChannel).call(eq(new ChannelPinger.SetUpRemotePing(ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 73))); verify(mockChannel).call(eq(new ChannelPinger.SetUpRemotePing(ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 73)));
verifyStatic(); verifyStatic();
ChannelPinger.setUpPingForChannel(mockChannel, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 73, true); ChannelPinger.setUpPingForChannel(mockChannel, null, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 73, true);
} }
@Test @Test
......
/*
* The MIT License
*
* Copyright (c) Red Hat, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package hudson.slaves;
import hudson.Functions;
import hudson.model.Computer;
import hudson.remoting.Channel;
import hudson.remoting.ChannelClosedException;
import hudson.remoting.PingThread;
import jenkins.security.MasterToSlaveCallable;
import org.junit.Rule;
import org.junit.Test;
import org.jvnet.hudson.test.JenkinsRule;
import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.lang.reflect.Method;
import java.util.Date;
import java.util.concurrent.TimeoutException;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.fail;
import static org.junit.Assume.assumeFalse;
/**
* @author ogondza.
*/
public class PingThreadTest {
@Rule
public JenkinsRule j = new JenkinsRule();
@Test
public void failedPingThreadResetsComputerChannel() throws Exception {
assumeFalse("We simulate hung agent by sending the SIGTSTP signal", Functions.isWindows());
DumbSlave slave = j.createOnlineSlave();
Computer computer = slave.toComputer();
Channel channel = (Channel) slave.getChannel();
String pid = channel.call(new GetPid());
PingThread pingThread = null;
for (Thread it: Thread.getAllStackTraces().keySet()) {
if (it instanceof PingThread && it.getName().endsWith(channel.toString())) {
pingThread = (PingThread) it;
}
}
assertNotNull(pingThread);
// Simulate lost connection
assert new ProcessBuilder("kill", "-TSTP", pid).start().waitFor() == 0;
try {
// ... do not wait for Ping Thread to notice
Method onDead = PingThread.class.getDeclaredMethod("onDead", Throwable.class);
onDead.setAccessible(true);
onDead.invoke(pingThread, new TimeoutException("No ping"));
try {
channel.call(new GetPid());
fail();
} catch (ChannelClosedException ex) {
// Expected
}
assertNull(slave.getComputer().getChannel());
assertNull(computer.getChannel());
} finally {
assert new ProcessBuilder("kill", "-CONT", pid).start().waitFor() == 0;
}
}
private static final class GetPid extends MasterToSlaveCallable<String, IOException> {
@Override public String call() throws IOException {
return ManagementFactory.getRuntimeMXBean().getName().replaceAll("@.*", "");
}
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册