SlaveComputer.java 19.9 KB
Newer Older
K
kohsuke 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 * The MIT License
 * 
 * Copyright (c) 2004-2009, Sun Microsystems, Inc., Kohsuke Kawaguchi, Stephen Connolly
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
24 25 26 27 28 29 30 31 32
package hudson.slaves;

import hudson.model.*;
import hudson.remoting.Channel;
import hudson.remoting.VirtualChannel;
import hudson.remoting.Callable;
import hudson.util.StreamTaskListener;
import hudson.util.NullStream;
import hudson.util.RingBufferLogHandler;
K
kohsuke 已提交
33
import hudson.util.Futures;
34 35
import hudson.FilePath;
import hudson.lifecycle.WindowsSlaveInstaller;
K
kohsuke 已提交
36
import hudson.Util;
K
kohsuke 已提交
37
import hudson.AbortException;
38
import hudson.remoting.Launcher;
39
import static hudson.slaves.SlaveComputer.LogHolder.SLAVE_LOG_HANDLER;
40
import hudson.slaves.OfflineCause.ChannelTermination;
41 42 43 44 45 46 47

import java.io.File;
import java.io.OutputStream;
import java.io.FileOutputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.IOException;
48
import java.io.PrintStream;
49 50 51 52 53 54 55
import java.util.logging.Level;
import java.util.logging.LogRecord;
import java.util.logging.Logger;
import java.util.List;
import java.util.Collections;
import java.util.ArrayList;
import java.nio.charset.Charset;
K
kohsuke 已提交
56
import java.util.concurrent.Future;
57
import java.security.Security;
58 59 60

import org.kohsuke.stapler.StaplerRequest;
import org.kohsuke.stapler.StaplerResponse;
61 62 63
import org.kohsuke.stapler.QueryParameter;
import org.kohsuke.stapler.HttpResponse;
import org.kohsuke.stapler.HttpRedirect;
64 65 66 67 68 69 70 71 72

import javax.servlet.ServletException;
import javax.servlet.http.HttpServletResponse;

/**
 * {@link Computer} for {@link Slave}s.
 *
 * @author Kohsuke Kawaguchi
 */
K
kohsuke 已提交
73
public class SlaveComputer extends Computer {
74 75 76 77
    private volatile Channel channel;
    private volatile transient boolean acceptingTasks = true;
    private Charset defaultCharset;
    private Boolean isUnix;
K
kohsuke 已提交
78 79 80 81 82 83 84 85
    /**
     * Effective {@link ComputerLauncher} that hides the details of
     * how we launch a slave agent on this computer.
     *
     * <p>
     * This is normally the same as {@link Slave#getLauncher()} but
     * can be different. See {@link #grabLauncher(Node)}. 
     */
86 87 88 89 90 91 92 93 94
    private ComputerLauncher launcher;

    /**
     * Number of failed attempts to reconnect to this node
     * (so that if we keep failing to reconnect, we can stop
     * trying.)
     */
    private transient int numRetryAttempt;

K
kohsuke 已提交
95 96 97 98 99 100 101 102 103 104 105
    /**
     * Tracks the status of the last launch operation, which is always asynchronous.
     * This can be used to wait for the completion, or cancel the launch activity.
     */
    private volatile Future<?> lastConnectActivity = null;

    private Object constructed = new Object();

    public SlaveComputer(Slave slave) {
        super(slave);
    }
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134

    /**
     * {@inheritDoc}
     */
    @Override
    public boolean isAcceptingTasks() {
        return acceptingTasks;
    }

    /**
     * Allows a {@linkplain hudson.slaves.ComputerLauncher} or a {@linkplain hudson.slaves.RetentionStrategy} to
     * suspend tasks being accepted by the slave computer.
     *
     * @param acceptingTasks {@code true} if the slave can accept tasks.
     */
    public void setAcceptingTasks(boolean acceptingTasks) {
        this.acceptingTasks = acceptingTasks;
    }

    /**
     * True if this computer is a Unix machine (as opposed to Windows machine).
     *
     * @return
     *      null if the computer is disconnected and therefore we don't know whether it is Unix or not.
     */
    public Boolean isUnix() {
        return isUnix;
    }

135
    @Override
136 137 138 139
    public Slave getNode() {
        return (Slave)super.getNode();
    }

K
kohsuke 已提交
140 141 142 143 144 145 146 147
    @Override
    public String getIcon() {
        Future<?> l = lastConnectActivity;
        if(l!=null && !l.isDone())
            return "computer-flash.gif";
        return super.getIcon();
    }

M
mindless 已提交
148 149 150 151
    /**
     * @deprecated since 2008-05-20.
     */
    @Deprecated @Override
152 153 154 155 156 157 158 159 160 161 162 163 164
    public boolean isJnlpAgent() {
        return launcher instanceof JNLPLauncher;
    }

    @Override
    public boolean isLaunchSupported() {
        return launcher.isLaunchSupported();
    }

    public ComputerLauncher getLauncher() {
        return launcher;
    }

165
    protected Future<?> _connect(boolean forceReconnect) {
K
kohsuke 已提交
166
        if(channel!=null)   return Futures.precomputed(null);
167
        if(!forceReconnect && isConnecting())
K
kohsuke 已提交
168
            return lastConnectActivity;
169
        if(forceReconnect && isConnecting())
K
kohsuke 已提交
170
            logger.fine("Forcing a reconnect");
171 172

        closeChannel();
K
kohsuke 已提交
173 174
        return lastConnectActivity = Computer.threadPoolForRemoting.submit(new java.util.concurrent.Callable<Object>() {
            public Object call() throws Exception {
175 176
                // do this on another thread so that the lengthy launch operation
                // (which is typical) won't block UI thread.
177
                TaskListener listener = new StreamTaskListener(openLogFile());
K
kohsuke 已提交
178 179 180
                try {
                    launcher.launch(SlaveComputer.this, listener);
                    return null;
K
kohsuke 已提交
181 182 183
                } catch (AbortException e) {
                    listener.error(e.getMessage());
                    throw e;
K
kohsuke 已提交
184 185 186 187 188 189 190
                } catch (IOException e) {
                    Util.displayIOException(e,listener);
                    e.printStackTrace(listener.error(Messages.ComputerLauncher_unexpectedError()));
                    throw e;
                } catch (InterruptedException e) {
                    e.printStackTrace(listener.error(Messages.ComputerLauncher_abortedLaunch()));
                    throw e;
191 192 193
                } finally {
                    if (channel==null)
                        offlineCause = new OfflineCause.LaunchFailed();
K
kohsuke 已提交
194
                }
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
            }
        });
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void taskAccepted(Executor executor, Queue.Task task) {
        super.taskAccepted(executor, task);
        if (launcher instanceof ExecutorListener) {
            ((ExecutorListener)launcher).taskAccepted(executor, task);
        }
        if (getNode().getRetentionStrategy() instanceof ExecutorListener) {
            ((ExecutorListener)getNode().getRetentionStrategy()).taskAccepted(executor, task);
        }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void taskCompleted(Executor executor, Queue.Task task, long durationMS) {
        super.taskCompleted(executor, task, durationMS);
        if (launcher instanceof ExecutorListener) {
            ((ExecutorListener)launcher).taskCompleted(executor, task, durationMS);
        }
222
        RetentionStrategy r = getRetentionStrategy();
K
kohsuke 已提交
223 224
        if (r instanceof ExecutorListener) {
            ((ExecutorListener) r).taskCompleted(executor, task, durationMS);
225 226 227 228 229 230 231 232 233 234 235 236
        }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void taskCompletedWithProblems(Executor executor, Queue.Task task, long durationMS, Throwable problems) {
        super.taskCompletedWithProblems(executor, task, durationMS, problems);
        if (launcher instanceof ExecutorListener) {
            ((ExecutorListener)launcher).taskCompletedWithProblems(executor, task, durationMS, problems);
        }
237 238 239
        RetentionStrategy r = getRetentionStrategy();
        if (r instanceof ExecutorListener) {
            ((ExecutorListener) r).taskCompletedWithProblems(executor, task, durationMS, problems);
240 241 242
        }
    }

K
kohsuke 已提交
243 244 245 246 247 248
    @Override
    public boolean isConnecting() {
        Future<?> l = lastConnectActivity;
        return isOffline() && l!=null && !l.isDone();
    }

249 250 251 252 253 254 255 256 257 258 259 260 261
    public OutputStream openLogFile() {
        OutputStream os;
        try {
            os = new FileOutputStream(getLogFile());
        } catch (FileNotFoundException e) {
            logger.log(Level.SEVERE, "Failed to create log file "+getLogFile(),e);
            os = new NullStream();
        }
        return os;
    }

    private final Object channelLock = new Object();

K
kohsuke 已提交
262 263 264 265
    public void setChannel(InputStream in, OutputStream out, TaskListener taskListener, Channel.Listener listener) throws IOException, InterruptedException {
        setChannel(in,out,taskListener.getLogger(),listener);
    }

266 267
    /**
     * Creates a {@link Channel} from the given stream and sets that to this slave.
K
kohsuke 已提交
268 269 270 271 272 273 274 275 276 277 278 279 280
     *
     * @param in
     *      Stream connected to the remote "slave.jar". It's the caller's responsibility to do
     *      buffering on this stream, if that's necessary.
     * @param out
     *      Stream connected to the remote peer. It's the caller's responsibility to do
     *      buffering on this stream, if that's necessary.
     * @param launchLog
     *      If non-null, receive the portion of data in <tt>is</tt> before
     *      the data goes into the "binary mode". This is useful
     *      when the established communication channel might include some data that might
     *      be useful for debugging/trouble-shooting.
     * @param listener
281
     *      Gets a notification when the channel closes, to perform clean up. Can be null.
282 283 284 285 286
     */
    public void setChannel(InputStream in, OutputStream out, OutputStream launchLog, Channel.Listener listener) throws IOException, InterruptedException {
        if(this.channel!=null)
            throw new IllegalStateException("Already connected");

287 288
        final TaskListener taskListener = new StreamTaskListener(launchLog);
        PrintStream log = taskListener.getLogger();
289

290 291 292
        Channel channel = new Channel(nodeName,threadPoolForRemoting, Channel.Mode.NEGOTIATE,
            in,out, launchLog);
        channel.addListener(new Channel.Listener() {
293
            @Override
294
            public void onClosed(Channel c, IOException cause) {
295
                SlaveComputer.this.channel = null;
296 297
                // Orderly shutdown will have null exception
                if (cause!=null) offlineCause = new ChannelTermination(cause);
298
                launcher.afterDisconnect(SlaveComputer.this, taskListener);
299 300
            }
        });
301 302
        if(listener!=null)
            channel.addListener(listener);
303

304 305 306
        String slaveVersion = channel.call(new SlaveVersion());
        log.println("Slave.jar version: " + slaveVersion);

307 308 309 310 311 312 313 314
        boolean _isUnix = channel.call(new DetectOS());
        log.println(_isUnix? hudson.model.Messages.Slave_UnixSlave():hudson.model.Messages.Slave_WindowsSlave());

        String defaultCharsetName = channel.call(new DetectDefaultCharset());

        String remoteFs = getNode().getRemoteFS();
        if(_isUnix && !remoteFs.contains("/") && remoteFs.contains("\\"))
            log.println("WARNING: "+remoteFs+" looks suspiciously like Windows path. Maybe you meant "+remoteFs.replace('\\','/')+"?");
315
        FilePath root = new FilePath(channel,getNode().getRemoteFS());
316

317
        channel.call(new SlaveInitializer());
318
        channel.call(new WindowsSlaveInstaller(remoteFs));
319 320
        for (ComputerListener cl : ComputerListener.all())
            cl.preOnline(this,channel,root,taskListener);
321

322 323
        offlineCause = null;

324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
        // update the data structure atomically to prevent others from seeing a channel that's not properly initialized yet
        synchronized(channelLock) {
            if(this.channel!=null) {
                // check again. we used to have this entire method in a big sycnhronization block,
                // but Channel constructor blocks for an external process to do the connection
                // if CommandLauncher is used, and that cannot be interrupted because it blocks at InputStream.
                // so if the process hangs, it hangs the thread in a lock, and since Hudson will try to relaunch,
                // we'll end up queuing the lot of threads in a pseudo deadlock.
                // This implementation prevents that by avoiding a lock. HUDSON-1705 is likely a manifestation of this.
                channel.close();
                throw new IllegalStateException("Already connected");
            }
            isUnix = _isUnix;
            numRetryAttempt = 0;
            this.channel = channel;
            defaultCharset = Charset.forName(defaultCharsetName);
        }
341 342
        for (ComputerListener cl : ComputerListener.all())
            cl.onOnline(this,taskListener);
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
        Hudson.getInstance().getQueue().scheduleMaintenance();
    }

    @Override
    public VirtualChannel getChannel() {
        return channel;
    }

    public Charset getDefaultCharset() {
        return defaultCharset;
    }

    public List<LogRecord> getLogRecords() throws IOException, InterruptedException {
        if(channel==null)
            return Collections.emptyList();
        else
            return channel.call(new Callable<List<LogRecord>,RuntimeException>() {
                public List<LogRecord> call() {
                    return new ArrayList<LogRecord>(SLAVE_LOG_HANDLER.getView());
                }
            });
    }

366 367 368 369 370 371 372 373 374 375 376
    public HttpResponse doDoDisconnect(@QueryParameter String offlineMessage) throws IOException, ServletException {
        if (channel!=null) {
            //does nothing in case computer is already disconnected
            checkPermission(Hudson.ADMINISTER);
            offlineMessage = Util.fixEmptyAndTrim(offlineMessage);
            disconnect(OfflineCause.create(Messages._SlaveComputer_DisconnectedBy(
                    Hudson.getAuthentication().getName(),
                    offlineMessage!=null ? " : " + offlineMessage : "")
            ));
        }
        return new HttpRedirect(".");
377 378 379
    }

    @Override
380 381
    public Future<?> disconnect(OfflineCause cause) {
        super.disconnect(cause);
K
kohsuke 已提交
382
        return Computer.threadPoolForRemoting.submit(new Runnable() {
383 384 385
            public void run() {
                // do this on another thread so that any lengthy disconnect operation
                // (which could be typical) won't block UI thread.
386
                TaskListener listener = new StreamTaskListener(openLogFile());
387 388 389 390 391 392 393 394 395 396 397 398 399
                launcher.beforeDisconnect(SlaveComputer.this, listener);
                closeChannel();
                launcher.afterDisconnect(SlaveComputer.this, listener);
            }
        });
    }

    public void doLaunchSlaveAgent(StaplerRequest req, StaplerResponse rsp) throws IOException, ServletException {
        if(channel!=null) {
            rsp.sendError(HttpServletResponse.SC_NOT_FOUND);
            return;
        }

K
kohsuke 已提交
400
        connect(true);
401 402 403 404 405 406 407 408 409 410 411

        // TODO: would be nice to redirect the user to "launching..." wait page,
        // then spend a few seconds there and poll for the completion periodically.
        rsp.sendRedirect("log");
    }

    public void tryReconnect() {
        numRetryAttempt++;
        if(numRetryAttempt<6 || (numRetryAttempt%12)==0) {
            // initially retry several times quickly, and after that, do it infrequently.
            logger.info("Attempting to reconnect "+nodeName);
K
kohsuke 已提交
412
            connect(true);
413 414 415 416 417 418
        }
    }

    /**
     * Serves jar files for JNLP slave agents.
     *
M
mindless 已提交
419
     * @deprecated since 2008-08-18.
420 421 422 423 424 425 426 427 428 429 430 431 432 433
     *      This URL binding is no longer used and moved up directly under to {@link Hudson},
     *      but it's left here for now just in case some old JNLP slave agents request it.
     */
    public Slave.JnlpJar getJnlpJars(String fileName) {
        return new Slave.JnlpJar(fileName);
    }

    @Override
    protected void kill() {
        super.kill();
        closeChannel();
    }

    public RetentionStrategy getRetentionStrategy() {
434 435
        Slave n = getNode();
        return n==null ? null : n.getRetentionStrategy();
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
    }

    /**
     * If still connected, disconnect.
     */
    private void closeChannel() {
        // TODO: race condition between this and the setChannel method.
        Channel c = channel;
        channel = null;
        isUnix = null;
        if (c != null) {
            try {
                c.close();
            } catch (IOException e) {
                logger.log(Level.SEVERE, "Failed to terminate channel to " + getDisplayName(), e);
            }
        }
453
        for (ComputerListener cl : ComputerListener.all())
454 455 456 457 458 459
            cl.onOffline(this);
    }

    @Override
    protected void setNode(Node node) {
        super.setNode(node);
K
kohsuke 已提交
460
        launcher = grabLauncher(node);
461 462

        // maybe the configuration was changed to relaunch the slave, so try to re-launch now.
K
kohsuke 已提交
463 464
        // "constructed==null" test is an ugly hack to avoid launching before the object is fully
        // constructed.
465 466 467 468 469 470
        if(constructed!=null) {
            if (node instanceof Slave)
                ((Slave)node).getRetentionStrategy().check(this);
            else
                connect(false);
        }
K
kohsuke 已提交
471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
    }

    /**
     * Grabs a {@link ComputerLauncher} out of {@link Node} to keep it in this {@link Computer}.
     * The returned launcher will be set to {@link #launcher} and used to carry out the actual launch operation.
     *
     * <p>
     * Subtypes that needs to decorate {@link ComputerLauncher} can do so by overriding this method.
     * This is useful for {@link SlaveComputer}s for clouds for example, where one normally needs
     * additional pre-launch step (such as waiting for the provisioned node to become available)
     * before the user specified launch step (like SSH connection) kicks in.
     *
     * @see ComputerLauncherFilter
     */
    protected ComputerLauncher grabLauncher(Node node) {
        return ((Slave)node).getLauncher();
487 488 489 490
    }

    private static final Logger logger = Logger.getLogger(SlaveComputer.class.getName());

491 492 493 494 495 496
    private static final class SlaveVersion implements Callable<String,IOException> {
        public String call() throws IOException {
            try { return Launcher.VERSION; }
            catch (Throwable ex) { return "< 1.335"; } // Older slave.jar won't have VERSION
        }
    }
497 498 499 500 501 502 503 504 505 506 507 508 509
    private static final class DetectOS implements Callable<Boolean,IOException> {
        public Boolean call() throws IOException {
            return File.pathSeparatorChar==':';
        }
    }

    private static final class DetectDefaultCharset implements Callable<String,IOException> {
        public String call() throws IOException {
            return Charset.defaultCharset().name();
        }
    }

    /**
510 511
     * Puts the {@link #SLAVE_LOG_HANDLER} into a separate class so that loading this class
     * in JVM doesn't end up loading tons of additional classes.
512
     */
513 514 515 516 517 518
    static final class LogHolder {
        /**
         * This field is used on each slave node to record log records on the slave.
         */
        static final RingBufferLogHandler SLAVE_LOG_HANDLER = new RingBufferLogHandler();
    }
519

520
    private static class SlaveInitializer implements Callable<Void,RuntimeException> {
521 522 523 524 525
        public Void call() {
            // avoid double installation of the handler
            Logger logger = Logger.getLogger("hudson");
            logger.removeHandler(SLAVE_LOG_HANDLER);
            logger.addHandler(SLAVE_LOG_HANDLER);
526 527 528 529 530 531 532 533

            // remove Sun PKCS11 provider if present. See http://hudson.gotdns.com/wiki/display/HUDSON/Solaris+Issue+6276483
            try {
                Security.removeProvider("SunPKCS11-Solaris");
            } catch (SecurityException e) {
                // ignore this error.
            }

534 535 536 537 538
            return null;
        }
        private static final long serialVersionUID = 1L;
    }
}