提交 58cb75fd 编写于 作者: K Kohsuke Kawaguchi

Improved the robustness in the node monitoring

- Improved logging and recovery for the Record thread that dies in the middle.
- Recover from failing to monitor one node and move on to the next.
上级 5c95be34
......@@ -110,11 +110,11 @@ public abstract class AbstractNodeMonitorDescriptor<T> extends Descriptor<NodeMo
* If no data is available, a background task to collect data will be started.
*/
public T get(Computer c) {
if(record==null) {
// if this is the first time, schedule the check now
if(inProgress==null) {
if(record==null || !record.data.containsKey(c)) {
// if we don't have the data, schedule the check now
if(!isInProgress()) {
synchronized(this) {
if(inProgress==null)
if(!isInProgress())
new Record().start();
}
}
......@@ -123,6 +123,11 @@ public abstract class AbstractNodeMonitorDescriptor<T> extends Descriptor<NodeMo
return record.data.get(c);
}
private boolean isInProgress() {
Record r = inProgress; // capture for atomicity
return r!=null && r.isAlive();
}
/**
* Is this monitor currently ignored?
*/
......@@ -204,34 +209,40 @@ public abstract class AbstractNodeMonitorDescriptor<T> extends Descriptor<NodeMo
@Override
public void run() {
long startTime = System.currentTimeMillis();
String oldName = getName();
for( Computer c : Jenkins.getInstance().getComputers() ) {
try {
setName("Monitoring "+c.getDisplayName()+" for "+getDisplayName());
if(c.getChannel()==null)
data.put(c,null);
else
data.put(c,monitor(c));
} catch (RuntimeException e) {
LOGGER.log(Level.WARNING, "Failed to monitor "+c.getDisplayName()+" for "+getDisplayName(), e);
} catch (IOException e) {
LOGGER.log(Level.WARNING, "Failed to monitor "+c.getDisplayName()+" for "+getDisplayName(), e);
} catch (InterruptedException e) {
LOGGER.log(Level.WARNING,"Node monitoring "+c.getDisplayName()+" for "+getDisplayName()+" aborted.",e);
try {
long startTime = System.currentTimeMillis();
String oldName = getName();
for( Computer c : Jenkins.getInstance().getComputers() ) {
try {
setName("Monitoring "+c.getDisplayName()+" for "+getDisplayName());
if(c.getChannel()==null)
data.put(c,null);
else
data.put(c,monitor(c));
} catch (RuntimeException e) {
LOGGER.log(Level.WARNING, "Failed to monitor "+c.getDisplayName()+" for "+getDisplayName(), e);
} catch (IOException e) {
LOGGER.log(Level.WARNING, "Failed to monitor "+c.getDisplayName()+" for "+getDisplayName(), e);
} catch (InterruptedException e) {
LOGGER.log(Level.WARNING,"Node monitoring "+c.getDisplayName()+" for "+getDisplayName()+" aborted.",e);
return; // we are told to die
}
}
}
setName(oldName);
setName(oldName);
synchronized(AbstractNodeMonitorDescriptor.this) {
assert inProgress==this;
inProgress = null;
record = this;
}
LOGGER.fine("Node monitoring "+getDisplayName()+" completed in "+(System.currentTimeMillis()-startTime)+"ms");
LOGGER.fine("Node monitoring "+getDisplayName()+" completed in "+(System.currentTimeMillis()-startTime)+"ms");
} catch (Throwable t) {
LOGGER.log(Level.WARNING, "Unexpected node monitoring termination: "+getDisplayName(),t);
} finally {
synchronized(AbstractNodeMonitorDescriptor.this) {
if (inProgress==this)
inProgress = null;
}
}
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册