提交 16fb4e91 编写于 作者: M Maximilian Michels 提交者: Fabian Hueske

[FLINK-2645] [jobmanager] Fail job execution if final accumulators cannot be...

[FLINK-2645] [jobmanager] Fail job execution if final accumulators cannot be merged and forward exceptions.

This closes #1112
上级 9c2791b0
......@@ -827,7 +827,7 @@ public class ExecutionGraph implements Serializable {
}
}
void jobVertexInFinalState(ExecutionJobVertex ev) {
void jobVertexInFinalState() {
synchronized (progressLock) {
if (numFinishedJobVertices >= verticesInCreationOrder.size()) {
throw new IllegalStateException("All vertices are already finished, cannot transition vertex to finished.");
......@@ -927,19 +927,18 @@ public class ExecutionGraph implements Serializable {
case RUNNING:
return attempt.switchToRunning();
case FINISHED:
Map<AccumulatorRegistry.Metric, Accumulator<?, ?>> flinkAccumulators = null;
Map<String, Accumulator<?, ?>> userAccumulators = null;
try {
AccumulatorSnapshot accumulators = state.getAccumulators();
flinkAccumulators = accumulators.deserializeFlinkAccumulators();
userAccumulators = accumulators.deserializeUserAccumulators(userClassLoader);
Map<AccumulatorRegistry.Metric, Accumulator<?, ?>> flinkAccumulators =
accumulators.deserializeFlinkAccumulators();
Map<String, Accumulator<?, ?>> userAccumulators =
accumulators.deserializeUserAccumulators(userClassLoader);
attempt.markFinished(flinkAccumulators, userAccumulators);
}
catch (Exception e) {
// we do not fail the job on deserialization problems of accumulators, but only log
LOG.error("Failed to deserialize final accumulator results.", e);
attempt.markFailed(e);
}
attempt.markFinished(flinkAccumulators, userAccumulators);
return true;
case CANCELED:
attempt.cancelingComplete();
......
......@@ -482,7 +482,7 @@ public class ExecutionJobVertex implements Serializable {
stateMonitor.notifyAll();
// tell the graph
graph.jobVertexInFinalState(this);
graph.jobVertexInFinalState();
} else {
numSubtasksInFinalState++;
}
......
......@@ -384,10 +384,14 @@ class JobManager(
newJobStatus match {
case JobStatus.FINISHED =>
val accumulatorResults: java.util.Map[String, SerializedValue[AnyRef]] = try {
executionGraph.getAccumulatorsSerialized()
executionGraph.getAccumulatorsSerialized()
} catch {
case e: Exception =>
log.error(s"Cannot fetch serialized accumulators for job $jobID", e)
log.error(s"Cannot fetch final accumulators for job $jobID", e)
val exception = new JobExecutionException(jobID,
"Failed to retrieve accumulator results.", e)
jobInfo.client ! decorateMessage(JobResultFailure(
new SerializedThrowable(exception)))
Collections.emptyMap()
}
val result = new SerializedJobExecutionResult(
......
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.test.accumulators;
import org.apache.flink.api.common.accumulators.DoubleCounter;
import org.apache.flink.api.common.accumulators.LongCounter;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.io.DiscardingOutputFormat;
import org.apache.flink.client.program.ProgramInvocationException;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.client.JobExecutionException;
import org.apache.flink.test.util.ForkableFlinkMiniCluster;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import static org.junit.Assert.fail;
/**
* Tests cases where Accumulator are
* a) throw errors during runtime
* b) is not compatible with existing accumulator
*/
public class AccumulatorErrorITCase {
private static ForkableFlinkMiniCluster cluster;
@BeforeClass
public static void startCluster() {
try {
Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 2);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 3);
config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 12);
cluster = new ForkableFlinkMiniCluster(config, false);
cluster.start();
}
catch (Exception e) {
e.printStackTrace();
fail("Failed to start test cluster: " + e.getMessage());
}
}
@AfterClass
public static void shutdownCluster() {
try {
cluster.shutdown();
cluster = null;
}
catch (Exception e) {
e.printStackTrace();
fail("Failed to stop test cluster: " + e.getMessage());
}
}
@Test
public void testFaultyAccumulator() throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", cluster.getLeaderRPCPort());
env.getConfig().disableSysoutLogging();
// Test Exception forwarding with faulty Accumulator implementation
DataSet<Long> input = env.generateSequence(0, 10000);
DataSet<Long> map = input.map(new FaultyAccumulatorUsingMapper());
map.output(new DiscardingOutputFormat<Long>());
try {
env.execute();
fail("Should have failed.");
} catch (ProgramInvocationException e) {
Assert.assertTrue("Exception should be passed:",
e.getCause() instanceof JobExecutionException);
Assert.assertTrue("Root cause should be:",
e.getCause().getCause() instanceof CustomException);
}
}
@Test
public void testInvalidTypeAccumulator() throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", cluster.getLeaderRPCPort());
env.getConfig().disableSysoutLogging();
// Test Exception forwarding with faulty Accumulator implementation
DataSet<Long> input = env.generateSequence(0, 10000);
DataSet<Long> mappers = input.map(new IncompatibleAccumulatorTypesMapper())
.map(new IncompatibleAccumulatorTypesMapper2());
mappers.output(new DiscardingOutputFormat<Long>());
try {
env.execute();
fail("Should have failed.");
} catch (ProgramInvocationException e) {
Assert.assertTrue("Exception should be passed:",
e.getCause() instanceof JobExecutionException);
Assert.assertTrue("Root cause should be:",
e.getCause().getCause() instanceof Exception);
Assert.assertTrue("Root cause should be:",
e.getCause().getCause().getCause() instanceof UnsupportedOperationException);
}
}
/* testFaultyAccumulator */
private static class FaultyAccumulatorUsingMapper extends RichMapFunction<Long, Long> {
private static final long serialVersionUID = 42;
@Override
public void open(Configuration parameters) throws Exception {
getRuntimeContext().addAccumulator("test", new FaultyAccumulator());
}
@Override
public Long map(Long value) throws Exception {
return -1L;
}
}
private static class FaultyAccumulator extends LongCounter {
private static final long serialVersionUID = 42;
@Override
public LongCounter clone() {
throw new CustomException();
}
}
private static class CustomException extends RuntimeException {
private static final long serialVersionUID = 42;
}
/* testInvalidTypeAccumulator */
private static class IncompatibleAccumulatorTypesMapper extends RichMapFunction<Long, Long> {
private static final long serialVersionUID = 42;
@Override
public void open(Configuration parameters) throws Exception {
getRuntimeContext().addAccumulator("test", new LongCounter());
}
@Override
public Long map(Long value) throws Exception {
return -1L;
}
}
private static class IncompatibleAccumulatorTypesMapper2 extends RichMapFunction<Long, Long> {
private static final long serialVersionUID = 42;
@Override
public void open(Configuration parameters) throws Exception {
getRuntimeContext().addAccumulator("test", new DoubleCounter());
}
@Override
public Long map(Long value) throws Exception {
return -1L;
}
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册