提交 a277543c 编写于 作者: S Stephan Ewen

[FLINK-3363] [jobmanager] Properly shut down executor thread pool when JobManager shuts down

上级 af3e6890
......@@ -21,6 +21,7 @@ package org.apache.flink.runtime.jobmanager
import java.io.{File, IOException}
import java.net.{BindException, ServerSocket, UnknownHostException, InetAddress, InetSocketAddress}
import java.util.UUID
import java.util.concurrent.ExecutorService
import akka.actor.Status.Failure
import akka.actor._
......@@ -90,7 +91,7 @@ import scala.language.postfixOps
* is indicated by [[CancellationSuccess]] and a failure by [[CancellationFailure]]
*
* - [[UpdateTaskExecutionState]] is sent by a TaskManager to update the state of an
ExecutionVertex contained in the [[ExecutionGraph]].
* ExecutionVertex contained in the [[ExecutionGraph]].
* A successful update is acknowledged by true and otherwise false.
*
* - [[RequestNextInputSplit]] requests the next input split for a running task on a
......@@ -102,7 +103,7 @@ import scala.language.postfixOps
*/
class JobManager(
protected val flinkConfiguration: Configuration,
protected val executionContext: ExecutionContext,
protected val executorService: ExecutorService,
protected val instanceManager: InstanceManager,
protected val scheduler: FlinkScheduler,
protected val libraryCacheManager: BlobLibraryCacheManager,
......@@ -121,6 +122,15 @@ class JobManager(
override val log = Logger(getClass)
/** The extra execution context, for futures, with a custom logging reporter */
protected val executionContext: ExecutionContext = ExecutionContext.fromExecutor(
executorService,
(t: Throwable) => {
if (!context.system.isTerminated) {
log.error("Executor could not execute task", t)
}
})
/** Either running or not yet archived jobs (session hasn't been ended). */
protected val currentJobs = scala.collection.mutable.HashMap[JobID, (ExecutionGraph, JobInfo)]()
......@@ -246,6 +256,9 @@ class JobManager(
case e: IOException => log.error("Could not properly shutdown the library cache manager.", e)
}
// shut down the extra thread pool for futures
executorService.shutdown()
log.debug(s"Job manager ${self.path} is completely stopped.")
}
......@@ -1503,7 +1516,8 @@ class JobManager(
/**
* Updates the accumulators reported from a task manager via the Heartbeat message.
* @param accumulators list of accumulator snapshots
*
* @param accumulators list of accumulator snapshots
*/
private def updateAccumulators(accumulators : Seq[AccumulatorSnapshot]) = {
accumulators foreach {
......@@ -2016,7 +2030,7 @@ object JobManager {
def createJobManagerComponents(
configuration: Configuration,
leaderElectionServiceOption: Option[LeaderElectionService]) :
(ExecutionContext,
(ExecutorService,
InstanceManager,
FlinkScheduler,
BlobLibraryCacheManager,
......@@ -2064,17 +2078,19 @@ object JobManager {
}
}
val executionContext = ExecutionContext.fromExecutor(new ForkJoinPool())
var blobServer: BlobServer = null
var instanceManager: InstanceManager = null
var scheduler: FlinkScheduler = null
var libraryCacheManager: BlobLibraryCacheManager = null
val executorService: ExecutorService = new ForkJoinPool()
try {
blobServer = new BlobServer(configuration)
instanceManager = new InstanceManager()
scheduler = new FlinkScheduler(executionContext)
scheduler = new FlinkScheduler(ExecutionContext.fromExecutor(executorService))
libraryCacheManager = new BlobLibraryCacheManager(blobServer, cleanupInterval)
instanceManager.addInstanceListener(scheduler)
......@@ -2093,6 +2109,8 @@ object JobManager {
if (blobServer != null) {
blobServer.shutdown()
}
executorService.shutdownNow()
throw t
}
......@@ -2122,7 +2140,7 @@ object JobManager {
new ZooKeeperCheckpointRecoveryFactory(client, configuration))
}
(executionContext,
(executorService,
instanceManager,
scheduler,
libraryCacheManager,
......@@ -2143,8 +2161,7 @@ object JobManager {
* @param actorSystem The actor system running the JobManager
* @param jobManagerClass The class of the JobManager to be started
* @param archiveClass The class of the MemoryArchivist to be started
*
* @return A tuple of references (JobManager Ref, Archiver Ref)
* @return A tuple of references (JobManager Ref, Archiver Ref)
*/
def startJobManagerActors(
configuration: Configuration,
......@@ -2174,8 +2191,7 @@ object JobManager {
* the actor will have the name generated by the actor system.
* @param jobManagerClass The class of the JobManager to be started
* @param archiveClass The class of the MemoryArchivist to be started
*
* @return A tuple of references (JobManager Ref, Archiver Ref)
* @return A tuple of references (JobManager Ref, Archiver Ref)
*/
def startJobManagerActors(
configuration: Configuration,
......@@ -2186,7 +2202,7 @@ object JobManager {
archiveClass: Class[_ <: MemoryArchivist])
: (ActorRef, ActorRef) = {
val (executionContext,
val (executorService: ExecutorService,
instanceManager,
scheduler,
libraryCacheManager,
......@@ -2211,7 +2227,7 @@ object JobManager {
val jobManagerProps = Props(
jobManagerClass,
configuration,
executionContext,
executorService,
instanceManager,
scheduler,
libraryCacheManager,
......
......@@ -44,6 +44,7 @@ import org.slf4j.LoggerFactory
import scala.concurrent.duration.FiniteDuration
import scala.concurrent._
import scala.concurrent.forkjoin.ForkJoinPool
/**
* Abstract base class for Flink's mini cluster. The mini cluster starts a
......@@ -82,7 +83,7 @@ abstract class FlinkMiniCluster(
/** Future lock */
val futureLock = new Object()
implicit val executionContext = ExecutionContext.global
implicit val timeout = AkkaUtils.getTimeout(userConfiguration)
......@@ -320,8 +321,6 @@ abstract class FlinkMiniCluster(
_.map(gracefulStop(_, timeout))
} getOrElse(Seq())
implicit val executionContext = ExecutionContext.global
Await.ready(Future.sequence(jmFutures ++ tmFutures), timeout)
if (!useSingleActorSystem) {
......
......@@ -52,7 +52,9 @@ import org.junit.rules.TemporaryFolder;
import scala.concurrent.Await;
import scala.concurrent.Future;
import scala.concurrent.duration.FiniteDuration;
import scala.concurrent.forkjoin.ForkJoinPool;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
public class JobManagerLeaderElectionTest extends TestLogger {
......@@ -62,14 +64,16 @@ public class JobManagerLeaderElectionTest extends TestLogger {
private static ActorSystem actorSystem;
private static TestingServer testingServer;
private static ExecutorService executor;
private static Timeout timeout = new Timeout(TestingUtils.TESTING_DURATION());
private static FiniteDuration duration = new FiniteDuration(5, TimeUnit.MINUTES);
@BeforeClass
public static void setup() throws Exception {
actorSystem = ActorSystem.create("TestingActorSystem");
testingServer = new TestingServer();
executor = new ForkJoinPool();
}
@AfterClass
......@@ -78,9 +82,13 @@ public class JobManagerLeaderElectionTest extends TestLogger {
JavaTestKit.shutdownActorSystem(actorSystem);
}
if(testingServer != null) {
if (testingServer != null) {
testingServer.stop();
}
if (executor != null) {
executor.shutdownNow();
}
}
/**
......@@ -175,10 +183,10 @@ public class JobManagerLeaderElectionTest extends TestLogger {
return Props.create(
TestingJobManager.class,
configuration,
TestingUtils.defaultExecutionContext(),
executor,
new InstanceManager(),
new Scheduler(TestingUtils.defaultExecutionContext()),
new BlobLibraryCacheManager(new BlobServer(configuration), 10l),
new BlobLibraryCacheManager(new BlobServer(configuration), 10L),
ActorRef.noSender(),
1,
1L,
......
......@@ -19,6 +19,7 @@
package org.apache.flink.runtime.testingUtils
import akka.actor.ActorRef
import org.apache.flink.configuration.Configuration
import org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory
import org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager
......@@ -27,25 +28,17 @@ import org.apache.flink.runtime.jobmanager.scheduler.Scheduler
import org.apache.flink.runtime.jobmanager.{JobManager, SubmittedJobGraphStore}
import org.apache.flink.runtime.leaderelection.LeaderElectionService
import scala.concurrent.ExecutionContext
import scala.concurrent.duration._
import scala.language.postfixOps
import java.util.concurrent.ExecutorService
/** JobManager implementation extended by testing messages
*
* @param flinkConfiguration
* @param executionContext
* @param instanceManager
* @param scheduler
* @param libraryCacheManager
* @param archive
* @param defaultExecutionRetries
* @param delayBetweenRetries
* @param timeout
*/
class TestingJobManager(
flinkConfiguration: Configuration,
executionContext: ExecutionContext,
executorService: ExecutorService,
instanceManager: InstanceManager,
scheduler: Scheduler,
libraryCacheManager: BlobLibraryCacheManager,
......@@ -58,7 +51,7 @@ class TestingJobManager(
checkpointRecoveryFactory : CheckpointRecoveryFactory)
extends JobManager(
flinkConfiguration,
executionContext,
executorService,
instanceManager,
scheduler,
libraryCacheManager,
......
......@@ -20,6 +20,7 @@ package org.apache.flink.test.runtime.minicluster;
import akka.actor.ActorSystem;
import akka.testkit.JavaTestKit;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.akka.AkkaUtils;
......@@ -28,33 +29,42 @@ import org.apache.flink.runtime.instance.ActorGateway;
import org.apache.flink.runtime.messages.JobManagerMessages;
import org.apache.flink.runtime.minicluster.LocalFlinkMiniCluster;
import org.apache.flink.runtime.testingUtils.TestingUtils;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import scala.concurrent.ExecutionContext$;
import scala.concurrent.forkjoin.ForkJoinPool;
import scala.concurrent.impl.ExecutionContextImpl;
public class LocalFlinkMiniClusterITCase {
import java.lang.reflect.Field;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
static ActorSystem system;
import static org.junit.Assert.fail;
@BeforeClass
public static void setup() {
system = ActorSystem.create("Testkit", AkkaUtils.getDefaultAkkaConfig());
}
public class LocalFlinkMiniClusterITCase {
@AfterClass
public static void teardown() {
JavaTestKit.shutdownActorSystem(system);
system = null;
}
private static String[] ALLOWED_THREAD_PREFIXES = { };
@Test
public void testLocalFlinkMiniClusterWithMultipleTaskManagers() {
final ActorSystem system = ActorSystem.create("Testkit", AkkaUtils.getDefaultAkkaConfig());
LocalFlinkMiniCluster miniCluster = null;
final int numTMs = 3;
final int numSlots = 14;
try{
// gather the threads that already exist
final Set<Thread> threadsBefore = new HashSet<>();
{
final Thread[] allThreads = new Thread[Thread.activeCount()];
Thread.enumerate(allThreads);
threadsBefore.addAll(Arrays.asList(allThreads));
}
try {
Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTMs);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlots);
......@@ -90,7 +100,70 @@ public class LocalFlinkMiniClusterITCase {
} finally {
if (miniCluster != null) {
miniCluster.stop();
miniCluster.awaitTermination();
}
JavaTestKit.shutdownActorSystem(system);
system.awaitTermination();
}
// shut down the global execution context, to make sure it does not affect this testing
try {
Field f = ExecutionContextImpl.class.getDeclaredField("executor");
f.setAccessible(true);
Object exec = ExecutionContext$.MODULE$.global();
ForkJoinPool executor = (ForkJoinPool) f.get(exec);
executor.shutdownNow();
}
catch (Exception e) {
System.err.println("Cannot test proper thread shutdown for local execution.");
return;
}
// check for remaining threads
// we need to check repeatedly for a while, because some threads shut down slowly
long deadline = System.currentTimeMillis() + 30000;
boolean foundThreads = true;
String threadName = "";
while (System.currentTimeMillis() < deadline) {
// check that no additional threads remain
final Thread[] threadsAfter = new Thread[Thread.activeCount()];
Thread.enumerate(threadsAfter);
foundThreads = false;
for (Thread t : threadsAfter) {
if (t.isAlive() && !threadsBefore.contains(t)) {
// this thread was not there before. check if it is allowed
boolean allowed = false;
for (String prefix : ALLOWED_THREAD_PREFIXES) {
if (t.getName().startsWith(prefix)) {
allowed = true;
break;
}
}
if (!allowed) {
foundThreads = true;
threadName = t.toString();
break;
}
}
}
if (foundThreads) {
try {
Thread.sleep(500);
} catch (InterruptedException ignored) {}
} else {
break;
}
}
if (foundThreads) {
fail("Thread " + threadName + " was started by the mini cluster, but not shut down");
}
}
}
......@@ -18,6 +18,8 @@
package org.apache.flink.yarn
import java.util.concurrent.ExecutorService
import akka.actor.ActorRef
import org.apache.flink.configuration.Configuration
import org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory
......@@ -28,7 +30,6 @@ import org.apache.flink.runtime.jobmanager.scheduler.Scheduler
import org.apache.flink.runtime.leaderelection.LeaderElectionService
import org.apache.flink.runtime.testingUtils.TestingJobManagerLike
import scala.concurrent.ExecutionContext
import scala.concurrent.duration.FiniteDuration
/** [[YarnJobManager]] implementation which mixes in the [[TestingJobManagerLike]] mixin.
......@@ -37,7 +38,7 @@ import scala.concurrent.duration.FiniteDuration
* instead of an anonymous class with the respective mixin to obtain a more readable logger name.
*
* @param flinkConfiguration Configuration object for the actor
* @param executionContext Execution context which is used to execute concurrent tasks in the
* @param executorService Execution context which is used to execute concurrent tasks in the
* [[org.apache.flink.runtime.executiongraph.ExecutionGraph]]
* @param instanceManager Instance manager to manage the registered
* [[org.apache.flink.runtime.taskmanager.TaskManager]]
......@@ -51,7 +52,7 @@ import scala.concurrent.duration.FiniteDuration
*/
class TestingYarnJobManager(
flinkConfiguration: Configuration,
executionContext: ExecutionContext,
executorService: ExecutorService,
instanceManager: InstanceManager,
scheduler: Scheduler,
libraryCacheManager: BlobLibraryCacheManager,
......@@ -64,7 +65,7 @@ class TestingYarnJobManager(
checkpointRecoveryFactory : CheckpointRecoveryFactory)
extends YarnJobManager(
flinkConfiguration,
executionContext,
executorService,
instanceManager,
scheduler,
libraryCacheManager,
......
......@@ -22,10 +22,13 @@ import java.io.File
import java.lang.reflect.Method
import java.nio.ByteBuffer
import java.util.Collections
import java.util.concurrent.ExecutorService
import java.util.{List => JavaList}
import akka.actor.ActorRef
import grizzled.slf4j.Logger
import org.apache.flink.api.common.JobID
import org.apache.flink.configuration.{Configuration => FlinkConfiguration, ConfigConstants}
import org.apache.flink.runtime.akka.AkkaUtils
......@@ -40,6 +43,7 @@ import org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager
import org.apache.flink.runtime.instance.InstanceManager
import org.apache.flink.runtime.jobmanager.scheduler.{Scheduler => FlinkScheduler}
import org.apache.flink.yarn.YarnMessages._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.DataOutputBuffer
......@@ -55,7 +59,6 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.hadoop.yarn.exceptions.YarnException
import org.apache.hadoop.yarn.util.Records
import scala.concurrent.ExecutionContext
import scala.concurrent.duration._
import scala.language.postfixOps
import scala.util.Try
......@@ -64,7 +67,7 @@ import scala.util.Try
* to start/administer/stop the Yarn session.
*
* @param flinkConfiguration Configuration object for the actor
* @param executionContext Execution context which is used to execute concurrent tasks in the
* @param executorService Execution context which is used to execute concurrent tasks in the
* [[org.apache.flink.runtime.executiongraph.ExecutionGraph]]
* @param instanceManager Instance manager to manage the registered
* [[org.apache.flink.runtime.taskmanager.TaskManager]]
......@@ -78,7 +81,7 @@ import scala.util.Try
*/
class YarnJobManager(
flinkConfiguration: FlinkConfiguration,
executionContext: ExecutionContext,
executorService: ExecutorService,
instanceManager: InstanceManager,
scheduler: FlinkScheduler,
libraryCacheManager: BlobLibraryCacheManager,
......@@ -91,7 +94,7 @@ class YarnJobManager(
checkpointRecoveryFactory : CheckpointRecoveryFactory)
extends JobManager(
flinkConfiguration,
executionContext,
executorService,
instanceManager,
scheduler,
libraryCacheManager,
......@@ -587,7 +590,8 @@ class YarnJobManager(
/**
* Calculate the correct JVM heap memory limit.
* @param memoryLimit The maximum memory in megabytes.
*
* @param memoryLimit The maximum memory in megabytes.
* @return A Tuple2 containing the heap and the offHeap limit in megabytes.
*/
private def calculateMemoryLimits(memoryLimit: Long): Long = {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册