From d67436ffad7afe4985d843bc63cab260667ed7d7 Mon Sep 17 00:00:00 2001 From: bao liang <29528966+lenboo@users.noreply.github.com> Date: Thu, 28 May 2020 12:31:12 +0800 Subject: [PATCH] [bug fix] fix: The workflow is fault-tolerant and 2 task instances are generated (#2833) * feature: add number configuration for master dispatch tasks * fix bug(#2762) the master would be blocked when worker group not exists * fix bug(#2762) the master would be blocked when worker group not exists * fix ut * fix ut * fix bug(2781): cannot pause work flow when task state is "submit success" * fix code smell * add mysql other param blank judge * test * update comments * update comments * add ut * fix bug: Restart the worker service again, the previously submitted successful tasks are not executed * update comments * add sleep * add null point check * fix bug:After the master is fault-tolerant, it cannot resume operation * fix bug: do not failover the host is 'NULL' process * fix bug:worker failover error. Co-authored-by: baoliang --- .../server/worker/processor/TaskCallbackService.java | 10 +++++++--- .../dolphinscheduler/server/zk/ZKMasterClient.java | 2 +- .../dolphinscheduler/service/zk/AbstractZKClient.java | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/worker/processor/TaskCallbackService.java b/dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/worker/processor/TaskCallbackService.java index 1e8bf9d0e..173140792 100644 --- a/dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/worker/processor/TaskCallbackService.java +++ b/dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/worker/processor/TaskCallbackService.java @@ -95,14 +95,18 @@ public class TaskCallbackService { if(newChannel != null){ return getRemoteChannel(newChannel, nettyRemoteChannel.getOpaque(), taskInstanceId); } - logger.warn("original master : {} is not reachable, random select master", nettyRemoteChannel.getHost()); + logger.warn("original master : {} for task : {} is not reachable, random select master", + nettyRemoteChannel.getHost(), + taskInstanceId); Set masterNodes = null; while (Stopper.isRunning()) { masterNodes = zookeeperRegistryCenter.getMasterNodesDirectly(); if (CollectionUtils.isEmpty(masterNodes)) { - logger.error("no available master node"); ThreadUtils.sleep(SLEEP_TIME_MILLIS); }else { + logger.error("find {} masters for task : {}.", + masterNodes.size(), + taskInstanceId); break; } } @@ -112,7 +116,7 @@ public class TaskCallbackService { return getRemoteChannel(newChannel, nettyRemoteChannel.getOpaque(), taskInstanceId); } } - throw new IllegalStateException(String.format("all available master nodes : %s are not reachable", masterNodes)); + throw new IllegalStateException(String.format("all available master nodes : %s are not reachable for task: {}", masterNodes, taskInstanceId)); } private NettyRemoteChannel getRemoteChannel(Channel newChannel, long opaque, int taskInstanceId){ diff --git a/dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/zk/ZKMasterClient.java b/dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/zk/ZKMasterClient.java index 1b807a727..686d73d8a 100644 --- a/dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/zk/ZKMasterClient.java +++ b/dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/zk/ZKMasterClient.java @@ -262,7 +262,7 @@ public class ZKMasterClient extends AbstractZKClient { Date workerServerStartDate = null; List workerServers = getServersList(ZKNodeType.WORKER); for(Server workerServer : workerServers){ - if(workerServer.getHost().equals(taskInstance.getHost())){ + if(taskInstance.getHost().equals(workerServer.getHost() + Constants.COLON + workerServer.getPort())){ workerServerStartDate = workerServer.getCreateTime(); break; } diff --git a/dolphinscheduler-service/src/main/java/org/apache/dolphinscheduler/service/zk/AbstractZKClient.java b/dolphinscheduler-service/src/main/java/org/apache/dolphinscheduler/service/zk/AbstractZKClient.java index 2960969d8..1cc4db6fe 100644 --- a/dolphinscheduler-service/src/main/java/org/apache/dolphinscheduler/service/zk/AbstractZKClient.java +++ b/dolphinscheduler-service/src/main/java/org/apache/dolphinscheduler/service/zk/AbstractZKClient.java @@ -187,7 +187,7 @@ public abstract class AbstractZKClient extends ZookeeperCachedOperator { } Map serverMaps = getServerMaps(zkNodeType); for(String hostKey : serverMaps.keySet()){ - if(hostKey.startsWith(host)){ + if(hostKey.contains(host)){ return true; } } -- GitLab