提交 6dd765f1 编写于 作者: L leiline

ha & savepoint

上级 6158d2ea
......@@ -2,7 +2,8 @@
<project version="4">
<component name="ChangeListManager">
<list default="true" id="9a04a044-0f91-4df8-acd4-c93019ecf348" name="Default" comment="">
<change beforePath="$PROJECT_DIR$/docs/1.7/116.md" beforeDir="false" afterPath="$PROJECT_DIR$/docs/1.7/116.md" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docs/1.7/114.md" beforeDir="false" afterPath="$PROJECT_DIR$/docs/1.7/114.md" afterDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="TRACKING_ENABLED" value="true" />
......@@ -25,17 +26,19 @@
</provider>
</entry>
</file>
<file leaf-file-name="114.md" pinned="false" current-in-tab="false">
<file leaf-file-name="114.md" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/docs/1.7/114.md">
<provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
<state split_layout="SPLIT">
<first_editor />
<first_editor relative-caret-position="320">
<caret line="323" column="68" selection-start-line="323" selection-start-column="68" selection-end-line="323" selection-end-column="68" />
</first_editor>
<second_editor />
</state>
</provider>
</entry>
</file>
<file leaf-file-name="116.md" pinned="false" current-in-tab="true">
<file leaf-file-name="116.md" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/docs/1.7/116.md">
<provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
<state split_layout="SPLIT">
......@@ -76,6 +79,7 @@
<option value="$PROJECT_DIR$/docs/1.7/115.md" />
<option value="$PROJECT_DIR$/README.md" />
<option value="$PROJECT_DIR$/docs/1.7/116.md" />
<option value="$PROJECT_DIR$/docs/1.7/114.md" />
</list>
</option>
</component>
......@@ -90,6 +94,9 @@
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scope" />
<pane id="AndroidView" />
<pane id="PackagesPane" />
<pane id="ProjectPane">
<subPane>
<expand>
......@@ -112,9 +119,6 @@
<select />
</subPane>
</pane>
<pane id="PackagesPane" />
<pane id="AndroidView" />
<pane id="Scope" />
</panes>
</component>
<component name="PropertiesComponent">
......@@ -189,7 +193,8 @@
<servers />
</component>
<component name="ToolWindowManager">
<frame x="64" y="-11" width="1297" height="780" extended-state="6" />
<frame x="65" y="-4" width="1295" height="772" extended-state="6" />
<editor active="true" />
<layout>
<window_info anchor="right" id="Palette" order="3" />
<window_info anchor="bottom" id="TODO" order="6" />
......@@ -198,8 +203,8 @@
<window_info anchor="right" id="Capture Analysis" order="3" />
<window_info anchor="bottom" id="Event Log" order="7" side_tool="true" />
<window_info anchor="right" id="Maven Projects" order="3" />
<window_info anchor="bottom" id="Version Control" order="7" />
<window_info anchor="bottom" id="Run" order="2" />
<window_info anchor="bottom" id="Version Control" order="7" />
<window_info anchor="bottom" id="Terminal" order="7" />
<window_info id="Capture Tool" order="2" />
<window_info id="Designer" order="2" />
......@@ -208,8 +213,8 @@
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info id="UI Designer" order="2" />
<window_info anchor="right" id="Theme Preview" order="3" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
......@@ -222,6 +227,44 @@
<option name="myLimit" value="2678400000" />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/docs/1.7/115.md">
<provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
<state split_layout="SPLIT">
<first_editor relative-caret-position="893">
<caret line="47" column="4" selection-start-line="47" selection-start-column="2" selection-end-line="47" selection-end-column="4" />
</first_editor>
<second_editor />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/docs/1.7/114.md">
<provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
<state split_layout="SPLIT">
<first_editor />
<second_editor />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/README.md">
<provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
<state split_layout="SPLIT">
<first_editor relative-caret-position="114">
<caret line="6" column="9" selection-start-line="6" selection-start-column="9" selection-end-line="6" selection-end-column="9" />
</first_editor>
<second_editor />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/docs/1.7/116.md">
<provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
<state split_layout="SPLIT">
<first_editor relative-caret-position="342">
<caret line="18" selection-start-line="18" selection-end-line="18" />
</first_editor>
<second_editor />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/docs/1.7/115.md">
<provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
<state split_layout="SPLIT">
......@@ -328,14 +371,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/docs/1.7/114.md">
<provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
<state split_layout="SPLIT">
<first_editor />
<second_editor />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/docs/1.7/115.md">
<provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
<state split_layout="SPLIT">
......@@ -356,6 +391,16 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/docs/1.7/114.md">
<provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
<state split_layout="SPLIT">
<first_editor relative-caret-position="320">
<caret line="323" column="68" selection-start-line="323" selection-start-column="68" selection-end-line="323" selection-end-column="68" />
</first_editor>
<second_editor />
</state>
</provider>
</entry>
</component>
<component name="masterDetails">
<states>
......
......@@ -152,13 +152,29 @@ In order to start an HA-cluster add the following configuration keys to `conf/fl
high-availability.storageDir: hdfs:///flink/recovery
```
存储路径保存了恢复失败的JobManager需要的所有元数据。
The `storageDir` stores all metadata needed to recover a JobManager failure.
配置好主服务器和zookeeper仲裁机制后,你可以像往常一样使用提供的集群启动脚本。它们将启动一个高可用集群。记住当你执行脚本的时候,zookeeper仲裁机制将会运行,确保为你正在启动的每个HA集群配置**单独的zookeeper根路径**
After configuring the masters and the ZooKeeper quorum, you can use the provided cluster startup scripts as usual. They will start an HA-cluster. Keep in mind that the **ZooKeeper quorum has to be running** when you call the scripts and make sure to **configure a separate ZooKeeper root path** for each HA cluster you are starting.
#### 示例:有两个JobManager的standalone模式集群
#### Example: Standalone Cluster with 2 JobManagers
1.`conf/flink-conf.yaml`**配置高可用模式和zookeeper仲裁机制**
```
high-availability: zookeeper
high-availability.zookeeper.quorum: localhost:2181
high-availability.zookeeper.path.root: /flink
high-availability.cluster-id: /cluster_one # important: customize per cluster
high-availability.storageDir: hdfs:///flink/recovery
```
1. **Configure high availability mode and ZooKeeper quorum** in `conf/flink-conf.yaml`:
```
......@@ -168,6 +184,12 @@ After configuring the masters and the ZooKeeper quorum, you can use the provided
high-availability.cluster-id: /cluster_one # important: customize per cluster
high-availability.storageDir: hdfs:///flink/recovery
```
2.`conf/masters`**配置主服务**
```
localhost:8081
localhost:8082
```
2. **Configure masters** in `conf/masters`:
......@@ -175,6 +197,11 @@ After configuring the masters and the ZooKeeper quorum, you can use the provided
localhost:8081
localhost:8082
```
3.`conf/zoo.cfg`**中配置zookeeper服务** (目前它只可能再每一台机器上运行一个zookeeper服务):
```
server.0=localhost:2888:3888
```
3. **Configure ZooKeeper server** in `conf/zoo.cfg` (currently it’s only possible to run a single ZooKeeper server per machine):
......@@ -182,6 +209,13 @@ After configuring the masters and the ZooKeeper quorum, you can use the provided
server.0=localhost:2888:3888
```
4. **启动zookeeper仲裁机制**
```
$ bin/start-zookeeper-quorum.sh
Starting zookeeper daemon on host localhost.
```
4. **Start ZooKeeper quorum**:
```
......@@ -189,6 +223,16 @@ After configuring the masters and the ZooKeeper quorum, you can use the provided
Starting zookeeper daemon on host localhost.
```
5. **启动高可用集群**:
```
$ bin/start-cluster.sh
Starting HA cluster with 2 masters and 1 peers in ZooKeeper quorum.
Starting jobmanager daemon on host localhost.
Starting jobmanager daemon on host localhost.
Starting taskmanager daemon on host localhost.
```
5. **Start an HA-cluster**:
```
......@@ -199,6 +243,17 @@ After configuring the masters and the ZooKeeper quorum, you can use the provided
Starting taskmanager daemon on host localhost.
```
6. **停止zookeeper仲裁和集群**
```
$ bin/stop-cluster.sh
Stopping taskmanager daemon (pid: 7647) on localhost.
Stopping jobmanager daemon (pid: 7495) on host localhost.
Stopping jobmanager daemon (pid: 7349) on host localhost.
$ bin/stop-zookeeper-quorum.sh
Stopping zookeeper daemon (pid: 7101) on host localhost.
```
6. **Stop ZooKeeper quorum and cluster**:
```
......@@ -209,17 +264,35 @@ After configuring the masters and the ZooKeeper quorum, you can use the provided
$ bin/stop-zookeeper-quorum.sh
Stopping zookeeper daemon (pid: 7101) on host localhost.
```
## 高可用YARN集群
当运行一个高可用YARN集群,**我们不必运行多个JobManager实例**,只需要一个,由yarn在失败时重新启动。确切的行为取决于你使用的yarn版本。
## YARN Cluster High Availability
When running a highly available YARN cluster, **we don’t run multiple JobManager (ApplicationMaster) instances**, but only one, which is restarted by YARN on failures. The exact behaviour depends on on the specific YARN version you are using.
### 配置
### Configuration
#### 最大主应用尝试次数(yarn-site.xml)
#### Maximum Application Master Attempts (yarn-site.xml)
You have to configure the maximum number of attempts for the application masters for **your** YARN setup in `yarn-site.xml`:
你必须在yarn-site.xml为你的主应用上的yarn设置最大尝试次数。
```
<property>
<name>yarn.resourcemanager.am.max-attempts</name>
<value>4</value>
<description>
The maximum number of application master execution attempts.
</description>
</property>
```
You have to configure the maximum number of attempts for the application masters for **your** YARN setup in `yarn-site.xml`:
```
......@@ -233,38 +306,75 @@ You have to configure the maximum number of attempts for the application masters
```
对当前版本的yarn的默认值是2(表示单个JobManager的失败的可容忍的)。
The default for current YARN versions is 2 (meaning a single JobManager failure is tolerated).
#### 应用尝试(flink-conf.yaml)
#### Application Attempts (flink-conf.yaml)
为了配置集群高可用([见上](#configuration),你必须在`conf/flink-conf.yaml`设置最大尝试次数。
```
yarn.application-attempts: 10
```
In addition to the HA configuration ([see above](#configuration)), you have to configure the maximum attempts in `conf/flink-conf.yaml`:
```
yarn.application-attempts: 10
```
这意味着应用在失败后可以重启可以被重启9次(9次重启+1次初始化)。当YARN操作需要时:抢占,节点硬件故障或重启,或nodemanager重新同步,YARN可以执行重启。这些重启将不会记录在 `yarn.application-attemps` 中,查阅 [Jian Fang’s blog post](http://johnjianfang.blogspot.de/2015/04/the-number-of-maximum-attempts-of-yarn.html)。值得注意的是, `yarn.application.am.max-attempts` 是应用程序重启的上限。因此Flink中设置的应用程序尝试次数不能超过启动YARN的集群设置次数。
This means that the application can be restarted 9 times for failed attempts before YARN fails the application (9 retries + 1 initial attempt). Additional restarts can be performed by YARN if required by YARN operations: Preemption, node hardware failures or reboots, or NodeManager resyncs. These restarts are not counted against `yarn.application-attempts`, see [Jian Fang’s blog post](http://johnjianfang.blogspot.de/2015/04/the-number-of-maximum-attempts-of-yarn.html). It’s important to note that `yarn.resourcemanager.am.max-attempts` is an upper bound for the application restarts. Therefore, the number of application attempts set within Flink cannot exceed the YARN cluster setting with which YARN was started.
#### 容器关闭行为
#### Container Shutdown Behaviour
* **YARN 2.3.0 &lt; version &lt; 2.4.0**. 当主应用失败,所有的容器都被重启。
* **YARN 2.4.0 &lt; version &lt; 2.6.0**. TaskManager容器在主应用程序故障期间保持活跃,这具有以下优点:启动时间更快并且用户不必等待再次获得容器资源。
* **YARN 2.6.0 &lt;= version**. 将尝试失败有效间隔设置为Flink的Akka超时值。尝试失败有效间隔表示只有在系统一个间隔期间达到最大应用程序尝试次数才会终止应用程序。这避免了长久的工作会耗尽它的应用程序尝试次数。
* **YARN 2.3.0 &lt; version &lt; 2.4.0**. All containers are restarted if the application master fails.
* **YARN 2.4.0 &lt; version &lt; 2.6.0**. TaskManager containers are kept alive across application master failures. This has the advantage that the startup time is faster and that the user does not have to wait for obtaining the container resources again.
* **YARN 2.6.0 &lt;= version**: Sets the attempt failure validity interval to the Flinks’ Akka timeout value. The attempt failure validity interval says that an application is only killed after the system has seen the maximum number of application attempts during one interval. This avoids that a long lasting job will deplete it’s application attempts.
**注意**: Hadoop YARN 2.4.0存在一个重大问题(在2.5.0版本中修复)阻止容器从应用程序的Master Mangaer或Job Manager容器中重启。详情查阅[FLINK-4142](https://issues.apache.org/jira/browse/FLINK-4142) 。我们推荐使用最新版本Hadoop 2.5.0为高可用设置YARN。
**Note**: Hadoop YARN 2.4.0 has a major bug (fixed in 2.5.0) preventing container restarts from a restarted Application Master/Job Manager container. See [FLINK-4142](https://issues.apache.org/jira/browse/FLINK-4142) for details. We recommend using at least Hadoop 2.5.0 for high availability setups on YARN.
#### 举例: 高可用的YARN缓存
#### Example: Highly Available YARN Session
1. **在 `conf/flink-conf.yaml` 中设置高可用模式和zookeeper选举**
```
high-availability: zookeeper
cihigh-availability.zookeeper.quorum: localhost:2181
high-availability.storageDir: hdfs:///flink/recovery
high-availability.zookeeper.path.root: /flink
yarn.application-attempts: 10
```
1. **Configure HA mode and ZooKeeper quorum** in `conf/flink-conf.yaml`:
```
high-availability: zookeeper
high-availability.zookeeper.quorum: localhost:2181
cihigh-availability.zookeeper.quorum: localhost:2181
high-availability.storageDir: hdfs:///flink/recovery
high-availability.zookeeper.path.root: /flink
yarn.application-attempts: 10
```
2. **在 `conf/zoo.cfg` 中设置Zookeeper服务** (目前每台机器只能运行一个Zookeeper服务)
```
server.0=localhost:2888:3888
```
2. **Configure ZooKeeper server** in `conf/zoo.cfg` (currently it’s only possible to run a single ZooKeeper server per machine):
......@@ -272,6 +382,13 @@ This means that the application can be restarted 9 times for failed attempts bef
server.0=localhost:2888:3888
```
3. **启动Zookeeper 选举**:
```
$ bin/start-zookeeper-quorum.sh
Starting zookeeper daemon on host localhost.
```
3. **Start ZooKeeper quorum**:
```
......@@ -279,14 +396,34 @@ This means that the application can be restarted 9 times for failed attempts bef
Starting zookeeper daemon on host localhost.
```
4. **启动一个高可用集群**
```
$ bin/yarn-session.sh -n 2
```
4. **Start an HA-cluster**:
```
$ bin/yarn-session.sh -n 2
```
## 配置Zookeeper安全性
## Configuring for Zookeeper Security
如果Zookeeper是在Kerberos运行的安全模式,你可以根据需要覆盖 `flink-conf.yaml`中的以下配置:
```
zookeeper.sasl.service-name: zookeeper # default is "zookeeper". If the ZooKeeper quorum is configured
# with a different service name then it can be supplied here.
zookeeper.sasl.login-context-name: Client # default is "Client". The value needs to match one of the values
# configured in "security.kerberos.login.contexts".
```
在Flink上配置Kerberos安全模式的更多信息,请查阅[here](//ci.apache.org/projects/flink/flink-docs-release-1.7/ops/config.html). 你也可以查阅[here](//ci.apache.org/projects/flink/flink-docs-release-1.7/ops/security-kerberos.html) 更多关于在Flink内部设置基于kerberos安全性的细节。
If ZooKeeper is running in secure mode with Kerberos, you can override the following configurations in `flink-conf.yaml` as necessary:
```
......@@ -299,8 +436,22 @@ zookeeper.sasl.login-context-name: Client # default is "Client". The value need
For more information on Flink configuration for Kerberos security, please see [here](//ci.apache.org/projects/flink/flink-docs-release-1.7/ops/config.html). You can also find [here](//ci.apache.org/projects/flink/flink-docs-release-1.7/ops/security-kerberos.html) further details on how Flink internally setups Kerberos-based security.
## Bootstrap Zookeeper
## Bootstrap ZooKeeper
如果你没有运行的Zookeeper安装,你可以使用Flink附带的帮助程序脚本。
`conf/zoo.cfg` 中有一个Zookeeper配置模板。你可以使用 `server.X` 条目配置主机以运行Zookeeper,其中X是每个服务器的唯一ID:
```
server.X=addressX:peerPort:leaderPort
[...]
server.Y=addressY:peerPort:leaderPort
```
If you don’t have a running ZooKeeper installation, you can use the helper scripts, which ship with Flink.
There is a ZooKeeper configuration template in `conf/zoo.cfg`. You can configure the hosts to run ZooKeeper on with the `server.X` entries, where X is a unique ID of each server:
......@@ -312,5 +463,8 @@ server.Y=addressY:peerPort:leaderPort
```
`bin/start-zookeeper-quorum.sh` 脚本将会在每一个配置的域名下启动一个Zookeeper服务。启动的进程通过Flink包装器启动Zookeeper服务器,该包装器从 `conf/zoo.cfg` 读取配置,并确保为方便设置一些必需的配置项。在生产配置中,建议你安装自己的Zookeeper集群。
The script `bin/start-zookeeper-quorum.sh` will start a ZooKeeper server on each of the configured hosts. The started processes start ZooKeeper servers via a Flink wrapper, which reads the configuration from `conf/zoo.cfg` and makes sure to set some required configuration values for convenience. In production setups, it is recommended to manage your own ZooKeeper installation.
# 状态后端
# State Backends
[Data Stream API](//ci.apache.org/projects/flink/flink-docs-release-1.7/dev/datastream_api.html)写的程序一般会以不同的方式保存状态:
* Windows会在触发元素或聚合之前收集元素或聚合
* 转换方法可能会使用键值对接口来存储值
* 转换方法可能继承 `CheckpointedFunction` 接口来使其局部变量具有容错能力
也可以参考API介绍中的[状态部分](//ci.apache.org/projects/flink/flink-docs-release-1.7/dev/stream/state/index.html)
Programs written in the [Data Stream API](//ci.apache.org/projects/flink/flink-docs-release-1.7/dev/datastream_api.html) often hold state in various forms:
* Windows gather elements or aggregates until they are triggered
......@@ -10,10 +20,22 @@ Programs written in the [Data Stream API](//ci.apache.org/projects/flink/flink-d
See also [state section](//ci.apache.org/projects/flink/flink-docs-release-1.7/dev/stream/state/index.html) in the streaming API guide.
当checkpointing被激活,这种状态被持久化在checkpoints中来防止数据丢死以及持续恢复。状态如何在内部表示,以及在检查点上如何及在何处持续取决于选择的 **后端状态**.
When checkpointing is activated, such state is persisted upon checkpoints to guard against data loss and recover consistently. How the state is represented internally, and how and where it is persisted upon checkpoints depends on the chosen **State Backend**.
## 后端状态可用
## Available State Backends
开箱即用,Flink捆绑了这些后端状态
* _MemoryStateBackend_
* _FsStateBackend_
* _RocksDBStateBackend_
如果没有配置其他任何内容,系统将使用内存后端状态。
Out of the box, Flink bundles these state backends:
* _MemoryStateBackend_
......@@ -22,6 +44,10 @@ Out of the box, Flink bundles these state backends:
If nothing else is configured, the system will use the MemoryStateBackend.
### 内存后端状态
### The MemoryStateBackend
The _MemoryStateBackend_ holds data internally as objects on the Java heap. Key/value state and window operators hold hash tables that store the values, triggers, etc.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册