提交 dc81c6ac 编写于 作者: W wizardforcel

Revert "21"

This reverts commit 4ea4e3d4.
上级 4ea4e3d4
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
## MatPlotLib 中的双向条形图 ## MatPlotLib 中的双向条形图
```py ```
%matplotlib inline %matplotlib inline
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -21,7 +21,7 @@ raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], ...@@ -21,7 +21,7 @@ raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'post_score': [5, 43, 23, 23, 51]} 'post_score': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score']) df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score'])
df df
```
| | first_name | pre_score | mid_score | post_score | | | first_name | pre_score | mid_score | post_score |
| --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- |
...@@ -31,7 +31,7 @@ df ...@@ -31,7 +31,7 @@ df
| 3 | Jake | 2 | 62 | 23 | | 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 | | 4 | Amy | 3 | 70 | 51 |
```py ```
# 输入数据,特别是第二和 # 输入数据,特别是第二和
# 第三行,跳过第一列 # 第三行,跳过第一列
x1 = df.ix[1, 1:] x1 = df.ix[1, 1:]
...@@ -78,13 +78,13 @@ plt.xlim([-max(x2)-10, max(x1)+10]) ...@@ -78,13 +78,13 @@ plt.xlim([-max(x2)-10, max(x1)+10])
plt.grid() plt.grid()
plt.show() plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_back_to_back_bar_plot_6_0.png) ![png](https://chrisalbon.com/python/data_visualization/matplotlib_back_to_back_bar_plot_6_0.png)
## MatPlotLib 中的条形图 ## MatPlotLib 中的条形图
```py ```
%matplotlib inline %matplotlib inline
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -97,7 +97,7 @@ raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], ...@@ -97,7 +97,7 @@ raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'post_score': [5, 43, 23, 23, 51]} 'post_score': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score']) df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score'])
df df
```
| | first_name | pre_score | mid_score | post_score | | | first_name | pre_score | mid_score | post_score |
| --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- |
...@@ -107,7 +107,7 @@ df ...@@ -107,7 +107,7 @@ df
| 3 | Jake | 2 | 62 | 23 | | 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 | | 4 | Amy | 3 | 70 | 51 |
```py ```
# 为每个变量创建得分均值的列表 # 为每个变量创建得分均值的列表
mean_values = [df['pre_score'].mean(), df['mid_score'].mean(), df['post_score'].mean()] mean_values = [df['pre_score'].mean(), df['mid_score'].mean(), df['post_score'].mean()]
...@@ -146,13 +146,13 @@ plt.xticks(x_pos, bar_labels) ...@@ -146,13 +146,13 @@ plt.xticks(x_pos, bar_labels)
plt.title('Mean Scores For Each Test') plt.title('Mean Scores For Each Test')
plt.show() plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_bar_plot_6_0.png) ![png](https://chrisalbon.com/python/data_visualization/matplotlib_bar_plot_6_0.png)
## Seaborn 中的调色板 ## Seaborn 中的调色板
```py ```
import pandas as pd import pandas as pd
%matplotlib inline %matplotlib inline
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -173,164 +173,164 @@ df = pd.DataFrame(data, columns = ['date', 'battle_deaths', 'deaths_regiment_1', ...@@ -173,164 +173,164 @@ df = pd.DataFrame(data, columns = ['date', 'battle_deaths', 'deaths_regiment_1',
df = df.set_index(df.date) df = df.set_index(df.date)
sns.palplot(sns.color_palette("deep", 10)) sns.palplot(sns.color_palette("deep", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_5_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_5_0.png)
```py ```
sns.palplot(sns.color_palette("muted", 10)) sns.palplot(sns.color_palette("muted", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_6_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_6_0.png)
```py ```
sns.palplot(sns.color_palette("bright", 10)) sns.palplot(sns.color_palette("bright", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_7_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_7_0.png)
```py ```
sns.palplot(sns.color_palette("dark", 10)) sns.palplot(sns.color_palette("dark", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_8_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_8_0.png)
```py ```
sns.palplot(sns.color_palette("colorblind", 10)) sns.palplot(sns.color_palette("colorblind", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_9_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_9_0.png)
```py ```
sns.palplot(sns.color_palette("Paired", 10)) sns.palplot(sns.color_palette("Paired", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_10_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_10_0.png)
```py ```
sns.palplot(sns.color_palette("BuGn", 10)) sns.palplot(sns.color_palette("BuGn", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_11_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_11_0.png)
```py ```
sns.palplot(sns.color_palette("GnBu", 10)) sns.palplot(sns.color_palette("GnBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_12_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_12_0.png)
```py ```
sns.palplot(sns.color_palette("OrRd", 10)) sns.palplot(sns.color_palette("OrRd", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_13_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_13_0.png)
```py ```
sns.palplot(sns.color_palette("PuBu", 10)) sns.palplot(sns.color_palette("PuBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_14_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_14_0.png)
```py ```
sns.palplot(sns.color_palette("YlGn", 10)) sns.palplot(sns.color_palette("YlGn", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_15_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_15_0.png)
```py ```
sns.palplot(sns.color_palette("YlGnBu", 10)) sns.palplot(sns.color_palette("YlGnBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_16_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_16_0.png)
```py ```
sns.palplot(sns.color_palette("YlOrBr", 10)) sns.palplot(sns.color_palette("YlOrBr", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_17_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_17_0.png)
```py ```
sns.palplot(sns.color_palette("YlOrRd", 10)) sns.palplot(sns.color_palette("YlOrRd", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_18_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_18_0.png)
```py ```
sns.palplot(sns.color_palette("BrBG", 10)) sns.palplot(sns.color_palette("BrBG", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_19_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_19_0.png)
```py ```
sns.palplot(sns.color_palette("PiYG", 10)) sns.palplot(sns.color_palette("PiYG", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_20_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_20_0.png)
```py ```
sns.palplot(sns.color_palette("PRGn", 10)) sns.palplot(sns.color_palette("PRGn", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_21_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_21_0.png)
```py ```
sns.palplot(sns.color_palette("PuOr", 10)) sns.palplot(sns.color_palette("PuOr", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_22_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_22_0.png)
```py ```
sns.palplot(sns.color_palette("RdBu", 10)) sns.palplot(sns.color_palette("RdBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_23_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_23_0.png)
```py ```
sns.palplot(sns.color_palette("RdGy", 10)) sns.palplot(sns.color_palette("RdGy", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_24_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_24_0.png)
```py ```
sns.palplot(sns.color_palette("RdYlBu", 10)) sns.palplot(sns.color_palette("RdYlBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_25_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_25_0.png)
```py ```
sns.palplot(sns.color_palette("RdYlGn", 10)) sns.palplot(sns.color_palette("RdYlGn", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_26_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_26_0.png)
```py ```
sns.palplot(sns.color_palette("Spectral", 10)) sns.palplot(sns.color_palette("Spectral", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_27_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_27_0.png)
```py ```
# 创建调色板并将其设为当前调色板 # 创建调色板并将其设为当前调色板
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"] flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
sns.set_palette(flatui) sns.set_palette(flatui)
sns.palplot(sns.color_palette()) sns.palplot(sns.color_palette())
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_29_0.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_29_0.png)
```py ```
# 设置绘图颜色 # 设置绘图颜色
sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4, sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4,
df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], color="#34495e") df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], color="#34495e")
# <matplotlib.axes._subplots.AxesSubplot at 0x116f5db70> # <matplotlib.axes._subplots.AxesSubplot at 0x116f5db70>
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_31_1.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_31_1.png)
## 使用 Seaborn 和 pandas 创建时间序列绘图 ## 使用 Seaborn 和 pandas 创建时间序列绘图
```py ```
import pandas as pd import pandas as pd
%matplotlib inline %matplotlib inline
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -353,23 +353,23 @@ sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df ...@@ -353,23 +353,23 @@ sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df
df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], color="indianred") df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], color="indianred")
# <matplotlib.axes._subplots.AxesSubplot at 0x1140be780> # <matplotlib.axes._subplots.AxesSubplot at 0x1140be780>
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_pandas_timeseries_plot_5_1.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_pandas_timeseries_plot_5_1.png)
```py ```
# 带有置信区间直线,但是没有直线的时间序列绘图 # 带有置信区间直线,但是没有直线的时间序列绘图
sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4, sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4,
df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], err_style="ci_bars", interpolate=False) df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], err_style="ci_bars", interpolate=False)
# <matplotlib.axes._subplots.AxesSubplot at 0x116400668> # <matplotlib.axes._subplots.AxesSubplot at 0x116400668>
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_pandas_timeseries_plot_7_1.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_pandas_timeseries_plot_7_1.png)
## 使用 Seaborn 创建散点图 ## 使用 Seaborn 创建散点图
```py ```
import pandas as pd import pandas as pd
%matplotlib inline %matplotlib inline
import random import random
...@@ -387,7 +387,7 @@ df['k'] = ['male','male','male','female','female'] ...@@ -387,7 +387,7 @@ df['k'] = ['male','male','male','female','female']
# 查看前几行数据 # 查看前几行数据
df.head() df.head()
```
| | x | y | z | k | | | x | y | z | k |
| --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- |
...@@ -397,7 +397,7 @@ df.head() ...@@ -397,7 +397,7 @@ df.head()
| 3 | 510 | 206 | 1 | female | | 3 | 510 | 206 | 1 | female |
| 4 | 848 | 357 | 0 | female | | 4 | 848 | 357 | 0 | female |
```py ```
# 设置散点图样式 # 设置散点图样式
sns.set_context("notebook", font_scale=1.1) sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks") sns.set_style("ticks")
...@@ -421,13 +421,13 @@ plt.xlabel('Time') ...@@ -421,13 +421,13 @@ plt.xlabel('Time')
plt.ylabel('Deaths') plt.ylabel('Deaths')
# <matplotlib.text.Text at 0x112b7bb70> # <matplotlib.text.Text at 0x112b7bb70>
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_scatterplot_7_1.png) ![png](https://chrisalbon.com/python/data_visualization/seaborn_scatterplot_7_1.png)
## MatPlotLib 中的分组条形图 ## MatPlotLib 中的分组条形图
```py ```
%matplotlib inline %matplotlib inline
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -439,7 +439,7 @@ raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], ...@@ -439,7 +439,7 @@ raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'post_score': [5, 43, 23, 23, 51]} 'post_score': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score']) df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score'])
df df
```
| | first_name | pre_score | mid_score | post_score | | | first_name | pre_score | mid_score | post_score |
| --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- |
...@@ -449,7 +449,7 @@ df ...@@ -449,7 +449,7 @@ df
| 3 | Jake | 2 | 62 | 23 | | 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 | | 4 | Amy | 3 | 70 | 51 |
```py ```
# 设置条形的位置和宽度 # 设置条形的位置和宽度
pos = list(range(len(df['pre_score']))) pos = list(range(len(df['pre_score'])))
width = 0.25 width = 0.25
...@@ -519,13 +519,13 @@ plt.ylim([0, max(df['pre_score'] + df['mid_score'] + df['post_score'])] ) ...@@ -519,13 +519,13 @@ plt.ylim([0, max(df['pre_score'] + df['mid_score'] + df['post_score'])] )
plt.legend(['Pre Score', 'Mid Score', 'Post Score'], loc='upper left') plt.legend(['Pre Score', 'Mid Score', 'Post Score'], loc='upper left')
plt.grid() plt.grid()
plt.show() plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_grouped_bar_plot_6_0.png) ![png](https://chrisalbon.com/python/data_visualization/matplotlib_grouped_bar_plot_6_0.png)
## MatPlotLib 中的直方图 ## MatPlotLib 中的直方图
```py ```
%matplotlib inline %matplotlib inline
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -540,7 +540,7 @@ pd.set_option('display.max_columns', 50) ...@@ -540,7 +540,7 @@ pd.set_option('display.max_columns', 50)
df = pd.read_csv('https://www.dropbox.com/s/52cb7kcflr8qm2u/5kings_battles_v1.csv?dl=1') df = pd.read_csv('https://www.dropbox.com/s/52cb7kcflr8qm2u/5kings_battles_v1.csv?dl=1')
df.head() df.head()
```
| | name | year | battle_number | attacker_king | defender_king | attacker_1 | attacker_2 | attacker_3 | attacker_4 | defender_1 | defender_2 | defender_3 | defender_4 | attacker_outcome | battle_type | major_death | major_capture | attacker_size | defender_size | attacker_commander | defender_commander | summer | location | region | note | | | name | year | battle_number | attacker_king | defender_king | attacker_1 | attacker_2 | attacker_3 | attacker_4 | defender_1 | defender_2 | defender_3 | defender_4 | attacker_outcome | battle_type | major_death | major_capture | attacker_size | defender_size | attacker_commander | defender_commander | summer | location | region | note |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
...@@ -551,7 +551,7 @@ df.head() ...@@ -551,7 +551,7 @@ df.head()
| 3 | Battle of the Green Fork | 298 | 4 | Robb Stark | Joffrey/Tommen Baratheon | Stark | NaN | NaN | NaN | Lannister | NaN | NaN | NaN | loss | pitched battle | 1 | 1 | 18000 | 20000 | Roose Bolton, Wylis Manderly, Medger Cerwyn, H... | Tywin Lannister, Gregor Clegane, Kevan Lannist... | 1 | Green Fork | The Riverlands | NaN | | 3 | Battle of the Green Fork | 298 | 4 | Robb Stark | Joffrey/Tommen Baratheon | Stark | NaN | NaN | NaN | Lannister | NaN | NaN | NaN | loss | pitched battle | 1 | 1 | 18000 | 20000 | Roose Bolton, Wylis Manderly, Medger Cerwyn, H... | Tywin Lannister, Gregor Clegane, Kevan Lannist... | 1 | Green Fork | The Riverlands | NaN |
| 4 | Battle of the Whispering Wood | 298 | 5 | Robb Stark | Joffrey/Tommen Baratheon | Stark | Tully | NaN | NaN | Lannister | NaN | NaN | NaN | win | ambush | 1 | 1 | 1875 | 6000 | Robb Stark, Brynden Tully | Jaime Lannister | 1 | Whispering Wood | The Riverlands | NaN | | 4 | Battle of the Whispering Wood | 298 | 5 | Robb Stark | Joffrey/Tommen Baratheon | Stark | Tully | NaN | NaN | Lannister | NaN | NaN | NaN | win | ambush | 1 | 1 | 1875 | 6000 | Robb Stark, Brynden Tully | Jaime Lannister | 1 | Whispering Wood | The Riverlands | NaN |
```py ```
# 制作攻击方和防守方大小的两个变量 # 制作攻击方和防守方大小的两个变量
# 但是当有超过 10000 个攻击方时将其排除在外 # 但是当有超过 10000 个攻击方时将其排除在外
data1 = df['attacker_size'][df['attacker_size'] < 90000] data1 = df['attacker_size'][df['attacker_size'] < 90000]
...@@ -584,11 +584,11 @@ plt.ylabel('Number of battles') ...@@ -584,11 +584,11 @@ plt.ylabel('Number of battles')
plt.legend(loc='upper right') plt.legend(loc='upper right')
plt.show() plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_histogram_6_0.png) ![png](https://chrisalbon.com/python/data_visualization/matplotlib_histogram_6_0.png)
```py ```
# 制作攻击方和防守方大小的两个变量 # 制作攻击方和防守方大小的两个变量
# 但是当有超过 10000 个攻击方时将其排除在外 # 但是当有超过 10000 个攻击方时将其排除在外
data1 = df['attacker_size'][df['attacker_size'] < 90000] data1 = df['attacker_size'][df['attacker_size'] < 90000]
...@@ -634,13 +634,13 @@ plt.ylabel('Number of battles') ...@@ -634,13 +634,13 @@ plt.ylabel('Number of battles')
plt.legend(loc='upper right') plt.legend(loc='upper right')
plt.show() plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_histogram_8_0.png) ![png](https://chrisalbon.com/python/data_visualization/matplotlib_histogram_8_0.png)
## 从 Pandas 数据帧生成 MatPlotLib 散点图 ## 从 Pandas 数据帧生成 MatPlotLib 散点图
```py ```
%matplotlib inline %matplotlib inline
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -654,7 +654,7 @@ raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], ...@@ -654,7 +654,7 @@ raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'postTestScore': [25, 94, 57, 62, 70]} 'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'female', 'preTestScore', 'postTestScore']) df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'female', 'preTestScore', 'postTestScore'])
df df
```
| | first_name | last_name | age | female | preTestScore | postTestScore | | | first_name | last_name | age | female | preTestScore | postTestScore |
| --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- |
...@@ -664,30 +664,30 @@ df ...@@ -664,30 +664,30 @@ df
| 3 | Jake | Milner | 24 | 0 | 2 | 62 | | 3 | Jake | Milner | 24 | 0 | 2 | 62 |
| 4 | Amy | Cooze | 73 | 1 | 3 | 70 | | 4 | Amy | Cooze | 73 | 1 | 3 | 70 |
```py ```
# preTestScore 和 postTestScore 的散点图 # preTestScore 和 postTestScore 的散点图
# 每个点的大小取决于年龄 # 每个点的大小取决于年龄
plt.scatter(df.preTestScore, df.postTestScore plt.scatter(df.preTestScore, df.postTestScore
, s=df.age) , s=df.age)
# <matplotlib.collections.PathCollection at 0x10ca42b00> # <matplotlib.collections.PathCollection at 0x10ca42b00>
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_scatterplot_from_pandas_6_1.png) ![png](https://chrisalbon.com/python/data_visualization/matplotlib_scatterplot_from_pandas_6_1.png)
```py ```
# preTestScore 和 postTestScore 的散点图 # preTestScore 和 postTestScore 的散点图
# 大小为 300,颜色取决于性别 # 大小为 300,颜色取决于性别
plt.scatter(df.preTestScore, df.postTestScore, s=300, c=df.female) plt.scatter(df.preTestScore, df.postTestScore, s=300, c=df.female)
# <matplotlib.collections.PathCollection at 0x10cb90a90> # <matplotlib.collections.PathCollection at 0x10cb90a90>
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_scatterplot_from_pandas_8_1.png) ![png](https://chrisalbon.com/python/data_visualization/matplotlib_scatterplot_from_pandas_8_1.png)
## Matplotlib 的简单示例 ## Matplotlib 的简单示例
```py ```
# 让 Jupyter 加载 matplotlib # 让 Jupyter 加载 matplotlib
# 并内联创建所有绘图(也就是在页面上) # 并内联创建所有绘图(也就是在页面上)
%matplotlib inline %matplotlib inline
...@@ -697,13 +697,13 @@ import matplotlib.pyplot as pyplot ...@@ -697,13 +697,13 @@ import matplotlib.pyplot as pyplot
pyplot.plot([1.6, 2.7]) pyplot.plot([1.6, 2.7])
# [<matplotlib.lines.Line2D at 0x10c4e7978>] # [<matplotlib.lines.Line2D at 0x10c4e7978>]
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_simple_example_6_1.png) ![png](https://chrisalbon.com/python/data_visualization/matplotlib_simple_example_6_1.png)
## MatPlotLib 中的饼图 ## MatPlotLib 中的饼图
```py ```
%matplotlib inline %matplotlib inline
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -714,7 +714,7 @@ raw_data = {'officer_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], ...@@ -714,7 +714,7 @@ raw_data = {'officer_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'march_arrests': [5, 43, 23, 23, 51]} 'march_arrests': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['officer_name', 'jan_arrests', 'feb_arrests', 'march_arrests']) df = pd.DataFrame(raw_data, columns = ['officer_name', 'jan_arrests', 'feb_arrests', 'march_arrests'])
df df
```
| | officer_name | jan_arrests | feb_arrests | march_arrests | | | officer_name | jan_arrests | feb_arrests | march_arrests |
| --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- |
...@@ -724,11 +724,11 @@ df ...@@ -724,11 +724,11 @@ df
| 3 | Jake | 2 | 62 | 23 | | 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 | | 4 | Amy | 3 | 70 | 51 |
```py ```
# 创建一列,其中包含每个官员的总逮捕数 # 创建一列,其中包含每个官员的总逮捕数
df['total_arrests'] = df['jan_arrests'] + df['feb_arrests'] + df['march_arrests'] df['total_arrests'] = df['jan_arrests'] + df['feb_arrests'] + df['march_arrests']
df df
```
| | officer_name | jan_arrests | feb_arrests | march_arrests | total_arrests | | | officer_name | jan_arrests | feb_arrests | march_arrests | total_arrests |
| --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- |
...@@ -738,7 +738,7 @@ df ...@@ -738,7 +738,7 @@ df
| 3 | Jake | 2 | 62 | 23 | 87 | | 3 | Jake | 2 | 62 | 23 | 87 |
| 4 | Amy | 3 | 70 | 51 | 124 | | 4 | Amy | 3 | 70 | 51 | 124 |
```py ```
# (从 iWantHue)创建一列颜色 # (从 iWantHue)创建一列颜色
colors = ["#E13F29", "#D69A80", "#D63B59", "#AE5552", "#CB5C3B", "#EB8076", "#96624E"] colors = ["#E13F29", "#D69A80", "#D63B59", "#AE5552", "#CB5C3B", "#EB8076", "#96624E"]
...@@ -766,13 +766,13 @@ plt.axis('equal') ...@@ -766,13 +766,13 @@ plt.axis('equal')
# 查看绘图 # 查看绘图
plt.tight_layout() plt.tight_layout()
plt.show() plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_pie_chart_7_0.png) ![png](https://chrisalbon.com/python/data_visualization/matplotlib_pie_chart_7_0.png)
## MatPlotLib 中的散点图 ## MatPlotLib 中的散点图
```py ```
%matplotlib inline %matplotlib inline
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -786,7 +786,7 @@ pd.set_option('display.max_columns', 50) ...@@ -786,7 +786,7 @@ pd.set_option('display.max_columns', 50)
df = pd.read_csv('https://raw.githubusercontent.com/chrisalbon/war_of_the_five_kings_dataset/master/5kings_battles_v1.csv') df = pd.read_csv('https://raw.githubusercontent.com/chrisalbon/war_of_the_five_kings_dataset/master/5kings_battles_v1.csv')
df.head() df.head()
```
| | name | year | battle_number | attacker_king | defender_king | attacker_1 | attacker_2 | attacker_3 | attacker_4 | defender_1 | defender_2 | defender_3 | defender_4 | attacker_outcome | battle_type | major_death | major_capture | attacker_size | defender_size | attacker_commander | defender_commander | summer | location | region | note | | | name | year | battle_number | attacker_king | defender_king | attacker_1 | attacker_2 | attacker_3 | attacker_4 | defender_1 | defender_2 | defender_3 | defender_4 | attacker_outcome | battle_type | major_death | major_capture | attacker_size | defender_size | attacker_commander | defender_commander | summer | location | region | note |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
...@@ -796,7 +796,7 @@ df.head() ...@@ -796,7 +796,7 @@ df.head()
| 3 | Battle of the Green Fork | 298 | 4 | Robb Stark | Joffrey/Tommen Baratheon | Stark | NaN | NaN | NaN | Lannister | NaN | NaN | NaN | loss | pitched battle | 1.0 | 1.0 | 18000.0 | 20000.0 | Roose Bolton, Wylis Manderly, Medger Cerwyn, H... | Tywin Lannister, Gregor Clegane, Kevan Lannist... | 1.0 | Green Fork | The Riverlands | NaN | | 3 | Battle of the Green Fork | 298 | 4 | Robb Stark | Joffrey/Tommen Baratheon | Stark | NaN | NaN | NaN | Lannister | NaN | NaN | NaN | loss | pitched battle | 1.0 | 1.0 | 18000.0 | 20000.0 | Roose Bolton, Wylis Manderly, Medger Cerwyn, H... | Tywin Lannister, Gregor Clegane, Kevan Lannist... | 1.0 | Green Fork | The Riverlands | NaN |
| 4 | Battle of the Whispering Wood | 298 | 5 | Robb Stark | Joffrey/Tommen Baratheon | Stark | Tully | NaN | NaN | Lannister | NaN | NaN | NaN | win | ambush | 1.0 | 1.0 | 1875.0 | 6000.0 | Robb Stark, Brynden Tully | Jaime Lannister | 1.0 | Whispering Wood | The Riverlands | NaN | | 4 | Battle of the Whispering Wood | 298 | 5 | Robb Stark | Joffrey/Tommen Baratheon | Stark | Tully | NaN | NaN | Lannister | NaN | NaN | NaN | win | ambush | 1.0 | 1.0 | 1875.0 | 6000.0 | Robb Stark, Brynden Tully | Jaime Lannister | 1.0 | Whispering Wood | The Riverlands | NaN |
```py ```
# 创建图形 # 创建图形
plt.figure(figsize=(10,8)) plt.figure(figsize=(10,8))
...@@ -863,13 +863,13 @@ plt.xlim([min(df['attacker_size'])-1000, max(df['attacker_size'])+1000]) ...@@ -863,13 +863,13 @@ plt.xlim([min(df['attacker_size'])-1000, max(df['attacker_size'])+1000])
plt.ylim([min(df['defender_size'])-1000, max(df['defender_size'])+1000]) plt.ylim([min(df['defender_size'])-1000, max(df['defender_size'])+1000])
plt.show() plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_simple_scatterplot_6_0.png) ![png](https://chrisalbon.com/python/data_visualization/matplotlib_simple_scatterplot_6_0.png)
## MatPlotLib 中的栈式百分比条形图 ## MatPlotLib 中的栈式百分比条形图
```py ```
%matplotlib inline %matplotlib inline
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -880,7 +880,7 @@ raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], ...@@ -880,7 +880,7 @@ raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'post_score': [5, 43, 23, 23, 51]} 'post_score': [5, 43, 23, 23, 51]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score']) df = pd.DataFrame(raw_data, columns = ['first_name', 'pre_score', 'mid_score', 'post_score'])
df df
```
| | first_name | pre_score | mid_score | post_score | | | first_name | pre_score | mid_score | post_score |
| --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- |
...@@ -890,7 +890,7 @@ df ...@@ -890,7 +890,7 @@ df
| 3 | Jake | 2 | 62 | 23 | | 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 | | 4 | Amy | 3 | 70 | 51 |
```py ```
# 创建带有一个子图的图形 # 创建带有一个子图的图形
f, ax = plt.subplots(1, figsize=(10,5)) f, ax = plt.subplots(1, figsize=(10,5))
...@@ -981,6 +981,6 @@ plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right') ...@@ -981,6 +981,6 @@ plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
# 展示绘图 # 展示绘图
plt.show() plt.show()
```
![png](https://chrisalbon.com/python/data_visualization/matplotlib_percentage_stacked_bar_plot_6_0.png) ![png](https://chrisalbon.com/python/data_visualization/matplotlib_percentage_stacked_bar_plot_6_0.png)
...@@ -20,7 +20,7 @@ $ s^2 = \frac {1}{n-1} \sum_{i=1}^n \left(x_i - \overline{x} \right)^ 2 $ ...@@ -20,7 +20,7 @@ $ s^2 = \frac {1}{n-1} \sum_{i=1}^n \left(x_i - \overline{x} \right)^ 2 $
## 演示中心极限定律 ## 演示中心极限定律
```py ```
# 导入包 # 导入包
import pandas as pd import pandas as pd
import numpy as np import numpy as np
...@@ -39,11 +39,11 @@ population['numbers'] = np.random.uniform(0,10000,size=10000) ...@@ -39,11 +39,11 @@ population['numbers'] = np.random.uniform(0,10000,size=10000)
population['numbers'].hist(bins=100) population['numbers'].hist(bins=100)
# <matplotlib.axes._subplots.AxesSubplot at 0x112c72710> # <matplotlib.axes._subplots.AxesSubplot at 0x112c72710>
```
![png](https://chrisalbon.com/statistics/frequentist/demonstrate_the_central_limit_theorem_5_1.png) ![png](https://chrisalbon.com/statistics/frequentist/demonstrate_the_central_limit_theorem_5_1.png)
```py ```
# 查看数值的均值 # 查看数值的均值
population['numbers'].mean() population['numbers'].mean()
...@@ -63,13 +63,13 @@ for i in range(0,1000): ...@@ -63,13 +63,13 @@ for i in range(0,1000):
pd.Series(sampled_means).hist(bins=100) pd.Series(sampled_means).hist(bins=100)
# <matplotlib.axes._subplots.AxesSubplot at 0x11516e668> # <matplotlib.axes._subplots.AxesSubplot at 0x11516e668>
```
![png](https://chrisalbon.com/statistics/frequentist/demonstrate_the_central_limit_theorem_11_1.png) ![png](https://chrisalbon.com/statistics/frequentist/demonstrate_the_central_limit_theorem_11_1.png)
这是关键的图表,记住总体分布是均匀的,然而,这个分布接近正态。 这是中心极限理论的关键点,也是我们可以假设样本均值是无偏的原因。 这是关键的图表,记住总体分布是均匀的,然而,这个分布接近正态。 这是中心极限理论的关键点,也是我们可以假设样本均值是无偏的原因。
```py ```
# 查看 sampled_means 的均值 # 查看 sampled_means 的均值
pd.Series(sampled_means).mean() pd.Series(sampled_means).mean()
...@@ -82,25 +82,25 @@ error = population['numbers'].mean() - pd.Series(sampled_means).mean() ...@@ -82,25 +82,25 @@ error = population['numbers'].mean() - pd.Series(sampled_means).mean()
print('The Mean Sample Mean is only %f different the True Population mean!' % error) print('The Mean Sample Mean is only %f different the True Population mean!' % error)
# The Mean Sample Mean is only 2.359302 different the True Population mean! # The Mean Sample Mean is only 2.359302 different the True Population mean!
```
## 皮尔逊相关系数 ## 皮尔逊相关系数
基于 [cbare](http://stackoverflow.com/users/199166/cbare)[这个](http://stackoverflow.com/a/17389980/2935984) StackOverflow 答案。 基于 [cbare](http://stackoverflow.com/users/199166/cbare)[这个](http://stackoverflow.com/a/17389980/2935984) StackOverflow 答案。
```py ```
import statistics as stats import statistics as stats
x = [1,2,3,4,5,6,7,8,9] x = [1,2,3,4,5,6,7,8,9]
y = [2,1,2,4.5,7,6.5,6,9,9.5] y = [2,1,2,4.5,7,6.5,6,9,9.5]
```
有许多等价的表达方式来计算皮尔逊相关系数(也称为皮尔逊的 r)。这是一个。 有许多等价的表达方式来计算皮尔逊相关系数(也称为皮尔逊的 r)。这是一个。
$r={\frac {1}{n-1}}\sum_{i=1}^{n}\left({\frac {x_{i}-{\bar {x}}}{s_{x}}}\right)\left({\frac {y_{i}-{\bar {y}}}{s_{y}}}\right)$ $r={\frac {1}{n-1}}\sum_{i=1}^{n}\left({\frac {x_{i}-{\bar {x}}}{s_{x}}}\right)\left({\frac {y_{i}-{\bar {y}}}{s_{y}}}\right)$
其中 $s_{x}$ 和 $s_{y}$ 是 $x$ 和 $y $ 的标准差,$\left({\frac {x_{i}-{\bar {x}}}{s_{x}}}\right)$ 是 $x$ 和 $y$ 的[标准得分](https://en.wikipedia.org/wiki/Standard_score) 其中 $s_{x}$ 和 $s_{y}$ 是 $x$ 和 $y $ 的标准差,$\left({\frac {x_{i}-{\bar {x}}}{s_{x}}}\right)$ 是 $x$ 和 $y$ 的[标准得分](https://en.wikipedia.org/wiki/Standard_score)
```py ```
# 创建函数 # 创建函数
def pearson(x,y): def pearson(x,y):
...@@ -142,11 +142,11 @@ def pearson(x,y): ...@@ -142,11 +142,11 @@ def pearson(x,y):
pearson(x,y) pearson(x,y)
# 0.9412443251336238 # 0.9412443251336238
```
## 概率质量函数(PMF) ## 概率质量函数(PMF)
```py ```
# 加载库 # 加载库
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -175,13 +175,13 @@ for unique_value, count in count.items(): ...@@ -175,13 +175,13 @@ for unique_value, count in count.items():
# 绘制概率质量函数 # 绘制概率质量函数
plt.bar(list(probability_mass_function.keys()), probability_mass_function.values(), color='g') plt.bar(list(probability_mass_function.keys()), probability_mass_function.values(), color='g')
plt.show() plt.show()
```
![png](https://chrisalbon.com/statistics/frequentist/probability_mass_functions_10_0.png) ![png](https://chrisalbon.com/statistics/frequentist/probability_mass_functions_10_0.png)
## Spearman 排名相关度 ## Spearman 排名相关度
```py ```
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import scipy.stats import scipy.stats
...@@ -189,11 +189,11 @@ import scipy.stats ...@@ -189,11 +189,11 @@ import scipy.stats
# 创建两列随机变量 # 创建两列随机变量
x = [1,2,3,4,5,6,7,8,9] x = [1,2,3,4,5,6,7,8,9]
y = [2,1,2,4.5,7,6.5,6,9,9.5] y = [2,1,2,4.5,7,6.5,6,9,9.5]
```
Spearman 的排名相关度,是变量的排名版本的皮尔逊相关系数。 Spearman 的排名相关度,是变量的排名版本的皮尔逊相关系数。
```py ```
# 创建接受 x 和 y 的函数 # 创建接受 x 和 y 的函数
def spearmans_rank_correlation(xs, ys): def spearmans_rank_correlation(xs, ys):
...@@ -216,11 +216,11 @@ spearmans_rank_correlation(x, y)[0] ...@@ -216,11 +216,11 @@ spearmans_rank_correlation(x, y)[0]
scipy.stats.spearmanr(x, y)[0] scipy.stats.spearmanr(x, y)[0]
# 0.90377360145618102 # 0.90377360145618102
```
## T 检验 ## T 检验
```py ```
from scipy import stats from scipy import stats
import numpy as np import numpy as np
...@@ -231,13 +231,13 @@ x = np.random.normal(1, 1.5, 20) ...@@ -231,13 +231,13 @@ x = np.random.normal(1, 1.5, 20)
# 创建 20 个观测的列表,从均值为 0, # 创建 20 个观测的列表,从均值为 0,
# 标准差为 1.5 的正态分布中随机抽取 # 标准差为 1.5 的正态分布中随机抽取
y = np.random.normal(0, 1.5, 20) y = np.random.normal(0, 1.5, 20)
```
### 单样本双边 T 检验 ### 单样本双边 T 检验
想象一下单样本 T 检验,并绘制一个“正态形状的”山丘,以`1`为中心,并以`1.5`为标准差而“展开”,然后在`0`处放置一个标志并查看标志在山丘上的位置。它靠近顶部吗? 或者远离山丘? 如果标志靠近山丘的底部或更远,则 t 检验的 p 值将低于`0.05` 想象一下单样本 T 检验,并绘制一个“正态形状的”山丘,以`1`为中心,并以`1.5`为标准差而“展开”,然后在`0`处放置一个标志并查看标志在山丘上的位置。它靠近顶部吗? 或者远离山丘? 如果标志靠近山丘的底部或更远,则 t 检验的 p 值将低于`0.05`
```py ```
# 运行 T 检验来检验 x 的均值和 0 相比,是否有统计学显著的差异 # 运行 T 检验来检验 x 的均值和 0 相比,是否有统计学显著的差异
pvalue = stats.ttest_1samp(x, 0)[1] pvalue = stats.ttest_1samp(x, 0)[1]
...@@ -245,13 +245,13 @@ pvalue = stats.ttest_1samp(x, 0)[1] ...@@ -245,13 +245,13 @@ pvalue = stats.ttest_1samp(x, 0)[1]
pvalue pvalue
# 0.00010976647757800537 # 0.00010976647757800537
```
### 双样本非配对等方差双边 T 检验 ### 双样本非配对等方差双边 T 检验
想象一下单样本 T 检验,并根据标准差绘制两个(正态形状的)山丘,以它们的均值为中心,并根据他们的标准差绘制它们的“平坦度”(个体延展度)。 T 检验考察了两座山丘重叠的程度。 它们基本上是彼此覆盖的吗? 山丘的底部几乎没有碰到吗? 如果山丘的尾部刚刚重叠或根本不重叠,则 t 检验的 p 值将低于 0.05。 想象一下单样本 T 检验,并根据标准差绘制两个(正态形状的)山丘,以它们的均值为中心,并根据他们的标准差绘制它们的“平坦度”(个体延展度)。 T 检验考察了两座山丘重叠的程度。 它们基本上是彼此覆盖的吗? 山丘的底部几乎没有碰到吗? 如果山丘的尾部刚刚重叠或根本不重叠,则 t 检验的 p 值将低于 0.05。
```py ```
stats.ttest_ind(x, y)[1] stats.ttest_ind(x, y)[1]
# 0.00035082056802728071 # 0.00035082056802728071
...@@ -259,27 +259,27 @@ stats.ttest_ind(x, y)[1] ...@@ -259,27 +259,27 @@ stats.ttest_ind(x, y)[1]
stats.ttest_ind(x, y, equal_var=False)[1] stats.ttest_ind(x, y, equal_var=False)[1]
# 0.00035089238660076095 # 0.00035089238660076095
```
### 双样本配对双边 T 检验 ### 双样本配对双边 T 检验
当我们采集重复样本,并且想要考虑我们正在测试的两个分布是成对的这一事实时,使用配对 T 检验。 当我们采集重复样本,并且想要考虑我们正在测试的两个分布是成对的这一事实时,使用配对 T 检验。
```py ```
stats.ttest_rel(x, y)[1] stats.ttest_rel(x, y)[1]
# 0.00034222792790150386 # 0.00034222792790150386
```
## 方差和标准差 ## 方差和标准差
```py ```
# 导入包 # 导入包
import math import math
# 创建值的列表 # 创建值的列表
data = [3,2,3,4,2,3,5,2,2,33,3,5,2,2,5,6,62,2,2,3,6,6,2,23,3,2,3] data = [3,2,3,4,2,3,5,2,2,33,3,5,2,2,5,6,62,2,2,3,6,6,2,23,3,2,3]
```
方差是衡量数据分布延展度的指标。 方差越大,数据点越“分散”。 方差,通常表示为 $ S^{2}$,计算方式如下: 方差是衡量数据分布延展度的指标。 方差越大,数据点越“分散”。 方差,通常表示为 $ S^{2}$,计算方式如下:
...@@ -289,7 +289,7 @@ $ \text{Sample Variance} = S_{n-1}^{2} = \frac{1}{n-1}\sum_{i=1}^{n}(x_i-\bar{x} ...@@ -289,7 +289,7 @@ $ \text{Sample Variance} = S_{n-1}^{2} = \frac{1}{n-1}\sum_{i=1}^{n}(x_i-\bar{x}
其中 $n$ 是观测数,$\bar{x}$ 是观察值的平均值,$x_i-\bar{x}$ 是单个观察值减去数据均值。 请注意,如果我们根据来自该总体的样本估计总体的方差,我们应该使用第二个等式,将 $n$ 替换为 $n-1$。 其中 $n$ 是观测数,$\bar{x}$ 是观察值的平均值,$x_i-\bar{x}$ 是单个观察值减去数据均值。 请注意,如果我们根据来自该总体的样本估计总体的方差,我们应该使用第二个等式,将 $n$ 替换为 $n-1$。
```py ```
# 计算 n # 计算 n
n = len(data) n = len(data)
...@@ -321,11 +321,11 @@ population_variance = sum_of_deviations_from_mean_squared/n ...@@ -321,11 +321,11 @@ population_variance = sum_of_deviations_from_mean_squared/n
population_variance population_variance
# 160.78463648834017 # 160.78463648834017
```
标准差就是方差的平方根。 标准差就是方差的平方根。
```py ```
# 计算总体方差的平方根 # 计算总体方差的平方根
population_standard_deviation = math.sqrt(population_variance) population_standard_deviation = math.sqrt(population_variance)
...@@ -333,3 +333,4 @@ population_standard_deviation = math.sqrt(population_variance) ...@@ -333,3 +333,4 @@ population_standard_deviation = math.sqrt(population_variance)
population_standard_deviation population_standard_deviation
# 12.68008818929664 # 12.68008818929664
```
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册