提交 e81332b9 编写于 作者: W wizardforcel

21

上级 dc81c6ac
......@@ -8,7 +8,7 @@
## MatPlotLib 中的双向条形图
```
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
......@@ -31,7 +31,7 @@ df
| 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 |
```
```py
# 输入数据,特别是第二和
# 第三行,跳过第一列
x1 = df.ix[1, 1:]
......@@ -84,7 +84,7 @@ plt.show()
## MatPlotLib 中的条形图
```
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
......@@ -107,7 +107,7 @@ df
| 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 |
```
```py
# 为每个变量创建得分均值的列表
mean_values = [df['pre_score'].mean(), df['mid_score'].mean(), df['post_score'].mean()]
......@@ -152,7 +152,7 @@ plt.show()
## Seaborn 中的调色板
```
```py
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
......@@ -177,139 +177,139 @@ sns.palplot(sns.color_palette("deep", 10))
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_5_0.png)
```
```py
sns.palplot(sns.color_palette("muted", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_6_0.png)
```
```py
sns.palplot(sns.color_palette("bright", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_7_0.png)
```
```py
sns.palplot(sns.color_palette("dark", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_8_0.png)
```
```py
sns.palplot(sns.color_palette("colorblind", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_9_0.png)
```
```py
sns.palplot(sns.color_palette("Paired", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_10_0.png)
```
```py
sns.palplot(sns.color_palette("BuGn", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_11_0.png)
```
```py
sns.palplot(sns.color_palette("GnBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_12_0.png)
```
```py
sns.palplot(sns.color_palette("OrRd", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_13_0.png)
```
```py
sns.palplot(sns.color_palette("PuBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_14_0.png)
```
```py
sns.palplot(sns.color_palette("YlGn", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_15_0.png)
```
```py
sns.palplot(sns.color_palette("YlGnBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_16_0.png)
```
```py
sns.palplot(sns.color_palette("YlOrBr", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_17_0.png)
```
```py
sns.palplot(sns.color_palette("YlOrRd", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_18_0.png)
```
```py
sns.palplot(sns.color_palette("BrBG", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_19_0.png)
```
```py
sns.palplot(sns.color_palette("PiYG", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_20_0.png)
```
```py
sns.palplot(sns.color_palette("PRGn", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_21_0.png)
```
```py
sns.palplot(sns.color_palette("PuOr", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_22_0.png)
```
```py
sns.palplot(sns.color_palette("RdBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_23_0.png)
```
```py
sns.palplot(sns.color_palette("RdGy", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_24_0.png)
```
```py
sns.palplot(sns.color_palette("RdYlBu", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_25_0.png)
```
```py
sns.palplot(sns.color_palette("RdYlGn", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_26_0.png)
```
```py
sns.palplot(sns.color_palette("Spectral", 10))
```
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_27_0.png)
```
```py
# 创建调色板并将其设为当前调色板
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
sns.set_palette(flatui)
......@@ -318,7 +318,7 @@ sns.palplot(sns.color_palette())
![png](https://chrisalbon.com/python/data_visualization/seaborn_color_palettes_29_0.png)
```
```py
# 设置绘图颜色
sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4,
df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], color="#34495e")
......@@ -330,7 +330,7 @@ sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df
## 使用 Seaborn 和 pandas 创建时间序列绘图
```
```py
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
......@@ -357,7 +357,7 @@ sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df
![png](https://chrisalbon.com/python/data_visualization/seaborn_pandas_timeseries_plot_5_1.png)
```
```py
# 带有置信区间直线,但是没有直线的时间序列绘图
sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df.deaths_regiment_4,
df.deaths_regiment_5, df.deaths_regiment_6, df.deaths_regiment_7], err_style="ci_bars", interpolate=False)
......@@ -369,7 +369,7 @@ sns.tsplot([df.deaths_regiment_1, df.deaths_regiment_2, df.deaths_regiment_3, df
## 使用 Seaborn 创建散点图
```
```py
import pandas as pd
%matplotlib inline
import random
......@@ -397,7 +397,7 @@ df.head()
| 3 | 510 | 206 | 1 | female |
| 4 | 848 | 357 | 0 | female |
```
```py
# 设置散点图样式
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
......@@ -427,7 +427,7 @@ plt.ylabel('Deaths')
## MatPlotLib 中的分组条形图
```
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
......@@ -449,7 +449,7 @@ df
| 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 |
```
```py
# 设置条形的位置和宽度
pos = list(range(len(df['pre_score'])))
width = 0.25
......@@ -525,7 +525,7 @@ plt.show()
## MatPlotLib 中的直方图
```
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
......@@ -551,7 +551,7 @@ df.head()
| 3 | Battle of the Green Fork | 298 | 4 | Robb Stark | Joffrey/Tommen Baratheon | Stark | NaN | NaN | NaN | Lannister | NaN | NaN | NaN | loss | pitched battle | 1 | 1 | 18000 | 20000 | Roose Bolton, Wylis Manderly, Medger Cerwyn, H... | Tywin Lannister, Gregor Clegane, Kevan Lannist... | 1 | Green Fork | The Riverlands | NaN |
| 4 | Battle of the Whispering Wood | 298 | 5 | Robb Stark | Joffrey/Tommen Baratheon | Stark | Tully | NaN | NaN | Lannister | NaN | NaN | NaN | win | ambush | 1 | 1 | 1875 | 6000 | Robb Stark, Brynden Tully | Jaime Lannister | 1 | Whispering Wood | The Riverlands | NaN |
```
```py
# 制作攻击方和防守方大小的两个变量
# 但是当有超过 10000 个攻击方时将其排除在外
data1 = df['attacker_size'][df['attacker_size'] < 90000]
......@@ -588,7 +588,7 @@ plt.show()
![png](https://chrisalbon.com/python/data_visualization/matplotlib_histogram_6_0.png)
```
```py
# 制作攻击方和防守方大小的两个变量
# 但是当有超过 10000 个攻击方时将其排除在外
data1 = df['attacker_size'][df['attacker_size'] < 90000]
......@@ -640,7 +640,7 @@ plt.show()
## 从 Pandas 数据帧生成 MatPlotLib 散点图
```
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
......@@ -664,7 +664,7 @@ df
| 3 | Jake | Milner | 24 | 0 | 2 | 62 |
| 4 | Amy | Cooze | 73 | 1 | 3 | 70 |
```
```py
# preTestScore 和 postTestScore 的散点图
# 每个点的大小取决于年龄
plt.scatter(df.preTestScore, df.postTestScore
......@@ -675,7 +675,7 @@ plt.scatter(df.preTestScore, df.postTestScore
![png](https://chrisalbon.com/python/data_visualization/matplotlib_scatterplot_from_pandas_6_1.png)
```
```py
# preTestScore 和 postTestScore 的散点图
# 大小为 300,颜色取决于性别
plt.scatter(df.preTestScore, df.postTestScore, s=300, c=df.female)
......@@ -687,7 +687,7 @@ plt.scatter(df.preTestScore, df.postTestScore, s=300, c=df.female)
## Matplotlib 的简单示例
```
```py
# 让 Jupyter 加载 matplotlib
# 并内联创建所有绘图(也就是在页面上)
%matplotlib inline
......@@ -703,7 +703,7 @@ pyplot.plot([1.6, 2.7])
## MatPlotLib 中的饼图
```
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
......@@ -724,7 +724,7 @@ df
| 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 |
```
```py
# 创建一列,其中包含每个官员的总逮捕数
df['total_arrests'] = df['jan_arrests'] + df['feb_arrests'] + df['march_arrests']
df
......@@ -738,7 +738,7 @@ df
| 3 | Jake | 2 | 62 | 23 | 87 |
| 4 | Amy | 3 | 70 | 51 | 124 |
```
```py
# (从 iWantHue)创建一列颜色
colors = ["#E13F29", "#D69A80", "#D63B59", "#AE5552", "#CB5C3B", "#EB8076", "#96624E"]
......@@ -772,7 +772,7 @@ plt.show()
## MatPlotLib 中的散点图
```
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
......@@ -796,7 +796,7 @@ df.head()
| 3 | Battle of the Green Fork | 298 | 4 | Robb Stark | Joffrey/Tommen Baratheon | Stark | NaN | NaN | NaN | Lannister | NaN | NaN | NaN | loss | pitched battle | 1.0 | 1.0 | 18000.0 | 20000.0 | Roose Bolton, Wylis Manderly, Medger Cerwyn, H... | Tywin Lannister, Gregor Clegane, Kevan Lannist... | 1.0 | Green Fork | The Riverlands | NaN |
| 4 | Battle of the Whispering Wood | 298 | 5 | Robb Stark | Joffrey/Tommen Baratheon | Stark | Tully | NaN | NaN | Lannister | NaN | NaN | NaN | win | ambush | 1.0 | 1.0 | 1875.0 | 6000.0 | Robb Stark, Brynden Tully | Jaime Lannister | 1.0 | Whispering Wood | The Riverlands | NaN |
```
```py
# 创建图形
plt.figure(figsize=(10,8))
......@@ -869,7 +869,7 @@ plt.show()
## MatPlotLib 中的栈式百分比条形图
```
```py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
......@@ -890,7 +890,7 @@ df
| 3 | Jake | 2 | 62 | 23 |
| 4 | Amy | 3 | 70 | 51 |
```
```py
# 创建带有一个子图的图形
f, ax = plt.subplots(1, figsize=(10,5))
......
......@@ -20,7 +20,7 @@ $ s^2 = \frac {1}{n-1} \sum_{i=1}^n \left(x_i - \overline{x} \right)^ 2 $
## 演示中心极限定律
```
```py
# 导入包
import pandas as pd
import numpy as np
......@@ -43,7 +43,7 @@ population['numbers'].hist(bins=100)
![png](https://chrisalbon.com/statistics/frequentist/demonstrate_the_central_limit_theorem_5_1.png)
```
```py
# 查看数值的均值
population['numbers'].mean()
......@@ -69,7 +69,7 @@ pd.Series(sampled_means).hist(bins=100)
这是关键的图表,记住总体分布是均匀的,然而,这个分布接近正态。 这是中心极限理论的关键点,也是我们可以假设样本均值是无偏的原因。
```
```py
# 查看 sampled_means 的均值
pd.Series(sampled_means).mean()
......@@ -88,7 +88,7 @@ print('The Mean Sample Mean is only %f different the True Population mean!' % er
基于 [cbare](http://stackoverflow.com/users/199166/cbare)[这个](http://stackoverflow.com/a/17389980/2935984) StackOverflow 答案。
```
```py
import statistics as stats
x = [1,2,3,4,5,6,7,8,9]
......@@ -100,7 +100,7 @@ $r={\frac {1}{n-1}}\sum_{i=1}^{n}\left({\frac {x_{i}-{\bar {x}}}{s_{x}}}\right)\
其中 $s_{x}$ 和 $s_{y}$ 是 $x$ 和 $y $ 的标准差,$\left({\frac {x_{i}-{\bar {x}}}{s_{x}}}\right)$ 是 $x$ 和 $y$ 的[标准得分](https://en.wikipedia.org/wiki/Standard_score)
```
```py
# 创建函数
def pearson(x,y):
......@@ -146,7 +146,7 @@ pearson(x,y)
## 概率质量函数(PMF)
```
```py
# 加载库
import matplotlib.pyplot as plt
......@@ -181,7 +181,7 @@ plt.show()
## Spearman 排名相关度
```
```py
import numpy as np
import pandas as pd
import scipy.stats
......@@ -193,7 +193,7 @@ y = [2,1,2,4.5,7,6.5,6,9,9.5]
Spearman 的排名相关度,是变量的排名版本的皮尔逊相关系数。
```
```py
# 创建接受 x 和 y 的函数
def spearmans_rank_correlation(xs, ys):
......@@ -220,7 +220,7 @@ scipy.stats.spearmanr(x, y)[0]
## T 检验
```
```py
from scipy import stats
import numpy as np
......@@ -237,7 +237,7 @@ y = np.random.normal(0, 1.5, 20)
想象一下单样本 T 检验,并绘制一个“正态形状的”山丘,以`1`为中心,并以`1.5`为标准差而“展开”,然后在`0`处放置一个标志并查看标志在山丘上的位置。它靠近顶部吗? 或者远离山丘? 如果标志靠近山丘的底部或更远,则 t 检验的 p 值将低于`0.05`
```
```py
# 运行 T 检验来检验 x 的均值和 0 相比,是否有统计学显著的差异
pvalue = stats.ttest_1samp(x, 0)[1]
......@@ -251,7 +251,7 @@ pvalue
想象一下单样本 T 检验,并根据标准差绘制两个(正态形状的)山丘,以它们的均值为中心,并根据他们的标准差绘制它们的“平坦度”(个体延展度)。 T 检验考察了两座山丘重叠的程度。 它们基本上是彼此覆盖的吗? 山丘的底部几乎没有碰到吗? 如果山丘的尾部刚刚重叠或根本不重叠,则 t 检验的 p 值将低于 0.05。
```
```py
stats.ttest_ind(x, y)[1]
# 0.00035082056802728071
......@@ -265,7 +265,7 @@ stats.ttest_ind(x, y, equal_var=False)[1]
当我们采集重复样本,并且想要考虑我们正在测试的两个分布是成对的这一事实时,使用配对 T 检验。
```
```py
stats.ttest_rel(x, y)[1]
# 0.00034222792790150386
......@@ -273,7 +273,7 @@ stats.ttest_rel(x, y)[1]
## 方差和标准差
```
```py
# 导入包
import math
......@@ -289,7 +289,7 @@ $ \text{Sample Variance} = S_{n-1}^{2} = \frac{1}{n-1}\sum_{i=1}^{n}(x_i-\bar{x}
其中 $n$ 是观测数,$\bar{x}$ 是观察值的平均值,$x_i-\bar{x}$ 是单个观察值减去数据均值。 请注意,如果我们根据来自该总体的样本估计总体的方差,我们应该使用第二个等式,将 $n$ 替换为 $n-1$。
```
```py
# 计算 n
n = len(data)
......@@ -325,7 +325,7 @@ population_variance
标准差就是方差的平方根。
```
```py
# 计算总体方差的平方根
population_standard_deviation = math.sqrt(population_variance)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册