{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"import pandas as pd\n",
"from pandas import Series,DataFrame"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 上一讲,append,concat数据集成方法\n",
"# merge融合,根据某一共同属性进行级联,高级用法"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" sex | \n",
" id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A | \n",
" 男 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" B | \n",
" 女 | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" C | \n",
" 女 | \n",
" 3 | \n",
"
\n",
" \n",
" 3 | \n",
" D | \n",
" 女 | \n",
" 4 | \n",
"
\n",
" \n",
" 4 | \n",
" E | \n",
" 男 | \n",
" 5 | \n",
"
\n",
" \n",
" 5 | \n",
" F | \n",
" 男 | \n",
" 6 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name sex id\n",
"0 A 男 1\n",
"1 B 女 2\n",
"2 C 女 3\n",
"3 D 女 4\n",
"4 E 男 5\n",
"5 F 男 6"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1 = DataFrame({'name':['A','B','C','D','E','F'],\n",
" 'sex':['男','女','女','女','男','男'],\n",
" 'id':[1,2,3,4,5,6]})\n",
"df1"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
" salary | \n",
" id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 22 | \n",
" 12000 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 25 | \n",
" 15000 | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" 27 | \n",
" 20000 | \n",
" 3 | \n",
"
\n",
" \n",
" 3 | \n",
" 21 | \n",
" 30000 | \n",
" 4 | \n",
"
\n",
" \n",
" 4 | \n",
" 18 | \n",
" 10000 | \n",
" 5 | \n",
"
\n",
" \n",
" 5 | \n",
" 29 | \n",
" 8000 | \n",
" 7 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age salary id\n",
"0 22 12000 1\n",
"1 25 15000 2\n",
"2 27 20000 3\n",
"3 21 30000 4\n",
"4 18 10000 5\n",
"5 29 8000 7"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = DataFrame({'age':[22,25,27,21,18,29],'salary':[12000,15000,20000,30000,10000,8000],'id':[1,2,3,4,5,7]})\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\python36\\lib\\site-packages\\pandas\\core\\frame.py:6692: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
"of pandas will change to not sort by default.\n",
"\n",
"To accept the future behavior, pass 'sort=False'.\n",
"\n",
"To retain the current behavior and silence the warning, pass 'sort=True'.\n",
"\n",
" sort=sort)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
" id | \n",
" name | \n",
" salary | \n",
" sex | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" NaN | \n",
" 1 | \n",
" A | \n",
" NaN | \n",
" 男 | \n",
"
\n",
" \n",
" 1 | \n",
" NaN | \n",
" 2 | \n",
" B | \n",
" NaN | \n",
" 女 | \n",
"
\n",
" \n",
" 2 | \n",
" NaN | \n",
" 3 | \n",
" C | \n",
" NaN | \n",
" 女 | \n",
"
\n",
" \n",
" 3 | \n",
" NaN | \n",
" 4 | \n",
" D | \n",
" NaN | \n",
" 女 | \n",
"
\n",
" \n",
" 4 | \n",
" NaN | \n",
" 5 | \n",
" E | \n",
" NaN | \n",
" 男 | \n",
"
\n",
" \n",
" 5 | \n",
" NaN | \n",
" 6 | \n",
" F | \n",
" NaN | \n",
" 男 | \n",
"
\n",
" \n",
" 0 | \n",
" 22.0 | \n",
" 1 | \n",
" NaN | \n",
" 12000.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 25.0 | \n",
" 2 | \n",
" NaN | \n",
" 15000.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" 27.0 | \n",
" 3 | \n",
" NaN | \n",
" 20000.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 21.0 | \n",
" 4 | \n",
" NaN | \n",
" 30000.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" 18.0 | \n",
" 5 | \n",
" NaN | \n",
" 10000.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 5 | \n",
" 29.0 | \n",
" 7 | \n",
" NaN | \n",
" 8000.0 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age id name salary sex\n",
"0 NaN 1 A NaN 男\n",
"1 NaN 2 B NaN 女\n",
"2 NaN 3 C NaN 女\n",
"3 NaN 4 D NaN 女\n",
"4 NaN 5 E NaN 男\n",
"5 NaN 6 F NaN 男\n",
"0 22.0 1 NaN 12000.0 NaN\n",
"1 25.0 2 NaN 15000.0 NaN\n",
"2 27.0 3 NaN 20000.0 NaN\n",
"3 21.0 4 NaN 30000.0 NaN\n",
"4 18.0 5 NaN 10000.0 NaN\n",
"5 29.0 7 NaN 8000.0 NaN"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1.append(df2)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" sex | \n",
" id | \n",
" age | \n",
" salary | \n",
" id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A | \n",
" 男 | \n",
" 1 | \n",
" 22 | \n",
" 12000 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" B | \n",
" 女 | \n",
" 2 | \n",
" 25 | \n",
" 15000 | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" C | \n",
" 女 | \n",
" 3 | \n",
" 27 | \n",
" 20000 | \n",
" 3 | \n",
"
\n",
" \n",
" 3 | \n",
" D | \n",
" 女 | \n",
" 4 | \n",
" 21 | \n",
" 30000 | \n",
" 4 | \n",
"
\n",
" \n",
" 4 | \n",
" E | \n",
" 男 | \n",
" 5 | \n",
" 18 | \n",
" 10000 | \n",
" 5 | \n",
"
\n",
" \n",
" 5 | \n",
" F | \n",
" 男 | \n",
" 6 | \n",
" 29 | \n",
" 8000 | \n",
" 7 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name sex id age salary id\n",
"0 A 男 1 22 12000 1\n",
"1 B 女 2 25 15000 2\n",
"2 C 女 3 27 20000 3\n",
"3 D 女 4 21 30000 4\n",
"4 E 男 5 18 10000 5\n",
"5 F 男 6 29 8000 7"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([df1,df2],axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" sex | \n",
" id | \n",
" age | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A | \n",
" 男 | \n",
" 1 | \n",
" 22 | \n",
" 12000 | \n",
"
\n",
" \n",
" 1 | \n",
" B | \n",
" 女 | \n",
" 2 | \n",
" 25 | \n",
" 15000 | \n",
"
\n",
" \n",
" 2 | \n",
" C | \n",
" 女 | \n",
" 3 | \n",
" 27 | \n",
" 20000 | \n",
"
\n",
" \n",
" 3 | \n",
" D | \n",
" 女 | \n",
" 4 | \n",
" 21 | \n",
" 30000 | \n",
"
\n",
" \n",
" 4 | \n",
" E | \n",
" 男 | \n",
" 5 | \n",
" 18 | \n",
" 10000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name sex id age salary\n",
"0 A 男 1 22 12000\n",
"1 B 女 2 25 15000\n",
"2 C 女 3 27 20000\n",
"3 D 女 4 21 30000\n",
"4 E 男 5 18 10000"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1.merge(df2)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" sex | \n",
" id | \n",
" age | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A | \n",
" 男 | \n",
" 1 | \n",
" 22.0 | \n",
" 12000.0 | \n",
"
\n",
" \n",
" 1 | \n",
" B | \n",
" 女 | \n",
" 2 | \n",
" 25.0 | \n",
" 15000.0 | \n",
"
\n",
" \n",
" 2 | \n",
" C | \n",
" 女 | \n",
" 3 | \n",
" 27.0 | \n",
" 20000.0 | \n",
"
\n",
" \n",
" 3 | \n",
" D | \n",
" 女 | \n",
" 4 | \n",
" 21.0 | \n",
" 30000.0 | \n",
"
\n",
" \n",
" 4 | \n",
" E | \n",
" 男 | \n",
" 5 | \n",
" 18.0 | \n",
" 10000.0 | \n",
"
\n",
" \n",
" 5 | \n",
" F | \n",
" 男 | \n",
" 6 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 6 | \n",
" NaN | \n",
" NaN | \n",
" 7 | \n",
" 29.0 | \n",
" 8000.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name sex id age salary\n",
"0 A 男 1 22.0 12000.0\n",
"1 B 女 2 25.0 15000.0\n",
"2 C 女 3 27.0 20000.0\n",
"3 D 女 4 21.0 30000.0\n",
"4 E 男 5 18.0 10000.0\n",
"5 F 男 6 NaN NaN\n",
"6 NaN NaN 7 29.0 8000.0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1.merge(df2,how = 'outer')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Python | \n",
" Math | \n",
" En | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 40 | \n",
" 15 | \n",
" 90 | \n",
"
\n",
" \n",
" B | \n",
" 59 | \n",
" 52 | \n",
" 83 | \n",
"
\n",
" \n",
" C | \n",
" 14 | \n",
" 138 | \n",
" 137 | \n",
"
\n",
" \n",
" D | \n",
" 89 | \n",
" 78 | \n",
" 53 | \n",
"
\n",
" \n",
" E | \n",
" 81 | \n",
" 101 | \n",
" 3 | \n",
"
\n",
" \n",
" F | \n",
" 75 | \n",
" 79 | \n",
" 86 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Python Math En\n",
"A 40 15 90\n",
"B 59 52 83\n",
"C 14 138 137\n",
"D 89 78 53\n",
"E 81 101 3\n",
"F 75 79 86"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = DataFrame(np.random.randint(0,150,size = (6,3)),index = list('ABCDEF'),columns=['Python','Math','En'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"Python 59.7\n",
"Math 77.2\n",
"En 75.3\n",
"dtype: float64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s = df.mean().round(1)\n",
"s"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" score_mean | \n",
"
\n",
" \n",
" \n",
" \n",
" Python | \n",
" 59.7 | \n",
"
\n",
" \n",
" Math | \n",
" 77.2 | \n",
"
\n",
" \n",
" En | \n",
" 75.3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" score_mean\n",
"Python 59.7\n",
"Math 77.2\n",
"En 75.3"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = DataFrame(s)\n",
"df2.columns = ['score_mean']\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Python | \n",
" Math | \n",
" En | \n",
"
\n",
" \n",
" \n",
" \n",
" score_mean | \n",
" 59.7 | \n",
" 77.2 | \n",
" 75.3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Python Math En\n",
"score_mean 59.7 77.2 75.3"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3 = df2.T\n",
"df3"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Python | \n",
" Math | \n",
" En | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 40.0 | \n",
" 15.0 | \n",
" 90.0 | \n",
"
\n",
" \n",
" B | \n",
" 59.0 | \n",
" 52.0 | \n",
" 83.0 | \n",
"
\n",
" \n",
" C | \n",
" 14.0 | \n",
" 138.0 | \n",
" 137.0 | \n",
"
\n",
" \n",
" D | \n",
" 89.0 | \n",
" 78.0 | \n",
" 53.0 | \n",
"
\n",
" \n",
" E | \n",
" 81.0 | \n",
" 101.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
" F | \n",
" 75.0 | \n",
" 79.0 | \n",
" 86.0 | \n",
"
\n",
" \n",
" score_mean | \n",
" 59.7 | \n",
" 77.2 | \n",
" 75.3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Python Math En\n",
"A 40.0 15.0 90.0\n",
"B 59.0 52.0 83.0\n",
"C 14.0 138.0 137.0\n",
"D 89.0 78.0 53.0\n",
"E 81.0 101.0 3.0\n",
"F 75.0 79.0 86.0\n",
"score_mean 59.7 77.2 75.3"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df4 = df.append(df3)\n",
"df4"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" score_mean | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 48.3 | \n",
"
\n",
" \n",
" B | \n",
" 64.7 | \n",
"
\n",
" \n",
" C | \n",
" 96.3 | \n",
"
\n",
" \n",
" D | \n",
" 73.3 | \n",
"
\n",
" \n",
" E | \n",
" 61.7 | \n",
"
\n",
" \n",
" F | \n",
" 80.0 | \n",
"
\n",
" \n",
" score_mean | \n",
" 70.7 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" score_mean\n",
"A 48.3\n",
"B 64.7\n",
"C 96.3\n",
"D 73.3\n",
"E 61.7\n",
"F 80.0\n",
"score_mean 70.7"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5 = DataFrame(df4.mean(axis = 1).round(1))\n",
"df5.columns = ['score_mean']\n",
"df5"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Python | \n",
" Math | \n",
" En | \n",
" score_mean | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 40.0 | \n",
" 15.0 | \n",
" 90.0 | \n",
" 48.3 | \n",
"
\n",
" \n",
" B | \n",
" 59.0 | \n",
" 52.0 | \n",
" 83.0 | \n",
" 64.7 | \n",
"
\n",
" \n",
" C | \n",
" 14.0 | \n",
" 138.0 | \n",
" 137.0 | \n",
" 96.3 | \n",
"
\n",
" \n",
" D | \n",
" 89.0 | \n",
" 78.0 | \n",
" 53.0 | \n",
" 73.3 | \n",
"
\n",
" \n",
" E | \n",
" 81.0 | \n",
" 101.0 | \n",
" 3.0 | \n",
" 61.7 | \n",
"
\n",
" \n",
" F | \n",
" 75.0 | \n",
" 79.0 | \n",
" 86.0 | \n",
" 80.0 | \n",
"
\n",
" \n",
" score_mean | \n",
" 59.7 | \n",
" 77.2 | \n",
" 75.3 | \n",
" 70.7 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Python Math En score_mean\n",
"A 40.0 15.0 90.0 48.3\n",
"B 59.0 52.0 83.0 64.7\n",
"C 14.0 138.0 137.0 96.3\n",
"D 89.0 78.0 53.0 73.3\n",
"E 81.0 101.0 3.0 61.7\n",
"F 75.0 79.0 86.0 80.0\n",
"score_mean 59.7 77.2 75.3 70.7"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df4.merge(df5,left_index=True,right_index=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}