Tue May 2 03:57:00 UTC 2023 inscode

上级 df352508
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 异常值检测"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in indexes: http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/simple\n",
"Collecting pandas\n",
" Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/pandas/2.0.1/pandas-2.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m284.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hCollecting openpyxl\n",
" Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/openpyxl/3.1.2/openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m250.0/250.0 kB\u001b[0m \u001b[31m189.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting xlrd==1.2.0\n",
" Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/xlrd/1.2.0/xlrd-1.2.0-py2.py3-none-any.whl (103 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m103.3/103.3 kB\u001b[0m \u001b[31m137.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting tzdata>=2022.1\n",
" Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/tzdata/2023.3/tzdata-2023.3-py2.py3-none-any.whl (341 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.8/341.8 kB\u001b[0m \u001b[31m250.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting numpy>=1.20.3\n",
" Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/numpy/1.24.3/numpy-1.24.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m236.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting pytz>=2020.1\n",
" Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/pytz/2023.3/pytz-2023.3-py2.py3-none-any.whl (502 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m502.3/502.3 kB\u001b[0m \u001b[31m231.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.8/dist-packages (from pandas) (2.8.2)\n",
"Collecting et-xmlfile\n",
" Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/et-xmlfile/1.1.0/et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
"Installing collected packages: pytz, xlrd, tzdata, numpy, et-xmlfile, pandas, openpyxl\n",
"Successfully installed et-xmlfile-1.1.0 numpy-1.24.3 openpyxl-3.1.2 pandas-2.0.1 pytz-2023.3 tzdata-2023.3 xlrd-1.2.0\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install pandas openpyxl xlrd==1.2.0"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "only integer scalar arrays can be converted to a scalar index",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[4], line 24\u001b[0m\n\u001b[1;32m 18\u001b[0m result \u001b[39m=\u001b[39m {\n\u001b[1;32m 19\u001b[0m \u001b[39m'\u001b[39m\u001b[39moutlier_values\u001b[39m\u001b[39m'\u001b[39m: outliers,\n\u001b[1;32m 20\u001b[0m \u001b[39m'\u001b[39m\u001b[39moutlier_index\u001b[39m\u001b[39m'\u001b[39m: outlier_index,\n\u001b[1;32m 21\u001b[0m \u001b[39m'\u001b[39m\u001b[39moutlier_ratio\u001b[39m\u001b[39m'\u001b[39m: outlier_ratio\n\u001b[1;32m 22\u001b[0m }\n\u001b[1;32m 23\u001b[0m \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m---> 24\u001b[0m detect_outliers([\u001b[39m51\u001b[39;49m, \u001b[39m2618.2\u001b[39;49m, \u001b[39m2608.4\u001b[39;49m, \u001b[39m2651.9\u001b[39;49m, \u001b[39m3442.1\u001b[39;49m, \u001b[39m3393.1\u001b[39;49m, \u001b[39m3136.1\u001b[39;49m, \u001b[39m3744.1\u001b[39;49m,\n\u001b[1;32m 25\u001b[0m \u001b[39m6607.4\u001b[39;49m, \u001b[39m4060.3\u001b[39;49m, \u001b[39m3614.7\u001b[39;49m, \u001b[39m3295.5\u001b[39;49m, \u001b[39m2332.1\u001b[39;49m, \u001b[39m2699.3\u001b[39;49m, \u001b[39m3036.8\u001b[39;49m,\n\u001b[1;32m 26\u001b[0m \u001b[39m865\u001b[39;49m, \u001b[39m3014.3\u001b[39;49m, \u001b[39m2742.8\u001b[39;49m, \u001b[39m2173.5\u001b[39;49m])\n",
"Cell \u001b[0;32mIn[4], line 11\u001b[0m, in \u001b[0;36mdetect_outliers\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 8\u001b[0m lower_bound \u001b[39m=\u001b[39m q1 \u001b[39m-\u001b[39m \u001b[39m1.5\u001b[39m \u001b[39m*\u001b[39m iqr \u001b[39m# 计算下界\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[39m# 判断异常值,大于上界或小于下界的值即为异常值\u001b[39;00m\n\u001b[0;32m---> 11\u001b[0m outliers \u001b[39m=\u001b[39m data[(data \u001b[39m>\u001b[39;49m upper_bound) \u001b[39m|\u001b[39;49m (data \u001b[39m<\u001b[39;49m lower_bound)]\n\u001b[1;32m 12\u001b[0m outlier_index \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mwhere((data \u001b[39m>\u001b[39m upper_bound) \u001b[39m|\u001b[39m (data \u001b[39m<\u001b[39m lower_bound))[\u001b[39m0\u001b[39m]\n\u001b[1;32m 14\u001b[0m \u001b[39m# 计算异常值比例\u001b[39;00m\n",
"\u001b[0;31mTypeError\u001b[0m: only integer scalar arrays can be converted to a scalar index"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"def detect_outliers(data):\n",
" # 使用numpy中的percentile函数计算百分位数\n",
" q1, q3 = np.percentile(data, [25, 75])\n",
" iqr = q3 - q1 # 计算四分位距\n",
" upper_bound = q3 + 1.5 * iqr # 计算上界\n",
" lower_bound = q1 - 1.5 * iqr # 计算下界\n",
"\n",
" # 判断异常值,大于上界或小于下界的值即为异常值\n",
" outliers = data[(data > upper_bound) | (data < lower_bound)]\n",
" outlier_index = np.where((data > upper_bound) | (data < lower_bound))[0]\n",
"\n",
" # 计算异常值比例\n",
" outlier_ratio = len(outliers) / len(data)\n",
"\n",
" # 将结果存储到字典中\n",
" result = {\n",
" 'outlier_values': outliers,\n",
" 'outlier_index': outlier_index,\n",
" 'outlier_ratio': outlier_ratio\n",
" }\n",
" return result\n",
"detect_outliers([51, 2618.2, 2608.4, 2651.9, 3442.1, 3393.1, 3136.1, 3744.1,\n",
" 6607.4, 4060.3, 3614.7, 3295.5, 2332.1, 2699.3, 3036.8,\n",
" 865, 3014.3, 2742.8, 2173.5])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册