Tue May 2 03:57:00 UTC 2023 inscode

b9eea59a · 63c50e17543b320f79aa675e · df352508 · b9eea59a
隐藏空白更改
内联并排

Showing with 131 addition and 0 deletion

a.ipynb a.ipynb +131 -0

未找到文件。
--- a/a.ipynb
+++ b/a.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 异常值检测"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/simple\n",
+      "Collecting pandas\n",
+      "  Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/pandas/2.0.1/pandas-2.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m284.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hCollecting openpyxl\n",
+      "  Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/openpyxl/3.1.2/openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m250.0/250.0 kB\u001b[0m \u001b[31m189.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting xlrd==1.2.0\n",
+      "  Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/xlrd/1.2.0/xlrd-1.2.0-py2.py3-none-any.whl (103 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m103.3/103.3 kB\u001b[0m \u001b[31m137.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting tzdata>=2022.1\n",
+      "  Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/tzdata/2023.3/tzdata-2023.3-py2.py3-none-any.whl (341 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.8/341.8 kB\u001b[0m \u001b[31m250.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting numpy>=1.20.3\n",
+      "  Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/numpy/1.24.3/numpy-1.24.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m236.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hCollecting pytz>=2020.1\n",
+      "  Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/pytz/2023.3/pytz-2023.3-py2.py3-none-any.whl (502 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m502.3/502.3 kB\u001b[0m \u001b[31m231.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.8/dist-packages (from pandas) (2.8.2)\n",
+      "Collecting et-xmlfile\n",
+      "  Downloading http://mirrors.csdn.net.cn/repository/csdn-pypi-mirrors/packages/et-xmlfile/1.1.0/et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
+      "Installing collected packages: pytz, xlrd, tzdata, numpy, et-xmlfile, pandas, openpyxl\n",
+      "Successfully installed et-xmlfile-1.1.0 numpy-1.24.3 openpyxl-3.1.2 pandas-2.0.1 pytz-2023.3 tzdata-2023.3 xlrd-1.2.0\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install pandas openpyxl xlrd==1.2.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "only integer scalar arrays can be converted to a scalar index",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[4], line 24\u001b[0m\n\u001b[1;32m     18\u001b[0m     result \u001b[39m=\u001b[39m {\n\u001b[1;32m     19\u001b[0m         \u001b[39m'\u001b[39m\u001b[39moutlier_values\u001b[39m\u001b[39m'\u001b[39m: outliers,\n\u001b[1;32m     20\u001b[0m         \u001b[39m'\u001b[39m\u001b[39moutlier_index\u001b[39m\u001b[39m'\u001b[39m: outlier_index,\n\u001b[1;32m     21\u001b[0m         \u001b[39m'\u001b[39m\u001b[39moutlier_ratio\u001b[39m\u001b[39m'\u001b[39m: outlier_ratio\n\u001b[1;32m     22\u001b[0m     }\n\u001b[1;32m     23\u001b[0m     \u001b[39mreturn\u001b[39;00m result\n\u001b[0;32m---> 24\u001b[0m detect_outliers([\u001b[39m51\u001b[39;49m, \u001b[39m2618.2\u001b[39;49m, \u001b[39m2608.4\u001b[39;49m, \u001b[39m2651.9\u001b[39;49m, \u001b[39m3442.1\u001b[39;49m, \u001b[39m3393.1\u001b[39;49m, \u001b[39m3136.1\u001b[39;49m, \u001b[39m3744.1\u001b[39;49m,\n\u001b[1;32m     25\u001b[0m                   \u001b[39m6607.4\u001b[39;49m, \u001b[39m4060.3\u001b[39;49m, \u001b[39m3614.7\u001b[39;49m, \u001b[39m3295.5\u001b[39;49m, \u001b[39m2332.1\u001b[39;49m, \u001b[39m2699.3\u001b[39;49m, \u001b[39m3036.8\u001b[39;49m,\n\u001b[1;32m     26\u001b[0m                   \u001b[39m865\u001b[39;49m, \u001b[39m3014.3\u001b[39;49m, \u001b[39m2742.8\u001b[39;49m, \u001b[39m2173.5\u001b[39;49m])\n",
+      "Cell \u001b[0;32mIn[4], line 11\u001b[0m, in \u001b[0;36mdetect_outliers\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m      8\u001b[0m lower_bound \u001b[39m=\u001b[39m q1 \u001b[39m-\u001b[39m \u001b[39m1.5\u001b[39m \u001b[39m*\u001b[39m iqr  \u001b[39m# 计算下界\u001b[39;00m\n\u001b[1;32m     10\u001b[0m \u001b[39m# 判断异常值，大于上界或小于下界的值即为异常值\u001b[39;00m\n\u001b[0;32m---> 11\u001b[0m outliers \u001b[39m=\u001b[39m data[(data \u001b[39m>\u001b[39;49m upper_bound) \u001b[39m|\u001b[39;49m (data \u001b[39m<\u001b[39;49m lower_bound)]\n\u001b[1;32m     12\u001b[0m outlier_index \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mwhere((data \u001b[39m>\u001b[39m upper_bound) \u001b[39m|\u001b[39m (data \u001b[39m<\u001b[39m lower_bound))[\u001b[39m0\u001b[39m]\n\u001b[1;32m     14\u001b[0m \u001b[39m# 计算异常值比例\u001b[39;00m\n",
+      "\u001b[0;31mTypeError\u001b[0m: only integer scalar arrays can be converted to a scalar index"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "def detect_outliers(data):\n",
+    "    # 使用numpy中的percentile函数计算百分位数\n",
+    "    q1, q3 = np.percentile(data, [25, 75])\n",
+    "    iqr = q3 - q1  # 计算四分位距\n",
+    "    upper_bound = q3 + 1.5 * iqr  # 计算上界\n",
+    "    lower_bound = q1 - 1.5 * iqr  # 计算下界\n",
+    "\n",
+    "    # 判断异常值，大于上界或小于下界的值即为异常值\n",
+    "    outliers = data[(data > upper_bound) | (data < lower_bound)]\n",
+    "    outlier_index = np.where((data > upper_bound) | (data < lower_bound))[0]\n",
+    "\n",
+    "    # 计算异常值比例\n",
+    "    outlier_ratio = len(outliers) / len(data)\n",
+    "\n",
+    "    # 将结果存储到字典中\n",
+    "    result = {\n",
+    "        'outlier_values': outliers,\n",
+    "        'outlier_index': outlier_index,\n",
+    "        'outlier_ratio': outlier_ratio\n",
+    "    }\n",
+    "    return result\n",
+    "detect_outliers([51, 2618.2, 2608.4, 2651.9, 3442.1, 3393.1, 3136.1, 3744.1,\n",
+    "                  6607.4, 4060.3, 3614.7, 3295.5, 2332.1, 2699.3, 3036.8,\n",
+    "                  865, 3014.3, 2742.8, 2173.5])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}