This commit is contained in:
2022-11-23 01:46:32 +05:30
4 changed files with 728 additions and 99 deletions

View File

@@ -3464,7 +3464,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.11 ('base')",
"display_name": "Python 3.8.13 ('StrokePredictionModel')",
"language": "python",
"name": "python3"
},
@@ -3483,7 +3483,7 @@
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "5819c1eaf6d552792a1bbc5e8998e6c2149ab26a1973a0d78107c0d9954e5ba0"
"hash": "6d6bab66b583e7661b89cead2220317a23c391a40fb8c52f2c1bcd3c04f3fbda"
}
}
},

BIN
datasetCleaned.xlsx Normal file

Binary file not shown.

View File

@@ -43,7 +43,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -74,7 +74,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -305,7 +305,7 @@
"9 0 "
]
},
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -324,7 +324,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -359,7 +359,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -499,7 +499,7 @@
"max 291.050000 97.600000 1.000000 "
]
},
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -510,7 +510,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -542,7 +542,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -683,7 +683,7 @@
"4 0 "
]
},
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -696,7 +696,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -743,59 +743,59 @@
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>39696</th>\n",
" <td>24427</td>\n",
" <th>25469</th>\n",
" <td>40932</td>\n",
" <td>1</td>\n",
" <td>63.0</td>\n",
" <td>0</td>\n",
" <td>20.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>73.20</td>\n",
" <td>26.4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>91.23</td>\n",
" <td>24.5</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9062</th>\n",
" <td>23897</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>86.33</td>\n",
" <td>28.7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38440</th>\n",
" <td>43933</td>\n",
" <td>0</td>\n",
" <td>57.0</td>\n",
" <th>23973</th>\n",
" <td>16201</td>\n",
" <td>1</td>\n",
" <td>48.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>59.41</td>\n",
" <td>34.9</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37515</th>\n",
" <td>29824</td>\n",
" <td>1</td>\n",
" <td>34.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>104.07</td>\n",
" <td>45.9</td>\n",
" <td>80.87</td>\n",
" <td>19.8</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
@@ -809,37 +809,40 @@
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22940</th>\n",
" <td>16030</td>\n",
" <th>37135</th>\n",
" <td>57514</td>\n",
" <td>1</td>\n",
" <td>71.0</td>\n",
" <td>55.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>96.03</td>\n",
" <td>NaN</td>\n",
" <td>132.16</td>\n",
" <td>29.1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12098</th>\n",
" <td>72294</td>\n",
" <td>1</td>\n",
" <td>59.0</td>\n",
" <th>20314</th>\n",
" <td>12476</td>\n",
" <td>0</td>\n",
" <td>62.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>90.06</td>\n",
" <td>27.0</td>\n",
" <td>110.97</td>\n",
" <td>34.2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
@@ -848,9 +851,6 @@
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
@@ -858,49 +858,49 @@
],
"text/plain": [
" id gender age hypertension heart_disease ever_married \\\n",
"39696 24427 0 20.0 0 0 0 \n",
"38440 43933 0 57.0 0 0 1 \n",
"37515 29824 1 34.0 0 0 1 \n",
"22940 16030 1 71.0 0 0 1 \n",
"12098 72294 1 59.0 0 0 1 \n",
"25469 40932 1 63.0 0 0 1 \n",
"9062 23897 1 4.0 0 0 0 \n",
"23973 16201 1 48.0 0 0 1 \n",
"37135 57514 1 55.0 0 0 1 \n",
"20314 12476 0 62.0 0 0 1 \n",
"\n",
" avg_glucose_level bmi stroke work_type_Govt_job \\\n",
"39696 91.23 24.5 0 1 \n",
"38440 59.41 34.9 0 1 \n",
"37515 104.07 45.9 0 0 \n",
"22940 96.03 NaN 0 0 \n",
"12098 90.06 27.0 0 0 \n",
"25469 73.20 26.4 0 0 \n",
"9062 86.33 28.7 0 0 \n",
"23973 80.87 19.8 0 0 \n",
"37135 132.16 29.1 0 1 \n",
"20314 110.97 34.2 0 1 \n",
"\n",
" work_type_Never_worked work_type_Private work_type_Self-employed \\\n",
"39696 0 0 0 \n",
"38440 0 0 0 \n",
"37515 0 1 0 \n",
"22940 0 1 0 \n",
"12098 0 0 1 \n",
"25469 0 0 1 \n",
"9062 0 0 0 \n",
"23973 0 1 0 \n",
"37135 0 0 0 \n",
"20314 0 0 0 \n",
"\n",
" work_type_children Residence_type_Rural Residence_type_Urban \\\n",
"39696 0 0 1 \n",
"38440 0 1 0 \n",
"37515 0 0 1 \n",
"22940 0 0 1 \n",
"12098 0 1 0 \n",
"25469 0 1 0 \n",
"9062 1 0 1 \n",
"23973 0 0 1 \n",
"37135 0 1 0 \n",
"20314 0 0 1 \n",
"\n",
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
"39696 0 0 \n",
"38440 1 0 \n",
"37515 0 0 \n",
"22940 1 0 \n",
"12098 0 0 \n",
"25469 0 1 \n",
"9062 0 0 \n",
"23973 0 0 \n",
"37135 0 1 \n",
"20314 0 1 \n",
"\n",
" smoking_status_smokes \n",
"39696 0 \n",
"38440 0 \n",
"37515 1 \n",
"22940 0 \n",
"12098 0 "
"25469 0 \n",
"9062 0 \n",
"23973 1 \n",
"37135 0 \n",
"20314 0 "
]
},
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -912,7 +912,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -940,7 +940,7 @@
"dtype: int64"
]
},
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -951,13 +951,22 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df = df.dropna(how = 'any', axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df.to_excel('datasetCleaned.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": 10,
@@ -4981,7 +4990,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.13 ('BrainStrokePredictionMLEnv')",
"display_name": "Python 3.8.13 ('StrokePredictionModel')",
"language": "python",
"name": "python3"
},
@@ -4995,12 +5004,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.8.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "54c59028773620d1ec7cf564885279046a3969c7da2b497982c8156e7da39d8c"
"hash": "6d6bab66b583e7661b89cead2220317a23c391a40fb8c52f2c1bcd3c04f3fbda"
}
}
},

620
visualization_of_data.ipynb Normal file
View File

@@ -0,0 +1,620 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np \n",
"import pandas as pd \n",
"import os\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>gender</th>\n",
" <th>age</th>\n",
" <th>hypertension</th>\n",
" <th>heart_disease</th>\n",
" <th>ever_married</th>\n",
" <th>work_type</th>\n",
" <th>Residence_type</th>\n",
" <th>avg_glucose_level</th>\n",
" <th>bmi</th>\n",
" <th>smoking_status</th>\n",
" <th>stroke</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>30669</td>\n",
" <td>Male</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" <td>children</td>\n",
" <td>Rural</td>\n",
" <td>95.12</td>\n",
" <td>18.0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>30468</td>\n",
" <td>Male</td>\n",
" <td>58.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Yes</td>\n",
" <td>Private</td>\n",
" <td>Urban</td>\n",
" <td>87.96</td>\n",
" <td>39.2</td>\n",
" <td>never smoked</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>16523</td>\n",
" <td>Female</td>\n",
" <td>8.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" <td>Private</td>\n",
" <td>Urban</td>\n",
" <td>110.89</td>\n",
" <td>17.6</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>56543</td>\n",
" <td>Female</td>\n",
" <td>70.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Yes</td>\n",
" <td>Private</td>\n",
" <td>Rural</td>\n",
" <td>69.04</td>\n",
" <td>35.9</td>\n",
" <td>formerly smoked</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>46136</td>\n",
" <td>Male</td>\n",
" <td>14.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" <td>Never_worked</td>\n",
" <td>Rural</td>\n",
" <td>161.28</td>\n",
" <td>19.1</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>32257</td>\n",
" <td>Female</td>\n",
" <td>47.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Yes</td>\n",
" <td>Private</td>\n",
" <td>Urban</td>\n",
" <td>210.95</td>\n",
" <td>50.1</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>52800</td>\n",
" <td>Female</td>\n",
" <td>52.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Yes</td>\n",
" <td>Private</td>\n",
" <td>Urban</td>\n",
" <td>77.59</td>\n",
" <td>17.7</td>\n",
" <td>formerly smoked</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>41413</td>\n",
" <td>Female</td>\n",
" <td>75.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>Yes</td>\n",
" <td>Self-employed</td>\n",
" <td>Rural</td>\n",
" <td>243.53</td>\n",
" <td>27.0</td>\n",
" <td>never smoked</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>15266</td>\n",
" <td>Female</td>\n",
" <td>32.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Yes</td>\n",
" <td>Private</td>\n",
" <td>Rural</td>\n",
" <td>77.67</td>\n",
" <td>32.3</td>\n",
" <td>smokes</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>28674</td>\n",
" <td>Female</td>\n",
" <td>74.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Yes</td>\n",
" <td>Self-employed</td>\n",
" <td>Urban</td>\n",
" <td>205.84</td>\n",
" <td>54.6</td>\n",
" <td>never smoked</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id gender age hypertension heart_disease ever_married \\\n",
"0 30669 Male 3.0 0 0 No \n",
"1 30468 Male 58.0 1 0 Yes \n",
"2 16523 Female 8.0 0 0 No \n",
"3 56543 Female 70.0 0 0 Yes \n",
"4 46136 Male 14.0 0 0 No \n",
"5 32257 Female 47.0 0 0 Yes \n",
"6 52800 Female 52.0 0 0 Yes \n",
"7 41413 Female 75.0 0 1 Yes \n",
"8 15266 Female 32.0 0 0 Yes \n",
"9 28674 Female 74.0 1 0 Yes \n",
"\n",
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
"0 children Rural 95.12 18.0 NaN \n",
"1 Private Urban 87.96 39.2 never smoked \n",
"2 Private Urban 110.89 17.6 NaN \n",
"3 Private Rural 69.04 35.9 formerly smoked \n",
"4 Never_worked Rural 161.28 19.1 NaN \n",
"5 Private Urban 210.95 50.1 NaN \n",
"6 Private Urban 77.59 17.7 formerly smoked \n",
"7 Self-employed Rural 243.53 27.0 never smoked \n",
"8 Private Rural 77.67 32.3 smokes \n",
"9 Self-employed Urban 205.84 54.6 never smoked \n",
"\n",
" stroke \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"5 0 \n",
"6 0 \n",
"7 0 \n",
"8 0 \n",
"9 0 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('trainFile.csv')\n",
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 43400 entries, 0 to 43399\n",
"Data columns (total 12 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 43400 non-null int64 \n",
" 1 gender 43400 non-null object \n",
" 2 age 43400 non-null float64\n",
" 3 hypertension 43400 non-null int64 \n",
" 4 heart_disease 43400 non-null int64 \n",
" 5 ever_married 43400 non-null object \n",
" 6 work_type 43400 non-null object \n",
" 7 Residence_type 43400 non-null object \n",
" 8 avg_glucose_level 43400 non-null float64\n",
" 9 bmi 41938 non-null float64\n",
" 10 smoking_status 30108 non-null object \n",
" 11 stroke 43400 non-null int64 \n",
"dtypes: float64(3), int64(4), object(5)\n",
"memory usage: 4.0+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>age</th>\n",
" <th>hypertension</th>\n",
" <th>heart_disease</th>\n",
" <th>avg_glucose_level</th>\n",
" <th>bmi</th>\n",
" <th>stroke</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>43400.000000</td>\n",
" <td>43400.000000</td>\n",
" <td>43400.000000</td>\n",
" <td>43400.000000</td>\n",
" <td>43400.000000</td>\n",
" <td>41938.000000</td>\n",
" <td>43400.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>36326.142350</td>\n",
" <td>42.217894</td>\n",
" <td>0.093571</td>\n",
" <td>0.047512</td>\n",
" <td>104.482750</td>\n",
" <td>28.605038</td>\n",
" <td>0.018041</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>21072.134879</td>\n",
" <td>22.519649</td>\n",
" <td>0.291235</td>\n",
" <td>0.212733</td>\n",
" <td>43.111751</td>\n",
" <td>7.770020</td>\n",
" <td>0.133103</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" <td>0.080000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>55.000000</td>\n",
" <td>10.100000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>18038.500000</td>\n",
" <td>24.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>77.540000</td>\n",
" <td>23.200000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>36351.500000</td>\n",
" <td>44.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>91.580000</td>\n",
" <td>27.700000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>54514.250000</td>\n",
" <td>60.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>112.070000</td>\n",
" <td>32.900000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>72943.000000</td>\n",
" <td>82.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>291.050000</td>\n",
" <td>97.600000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id age hypertension heart_disease \\\n",
"count 43400.000000 43400.000000 43400.000000 43400.000000 \n",
"mean 36326.142350 42.217894 0.093571 0.047512 \n",
"std 21072.134879 22.519649 0.291235 0.212733 \n",
"min 1.000000 0.080000 0.000000 0.000000 \n",
"25% 18038.500000 24.000000 0.000000 0.000000 \n",
"50% 36351.500000 44.000000 0.000000 0.000000 \n",
"75% 54514.250000 60.000000 0.000000 0.000000 \n",
"max 72943.000000 82.000000 1.000000 1.000000 \n",
"\n",
" avg_glucose_level bmi stroke \n",
"count 43400.000000 41938.000000 43400.000000 \n",
"mean 104.482750 28.605038 0.018041 \n",
"std 43.111751 7.770020 0.133103 \n",
"min 55.000000 10.100000 0.000000 \n",
"25% 77.540000 23.200000 0.000000 \n",
"50% 91.580000 27.700000 0.000000 \n",
"75% 112.070000 32.900000 0.000000 \n",
"max 291.050000 97.600000 1.000000 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Male' 'Female' 'Other']\n",
"['children' 'Private' 'Never_worked' 'Self-employed' 'Govt_job']\n",
"['Rural' 'Urban']\n",
"[nan 'never smoked' 'formerly smoked' 'smokes']\n",
"['No' 'Yes']\n"
]
}
],
"source": [
"print(df['gender'].unique())\n",
"print(df['work_type'].unique())\n",
"print(df['Residence_type'].unique())\n",
"print(df['smoking_status'].unique())\n",
"print(df['ever_married'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id 0\n",
"gender 0\n",
"age 0\n",
"hypertension 0\n",
"heart_disease 0\n",
"ever_married 0\n",
"work_type 0\n",
"Residence_type 0\n",
"avg_glucose_level 0\n",
"bmi 1462\n",
"smoking_status 13292\n",
"stroke 0\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df = df.dropna(how = 'any', axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 29072 entries, 1 to 43399\n",
"Data columns (total 12 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 29072 non-null int64 \n",
" 1 gender 29072 non-null object \n",
" 2 age 29072 non-null float64\n",
" 3 hypertension 29072 non-null int64 \n",
" 4 heart_disease 29072 non-null int64 \n",
" 5 ever_married 29072 non-null object \n",
" 6 work_type 29072 non-null object \n",
" 7 Residence_type 29072 non-null object \n",
" 8 avg_glucose_level 29072 non-null float64\n",
" 9 bmi 29072 non-null float64\n",
" 10 smoking_status 29072 non-null object \n",
" 11 stroke 29072 non-null int64 \n",
"dtypes: float64(3), int64(4), object(5)\n",
"memory usage: 2.9+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df.to_excel('datasetCleaned.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" <script type=\"text/javascript\">\n",
" window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
" if (window.MathJax && window.MathJax.Hub && window.MathJax.Hub.Config) {window.MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
" if (typeof require !== 'undefined') {\n",
" require.undef(\"plotly\");\n",
" requirejs.config({\n",
" paths: {\n",
" 'plotly': ['https://cdn.plot.ly/plotly-2.14.0.min']\n",
" }\n",
" });\n",
" require(['plotly'], function(Plotly) {\n",
" window._Plotly = Plotly;\n",
" });\n",
" }\n",
" </script>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import cufflinks as cf\n",
"\n",
"cf.go_offline()\n",
"cf.set_config_file(offline=False, world_readable=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.13 ('StrokePredictionModel')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "6d6bab66b583e7661b89cead2220317a23c391a40fb8c52f2c1bcd3c04f3fbda"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}