diff --git a/Extra/main.ipynb b/Extra/main.ipynb
index b53a95b..cd1c5bb 100644
--- a/Extra/main.ipynb
+++ b/Extra/main.ipynb
@@ -3464,7 +3464,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3.8.11 ('base')",
+ "display_name": "Python 3.8.13 ('StrokePredictionModel')",
"language": "python",
"name": "python3"
},
@@ -3483,7 +3483,7 @@
"orig_nbformat": 4,
"vscode": {
"interpreter": {
- "hash": "5819c1eaf6d552792a1bbc5e8998e6c2149ab26a1973a0d78107c0d9954e5ba0"
+ "hash": "6d6bab66b583e7661b89cead2220317a23c391a40fb8c52f2c1bcd3c04f3fbda"
}
}
},
diff --git a/datasetCleaned.xlsx b/datasetCleaned.xlsx
new file mode 100644
index 0000000..86304b5
Binary files /dev/null and b/datasetCleaned.xlsx differ
diff --git a/main.ipynb b/main.ipynb
index 78035b3..97cc677 100644
--- a/main.ipynb
+++ b/main.ipynb
@@ -43,7 +43,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -74,7 +74,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -305,7 +305,7 @@
"9 0 "
]
},
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -324,7 +324,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -359,7 +359,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -499,7 +499,7 @@
"max 291.050000 97.600000 1.000000 "
]
},
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -510,7 +510,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -542,7 +542,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -683,7 +683,7 @@
"4 0 "
]
},
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -696,7 +696,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -743,59 +743,59 @@
" \n",
"
\n",
" \n",
- " | 39696 | \n",
- " 24427 | \n",
+ " 25469 | \n",
+ " 40932 | \n",
+ " 1 | \n",
+ " 63.0 | \n",
" 0 | \n",
- " 20.0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 73.20 | \n",
+ " 26.4 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 91.23 | \n",
- " 24.5 | \n",
+ " 0 | \n",
+ " 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 9062 | \n",
+ " 23897 | \n",
+ " 1 | \n",
+ " 4.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
+ " 86.33 | \n",
+ " 28.7 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
- " | 38440 | \n",
- " 43933 | \n",
- " 0 | \n",
- " 57.0 | \n",
+ " 23973 | \n",
+ " 16201 | \n",
+ " 1 | \n",
+ " 48.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
- " 59.41 | \n",
- " 34.9 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 37515 | \n",
- " 29824 | \n",
- " 1 | \n",
- " 34.0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 104.07 | \n",
- " 45.9 | \n",
+ " 80.87 | \n",
+ " 19.8 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
@@ -809,37 +809,40 @@
" 1 | \n",
"
\n",
" \n",
- " | 22940 | \n",
- " 16030 | \n",
+ " 37135 | \n",
+ " 57514 | \n",
" 1 | \n",
- " 71.0 | \n",
+ " 55.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
- " 96.03 | \n",
- " NaN | \n",
+ " 132.16 | \n",
+ " 29.1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
" 1 | \n",
- " 1 | \n",
- " 0 | \n",
" 0 | \n",
"
\n",
" \n",
- " | 12098 | \n",
- " 72294 | \n",
- " 1 | \n",
- " 59.0 | \n",
+ " 20314 | \n",
+ " 12476 | \n",
+ " 0 | \n",
+ " 62.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
- " 90.06 | \n",
- " 27.0 | \n",
+ " 110.97 | \n",
+ " 34.2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
@@ -848,9 +851,6 @@
" 0 | \n",
" 1 | \n",
" 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
"
\n",
" \n",
"\n",
@@ -858,49 +858,49 @@
],
"text/plain": [
" id gender age hypertension heart_disease ever_married \\\n",
- "39696 24427 0 20.0 0 0 0 \n",
- "38440 43933 0 57.0 0 0 1 \n",
- "37515 29824 1 34.0 0 0 1 \n",
- "22940 16030 1 71.0 0 0 1 \n",
- "12098 72294 1 59.0 0 0 1 \n",
+ "25469 40932 1 63.0 0 0 1 \n",
+ "9062 23897 1 4.0 0 0 0 \n",
+ "23973 16201 1 48.0 0 0 1 \n",
+ "37135 57514 1 55.0 0 0 1 \n",
+ "20314 12476 0 62.0 0 0 1 \n",
"\n",
" avg_glucose_level bmi stroke work_type_Govt_job \\\n",
- "39696 91.23 24.5 0 1 \n",
- "38440 59.41 34.9 0 1 \n",
- "37515 104.07 45.9 0 0 \n",
- "22940 96.03 NaN 0 0 \n",
- "12098 90.06 27.0 0 0 \n",
+ "25469 73.20 26.4 0 0 \n",
+ "9062 86.33 28.7 0 0 \n",
+ "23973 80.87 19.8 0 0 \n",
+ "37135 132.16 29.1 0 1 \n",
+ "20314 110.97 34.2 0 1 \n",
"\n",
" work_type_Never_worked work_type_Private work_type_Self-employed \\\n",
- "39696 0 0 0 \n",
- "38440 0 0 0 \n",
- "37515 0 1 0 \n",
- "22940 0 1 0 \n",
- "12098 0 0 1 \n",
+ "25469 0 0 1 \n",
+ "9062 0 0 0 \n",
+ "23973 0 1 0 \n",
+ "37135 0 0 0 \n",
+ "20314 0 0 0 \n",
"\n",
" work_type_children Residence_type_Rural Residence_type_Urban \\\n",
- "39696 0 0 1 \n",
- "38440 0 1 0 \n",
- "37515 0 0 1 \n",
- "22940 0 0 1 \n",
- "12098 0 1 0 \n",
+ "25469 0 1 0 \n",
+ "9062 1 0 1 \n",
+ "23973 0 0 1 \n",
+ "37135 0 1 0 \n",
+ "20314 0 0 1 \n",
"\n",
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
- "39696 0 0 \n",
- "38440 1 0 \n",
- "37515 0 0 \n",
- "22940 1 0 \n",
- "12098 0 0 \n",
+ "25469 0 1 \n",
+ "9062 0 0 \n",
+ "23973 0 0 \n",
+ "37135 0 1 \n",
+ "20314 0 1 \n",
"\n",
" smoking_status_smokes \n",
- "39696 0 \n",
- "38440 0 \n",
- "37515 1 \n",
- "22940 0 \n",
- "12098 0 "
+ "25469 0 \n",
+ "9062 0 \n",
+ "23973 1 \n",
+ "37135 0 \n",
+ "20314 0 "
]
},
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -912,7 +912,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -940,7 +940,7 @@
"dtype: int64"
]
},
- "execution_count": 8,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -951,13 +951,22 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df = df.dropna(how = 'any', axis=0)"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_excel('datasetCleaned.xlsx')"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 10,
@@ -4981,7 +4990,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3.9.13 ('BrainStrokePredictionMLEnv')",
+ "display_name": "Python 3.8.13 ('StrokePredictionModel')",
"language": "python",
"name": "python3"
},
@@ -4995,12 +5004,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.13"
+ "version": "3.8.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
- "hash": "54c59028773620d1ec7cf564885279046a3969c7da2b497982c8156e7da39d8c"
+ "hash": "6d6bab66b583e7661b89cead2220317a23c391a40fb8c52f2c1bcd3c04f3fbda"
}
}
},
diff --git a/visualization_of_data.ipynb b/visualization_of_data.ipynb
new file mode 100644
index 0000000..610adc0
--- /dev/null
+++ b/visualization_of_data.ipynb
@@ -0,0 +1,620 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np \n",
+ "import pandas as pd \n",
+ "import os\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " gender | \n",
+ " age | \n",
+ " hypertension | \n",
+ " heart_disease | \n",
+ " ever_married | \n",
+ " work_type | \n",
+ " Residence_type | \n",
+ " avg_glucose_level | \n",
+ " bmi | \n",
+ " smoking_status | \n",
+ " stroke | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 30669 | \n",
+ " Male | \n",
+ " 3.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " No | \n",
+ " children | \n",
+ " Rural | \n",
+ " 95.12 | \n",
+ " 18.0 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 30468 | \n",
+ " Male | \n",
+ " 58.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " Yes | \n",
+ " Private | \n",
+ " Urban | \n",
+ " 87.96 | \n",
+ " 39.2 | \n",
+ " never smoked | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 16523 | \n",
+ " Female | \n",
+ " 8.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " No | \n",
+ " Private | \n",
+ " Urban | \n",
+ " 110.89 | \n",
+ " 17.6 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 56543 | \n",
+ " Female | \n",
+ " 70.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Yes | \n",
+ " Private | \n",
+ " Rural | \n",
+ " 69.04 | \n",
+ " 35.9 | \n",
+ " formerly smoked | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 46136 | \n",
+ " Male | \n",
+ " 14.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " No | \n",
+ " Never_worked | \n",
+ " Rural | \n",
+ " 161.28 | \n",
+ " 19.1 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 32257 | \n",
+ " Female | \n",
+ " 47.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Yes | \n",
+ " Private | \n",
+ " Urban | \n",
+ " 210.95 | \n",
+ " 50.1 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 52800 | \n",
+ " Female | \n",
+ " 52.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Yes | \n",
+ " Private | \n",
+ " Urban | \n",
+ " 77.59 | \n",
+ " 17.7 | \n",
+ " formerly smoked | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 41413 | \n",
+ " Female | \n",
+ " 75.0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Yes | \n",
+ " Self-employed | \n",
+ " Rural | \n",
+ " 243.53 | \n",
+ " 27.0 | \n",
+ " never smoked | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 15266 | \n",
+ " Female | \n",
+ " 32.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Yes | \n",
+ " Private | \n",
+ " Rural | \n",
+ " 77.67 | \n",
+ " 32.3 | \n",
+ " smokes | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 28674 | \n",
+ " Female | \n",
+ " 74.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " Yes | \n",
+ " Self-employed | \n",
+ " Urban | \n",
+ " 205.84 | \n",
+ " 54.6 | \n",
+ " never smoked | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id gender age hypertension heart_disease ever_married \\\n",
+ "0 30669 Male 3.0 0 0 No \n",
+ "1 30468 Male 58.0 1 0 Yes \n",
+ "2 16523 Female 8.0 0 0 No \n",
+ "3 56543 Female 70.0 0 0 Yes \n",
+ "4 46136 Male 14.0 0 0 No \n",
+ "5 32257 Female 47.0 0 0 Yes \n",
+ "6 52800 Female 52.0 0 0 Yes \n",
+ "7 41413 Female 75.0 0 1 Yes \n",
+ "8 15266 Female 32.0 0 0 Yes \n",
+ "9 28674 Female 74.0 1 0 Yes \n",
+ "\n",
+ " work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
+ "0 children Rural 95.12 18.0 NaN \n",
+ "1 Private Urban 87.96 39.2 never smoked \n",
+ "2 Private Urban 110.89 17.6 NaN \n",
+ "3 Private Rural 69.04 35.9 formerly smoked \n",
+ "4 Never_worked Rural 161.28 19.1 NaN \n",
+ "5 Private Urban 210.95 50.1 NaN \n",
+ "6 Private Urban 77.59 17.7 formerly smoked \n",
+ "7 Self-employed Rural 243.53 27.0 never smoked \n",
+ "8 Private Rural 77.67 32.3 smokes \n",
+ "9 Self-employed Urban 205.84 54.6 never smoked \n",
+ "\n",
+ " stroke \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "5 0 \n",
+ "6 0 \n",
+ "7 0 \n",
+ "8 0 \n",
+ "9 0 "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.read_csv('trainFile.csv')\n",
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 43400 entries, 0 to 43399\n",
+ "Data columns (total 12 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 id 43400 non-null int64 \n",
+ " 1 gender 43400 non-null object \n",
+ " 2 age 43400 non-null float64\n",
+ " 3 hypertension 43400 non-null int64 \n",
+ " 4 heart_disease 43400 non-null int64 \n",
+ " 5 ever_married 43400 non-null object \n",
+ " 6 work_type 43400 non-null object \n",
+ " 7 Residence_type 43400 non-null object \n",
+ " 8 avg_glucose_level 43400 non-null float64\n",
+ " 9 bmi 41938 non-null float64\n",
+ " 10 smoking_status 30108 non-null object \n",
+ " 11 stroke 43400 non-null int64 \n",
+ "dtypes: float64(3), int64(4), object(5)\n",
+ "memory usage: 4.0+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " age | \n",
+ " hypertension | \n",
+ " heart_disease | \n",
+ " avg_glucose_level | \n",
+ " bmi | \n",
+ " stroke | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 43400.000000 | \n",
+ " 43400.000000 | \n",
+ " 43400.000000 | \n",
+ " 43400.000000 | \n",
+ " 43400.000000 | \n",
+ " 41938.000000 | \n",
+ " 43400.000000 | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " 36326.142350 | \n",
+ " 42.217894 | \n",
+ " 0.093571 | \n",
+ " 0.047512 | \n",
+ " 104.482750 | \n",
+ " 28.605038 | \n",
+ " 0.018041 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " 21072.134879 | \n",
+ " 22.519649 | \n",
+ " 0.291235 | \n",
+ " 0.212733 | \n",
+ " 43.111751 | \n",
+ " 7.770020 | \n",
+ " 0.133103 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " 1.000000 | \n",
+ " 0.080000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 55.000000 | \n",
+ " 10.100000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " 18038.500000 | \n",
+ " 24.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 77.540000 | \n",
+ " 23.200000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " 36351.500000 | \n",
+ " 44.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 91.580000 | \n",
+ " 27.700000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " 54514.250000 | \n",
+ " 60.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 112.070000 | \n",
+ " 32.900000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 72943.000000 | \n",
+ " 82.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 291.050000 | \n",
+ " 97.600000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id age hypertension heart_disease \\\n",
+ "count 43400.000000 43400.000000 43400.000000 43400.000000 \n",
+ "mean 36326.142350 42.217894 0.093571 0.047512 \n",
+ "std 21072.134879 22.519649 0.291235 0.212733 \n",
+ "min 1.000000 0.080000 0.000000 0.000000 \n",
+ "25% 18038.500000 24.000000 0.000000 0.000000 \n",
+ "50% 36351.500000 44.000000 0.000000 0.000000 \n",
+ "75% 54514.250000 60.000000 0.000000 0.000000 \n",
+ "max 72943.000000 82.000000 1.000000 1.000000 \n",
+ "\n",
+ " avg_glucose_level bmi stroke \n",
+ "count 43400.000000 41938.000000 43400.000000 \n",
+ "mean 104.482750 28.605038 0.018041 \n",
+ "std 43.111751 7.770020 0.133103 \n",
+ "min 55.000000 10.100000 0.000000 \n",
+ "25% 77.540000 23.200000 0.000000 \n",
+ "50% 91.580000 27.700000 0.000000 \n",
+ "75% 112.070000 32.900000 0.000000 \n",
+ "max 291.050000 97.600000 1.000000 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Male' 'Female' 'Other']\n",
+ "['children' 'Private' 'Never_worked' 'Self-employed' 'Govt_job']\n",
+ "['Rural' 'Urban']\n",
+ "[nan 'never smoked' 'formerly smoked' 'smokes']\n",
+ "['No' 'Yes']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(df['gender'].unique())\n",
+ "print(df['work_type'].unique())\n",
+ "print(df['Residence_type'].unique())\n",
+ "print(df['smoking_status'].unique())\n",
+ "print(df['ever_married'].unique())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 0\n",
+ "gender 0\n",
+ "age 0\n",
+ "hypertension 0\n",
+ "heart_disease 0\n",
+ "ever_married 0\n",
+ "work_type 0\n",
+ "Residence_type 0\n",
+ "avg_glucose_level 0\n",
+ "bmi 1462\n",
+ "smoking_status 13292\n",
+ "stroke 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.dropna(how = 'any', axis=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 29072 entries, 1 to 43399\n",
+ "Data columns (total 12 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 id 29072 non-null int64 \n",
+ " 1 gender 29072 non-null object \n",
+ " 2 age 29072 non-null float64\n",
+ " 3 hypertension 29072 non-null int64 \n",
+ " 4 heart_disease 29072 non-null int64 \n",
+ " 5 ever_married 29072 non-null object \n",
+ " 6 work_type 29072 non-null object \n",
+ " 7 Residence_type 29072 non-null object \n",
+ " 8 avg_glucose_level 29072 non-null float64\n",
+ " 9 bmi 29072 non-null float64\n",
+ " 10 smoking_status 29072 non-null object \n",
+ " 11 stroke 29072 non-null int64 \n",
+ "dtypes: float64(3), int64(4), object(5)\n",
+ "memory usage: 2.9+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_excel('datasetCleaned.xlsx')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import cufflinks as cf\n",
+ "\n",
+ "cf.go_offline()\n",
+ "cf.set_config_file(offline=False, world_readable=True)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3.8.13 ('StrokePredictionModel')",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.13"
+ },
+ "orig_nbformat": 4,
+ "vscode": {
+ "interpreter": {
+ "hash": "6d6bab66b583e7661b89cead2220317a23c391a40fb8c52f2c1bcd3c04f3fbda"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}