diff --git a/Extra/main.ipynb b/Extra/main.ipynb index b53a95b..cd1c5bb 100644 --- a/Extra/main.ipynb +++ b/Extra/main.ipynb @@ -3464,7 +3464,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.8.11 ('base')", + "display_name": "Python 3.8.13 ('StrokePredictionModel')", "language": "python", "name": "python3" }, @@ -3483,7 +3483,7 @@ "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "5819c1eaf6d552792a1bbc5e8998e6c2149ab26a1973a0d78107c0d9954e5ba0" + "hash": "6d6bab66b583e7661b89cead2220317a23c391a40fb8c52f2c1bcd3c04f3fbda" } } }, diff --git a/datasetCleaned.xlsx b/datasetCleaned.xlsx new file mode 100644 index 0000000..86304b5 Binary files /dev/null and b/datasetCleaned.xlsx differ diff --git a/main.ipynb b/main.ipynb index 78035b3..97cc677 100644 --- a/main.ipynb +++ b/main.ipynb @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -305,7 +305,7 @@ "9 0 " ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -324,7 +324,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -359,7 +359,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -499,7 +499,7 @@ "max 291.050000 97.600000 1.000000 " ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -510,7 +510,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -542,7 +542,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -683,7 +683,7 @@ "4 0 " ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -696,7 +696,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -743,59 +743,59 @@ " \n", " \n", " \n", - " 39696\n", - " 24427\n", + " 25469\n", + " 40932\n", + " 1\n", + " 63.0\n", " 0\n", - " 20.0\n", + " 0\n", + " 1\n", + " 73.20\n", + " 26.4\n", " 0\n", " 0\n", " 0\n", - " 91.23\n", - " 24.5\n", + " 0\n", + " 1\n", " 0\n", " 1\n", " 0\n", " 0\n", + " 1\n", + " 0\n", + " \n", + " \n", + " 9062\n", + " 23897\n", + " 1\n", + " 4.0\n", " 0\n", " 0\n", " 0\n", + " 86.33\n", + " 28.7\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " 0\n", " 1\n", " 0\n", " 0\n", " 0\n", " \n", " \n", - " 38440\n", - " 43933\n", - " 0\n", - " 57.0\n", + " 23973\n", + " 16201\n", + " 1\n", + " 48.0\n", " 0\n", " 0\n", " 1\n", - " 59.41\n", - " 34.9\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 0\n", - " 1\n", - " 0\n", - " 0\n", - " \n", - " \n", - " 37515\n", - " 29824\n", - " 1\n", - " 34.0\n", - " 0\n", - " 0\n", - " 1\n", - " 104.07\n", - " 45.9\n", + " 80.87\n", + " 19.8\n", " 0\n", " 0\n", " 0\n", @@ -809,37 +809,40 @@ " 1\n", " \n", " \n", - " 22940\n", - " 16030\n", + " 37135\n", + " 57514\n", " 1\n", - " 71.0\n", + " 55.0\n", " 0\n", " 0\n", " 1\n", - " 96.03\n", - " NaN\n", + " 132.16\n", + " 29.1\n", + " 0\n", + " 1\n", + " 0\n", " 0\n", " 0\n", " 0\n", " 1\n", " 0\n", " 0\n", - " 0\n", " 1\n", - " 1\n", - " 0\n", " 0\n", " \n", " \n", - " 12098\n", - " 72294\n", - " 1\n", - " 59.0\n", + " 20314\n", + " 12476\n", + " 0\n", + " 62.0\n", " 0\n", " 0\n", " 1\n", - " 90.06\n", - " 27.0\n", + " 110.97\n", + " 34.2\n", + " 0\n", + " 1\n", + " 0\n", " 0\n", " 0\n", " 0\n", @@ -848,9 +851,6 @@ " 0\n", " 1\n", " 0\n", - " 0\n", - " 0\n", - " 0\n", " \n", " \n", "\n", @@ -858,49 +858,49 @@ ], "text/plain": [ " id gender age hypertension heart_disease ever_married \\\n", - "39696 24427 0 20.0 0 0 0 \n", - "38440 43933 0 57.0 0 0 1 \n", - "37515 29824 1 34.0 0 0 1 \n", - "22940 16030 1 71.0 0 0 1 \n", - "12098 72294 1 59.0 0 0 1 \n", + "25469 40932 1 63.0 0 0 1 \n", + "9062 23897 1 4.0 0 0 0 \n", + "23973 16201 1 48.0 0 0 1 \n", + "37135 57514 1 55.0 0 0 1 \n", + "20314 12476 0 62.0 0 0 1 \n", "\n", " avg_glucose_level bmi stroke work_type_Govt_job \\\n", - "39696 91.23 24.5 0 1 \n", - "38440 59.41 34.9 0 1 \n", - "37515 104.07 45.9 0 0 \n", - "22940 96.03 NaN 0 0 \n", - "12098 90.06 27.0 0 0 \n", + "25469 73.20 26.4 0 0 \n", + "9062 86.33 28.7 0 0 \n", + "23973 80.87 19.8 0 0 \n", + "37135 132.16 29.1 0 1 \n", + "20314 110.97 34.2 0 1 \n", "\n", " work_type_Never_worked work_type_Private work_type_Self-employed \\\n", - "39696 0 0 0 \n", - "38440 0 0 0 \n", - "37515 0 1 0 \n", - "22940 0 1 0 \n", - "12098 0 0 1 \n", + "25469 0 0 1 \n", + "9062 0 0 0 \n", + "23973 0 1 0 \n", + "37135 0 0 0 \n", + "20314 0 0 0 \n", "\n", " work_type_children Residence_type_Rural Residence_type_Urban \\\n", - "39696 0 0 1 \n", - "38440 0 1 0 \n", - "37515 0 0 1 \n", - "22940 0 0 1 \n", - "12098 0 1 0 \n", + "25469 0 1 0 \n", + "9062 1 0 1 \n", + "23973 0 0 1 \n", + "37135 0 1 0 \n", + "20314 0 0 1 \n", "\n", " smoking_status_formerly smoked smoking_status_never smoked \\\n", - "39696 0 0 \n", - "38440 1 0 \n", - "37515 0 0 \n", - "22940 1 0 \n", - "12098 0 0 \n", + "25469 0 1 \n", + "9062 0 0 \n", + "23973 0 0 \n", + "37135 0 1 \n", + "20314 0 1 \n", "\n", " smoking_status_smokes \n", - "39696 0 \n", - "38440 0 \n", - "37515 1 \n", - "22940 0 \n", - "12098 0 " + "25469 0 \n", + "9062 0 \n", + "23973 1 \n", + "37135 0 \n", + "20314 0 " ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -912,7 +912,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -940,7 +940,7 @@ "dtype: int64" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -951,13 +951,22 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "df = df.dropna(how = 'any', axis=0)" ] }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_excel('datasetCleaned.xlsx')" + ] + }, { "cell_type": "code", "execution_count": 10, @@ -4981,7 +4990,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.9.13 ('BrainStrokePredictionMLEnv')", + "display_name": "Python 3.8.13 ('StrokePredictionModel')", "language": "python", "name": "python3" }, @@ -4995,12 +5004,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.8.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "54c59028773620d1ec7cf564885279046a3969c7da2b497982c8156e7da39d8c" + "hash": "6d6bab66b583e7661b89cead2220317a23c391a40fb8c52f2c1bcd3c04f3fbda" } } }, diff --git a/visualization_of_data.ipynb b/visualization_of_data.ipynb new file mode 100644 index 0000000..610adc0 --- /dev/null +++ b/visualization_of_data.ipynb @@ -0,0 +1,620 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np \n", + "import pandas as pd \n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idgenderagehypertensionheart_diseaseever_marriedwork_typeResidence_typeavg_glucose_levelbmismoking_statusstroke
030669Male3.000NochildrenRural95.1218.0NaN0
130468Male58.010YesPrivateUrban87.9639.2never smoked0
216523Female8.000NoPrivateUrban110.8917.6NaN0
356543Female70.000YesPrivateRural69.0435.9formerly smoked0
446136Male14.000NoNever_workedRural161.2819.1NaN0
532257Female47.000YesPrivateUrban210.9550.1NaN0
652800Female52.000YesPrivateUrban77.5917.7formerly smoked0
741413Female75.001YesSelf-employedRural243.5327.0never smoked0
815266Female32.000YesPrivateRural77.6732.3smokes0
928674Female74.010YesSelf-employedUrban205.8454.6never smoked0
\n", + "
" + ], + "text/plain": [ + " id gender age hypertension heart_disease ever_married \\\n", + "0 30669 Male 3.0 0 0 No \n", + "1 30468 Male 58.0 1 0 Yes \n", + "2 16523 Female 8.0 0 0 No \n", + "3 56543 Female 70.0 0 0 Yes \n", + "4 46136 Male 14.0 0 0 No \n", + "5 32257 Female 47.0 0 0 Yes \n", + "6 52800 Female 52.0 0 0 Yes \n", + "7 41413 Female 75.0 0 1 Yes \n", + "8 15266 Female 32.0 0 0 Yes \n", + "9 28674 Female 74.0 1 0 Yes \n", + "\n", + " work_type Residence_type avg_glucose_level bmi smoking_status \\\n", + "0 children Rural 95.12 18.0 NaN \n", + "1 Private Urban 87.96 39.2 never smoked \n", + "2 Private Urban 110.89 17.6 NaN \n", + "3 Private Rural 69.04 35.9 formerly smoked \n", + "4 Never_worked Rural 161.28 19.1 NaN \n", + "5 Private Urban 210.95 50.1 NaN \n", + "6 Private Urban 77.59 17.7 formerly smoked \n", + "7 Self-employed Rural 243.53 27.0 never smoked \n", + "8 Private Rural 77.67 32.3 smokes \n", + "9 Self-employed Urban 205.84 54.6 never smoked \n", + "\n", + " stroke \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "5 0 \n", + "6 0 \n", + "7 0 \n", + "8 0 \n", + "9 0 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('trainFile.csv')\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 43400 entries, 0 to 43399\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 43400 non-null int64 \n", + " 1 gender 43400 non-null object \n", + " 2 age 43400 non-null float64\n", + " 3 hypertension 43400 non-null int64 \n", + " 4 heart_disease 43400 non-null int64 \n", + " 5 ever_married 43400 non-null object \n", + " 6 work_type 43400 non-null object \n", + " 7 Residence_type 43400 non-null object \n", + " 8 avg_glucose_level 43400 non-null float64\n", + " 9 bmi 41938 non-null float64\n", + " 10 smoking_status 30108 non-null object \n", + " 11 stroke 43400 non-null int64 \n", + "dtypes: float64(3), int64(4), object(5)\n", + "memory usage: 4.0+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idagehypertensionheart_diseaseavg_glucose_levelbmistroke
count43400.00000043400.00000043400.00000043400.00000043400.00000041938.00000043400.000000
mean36326.14235042.2178940.0935710.047512104.48275028.6050380.018041
std21072.13487922.5196490.2912350.21273343.1117517.7700200.133103
min1.0000000.0800000.0000000.00000055.00000010.1000000.000000
25%18038.50000024.0000000.0000000.00000077.54000023.2000000.000000
50%36351.50000044.0000000.0000000.00000091.58000027.7000000.000000
75%54514.25000060.0000000.0000000.000000112.07000032.9000000.000000
max72943.00000082.0000001.0000001.000000291.05000097.6000001.000000
\n", + "
" + ], + "text/plain": [ + " id age hypertension heart_disease \\\n", + "count 43400.000000 43400.000000 43400.000000 43400.000000 \n", + "mean 36326.142350 42.217894 0.093571 0.047512 \n", + "std 21072.134879 22.519649 0.291235 0.212733 \n", + "min 1.000000 0.080000 0.000000 0.000000 \n", + "25% 18038.500000 24.000000 0.000000 0.000000 \n", + "50% 36351.500000 44.000000 0.000000 0.000000 \n", + "75% 54514.250000 60.000000 0.000000 0.000000 \n", + "max 72943.000000 82.000000 1.000000 1.000000 \n", + "\n", + " avg_glucose_level bmi stroke \n", + "count 43400.000000 41938.000000 43400.000000 \n", + "mean 104.482750 28.605038 0.018041 \n", + "std 43.111751 7.770020 0.133103 \n", + "min 55.000000 10.100000 0.000000 \n", + "25% 77.540000 23.200000 0.000000 \n", + "50% 91.580000 27.700000 0.000000 \n", + "75% 112.070000 32.900000 0.000000 \n", + "max 291.050000 97.600000 1.000000 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Male' 'Female' 'Other']\n", + "['children' 'Private' 'Never_worked' 'Self-employed' 'Govt_job']\n", + "['Rural' 'Urban']\n", + "[nan 'never smoked' 'formerly smoked' 'smokes']\n", + "['No' 'Yes']\n" + ] + } + ], + "source": [ + "print(df['gender'].unique())\n", + "print(df['work_type'].unique())\n", + "print(df['Residence_type'].unique())\n", + "print(df['smoking_status'].unique())\n", + "print(df['ever_married'].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "gender 0\n", + "age 0\n", + "hypertension 0\n", + "heart_disease 0\n", + "ever_married 0\n", + "work_type 0\n", + "Residence_type 0\n", + "avg_glucose_level 0\n", + "bmi 1462\n", + "smoking_status 13292\n", + "stroke 0\n", + "dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.dropna(how = 'any', axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 29072 entries, 1 to 43399\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 29072 non-null int64 \n", + " 1 gender 29072 non-null object \n", + " 2 age 29072 non-null float64\n", + " 3 hypertension 29072 non-null int64 \n", + " 4 heart_disease 29072 non-null int64 \n", + " 5 ever_married 29072 non-null object \n", + " 6 work_type 29072 non-null object \n", + " 7 Residence_type 29072 non-null object \n", + " 8 avg_glucose_level 29072 non-null float64\n", + " 9 bmi 29072 non-null float64\n", + " 10 smoking_status 29072 non-null object \n", + " 11 stroke 29072 non-null int64 \n", + "dtypes: float64(3), int64(4), object(5)\n", + "memory usage: 2.9+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_excel('datasetCleaned.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + " \n", + " " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import cufflinks as cf\n", + "\n", + "cf.go_offline()\n", + "cf.set_config_file(offline=False, world_readable=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.13 ('StrokePredictionModel')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "6d6bab66b583e7661b89cead2220317a23c391a40fb8c52f2c1bcd3c04f3fbda" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}