{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np \n",
"import pandas as pd \n",
"import os\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" gender | \n",
" age | \n",
" hypertension | \n",
" heart_disease | \n",
" ever_married | \n",
" work_type | \n",
" Residence_type | \n",
" avg_glucose_level | \n",
" bmi | \n",
" smoking_status | \n",
" stroke | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 30669 | \n",
" Male | \n",
" 3.0 | \n",
" 0 | \n",
" 0 | \n",
" No | \n",
" children | \n",
" Rural | \n",
" 95.12 | \n",
" 18.0 | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 30468 | \n",
" Male | \n",
" 58.0 | \n",
" 1 | \n",
" 0 | \n",
" Yes | \n",
" Private | \n",
" Urban | \n",
" 87.96 | \n",
" 39.2 | \n",
" never smoked | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 16523 | \n",
" Female | \n",
" 8.0 | \n",
" 0 | \n",
" 0 | \n",
" No | \n",
" Private | \n",
" Urban | \n",
" 110.89 | \n",
" 17.6 | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 56543 | \n",
" Female | \n",
" 70.0 | \n",
" 0 | \n",
" 0 | \n",
" Yes | \n",
" Private | \n",
" Rural | \n",
" 69.04 | \n",
" 35.9 | \n",
" formerly smoked | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 46136 | \n",
" Male | \n",
" 14.0 | \n",
" 0 | \n",
" 0 | \n",
" No | \n",
" Never_worked | \n",
" Rural | \n",
" 161.28 | \n",
" 19.1 | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" | 5 | \n",
" 32257 | \n",
" Female | \n",
" 47.0 | \n",
" 0 | \n",
" 0 | \n",
" Yes | \n",
" Private | \n",
" Urban | \n",
" 210.95 | \n",
" 50.1 | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" | 6 | \n",
" 52800 | \n",
" Female | \n",
" 52.0 | \n",
" 0 | \n",
" 0 | \n",
" Yes | \n",
" Private | \n",
" Urban | \n",
" 77.59 | \n",
" 17.7 | \n",
" formerly smoked | \n",
" 0 | \n",
"
\n",
" \n",
" | 7 | \n",
" 41413 | \n",
" Female | \n",
" 75.0 | \n",
" 0 | \n",
" 1 | \n",
" Yes | \n",
" Self-employed | \n",
" Rural | \n",
" 243.53 | \n",
" 27.0 | \n",
" never smoked | \n",
" 0 | \n",
"
\n",
" \n",
" | 8 | \n",
" 15266 | \n",
" Female | \n",
" 32.0 | \n",
" 0 | \n",
" 0 | \n",
" Yes | \n",
" Private | \n",
" Rural | \n",
" 77.67 | \n",
" 32.3 | \n",
" smokes | \n",
" 0 | \n",
"
\n",
" \n",
" | 9 | \n",
" 28674 | \n",
" Female | \n",
" 74.0 | \n",
" 1 | \n",
" 0 | \n",
" Yes | \n",
" Self-employed | \n",
" Urban | \n",
" 205.84 | \n",
" 54.6 | \n",
" never smoked | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id gender age hypertension heart_disease ever_married \\\n",
"0 30669 Male 3.0 0 0 No \n",
"1 30468 Male 58.0 1 0 Yes \n",
"2 16523 Female 8.0 0 0 No \n",
"3 56543 Female 70.0 0 0 Yes \n",
"4 46136 Male 14.0 0 0 No \n",
"5 32257 Female 47.0 0 0 Yes \n",
"6 52800 Female 52.0 0 0 Yes \n",
"7 41413 Female 75.0 0 1 Yes \n",
"8 15266 Female 32.0 0 0 Yes \n",
"9 28674 Female 74.0 1 0 Yes \n",
"\n",
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
"0 children Rural 95.12 18.0 NaN \n",
"1 Private Urban 87.96 39.2 never smoked \n",
"2 Private Urban 110.89 17.6 NaN \n",
"3 Private Rural 69.04 35.9 formerly smoked \n",
"4 Never_worked Rural 161.28 19.1 NaN \n",
"5 Private Urban 210.95 50.1 NaN \n",
"6 Private Urban 77.59 17.7 formerly smoked \n",
"7 Self-employed Rural 243.53 27.0 never smoked \n",
"8 Private Rural 77.67 32.3 smokes \n",
"9 Self-employed Urban 205.84 54.6 never smoked \n",
"\n",
" stroke \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"5 0 \n",
"6 0 \n",
"7 0 \n",
"8 0 \n",
"9 0 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('trainFile.csv')\n",
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 43400 entries, 0 to 43399\n",
"Data columns (total 12 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 43400 non-null int64 \n",
" 1 gender 43400 non-null object \n",
" 2 age 43400 non-null float64\n",
" 3 hypertension 43400 non-null int64 \n",
" 4 heart_disease 43400 non-null int64 \n",
" 5 ever_married 43400 non-null object \n",
" 6 work_type 43400 non-null object \n",
" 7 Residence_type 43400 non-null object \n",
" 8 avg_glucose_level 43400 non-null float64\n",
" 9 bmi 41938 non-null float64\n",
" 10 smoking_status 30108 non-null object \n",
" 11 stroke 43400 non-null int64 \n",
"dtypes: float64(3), int64(4), object(5)\n",
"memory usage: 4.0+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" age | \n",
" hypertension | \n",
" heart_disease | \n",
" avg_glucose_level | \n",
" bmi | \n",
" stroke | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 43400.000000 | \n",
" 43400.000000 | \n",
" 43400.000000 | \n",
" 43400.000000 | \n",
" 43400.000000 | \n",
" 41938.000000 | \n",
" 43400.000000 | \n",
"
\n",
" \n",
" | mean | \n",
" 36326.142350 | \n",
" 42.217894 | \n",
" 0.093571 | \n",
" 0.047512 | \n",
" 104.482750 | \n",
" 28.605038 | \n",
" 0.018041 | \n",
"
\n",
" \n",
" | std | \n",
" 21072.134879 | \n",
" 22.519649 | \n",
" 0.291235 | \n",
" 0.212733 | \n",
" 43.111751 | \n",
" 7.770020 | \n",
" 0.133103 | \n",
"
\n",
" \n",
" | min | \n",
" 1.000000 | \n",
" 0.080000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 55.000000 | \n",
" 10.100000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 25% | \n",
" 18038.500000 | \n",
" 24.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 77.540000 | \n",
" 23.200000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 50% | \n",
" 36351.500000 | \n",
" 44.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 91.580000 | \n",
" 27.700000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 75% | \n",
" 54514.250000 | \n",
" 60.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 112.070000 | \n",
" 32.900000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | max | \n",
" 72943.000000 | \n",
" 82.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 291.050000 | \n",
" 97.600000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id age hypertension heart_disease \\\n",
"count 43400.000000 43400.000000 43400.000000 43400.000000 \n",
"mean 36326.142350 42.217894 0.093571 0.047512 \n",
"std 21072.134879 22.519649 0.291235 0.212733 \n",
"min 1.000000 0.080000 0.000000 0.000000 \n",
"25% 18038.500000 24.000000 0.000000 0.000000 \n",
"50% 36351.500000 44.000000 0.000000 0.000000 \n",
"75% 54514.250000 60.000000 0.000000 0.000000 \n",
"max 72943.000000 82.000000 1.000000 1.000000 \n",
"\n",
" avg_glucose_level bmi stroke \n",
"count 43400.000000 41938.000000 43400.000000 \n",
"mean 104.482750 28.605038 0.018041 \n",
"std 43.111751 7.770020 0.133103 \n",
"min 55.000000 10.100000 0.000000 \n",
"25% 77.540000 23.200000 0.000000 \n",
"50% 91.580000 27.700000 0.000000 \n",
"75% 112.070000 32.900000 0.000000 \n",
"max 291.050000 97.600000 1.000000 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Male' 'Female' 'Other']\n",
"['children' 'Private' 'Never_worked' 'Self-employed' 'Govt_job']\n",
"['Rural' 'Urban']\n",
"[nan 'never smoked' 'formerly smoked' 'smokes']\n",
"['No' 'Yes']\n"
]
}
],
"source": [
"print(df['gender'].unique())\n",
"print(df['work_type'].unique())\n",
"print(df['Residence_type'].unique())\n",
"print(df['smoking_status'].unique())\n",
"print(df['ever_married'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id 0\n",
"gender 0\n",
"age 0\n",
"hypertension 0\n",
"heart_disease 0\n",
"ever_married 0\n",
"work_type 0\n",
"Residence_type 0\n",
"avg_glucose_level 0\n",
"bmi 1462\n",
"smoking_status 13292\n",
"stroke 0\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df = df.dropna(how = 'any', axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Int64Index: 29072 entries, 1 to 43399\n",
"Data columns (total 12 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 29072 non-null int64 \n",
" 1 gender 29072 non-null object \n",
" 2 age 29072 non-null float64\n",
" 3 hypertension 29072 non-null int64 \n",
" 4 heart_disease 29072 non-null int64 \n",
" 5 ever_married 29072 non-null object \n",
" 6 work_type 29072 non-null object \n",
" 7 Residence_type 29072 non-null object \n",
" 8 avg_glucose_level 29072 non-null float64\n",
" 9 bmi 29072 non-null float64\n",
" 10 smoking_status 29072 non-null object \n",
" 11 stroke 29072 non-null int64 \n",
"dtypes: float64(3), int64(4), object(5)\n",
"memory usage: 2.9+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df.to_excel('datasetCleaned.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import cufflinks as cf\n",
"\n",
"cf.go_offline()\n",
"cf.set_config_file(offline=False, world_readable=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.13 ('StrokePredictionModel')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "6d6bab66b583e7661b89cead2220317a23c391a40fb8c52f2c1bcd3c04f3fbda"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}