{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np \n", "import pandas as pd \n", "import os\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idgenderagehypertensionheart_diseaseever_marriedwork_typeResidence_typeavg_glucose_levelbmismoking_statusstroke
030669Male3.000NochildrenRural95.1218.0NaN0
130468Male58.010YesPrivateUrban87.9639.2never smoked0
216523Female8.000NoPrivateUrban110.8917.6NaN0
356543Female70.000YesPrivateRural69.0435.9formerly smoked0
446136Male14.000NoNever_workedRural161.2819.1NaN0
532257Female47.000YesPrivateUrban210.9550.1NaN0
652800Female52.000YesPrivateUrban77.5917.7formerly smoked0
741413Female75.001YesSelf-employedRural243.5327.0never smoked0
815266Female32.000YesPrivateRural77.6732.3smokes0
928674Female74.010YesSelf-employedUrban205.8454.6never smoked0
\n", "
" ], "text/plain": [ " id gender age hypertension heart_disease ever_married \\\n", "0 30669 Male 3.0 0 0 No \n", "1 30468 Male 58.0 1 0 Yes \n", "2 16523 Female 8.0 0 0 No \n", "3 56543 Female 70.0 0 0 Yes \n", "4 46136 Male 14.0 0 0 No \n", "5 32257 Female 47.0 0 0 Yes \n", "6 52800 Female 52.0 0 0 Yes \n", "7 41413 Female 75.0 0 1 Yes \n", "8 15266 Female 32.0 0 0 Yes \n", "9 28674 Female 74.0 1 0 Yes \n", "\n", " work_type Residence_type avg_glucose_level bmi smoking_status \\\n", "0 children Rural 95.12 18.0 NaN \n", "1 Private Urban 87.96 39.2 never smoked \n", "2 Private Urban 110.89 17.6 NaN \n", "3 Private Rural 69.04 35.9 formerly smoked \n", "4 Never_worked Rural 161.28 19.1 NaN \n", "5 Private Urban 210.95 50.1 NaN \n", "6 Private Urban 77.59 17.7 formerly smoked \n", "7 Self-employed Rural 243.53 27.0 never smoked \n", "8 Private Rural 77.67 32.3 smokes \n", "9 Self-employed Urban 205.84 54.6 never smoked \n", "\n", " stroke \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "5 0 \n", "6 0 \n", "7 0 \n", "8 0 \n", "9 0 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('trainFile.csv')\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 43400 entries, 0 to 43399\n", "Data columns (total 12 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 43400 non-null int64 \n", " 1 gender 43400 non-null object \n", " 2 age 43400 non-null float64\n", " 3 hypertension 43400 non-null int64 \n", " 4 heart_disease 43400 non-null int64 \n", " 5 ever_married 43400 non-null object \n", " 6 work_type 43400 non-null object \n", " 7 Residence_type 43400 non-null object \n", " 8 avg_glucose_level 43400 non-null float64\n", " 9 bmi 41938 non-null float64\n", " 10 smoking_status 30108 non-null object \n", " 11 stroke 43400 non-null int64 \n", "dtypes: float64(3), int64(4), object(5)\n", "memory usage: 4.0+ MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idagehypertensionheart_diseaseavg_glucose_levelbmistroke
count43400.00000043400.00000043400.00000043400.00000043400.00000041938.00000043400.000000
mean36326.14235042.2178940.0935710.047512104.48275028.6050380.018041
std21072.13487922.5196490.2912350.21273343.1117517.7700200.133103
min1.0000000.0800000.0000000.00000055.00000010.1000000.000000
25%18038.50000024.0000000.0000000.00000077.54000023.2000000.000000
50%36351.50000044.0000000.0000000.00000091.58000027.7000000.000000
75%54514.25000060.0000000.0000000.000000112.07000032.9000000.000000
max72943.00000082.0000001.0000001.000000291.05000097.6000001.000000
\n", "
" ], "text/plain": [ " id age hypertension heart_disease \\\n", "count 43400.000000 43400.000000 43400.000000 43400.000000 \n", "mean 36326.142350 42.217894 0.093571 0.047512 \n", "std 21072.134879 22.519649 0.291235 0.212733 \n", "min 1.000000 0.080000 0.000000 0.000000 \n", "25% 18038.500000 24.000000 0.000000 0.000000 \n", "50% 36351.500000 44.000000 0.000000 0.000000 \n", "75% 54514.250000 60.000000 0.000000 0.000000 \n", "max 72943.000000 82.000000 1.000000 1.000000 \n", "\n", " avg_glucose_level bmi stroke \n", "count 43400.000000 41938.000000 43400.000000 \n", "mean 104.482750 28.605038 0.018041 \n", "std 43.111751 7.770020 0.133103 \n", "min 55.000000 10.100000 0.000000 \n", "25% 77.540000 23.200000 0.000000 \n", "50% 91.580000 27.700000 0.000000 \n", "75% 112.070000 32.900000 0.000000 \n", "max 291.050000 97.600000 1.000000 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Male' 'Female' 'Other']\n", "['children' 'Private' 'Never_worked' 'Self-employed' 'Govt_job']\n", "['Rural' 'Urban']\n", "[nan 'never smoked' 'formerly smoked' 'smokes']\n", "['No' 'Yes']\n" ] } ], "source": [ "print(df['gender'].unique())\n", "print(df['work_type'].unique())\n", "print(df['Residence_type'].unique())\n", "print(df['smoking_status'].unique())\n", "print(df['ever_married'].unique())" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0\n", "gender 0\n", "age 0\n", "hypertension 0\n", "heart_disease 0\n", "ever_married 0\n", "work_type 0\n", "Residence_type 0\n", "avg_glucose_level 0\n", "bmi 1462\n", "smoking_status 13292\n", "stroke 0\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "df = df.dropna(how = 'any', axis=0)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 29072 entries, 1 to 43399\n", "Data columns (total 12 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 29072 non-null int64 \n", " 1 gender 29072 non-null object \n", " 2 age 29072 non-null float64\n", " 3 hypertension 29072 non-null int64 \n", " 4 heart_disease 29072 non-null int64 \n", " 5 ever_married 29072 non-null object \n", " 6 work_type 29072 non-null object \n", " 7 Residence_type 29072 non-null object \n", " 8 avg_glucose_level 29072 non-null float64\n", " 9 bmi 29072 non-null float64\n", " 10 smoking_status 29072 non-null object \n", " 11 stroke 29072 non-null int64 \n", "dtypes: float64(3), int64(4), object(5)\n", "memory usage: 2.9+ MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df.to_excel('datasetCleaned.xlsx')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import cufflinks as cf\n", "\n", "cf.go_offline()\n", "cf.set_config_file(offline=False, world_readable=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.13 ('StrokePredictionModel')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "6d6bab66b583e7661b89cead2220317a23c391a40fb8c52f2c1bcd3c04f3fbda" } } }, "nbformat": 4, "nbformat_minor": 2 }