{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np \n",
    "import pandas as pd \n",
    "import os\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>gender</th>\n",
       "      <th>age</th>\n",
       "      <th>hypertension</th>\n",
       "      <th>heart_disease</th>\n",
       "      <th>ever_married</th>\n",
       "      <th>work_type</th>\n",
       "      <th>Residence_type</th>\n",
       "      <th>avg_glucose_level</th>\n",
       "      <th>bmi</th>\n",
       "      <th>smoking_status</th>\n",
       "      <th>stroke</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>30669</td>\n",
       "      <td>Male</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>children</td>\n",
       "      <td>Rural</td>\n",
       "      <td>95.12</td>\n",
       "      <td>18.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>30468</td>\n",
       "      <td>Male</td>\n",
       "      <td>58.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Private</td>\n",
       "      <td>Urban</td>\n",
       "      <td>87.96</td>\n",
       "      <td>39.2</td>\n",
       "      <td>never smoked</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>16523</td>\n",
       "      <td>Female</td>\n",
       "      <td>8.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>Private</td>\n",
       "      <td>Urban</td>\n",
       "      <td>110.89</td>\n",
       "      <td>17.6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>56543</td>\n",
       "      <td>Female</td>\n",
       "      <td>70.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Private</td>\n",
       "      <td>Rural</td>\n",
       "      <td>69.04</td>\n",
       "      <td>35.9</td>\n",
       "      <td>formerly smoked</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>46136</td>\n",
       "      <td>Male</td>\n",
       "      <td>14.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>Never_worked</td>\n",
       "      <td>Rural</td>\n",
       "      <td>161.28</td>\n",
       "      <td>19.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>32257</td>\n",
       "      <td>Female</td>\n",
       "      <td>47.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Private</td>\n",
       "      <td>Urban</td>\n",
       "      <td>210.95</td>\n",
       "      <td>50.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>52800</td>\n",
       "      <td>Female</td>\n",
       "      <td>52.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Private</td>\n",
       "      <td>Urban</td>\n",
       "      <td>77.59</td>\n",
       "      <td>17.7</td>\n",
       "      <td>formerly smoked</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>41413</td>\n",
       "      <td>Female</td>\n",
       "      <td>75.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Self-employed</td>\n",
       "      <td>Rural</td>\n",
       "      <td>243.53</td>\n",
       "      <td>27.0</td>\n",
       "      <td>never smoked</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>15266</td>\n",
       "      <td>Female</td>\n",
       "      <td>32.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Private</td>\n",
       "      <td>Rural</td>\n",
       "      <td>77.67</td>\n",
       "      <td>32.3</td>\n",
       "      <td>smokes</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>28674</td>\n",
       "      <td>Female</td>\n",
       "      <td>74.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Self-employed</td>\n",
       "      <td>Urban</td>\n",
       "      <td>205.84</td>\n",
       "      <td>54.6</td>\n",
       "      <td>never smoked</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      id  gender   age  hypertension  heart_disease ever_married  \\\n",
       "0  30669    Male   3.0             0              0           No   \n",
       "1  30468    Male  58.0             1              0          Yes   \n",
       "2  16523  Female   8.0             0              0           No   \n",
       "3  56543  Female  70.0             0              0          Yes   \n",
       "4  46136    Male  14.0             0              0           No   \n",
       "5  32257  Female  47.0             0              0          Yes   \n",
       "6  52800  Female  52.0             0              0          Yes   \n",
       "7  41413  Female  75.0             0              1          Yes   \n",
       "8  15266  Female  32.0             0              0          Yes   \n",
       "9  28674  Female  74.0             1              0          Yes   \n",
       "\n",
       "       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \\\n",
       "0       children          Rural              95.12  18.0              NaN   \n",
       "1        Private          Urban              87.96  39.2     never smoked   \n",
       "2        Private          Urban             110.89  17.6              NaN   \n",
       "3        Private          Rural              69.04  35.9  formerly smoked   \n",
       "4   Never_worked          Rural             161.28  19.1              NaN   \n",
       "5        Private          Urban             210.95  50.1              NaN   \n",
       "6        Private          Urban              77.59  17.7  formerly smoked   \n",
       "7  Self-employed          Rural             243.53  27.0     never smoked   \n",
       "8        Private          Rural              77.67  32.3           smokes   \n",
       "9  Self-employed          Urban             205.84  54.6     never smoked   \n",
       "\n",
       "   stroke  \n",
       "0       0  \n",
       "1       0  \n",
       "2       0  \n",
       "3       0  \n",
       "4       0  \n",
       "5       0  \n",
       "6       0  \n",
       "7       0  \n",
       "8       0  \n",
       "9       0  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('trainFile.csv')\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 43400 entries, 0 to 43399\n",
      "Data columns (total 12 columns):\n",
      " #   Column             Non-Null Count  Dtype  \n",
      "---  ------             --------------  -----  \n",
      " 0   id                 43400 non-null  int64  \n",
      " 1   gender             43400 non-null  object \n",
      " 2   age                43400 non-null  float64\n",
      " 3   hypertension       43400 non-null  int64  \n",
      " 4   heart_disease      43400 non-null  int64  \n",
      " 5   ever_married       43400 non-null  object \n",
      " 6   work_type          43400 non-null  object \n",
      " 7   Residence_type     43400 non-null  object \n",
      " 8   avg_glucose_level  43400 non-null  float64\n",
      " 9   bmi                41938 non-null  float64\n",
      " 10  smoking_status     30108 non-null  object \n",
      " 11  stroke             43400 non-null  int64  \n",
      "dtypes: float64(3), int64(4), object(5)\n",
      "memory usage: 4.0+ MB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>age</th>\n",
       "      <th>hypertension</th>\n",
       "      <th>heart_disease</th>\n",
       "      <th>avg_glucose_level</th>\n",
       "      <th>bmi</th>\n",
       "      <th>stroke</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>43400.000000</td>\n",
       "      <td>43400.000000</td>\n",
       "      <td>43400.000000</td>\n",
       "      <td>43400.000000</td>\n",
       "      <td>43400.000000</td>\n",
       "      <td>41938.000000</td>\n",
       "      <td>43400.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>36326.142350</td>\n",
       "      <td>42.217894</td>\n",
       "      <td>0.093571</td>\n",
       "      <td>0.047512</td>\n",
       "      <td>104.482750</td>\n",
       "      <td>28.605038</td>\n",
       "      <td>0.018041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>21072.134879</td>\n",
       "      <td>22.519649</td>\n",
       "      <td>0.291235</td>\n",
       "      <td>0.212733</td>\n",
       "      <td>43.111751</td>\n",
       "      <td>7.770020</td>\n",
       "      <td>0.133103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.080000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>55.000000</td>\n",
       "      <td>10.100000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>18038.500000</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>77.540000</td>\n",
       "      <td>23.200000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>36351.500000</td>\n",
       "      <td>44.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>91.580000</td>\n",
       "      <td>27.700000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>54514.250000</td>\n",
       "      <td>60.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>112.070000</td>\n",
       "      <td>32.900000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>72943.000000</td>\n",
       "      <td>82.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>291.050000</td>\n",
       "      <td>97.600000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 id           age  hypertension  heart_disease  \\\n",
       "count  43400.000000  43400.000000  43400.000000   43400.000000   \n",
       "mean   36326.142350     42.217894      0.093571       0.047512   \n",
       "std    21072.134879     22.519649      0.291235       0.212733   \n",
       "min        1.000000      0.080000      0.000000       0.000000   \n",
       "25%    18038.500000     24.000000      0.000000       0.000000   \n",
       "50%    36351.500000     44.000000      0.000000       0.000000   \n",
       "75%    54514.250000     60.000000      0.000000       0.000000   \n",
       "max    72943.000000     82.000000      1.000000       1.000000   \n",
       "\n",
       "       avg_glucose_level           bmi        stroke  \n",
       "count       43400.000000  41938.000000  43400.000000  \n",
       "mean          104.482750     28.605038      0.018041  \n",
       "std            43.111751      7.770020      0.133103  \n",
       "min            55.000000     10.100000      0.000000  \n",
       "25%            77.540000     23.200000      0.000000  \n",
       "50%            91.580000     27.700000      0.000000  \n",
       "75%           112.070000     32.900000      0.000000  \n",
       "max           291.050000     97.600000      1.000000  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Male' 'Female' 'Other']\n",
      "['children' 'Private' 'Never_worked' 'Self-employed' 'Govt_job']\n",
      "['Rural' 'Urban']\n",
      "[nan 'never smoked' 'formerly smoked' 'smokes']\n",
      "['No' 'Yes']\n"
     ]
    }
   ],
   "source": [
    "print(df['gender'].unique())\n",
    "print(df['work_type'].unique())\n",
    "print(df['Residence_type'].unique())\n",
    "print(df['smoking_status'].unique())\n",
    "print(df['ever_married'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                       0\n",
       "gender                   0\n",
       "age                      0\n",
       "hypertension             0\n",
       "heart_disease            0\n",
       "ever_married             0\n",
       "work_type                0\n",
       "Residence_type           0\n",
       "avg_glucose_level        0\n",
       "bmi                   1462\n",
       "smoking_status       13292\n",
       "stroke                   0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.dropna(how = 'any', axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 29072 entries, 1 to 43399\n",
      "Data columns (total 12 columns):\n",
      " #   Column             Non-Null Count  Dtype  \n",
      "---  ------             --------------  -----  \n",
      " 0   id                 29072 non-null  int64  \n",
      " 1   gender             29072 non-null  object \n",
      " 2   age                29072 non-null  float64\n",
      " 3   hypertension       29072 non-null  int64  \n",
      " 4   heart_disease      29072 non-null  int64  \n",
      " 5   ever_married       29072 non-null  object \n",
      " 6   work_type          29072 non-null  object \n",
      " 7   Residence_type     29072 non-null  object \n",
      " 8   avg_glucose_level  29072 non-null  float64\n",
      " 9   bmi                29072 non-null  float64\n",
      " 10  smoking_status     29072 non-null  object \n",
      " 11  stroke             29072 non-null  int64  \n",
      "dtypes: float64(3), int64(4), object(5)\n",
      "memory usage: 2.9+ MB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_excel('datasetCleaned.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "        <script type=\"text/javascript\">\n",
       "        window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
       "        if (window.MathJax && window.MathJax.Hub && window.MathJax.Hub.Config) {window.MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
       "        if (typeof require !== 'undefined') {\n",
       "        require.undef(\"plotly\");\n",
       "        requirejs.config({\n",
       "            paths: {\n",
       "                'plotly': ['https://cdn.plot.ly/plotly-2.14.0.min']\n",
       "            }\n",
       "        });\n",
       "        require(['plotly'], function(Plotly) {\n",
       "            window._Plotly = Plotly;\n",
       "        });\n",
       "        }\n",
       "        </script>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import cufflinks as cf\n",
    "\n",
    "cf.go_offline()\n",
    "cf.set_config_file(offline=False, world_readable=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.13 ('StrokePredictionModel')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "6d6bab66b583e7661b89cead2220317a23c391a40fb8c52f2c1bcd3c04f3fbda"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}