diff --git a/PneumoniaClassificationModel/main.ipynb b/PneumoniaClassificationModel/main.ipynb new file mode 100644 index 0000000..64165f7 --- /dev/null +++ b/PneumoniaClassificationModel/main.ipynb @@ -0,0 +1,144 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pneumonia Classification Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction + Set-up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Machine learning has a phenomenal range of applications, including in health and diagnostics. This tutorial will explain the complete pipeline from loading data to predicting results, and it will explain how to build an X-ray image classification model from scratch to predict whether an X-ray scan shows presence of pneumonia. This is especially useful during these current times as COVID-19 is known to cause pneumonia.\n", + "\n", + "This tutorial will explain how to utilize TPUs efficiently, load in image data, build and train a convolution neural network, finetune and regularize the model, and predict results. Data augmentation is not included in the model because X-ray scans are only taken in a specific orientation, and variations such as flips and rotations will not exist in real X-ray images.\n", + "\n", + "Run the following cell to load the necessary packages. Make sure to change the Accelerator on the right to TPU." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of replicas: 1\n", + "2.9.1\n" + ] + } + ], + "source": [ + "import re\n", + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "try:\n", + " tpu = tf.distribute.cluster_resolver.TPUClusterResolver()\n", + " print('Device:', tpu.master())\n", + " tf.config.experimental_connect_to_cluster(tpu)\n", + " tf.tpu.experimental.initialize_tpu_system(tpu)\n", + " strategy = tf.distribute.experimental.TPUStrategy(tpu)\n", + "except:\n", + " strategy = tf.distribute.get_strategy()\n", + "print('Number of replicas:', strategy.num_replicas_in_sync)\n", + " \n", + "print(tf.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "AUTOTUNE = tf.data.experimental.AUTOTUNE\n", + "BATCH_SIZE = 16 * strategy.num_replicas_in_sync\n", + "IMAGE_SIZE = [180, 180]\n", + "EPOCHS = 25" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Chest X-ray data we are using from Cell divides the data into train, val, and test files. There are only 16 files in the validation folder, and we would prefer to have a less extreme division between the training and the validation set. We will append the validation files and create a new split that resembes the standard 80:20 division instead." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mx:\\Maneesha\\GitHub\\ML Project\\PneumoniaClassificationModel\\main.ipynb Cell 8\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m filenames \u001b[39m=\u001b[39m tf\u001b[39m.\u001b[39mio\u001b[39m.\u001b[39mgfile\u001b[39m.\u001b[39mglob(\u001b[39mstr\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mPneumoniaClassificationModel/DataFrames/train/\u001b[39m\u001b[39m'\u001b[39m))\n\u001b[0;32m 2\u001b[0m filenames\u001b[39m.\u001b[39mextend(tf\u001b[39m.\u001b[39mio\u001b[39m.\u001b[39mgfile\u001b[39m.\u001b[39mglob(\u001b[39mstr\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mPneumoniaClassificationModel/DataFrames/val/\u001b[39m\u001b[39m'\u001b[39m)))\n\u001b[1;32m----> 4\u001b[0m train_filenames, val_filenames \u001b[39m=\u001b[39m train_test_split(filenames, test_size\u001b[39m=\u001b[39;49m\u001b[39m0.2\u001b[39;49m)\n", + "File \u001b[1;32my:\\Anaconda\\lib\\site-packages\\sklearn\\model_selection\\_split.py:2433\u001b[0m, in \u001b[0;36mtrain_test_split\u001b[1;34m(test_size, train_size, random_state, shuffle, stratify, *arrays)\u001b[0m\n\u001b[0;32m 2430\u001b[0m arrays \u001b[39m=\u001b[39m indexable(\u001b[39m*\u001b[39marrays)\n\u001b[0;32m 2432\u001b[0m n_samples \u001b[39m=\u001b[39m _num_samples(arrays[\u001b[39m0\u001b[39m])\n\u001b[1;32m-> 2433\u001b[0m n_train, n_test \u001b[39m=\u001b[39m _validate_shuffle_split(\n\u001b[0;32m 2434\u001b[0m n_samples, test_size, train_size, default_test_size\u001b[39m=\u001b[39;49m\u001b[39m0.25\u001b[39;49m\n\u001b[0;32m 2435\u001b[0m )\n\u001b[0;32m 2437\u001b[0m \u001b[39mif\u001b[39;00m shuffle \u001b[39mis\u001b[39;00m \u001b[39mFalse\u001b[39;00m:\n\u001b[0;32m 2438\u001b[0m \u001b[39mif\u001b[39;00m stratify \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n", + "File \u001b[1;32my:\\Anaconda\\lib\\site-packages\\sklearn\\model_selection\\_split.py:2111\u001b[0m, in \u001b[0;36m_validate_shuffle_split\u001b[1;34m(n_samples, test_size, train_size, default_test_size)\u001b[0m\n\u001b[0;32m 2108\u001b[0m n_train, n_test \u001b[39m=\u001b[39m \u001b[39mint\u001b[39m(n_train), \u001b[39mint\u001b[39m(n_test)\n\u001b[0;32m 2110\u001b[0m \u001b[39mif\u001b[39;00m n_train \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m-> 2111\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 2112\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mWith n_samples=\u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m, test_size=\u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m and train_size=\u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m, the \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 2113\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mresulting train set will be empty. Adjust any of the \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 2114\u001b[0m \u001b[39m\"\u001b[39m\u001b[39maforementioned parameters.\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(n_samples, test_size, train_size)\n\u001b[0;32m 2115\u001b[0m )\n\u001b[0;32m 2117\u001b[0m \u001b[39mreturn\u001b[39;00m n_train, n_test\n", + "\u001b[1;31mValueError\u001b[0m: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters." + ] + } + ], + "source": [ + "filenames = tf.io.gfile.glob(str('PneumoniaClassificationModel/DataFrames/train/'))\n", + "filenames.extend(tf.io.gfile.glob(str('PneumoniaClassificationModel/DataFrames/val/')))\n", + "\n", + "train_filenames, val_filenames = train_test_split(filenames, test_size=0.2)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.13 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "5819c1eaf6d552792a1bbc5e8998e6c2149ab26a1973a0d78107c0d9954e5ba0" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}