{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ANOMALY DETECTION ALGORITHM "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## From unsupervised to supervised learning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let df denote a dataframe with two features; number of bets per minute and 200's. We first fit that data into a probability distribution. After that we detect the outliers and finally we replace the extreme values (like load tests, big events etc.) with an average value. This mitigates the noise of our data.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import itertools\n",
    "import logging\n",
    "import warnings\n",
    "import scipy.stats as st\n",
    "import statsmodels as sm\n",
    "import math\n",
    "# import urllib3\n",
    "from statistics import mean\n",
    "from scipy.stats import *\n",
    "from splunk_hec_handler import SplunkHecHandler\n",
    "from datetime import datetime, timedelta\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "df = pd.read_csv(\"sportsbook_NN_data_spare.csv\")\n",
    "\n",
    "# urllib3.disable_warnings()\n",
    "\n",
    "# Create models from data\n",
    "\n",
    "def best_fit_distribution(data, bins=200, ax=None):\n",
    "    \"\"\"Model data by finding best fit distribution to data\"\"\"\n",
    "    # Get histogram of original data\n",
    "    y, x = np.histogram(data, bins=bins, density=True)\n",
    "    x = (x + np.roll(x, -1))[:-1] / 2.0\n",
    "\n",
    "    # Distributions to check\n",
    "    DISTRIBUTIONS = [        \n",
    "        st.alpha,st.anglit,st.arcsine,st.beta,st.betaprime,st.bradford,st.burr,st.cauchy,st.chi,st.chi2,st.cosine,\n",
    "        st.dgamma,st.dweibull,st.erlang,st.expon,st.exponnorm,st.exponweib,st.exponpow,st.f,st.fatiguelife,st.fisk,\n",
    "        st.foldcauchy,st.foldnorm,st.genlogistic,st.genpareto,st.gennorm,st.genexpon,\n",
    "        st.genextreme,st.gausshyper,st.gamma,st.gengamma,st.genhalflogistic,st.gilbrat,st.gompertz,st.gumbel_r,\n",
    "        st.gumbel_l,st.halfcauchy,st.halflogistic\n",
    "    ]\n",
    "\n",
    "\n",
    "    # Best holders\n",
    "    best_distribution = st.norm\n",
    "    best_params = (0.0, 1.0)\n",
    "    best_sse = np.inf\n",
    "\n",
    "    \n",
    "    list_s = []\n",
    "    # Estimate distribution parameters from data\n",
    "    for distribution in DISTRIBUTIONS:\n",
    "\n",
    "        # Try to fit the distribution\n",
    "        try:\n",
    "            # Ignore warnings from data that can't be fit\n",
    "            with warnings.catch_warnings():\n",
    "                warnings.filterwarnings('ignore')\n",
    "\n",
    "                # fit dist to data\n",
    "                params = distribution.fit(data)\n",
    "\n",
    "                # Separate parts of parameters\n",
    "                arg = params[:-2]\n",
    "                loc = params[-2]\n",
    "                scale = params[-1]\n",
    "\n",
    "                # Calculate fitted PDF and error with fit in distribution\n",
    "                pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)\n",
    "                sse = np.sum(np.power(y - pdf, 2.0))\n",
    "\n",
    "                # if axis pass in add to plot\n",
    "                try:\n",
    "                    if ax:\n",
    "                        pd.Series(pdf, x).plot(ax=ax)\n",
    "                    end\n",
    "                except Exception:\n",
    "                    pass\n",
    "\n",
    "                # identify if this distribution is better\n",
    "                if best_sse > sse > 0:\n",
    "                    best_distribution = distribution\n",
    "                    best_params = params\n",
    "                    best_sse = sse\n",
    "\n",
    "        except Exception:\n",
    "            pass\n",
    "        \n",
    "        list_s.append((distribution.name,sse))\n",
    "\n",
    "        z, w = zip(*list_s)\n",
    "    \n",
    "    data_frame = pd.DataFrame({'Dist': z, 'Error': w})\n",
    "    print(data_frame.to_string())\n",
    "    print(\"\\n\")\n",
    "    print((list(z)[list(w).index(min(list(w)))],min(list(w))))\n",
    "    print(\"\\n\")\n",
    "    return (best_distribution.name, best_params)\n",
    "\n",
    "def make_pdf(dist, params, size=10000):\n",
    "    \"\"\"Generate distributions's Probability Distribution Function \"\"\"\n",
    "\n",
    "    # Separate parts of parameters\n",
    "    arg = params[:-2]\n",
    "    loc = params[-2]\n",
    "    scale = params[-1]\n",
    "\n",
    "    # Get sane start and end points of distribution\n",
    "    start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)\n",
    "    end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)\n",
    "\n",
    "    # Build PDF and turn into pandas Series\n",
    "    x = np.linspace(start, end, size)\n",
    "    y = dist.pdf(x, loc=loc, scale=scale, *arg)\n",
    "    pdf = pd.Series(y, x)\n",
    "\n",
    "    return pdf\n",
    "\n",
    "# Load data from statsmodels datasets\n",
    "data = pd.Series(df[\"NoB\"]).iloc[-1440:]\n",
    "\n",
    "# Find best fit distribution\n",
    "best_fit_name, best_fit_params = best_fit_distribution(data, 200) \n",
    "best_dist = getattr(st, best_fit_name)\n",
    "\n",
    "# Make PDF with best params \n",
    "pdf = make_pdf(best_dist, best_fit_params)\n",
    "\n",
    "######################################################### Second Part ########################################################## \n",
    "\n",
    "# the code below disregards the ouliers in 200's; there are discrete cases where huge peaks appear (no-obvious reason) \n",
    "q = df[\"count_200\"].quantile(0.90)\n",
    "\n",
    "index_list_quantile = []\n",
    "\n",
    "for j in range(len(df[\"count_200\"])):\n",
    "    if df[\"count_200\"][j] >= int(q):\n",
    "        index_list_quantile.append(j)\n",
    "\n",
    "df[\"count_200\"].replace(df[\"count_200\"][index_list_quantile], df[\"count_200\"].mean(), inplace=True)     \n",
    "\n",
    "# we define our main frame with Sportsbook Web number of bets per minute over some period of Time (here is 4 months approx.)\n",
    "main_df = df[[\"_time\", \"NoB\"]]\n",
    "\n",
    "Time_list = []\n",
    "dist = best_dist\n",
    "\n",
    "# takes the last day of the sample and creates a list with all minutes. \n",
    "# In principle we must be able to use epoch Time for constant data flow.\n",
    "for i in main_df[\"_time\"]:    \n",
    "        Time_list.append(i)\n",
    "        \n",
    "# Time_ = map(lambda x : x.split(\" \")[1], Time_list)\n",
    "Time = Time_list \n",
    "\n",
    "#  we enumerate the Time-list so that we can exploit previous elements\n",
    "\n",
    "listTime =[]\n",
    "\n",
    "for i in range(len(Time)):\n",
    "    listTime.append((Time[i],i))\n",
    "    \n",
    "#  we split the listTime in to distinct lists for later use; j_1 contains \"%H:%M\", j_2 contains  range(0,1441)\n",
    "J_1, J_2 = zip(*listTime)\n",
    "j_1 = list(J_1)\n",
    "j_2 = list(J_2)\n",
    "\n",
    "for i in range(len(j_1)):\n",
    "    j_1[i] = j_1[i].split()[3]\n",
    "    \n",
    "j_1 = j_1[:1440] \n",
    "\n",
    "def series(j):\n",
    "    return list(main_df[main_df[\"_time\"].str.contains(j)][\"NoB\"])\n",
    "\n",
    "def fun_200(j):\n",
    "    return list(df[df[\"_time\"].str.contains(j)][\"count_200\"].apply(lambda x : int(x)))\n",
    "\n",
    "def time(j):       \n",
    "    return list(main_df[main_df[\"_time\"].str.contains(j)][\"_time\"])\n",
    "\n",
    "def perc(mean,y):    \n",
    "    return (100*(y-mean))/mean\n",
    "\n",
    "list_character=[]\n",
    "list_char=[]\n",
    "list_1=[]\n",
    "list_2=[]\n",
    "list_3=[]\n",
    "\n",
    "for j in range(len(j_1)):\n",
    "    \n",
    "    mean_j = mean(series(j_1[j])[:-1])\n",
    "    list_1.append(series(j_1[j])[-1])\n",
    "    list_2.append(fun_200(j_1[j])[-1])\n",
    "    list_3.append(time(j_1[j])[-1])\n",
    "    \n",
    "    if (perc(mean_j, series(j_1[j])[-1]) <= -20): \n",
    "    \n",
    "        if (max(series(j_1[j])) < best_dist.interval(0.95, *best_fit_params)[1]): \n",
    "        \n",
    "           if (perc(mean_j, series(j_1[j])[-1]) <= -70) & (perc(mean(series(j_1[j-1])), series(j_1[j-1])[-1]) <= -70) & (perc(mean(series(j_1[j-2])), series(j_1[j-2])[-1]) <= -70):\n",
    "                x1 = \"High risk. The percentage drop is lower than {}% for more than 3 minutes.\".format(j_1[j])\n",
    "                list_char.append(x1)\n",
    "                print(x_1)\n",
    "           \n",
    "        \n",
    "           elif (perc(mean(fun_200(j_1[j])[:-1]),fun_200(j_1[j])[-1]) < -20): \n",
    "                x2 = \"There is a {} % drop at {} o'clock and {} drop at 200's\".format(math.ceil(perc(mean_j,series(j_1[j])[-1])),j_1[j],math.ceil(perc(mean(fun_200(j_1[j])[:-1]),fun_200(j_1[j])[-1])))\n",
    "                list_char.append(x2)\n",
    "                print(x2)\n",
    "     \n",
    "           else:\n",
    "                print(\"{} We have a normal decrease.\".format(j_1[j])) \n",
    "        else:\n",
    "            if ((max(series(j_1[j-1])) > best_dist.interval(0.95, *best_fit_params)[1]) |\n",
    "                (max(series(j_1[j-2])) > best_dist.interval(0.95, *best_fit_params)[1])):\n",
    "                x3 = \"{} Low Risk\".format(j_1[j])\n",
    "                list_char.append(x3)\n",
    "                print(x3)\n",
    "        \n",
    "    else:        \n",
    "        x4 = \"{} No problem with the channel\".format(j_1[j])\n",
    "        list_char.append(x4)\n",
    "        print(x4)\n",
    "    \n",
    "    if 'drop' in list_char[-1]:\n",
    "        list_character.append(1)\n",
    "    else:\n",
    "        list_character.append(0)\n",
    "        \n",
    "df_ = pd.DataFrame({'_time':list_3, 'NoB':list_1, '200':list_2})\n",
    "df_train = pd.concat([df_.reset_index(drop=True),pd.DataFrame({'Characters':list_character}).reset_index(drop=True)], axis=1)\n",
    "\n",
    "df_train.to_csv('NN_training_data.csv', sep=',')    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Anomaly detection - Neural Network."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The first attempt is to use a simple logistic regression model;\n",
    "although it performs (unexpectedly) well, regression cannot self-learn. \n",
    "Therefore we move to more complex solutions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7993055555555556\n",
      "\n",
      "\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.99      0.80      0.88       654\n",
      "           1       0.31      0.92      0.47        66\n",
      "\n",
      "   micro avg       0.81      0.81      0.81       720\n",
      "   macro avg       0.65      0.86      0.67       720\n",
      "weighted avg       0.93      0.81      0.84       720\n",
      "\n",
      "\n",
      "\n",
      "[[520 134]\n",
      " [  5  61]]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\dtomlinson\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report, confusion_matrix\n",
    "from imblearn.over_sampling import SMOTE\n",
    "\n",
    "df = pd.read_csv(\"NN_training_data.csv\")\n",
    "del df[\"Unnamed: 0\"]\n",
    "del df[\"_time\"]\n",
    "\n",
    "X = df[[\"NoB\",\"200\"]]\n",
    "y = df[\"Target\"]\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)\n",
    "\n",
    "smt = SMOTE()\n",
    "X_train, y_train = smt.fit_sample(X_train, y_train)\n",
    "\n",
    "log_reg = LogisticRegression()\n",
    "log_reg.fit(X_train, y_train)\n",
    "predictions=log_reg.predict(X_test)\n",
    "print(log_reg.score(X,y))\n",
    "print(\"\\n\")\n",
    "print(classification_report(y_test,predictions))\n",
    "print(\"\\n\")\n",
    "print(confusion_matrix(y_test,predictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We implement the MLPClassifier from scikit-learn. \n",
    "This is the neural network analogue of tensorflow in sklearn.\n",
    "It doesn't have the same functionality, thus if we don't hit \n",
    "a high score we won't use it further alone. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We implement other libraries from the sklearn deep learning toolkit. \n",
    "In the cell below we use also Stohastic Gradient Descent and Gradient \n",
    "Boosting classifier."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.6763888888888889\n",
      "\n",
      "\n",
      "[[223  22]\n",
      " [ 10 105]]\n",
      "\n",
      "\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.96      0.91      0.93       245\n",
      "           1       0.83      0.91      0.87       115\n",
      "\n",
      "   micro avg       0.91      0.91      0.91       360\n",
      "   macro avg       0.89      0.91      0.90       360\n",
      "weighted avg       0.92      0.91      0.91       360\n",
      "\n",
      "0.6763888888888889\n",
      "\n",
      "\n",
      "0.6763888888888889\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "\n",
    "df = pd.read_csv(\"NN_training_data.csv\")\n",
    "del df[\"Unnamed: 0\"]\n",
    "del df[\"_time\"]\n",
    "\n",
    "X = df[[\"NoB\",\"200\"]]\n",
    "y = df[\"Target\"]\n",
    "\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "scaler = StandardScaler()\n",
    "# Fit only to the training data\n",
    "scaler.fit(X_train)\n",
    "\n",
    "# Now apply the transformations to the data:\n",
    "X_train = scaler.transform(X_train)\n",
    "X_test = scaler.transform(X_test)\n",
    "\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from imblearn.over_sampling import SMOTE\n",
    "\n",
    "mlp = MLPClassifier(hidden_layer_sizes=(5,5), activation=\"relu\")\n",
    "\n",
    "mlp.fit(X_train,y_train)\n",
    "\n",
    "predictions = mlp.predict(X_test)\n",
    "\n",
    "print(mlp.score(X,y))\n",
    "print(\"\\n\")\n",
    "print(confusion_matrix(y_test,predictions))\n",
    "print(\"\\n\")\n",
    "print(classification_report(y_test,predictions))\n",
    "\n",
    "# mlp.coefs_\n",
    "# mlp.intercepts_\n",
    "\n",
    "model = GradientBoostingClassifier()\n",
    "model.fit(X_train, y_train)\n",
    "model.predict(X_test)\n",
    "\n",
    "print(model.score(X,y))\n",
    "print(\"\\n\")\n",
    "\n",
    "\n",
    "stohastic_model = SGDClassifier(loss=\"hinge\", penalty=\"l2\", max_iter=5)\n",
    "stohastic_model.fit(X_train, y_train)\n",
    "stohastic_model.predict(X_test)\n",
    "\n",
    "print(stohastic_model.score(X,y))\n",
    "\n",
    "# print(confusion_matrix(y_test,predictions))\n",
    "# print(\"\\n\")\n",
    "# print(classification_report(y_test,predictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Neural Network - Keras/Pandas/Numpy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The first cell contains the baseline of the neural network. We \n",
    "only feed a neural network of sixty-neurons in the hidden leayer \n",
    "to check the accuracy generically. The second cell consists a \n",
    "Standardized version of the first. There is no other difference. \n",
    "Nevertheless with that small change we hit a better result (thus \n",
    "we keep that change). \n",
    "\n",
    "The third cell computationally is more efficient than the other \n",
    "two; we use a thirty-layer neural network (reduce it by half) and \n",
    "the width of the neural network smaller."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Results: 69.36% (21.95%)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import warnings\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "from keras.wrappers.scikit_learn import KerasClassifier\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "seed = 7\n",
    "numpy.random.seed(seed)\n",
    "\n",
    "df = pd.read_csv(\"NN_training_data.csv\")\n",
    "\n",
    "del df[\"Unnamed: 0\"]\n",
    "del df[\"_time\"]\n",
    "\n",
    "dataset = df.values\n",
    "\n",
    "X = df[[\"NoB\",\"200\"]]\n",
    "y = df[\"Target\"]\n",
    "\n",
    "# baseline model\n",
    "def create_baseline():\n",
    "# create model\n",
    "    model = Sequential()\n",
    "    model.add(Dense(60, input_dim=2, kernel_initializer='normal', activation='relu'))\n",
    "    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))\n",
    "# Compile model\n",
    "    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "    return model\n",
    "\n",
    "# evaluate model with standardized dataset\n",
    "estimator = KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=0)\n",
    "kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)\n",
    "results = cross_val_score(estimator, X, y, cv=kfold)\n",
    "print(\"Results: %.2f%% (%.2f%%)\" % (results.mean()*100, results.std()*100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Standardized: 91.11% (3.06%)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import warnings\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "from keras.wrappers.scikit_learn import KerasClassifier\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "seed = 7\n",
    "numpy.random.seed(seed)\n",
    "\n",
    "df = pd.read_csv(\"NN_training_data.csv\")\n",
    "\n",
    "del df[\"Unnamed: 0\"]\n",
    "del df[\"_time\"]\n",
    "\n",
    "dataset = df.values\n",
    "\n",
    "X = df[[\"NoB\",\"200\"]]\n",
    "y = df[\"Target\"]\n",
    "\n",
    "estimators = []\n",
    "estimators.append(('standardize', StandardScaler()))\n",
    "estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=0)))\n",
    "pipeline = Pipeline(estimators)\n",
    "kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)\n",
    "results = cross_val_score(pipeline, X, y, cv=kfold)\n",
    "print(\"Standardized: %.2f%% (%.2f%%)\" % (results.mean()*100, results.std()*100))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is the most efficient NN we constructed so far. It's not \"very\" deep; fact that reflects the low complexity of our data. Nevertheless it takes ~5 minutes (when n_splits of cross_val_score has been set to 10) to evaluate the mean precision of the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From C:\\Users\\dtomlinson\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\tensorflow\\python\\framework\\op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Colocations handled automatically by placer.\n",
      "WARNING:tensorflow:From C:\\Users\\dtomlinson\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\tensorflow\\python\\ops\\math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use tf.cast instead.\n",
      "Smaller Standardized NN: 91.11% (0.27%)\n",
      "[[2597    0]\n",
      " [ 245   38]]\n"
     ]
    }
   ],
   "source": [
    "import numpy\n",
    "import pandas as pd\n",
    "import warnings\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Dropout\n",
    "from keras.optimizers import SGD\n",
    "from keras.wrappers.scikit_learn import KerasClassifier\n",
    "from sklearn.model_selection import cross_val_score, cross_val_predict\n",
    "from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from keras import regularizers\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "seed = 7\n",
    "numpy.random.seed(seed)\n",
    "\n",
    "df = pd.read_csv(\"NN_training_data.csv\")\n",
    "\n",
    "del df[\"Unnamed: 0\"]\n",
    "del df[\"_time\"]\n",
    "\n",
    "dataset = df.values\n",
    "\n",
    "X = df[[\"NoB\",\"200\"]]\n",
    "y = df[\"Target\"]\n",
    "\n",
    "# smaller model\n",
    "def optimal_nn():\n",
    "# create model\n",
    "    model = Sequential()\n",
    "    model.add(Dense(5, input_dim=2, kernel_initializer='normal', activation='relu', kernel_regularizer=regularizers.l2(0.001)))\n",
    "    model.add(Dense(5, activation='relu', kernel_regularizer=regularizers.l2(0.001)))\n",
    "    model.add(Dense(5, activation='relu', kernel_regularizer=regularizers.l2(0.001)))\n",
    "    model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.001)))\n",
    "# Compile model\n",
    "    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "    return model\n",
    " \n",
    "estimators = []\n",
    "estimators.append(('standardize', StandardScaler()))\n",
    "estimators.append(('mlp', KerasClassifier(build_fn=optimal_nn, epochs=50, batch_size=5, verbose=0)))\n",
    "pipeline = Pipeline(estimators)\n",
    "kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=seed)\n",
    "results = cross_val_score(pipeline, X, y, cv=kfold)\n",
    "\n",
    "\n",
    "\n",
    "#  CAUTION: the y_pred here does not necessarily return the same predictions with the results above. \n",
    "#  Generally, cross_val_score and cross_predict form two different scikit_learn functions, therefore\n",
    "#  obey their own rules; their difference is mainly how the data is split in batches.\n",
    "#  Nevertheless, it forms an approximation of the cross_val_score. Remember, cross_val_predict fits\n",
    "#  each data sample EXACTLY ONCE in a test set and the confusion_matrix returns the prediction for this \n",
    "#  data sample that given time (hence explains why we have: len(y_pred)=len(y)=1440). \n",
    "\n",
    "y_pred = cross_val_predict(pipeline, X, y, cv=kfold)\n",
    "conf_mat = confusion_matrix(y, y_pred)\n",
    "print(\"Smaller Standardized NN: %.2f%% (%.2f%%)\" % (results.mean()*100, results.std()*100))\n",
    "print(conf_mat)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "VERY Interesting fact : Without using the StandardScaler() function our model is really poor. In particular, it hits 63% percent without the StandardScaler() and furthermore it doesn't recognize the y-values equal to 1 (i.e., when there is a problem); in other words it's not working. Below the first cell contains the non-standardized version and the second when StandardScaler() is considered. Also we provide a classification report for comparison. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.63      1.00      0.77       227\n",
      "           1       0.00      0.00      0.00       133\n",
      "\n",
      "   micro avg       0.63      0.63      0.63       360\n",
      "   macro avg       0.32      0.50      0.39       360\n",
      "weighted avg       0.40      0.63      0.49       360\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "mlp = KerasClassifier(build_fn=optimal_nn, epochs=100, batch_size=5, verbose=0)\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)\n",
    "mlp.fit(X_train,y_train)\n",
    "\n",
    "predictions = mlp.predict(X_test)\n",
    "\n",
    "print(classification_report(y_test,predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.99      0.77      0.87       640\n",
      "           1       0.35      0.96      0.51        80\n",
      "\n",
      "   micro avg       0.79      0.79      0.79       720\n",
      "   macro avg       0.67      0.87      0.69       720\n",
      "weighted avg       0.92      0.79      0.83       720\n",
      "\n",
      "\n",
      "\n",
      "[[495 145]\n",
      " [  3  77]]\n"
     ]
    },
    {
     "ename": "AttributeError",
     "evalue": "'KerasClassifier' object has no attribute 'save'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-21-9552322fe2cf>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     49\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconfusion_matrix\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mpredictions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     50\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 51\u001b[1;33m \u001b[0mmlp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"mlp_classifier.model\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m: 'KerasClassifier' object has no attribute 'save'"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import warnings\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Dropout\n",
    "from keras.optimizers import SGD\n",
    "from keras.wrappers.scikit_learn import KerasClassifier\n",
    "from sklearn.model_selection import cross_val_score, cross_val_predict\n",
    "from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from keras import regularizers\n",
    "from imblearn.over_sampling import SMOTE\n",
    "\n",
    "df = pd.read_csv(\"NN_training_data.csv\")\n",
    "\n",
    "del df[\"Unnamed: 0\"]\n",
    "del df[\"_time\"]\n",
    "\n",
    "dataset = df.values\n",
    "\n",
    "X = df[[\"NoB\",\"200\"]]\n",
    "y = df[\"Target\"]\n",
    "\n",
    "mlp = KerasClassifier(build_fn=optimal_nn, epochs=100, batch_size=5, verbose=0)\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)\n",
    "\n",
    "smt=SMOTE()\n",
    "X_train, y_train = smt.fit_sample(X_train,y_train)\n",
    "np.bincount(y_train)\n",
    "\n",
    "scaler = StandardScaler()\n",
    "# Fit only to the training data\n",
    "scaler.fit(X_train)\n",
    "\n",
    "# Now apply the transformations to the data:\n",
    "X_train = scaler.transform(X_train)\n",
    "X_test = scaler.transform(X_test)\n",
    "\n",
    "mlp.fit(X_train,y_train)\n",
    "predictions = mlp.predict(X_test)\n",
    "\n",
    "from sklearn.metrics import classification_report, confusion_matrix\n",
    "print(classification_report(y_test,predictions))\n",
    "print(\"\\n\")\n",
    "print(confusion_matrix(y_test,predictions))\n",
    "\n",
    "mlp.save(\"mlp_classifier.model\")"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "Here we print also the weights and biases of the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 2304 samples, validate on 576 samples\n",
      "Epoch 1/5\n",
      "2304/2304 [==============================] - 2s 822us/step - loss: 0.3760 - acc: 0.8772 - val_loss: 0.1549 - val_acc: 1.0000\n",
      "Epoch 2/5\n",
      "2304/2304 [==============================] - 1s 282us/step - loss: 0.3749 - acc: 0.8772 - val_loss: 0.1309 - val_acc: 1.0000\n",
      "Epoch 3/5\n",
      "2304/2304 [==============================] - 1s 295us/step - loss: 0.3744 - acc: 0.8772 - val_loss: 0.1326 - val_acc: 1.0000\n",
      "Epoch 4/5\n",
      "2304/2304 [==============================] - 1s 283us/step - loss: 0.3745 - acc: 0.8772 - val_loss: 0.1337 - val_acc: 1.0000\n",
      "Epoch 5/5\n",
      "2304/2304 [==============================] - 1s 282us/step - loss: 0.3744 - acc: 0.8772 - val_loss: 0.1336 - val_acc: 1.0000\n",
      "[array([[-2.9095856e-03, -2.1770846e-03, -1.9934693e-16,  5.5312752e-03,\n",
      "         1.8497715e-02],\n",
      "       [ 4.6612010e-03,  7.4044587e-03, -8.8255713e-03,  1.1704177e-02,\n",
      "         1.3716479e-02]], dtype=float32), array([ 0.00893806,  0.00683938, -0.01635896,  0.00989568,  0.01338646],\n",
      "      dtype=float32), array([[ 0.26589656],\n",
      "       [-0.11232933],\n",
      "       [ 0.84088373],\n",
      "       [-0.72920924],\n",
      "       [-0.45201716]], dtype=float32), array([-0.08870715], dtype=float32)]\n"
     ]
    }
   ],
   "source": [
    "import numpy\n",
    "import pandas as pd\n",
    "import warnings\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Dropout\n",
    "from keras.optimizers import SGD\n",
    "from keras.wrappers.scikit_learn import KerasClassifier\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.pipeline import Pipeline\n",
    "from keras import regularizers\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
    "import tensorflow as tf\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "seed = 7\n",
    "numpy.random.seed(seed)\n",
    "\n",
    "df = pd.read_csv(\"NN_training_data.csv\")\n",
    "\n",
    "del df[\"Unnamed: 0\"]\n",
    "del df[\"_time\"]\n",
    "\n",
    "dataset = df.values\n",
    "\n",
    "X = df[[\"NoB\",\"200\"]]\n",
    "y = df[\"Target\"]\n",
    "\n",
    "# Sequential\n",
    "model = Sequential()\n",
    "\n",
    "# Neural network\n",
    "model.add(Dense(5, input_dim=2, kernel_initializer='normal', activation='tanh', kernel_regularizer=regularizers.l2(0.001)))\n",
    "# model.add(Dropout(0.2))\n",
    "model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.001)))\n",
    "\n",
    "\n",
    "# Compile model\n",
    "# sgd = optimizers.SGD(lr=0.01, decay=0.0, momentum=0.0, nesterov=False)\n",
    "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "\n",
    "# callback=tf.keras.callbacks.EarlyStopping(monitor='val_acc', patience=10)\n",
    "\n",
    "# Fit model\n",
    "history = model.fit(X, y, epochs=5, validation_split=0.2, batch_size=5)\n",
    "\n",
    "# tf.keras.callbacks.ModelCheckpoint('NN_anomaly_alter.model', monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)\n",
    "\n",
    "print(model.get_weights())\n",
    "\n",
    "model.save('NN_anomaly_1.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "model1 = tf.keras.models.load_model(\"NN_anomaly_nice.model\")\n",
    "# model2 = tf.keras.models.load_model(\"NN_anomaly_nice.model\")\n",
    "# model3 = tf.keras.models.load_model(\"NN_anomaly.model\")\n",
    "X=pd.read_csv(\"01.09.2019_NN_test_data.csv\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "prediction1=model1.predict(X)\n",
    "# prediction2=model2.predict(X1)\n",
    "# prediction3=model3.predict(X1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_1=[]\n",
    "for i in range(len(prediction1)):\n",
    "    if prediction1[i][0] > 0.5:\n",
    "        list_1.append(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"NN_training_data.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>_time</th>\n",
       "      <th>NoB</th>\n",
       "      <th>200</th>\n",
       "      <th>logins</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>Fri Jul 12 00:00</td>\n",
       "      <td>149</td>\n",
       "      <td>3391</td>\n",
       "      <td>697</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>Fri Jul 12 00:01</td>\n",
       "      <td>129</td>\n",
       "      <td>3854</td>\n",
       "      <td>782</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.0</td>\n",
       "      <td>Fri Jul 12 00:02</td>\n",
       "      <td>114</td>\n",
       "      <td>3160</td>\n",
       "      <td>761</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.0</td>\n",
       "      <td>Fri Jul 12 00:03</td>\n",
       "      <td>95</td>\n",
       "      <td>3485</td>\n",
       "      <td>792</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4.0</td>\n",
       "      <td>Fri Jul 12 00:04</td>\n",
       "      <td>28</td>\n",
       "      <td>3136</td>\n",
       "      <td>739</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0             _time  NoB   200  logins  Target\n",
       "0         0.0  Fri Jul 12 00:00  149  3391     697       0\n",
       "1         1.0  Fri Jul 12 00:01  129  3854     782       0\n",
       "2         2.0  Fri Jul 12 00:02  114  3160     761       0\n",
       "3         3.0  Fri Jul 12 00:03   95  3485     792       0\n",
       "4         4.0  Fri Jul 12 00:04   28  3136     739       0"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}