{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# ANOMALY DETECTION ALGORITHM " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## From unsupervised to supervised learning" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let df denote a dataframe with two features; number of bets per minute and 200's. We first fit that data into a probability distribution. After that we detect the outliers and finally we replace the extreme values (like load tests, big events etc.) with an average value. This mitigates the noise of our data. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import itertools\n", "import logging\n", "import warnings\n", "import scipy.stats as st\n", "import statsmodels as sm\n", "import math\n", "# import urllib3\n", "from statistics import mean\n", "from scipy.stats import *\n", "from splunk_hec_handler import SplunkHecHandler\n", "from datetime import datetime, timedelta\n", "\n", "%matplotlib inline\n", "\n", "df = pd.read_csv(\"sportsbook_NN_data_spare.csv\")\n", "\n", "# urllib3.disable_warnings()\n", "\n", "# Create models from data\n", "\n", "def best_fit_distribution(data, bins=200, ax=None):\n", " \"\"\"Model data by finding best fit distribution to data\"\"\"\n", " # Get histogram of original data\n", " y, x = np.histogram(data, bins=bins, density=True)\n", " x = (x + np.roll(x, -1))[:-1] / 2.0\n", "\n", " # Distributions to check\n", " DISTRIBUTIONS = [ \n", " st.alpha,st.anglit,st.arcsine,st.beta,st.betaprime,st.bradford,st.burr,st.cauchy,st.chi,st.chi2,st.cosine,\n", " st.dgamma,st.dweibull,st.erlang,st.expon,st.exponnorm,st.exponweib,st.exponpow,st.f,st.fatiguelife,st.fisk,\n", " st.foldcauchy,st.foldnorm,st.genlogistic,st.genpareto,st.gennorm,st.genexpon,\n", " st.genextreme,st.gausshyper,st.gamma,st.gengamma,st.genhalflogistic,st.gilbrat,st.gompertz,st.gumbel_r,\n", " st.gumbel_l,st.halfcauchy,st.halflogistic\n", " ]\n", "\n", "\n", " # Best holders\n", " best_distribution = st.norm\n", " best_params = (0.0, 1.0)\n", " best_sse = np.inf\n", "\n", " \n", " list_s = []\n", " # Estimate distribution parameters from data\n", " for distribution in DISTRIBUTIONS:\n", "\n", " # Try to fit the distribution\n", " try:\n", " # Ignore warnings from data that can't be fit\n", " with warnings.catch_warnings():\n", " warnings.filterwarnings('ignore')\n", "\n", " # fit dist to data\n", " params = distribution.fit(data)\n", "\n", " # Separate parts of parameters\n", " arg = params[:-2]\n", " loc = params[-2]\n", " scale = params[-1]\n", "\n", " # Calculate fitted PDF and error with fit in distribution\n", " pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)\n", " sse = np.sum(np.power(y - pdf, 2.0))\n", "\n", " # if axis pass in add to plot\n", " try:\n", " if ax:\n", " pd.Series(pdf, x).plot(ax=ax)\n", " end\n", " except Exception:\n", " pass\n", "\n", " # identify if this distribution is better\n", " if best_sse > sse > 0:\n", " best_distribution = distribution\n", " best_params = params\n", " best_sse = sse\n", "\n", " except Exception:\n", " pass\n", " \n", " list_s.append((distribution.name,sse))\n", "\n", " z, w = zip(*list_s)\n", " \n", " data_frame = pd.DataFrame({'Dist': z, 'Error': w})\n", " print(data_frame.to_string())\n", " print(\"\\n\")\n", " print((list(z)[list(w).index(min(list(w)))],min(list(w))))\n", " print(\"\\n\")\n", " return (best_distribution.name, best_params)\n", "\n", "def make_pdf(dist, params, size=10000):\n", " \"\"\"Generate distributions's Probability Distribution Function \"\"\"\n", "\n", " # Separate parts of parameters\n", " arg = params[:-2]\n", " loc = params[-2]\n", " scale = params[-1]\n", "\n", " # Get sane start and end points of distribution\n", " start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)\n", " end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)\n", "\n", " # Build PDF and turn into pandas Series\n", " x = np.linspace(start, end, size)\n", " y = dist.pdf(x, loc=loc, scale=scale, *arg)\n", " pdf = pd.Series(y, x)\n", "\n", " return pdf\n", "\n", "# Load data from statsmodels datasets\n", "data = pd.Series(df[\"NoB\"]).iloc[-1440:]\n", "\n", "# Find best fit distribution\n", "best_fit_name, best_fit_params = best_fit_distribution(data, 200) \n", "best_dist = getattr(st, best_fit_name)\n", "\n", "# Make PDF with best params \n", "pdf = make_pdf(best_dist, best_fit_params)\n", "\n", "######################################################### Second Part ########################################################## \n", "\n", "# the code below disregards the ouliers in 200's; there are discrete cases where huge peaks appear (no-obvious reason) \n", "q = df[\"count_200\"].quantile(0.90)\n", "\n", "index_list_quantile = []\n", "\n", "for j in range(len(df[\"count_200\"])):\n", " if df[\"count_200\"][j] >= int(q):\n", " index_list_quantile.append(j)\n", "\n", "df[\"count_200\"].replace(df[\"count_200\"][index_list_quantile], df[\"count_200\"].mean(), inplace=True) \n", "\n", "# we define our main frame with Sportsbook Web number of bets per minute over some period of Time (here is 4 months approx.)\n", "main_df = df[[\"_time\", \"NoB\"]]\n", "\n", "Time_list = []\n", "dist = best_dist\n", "\n", "# takes the last day of the sample and creates a list with all minutes. \n", "# In principle we must be able to use epoch Time for constant data flow.\n", "for i in main_df[\"_time\"]: \n", " Time_list.append(i)\n", " \n", "# Time_ = map(lambda x : x.split(\" \")[1], Time_list)\n", "Time = Time_list \n", "\n", "# we enumerate the Time-list so that we can exploit previous elements\n", "\n", "listTime =[]\n", "\n", "for i in range(len(Time)):\n", " listTime.append((Time[i],i))\n", " \n", "# we split the listTime in to distinct lists for later use; j_1 contains \"%H:%M\", j_2 contains range(0,1441)\n", "J_1, J_2 = zip(*listTime)\n", "j_1 = list(J_1)\n", "j_2 = list(J_2)\n", "\n", "for i in range(len(j_1)):\n", " j_1[i] = j_1[i].split()[3]\n", " \n", "j_1 = j_1[:1440] \n", "\n", "def series(j):\n", " return list(main_df[main_df[\"_time\"].str.contains(j)][\"NoB\"])\n", "\n", "def fun_200(j):\n", " return list(df[df[\"_time\"].str.contains(j)][\"count_200\"].apply(lambda x : int(x)))\n", "\n", "def time(j): \n", " return list(main_df[main_df[\"_time\"].str.contains(j)][\"_time\"])\n", "\n", "def perc(mean,y): \n", " return (100*(y-mean))/mean\n", "\n", "list_character=[]\n", "list_char=[]\n", "list_1=[]\n", "list_2=[]\n", "list_3=[]\n", "\n", "for j in range(len(j_1)):\n", " \n", " mean_j = mean(series(j_1[j])[:-1])\n", " list_1.append(series(j_1[j])[-1])\n", " list_2.append(fun_200(j_1[j])[-1])\n", " list_3.append(time(j_1[j])[-1])\n", " \n", " if (perc(mean_j, series(j_1[j])[-1]) <= -20): \n", " \n", " if (max(series(j_1[j])) < best_dist.interval(0.95, *best_fit_params)[1]): \n", " \n", " if (perc(mean_j, series(j_1[j])[-1]) <= -70) & (perc(mean(series(j_1[j-1])), series(j_1[j-1])[-1]) <= -70) & (perc(mean(series(j_1[j-2])), series(j_1[j-2])[-1]) <= -70):\n", " x1 = \"High risk. The percentage drop is lower than {}% for more than 3 minutes.\".format(j_1[j])\n", " list_char.append(x1)\n", " print(x_1)\n", " \n", " \n", " elif (perc(mean(fun_200(j_1[j])[:-1]),fun_200(j_1[j])[-1]) < -20): \n", " x2 = \"There is a {} % drop at {} o'clock and {} drop at 200's\".format(math.ceil(perc(mean_j,series(j_1[j])[-1])),j_1[j],math.ceil(perc(mean(fun_200(j_1[j])[:-1]),fun_200(j_1[j])[-1])))\n", " list_char.append(x2)\n", " print(x2)\n", " \n", " else:\n", " print(\"{} We have a normal decrease.\".format(j_1[j])) \n", " else:\n", " if ((max(series(j_1[j-1])) > best_dist.interval(0.95, *best_fit_params)[1]) |\n", " (max(series(j_1[j-2])) > best_dist.interval(0.95, *best_fit_params)[1])):\n", " x3 = \"{} Low Risk\".format(j_1[j])\n", " list_char.append(x3)\n", " print(x3)\n", " \n", " else: \n", " x4 = \"{} No problem with the channel\".format(j_1[j])\n", " list_char.append(x4)\n", " print(x4)\n", " \n", " if 'drop' in list_char[-1]:\n", " list_character.append(1)\n", " else:\n", " list_character.append(0)\n", " \n", "df_ = pd.DataFrame({'_time':list_3, 'NoB':list_1, '200':list_2})\n", "df_train = pd.concat([df_.reset_index(drop=True),pd.DataFrame({'Characters':list_character}).reset_index(drop=True)], axis=1)\n", "\n", "df_train.to_csv('NN_training_data.csv', sep=',') " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Anomaly detection - Neural Network." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The first attempt is to use a simple logistic regression model;\n", "although it performs (unexpectedly) well, regression cannot self-learn. \n", "Therefore we move to more complex solutions." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0.7993055555555556\n", "\n", "\n", " precision recall f1-score support\n", "\n", " 0 0.99 0.80 0.88 654\n", " 1 0.31 0.92 0.47 66\n", "\n", " micro avg 0.81 0.81 0.81 720\n", " macro avg 0.65 0.86 0.67 720\n", "weighted avg 0.93 0.81 0.84 720\n", "\n", "\n", "\n", "[[520 134]\n", " [ 5 61]]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\dtomlinson\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", " FutureWarning)\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import classification_report, confusion_matrix\n", "from imblearn.over_sampling import SMOTE\n", "\n", "df = pd.read_csv(\"NN_training_data.csv\")\n", "del df[\"Unnamed: 0\"]\n", "del df[\"_time\"]\n", "\n", "X = df[[\"NoB\",\"200\"]]\n", "y = df[\"Target\"]\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)\n", "\n", "smt = SMOTE()\n", "X_train, y_train = smt.fit_sample(X_train, y_train)\n", "\n", "log_reg = LogisticRegression()\n", "log_reg.fit(X_train, y_train)\n", "predictions=log_reg.predict(X_test)\n", "print(log_reg.score(X,y))\n", "print(\"\\n\")\n", "print(classification_report(y_test,predictions))\n", "print(\"\\n\")\n", "print(confusion_matrix(y_test,predictions))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We implement the MLPClassifier from scikit-learn. \n", "This is the neural network analogue of tensorflow in sklearn.\n", "It doesn't have the same functionality, thus if we don't hit \n", "a high score we won't use it further alone. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We implement other libraries from the sklearn deep learning toolkit. \n", "In the cell below we use also Stohastic Gradient Descent and Gradient \n", "Boosting classifier." ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.6763888888888889\n", "\n", "\n", "[[223 22]\n", " [ 10 105]]\n", "\n", "\n", " precision recall f1-score support\n", "\n", " 0 0.96 0.91 0.93 245\n", " 1 0.83 0.91 0.87 115\n", "\n", " micro avg 0.91 0.91 0.91 360\n", " macro avg 0.89 0.91 0.90 360\n", "weighted avg 0.92 0.91 0.91 360\n", "\n", "0.6763888888888889\n", "\n", "\n", "0.6763888888888889\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.ensemble import GradientBoostingClassifier\n", "from sklearn.linear_model import SGDClassifier\n", "\n", "df = pd.read_csv(\"NN_training_data.csv\")\n", "del df[\"Unnamed: 0\"]\n", "del df[\"_time\"]\n", "\n", "X = df[[\"NoB\",\"200\"]]\n", "y = df[\"Target\"]\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "scaler = StandardScaler()\n", "# Fit only to the training data\n", "scaler.fit(X_train)\n", "\n", "# Now apply the transformations to the data:\n", "X_train = scaler.transform(X_train)\n", "X_test = scaler.transform(X_test)\n", "\n", "from sklearn.neural_network import MLPClassifier\n", "from imblearn.over_sampling import SMOTE\n", "\n", "mlp = MLPClassifier(hidden_layer_sizes=(5,5), activation=\"relu\")\n", "\n", "mlp.fit(X_train,y_train)\n", "\n", "predictions = mlp.predict(X_test)\n", "\n", "print(mlp.score(X,y))\n", "print(\"\\n\")\n", "print(confusion_matrix(y_test,predictions))\n", "print(\"\\n\")\n", "print(classification_report(y_test,predictions))\n", "\n", "# mlp.coefs_\n", "# mlp.intercepts_\n", "\n", "model = GradientBoostingClassifier()\n", "model.fit(X_train, y_train)\n", "model.predict(X_test)\n", "\n", "print(model.score(X,y))\n", "print(\"\\n\")\n", "\n", "\n", "stohastic_model = SGDClassifier(loss=\"hinge\", penalty=\"l2\", max_iter=5)\n", "stohastic_model.fit(X_train, y_train)\n", "stohastic_model.predict(X_test)\n", "\n", "print(stohastic_model.score(X,y))\n", "\n", "# print(confusion_matrix(y_test,predictions))\n", "# print(\"\\n\")\n", "# print(classification_report(y_test,predictions))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Neural Network - Keras/Pandas/Numpy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The first cell contains the baseline of the neural network. We \n", "only feed a neural network of sixty-neurons in the hidden leayer \n", "to check the accuracy generically. The second cell consists a \n", "Standardized version of the first. There is no other difference. \n", "Nevertheless with that small change we hit a better result (thus \n", "we keep that change). \n", "\n", "The third cell computationally is more efficient than the other \n", "two; we use a thirty-layer neural network (reduce it by half) and \n", "the width of the neural network smaller." ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Results: 69.36% (21.95%)\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import warnings\n", "from keras.models import Sequential\n", "from keras.layers import Dense\n", "from keras.wrappers.scikit_learn import KerasClassifier\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.pipeline import Pipeline\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "seed = 7\n", "numpy.random.seed(seed)\n", "\n", "df = pd.read_csv(\"NN_training_data.csv\")\n", "\n", "del df[\"Unnamed: 0\"]\n", "del df[\"_time\"]\n", "\n", "dataset = df.values\n", "\n", "X = df[[\"NoB\",\"200\"]]\n", "y = df[\"Target\"]\n", "\n", "# baseline model\n", "def create_baseline():\n", "# create model\n", " model = Sequential()\n", " model.add(Dense(60, input_dim=2, kernel_initializer='normal', activation='relu'))\n", " model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))\n", "# Compile model\n", " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", " return model\n", "\n", "# evaluate model with standardized dataset\n", "estimator = KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=0)\n", "kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)\n", "results = cross_val_score(estimator, X, y, cv=kfold)\n", "print(\"Results: %.2f%% (%.2f%%)\" % (results.mean()*100, results.std()*100))" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Standardized: 91.11% (3.06%)\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import warnings\n", "from keras.models import Sequential\n", "from keras.layers import Dense\n", "from keras.wrappers.scikit_learn import KerasClassifier\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.pipeline import Pipeline\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "seed = 7\n", "numpy.random.seed(seed)\n", "\n", "df = pd.read_csv(\"NN_training_data.csv\")\n", "\n", "del df[\"Unnamed: 0\"]\n", "del df[\"_time\"]\n", "\n", "dataset = df.values\n", "\n", "X = df[[\"NoB\",\"200\"]]\n", "y = df[\"Target\"]\n", "\n", "estimators = []\n", "estimators.append(('standardize', StandardScaler()))\n", "estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=0)))\n", "pipeline = Pipeline(estimators)\n", "kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)\n", "results = cross_val_score(pipeline, X, y, cv=kfold)\n", "print(\"Standardized: %.2f%% (%.2f%%)\" % (results.mean()*100, results.std()*100))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is the most efficient NN we constructed so far. It's not \"very\" deep; fact that reflects the low complexity of our data. Nevertheless it takes ~5 minutes (when n_splits of cross_val_score has been set to 10) to evaluate the mean precision of the model." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From C:\\Users\\dtomlinson\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\tensorflow\\python\\framework\\op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Colocations handled automatically by placer.\n", "WARNING:tensorflow:From C:\\Users\\dtomlinson\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\tensorflow\\python\\ops\\math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.cast instead.\n", "Smaller Standardized NN: 91.11% (0.27%)\n", "[[2597 0]\n", " [ 245 38]]\n" ] } ], "source": [ "import numpy\n", "import pandas as pd\n", "import warnings\n", "from keras.models import Sequential\n", "from keras.layers import Dense, Dropout\n", "from keras.optimizers import SGD\n", "from keras.wrappers.scikit_learn import KerasClassifier\n", "from sklearn.model_selection import cross_val_score, cross_val_predict\n", "from sklearn.preprocessing import LabelEncoder, StandardScaler\n", "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.metrics import confusion_matrix\n", "from keras import regularizers\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "seed = 7\n", "numpy.random.seed(seed)\n", "\n", "df = pd.read_csv(\"NN_training_data.csv\")\n", "\n", "del df[\"Unnamed: 0\"]\n", "del df[\"_time\"]\n", "\n", "dataset = df.values\n", "\n", "X = df[[\"NoB\",\"200\"]]\n", "y = df[\"Target\"]\n", "\n", "# smaller model\n", "def optimal_nn():\n", "# create model\n", " model = Sequential()\n", " model.add(Dense(5, input_dim=2, kernel_initializer='normal', activation='relu', kernel_regularizer=regularizers.l2(0.001)))\n", " model.add(Dense(5, activation='relu', kernel_regularizer=regularizers.l2(0.001)))\n", " model.add(Dense(5, activation='relu', kernel_regularizer=regularizers.l2(0.001)))\n", " model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.001)))\n", "# Compile model\n", " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", " return model\n", " \n", "estimators = []\n", "estimators.append(('standardize', StandardScaler()))\n", "estimators.append(('mlp', KerasClassifier(build_fn=optimal_nn, epochs=50, batch_size=5, verbose=0)))\n", "pipeline = Pipeline(estimators)\n", "kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=seed)\n", "results = cross_val_score(pipeline, X, y, cv=kfold)\n", "\n", "\n", "\n", "# CAUTION: the y_pred here does not necessarily return the same predictions with the results above. \n", "# Generally, cross_val_score and cross_predict form two different scikit_learn functions, therefore\n", "# obey their own rules; their difference is mainly how the data is split in batches.\n", "# Nevertheless, it forms an approximation of the cross_val_score. Remember, cross_val_predict fits\n", "# each data sample EXACTLY ONCE in a test set and the confusion_matrix returns the prediction for this \n", "# data sample that given time (hence explains why we have: len(y_pred)=len(y)=1440). \n", "\n", "y_pred = cross_val_predict(pipeline, X, y, cv=kfold)\n", "conf_mat = confusion_matrix(y, y_pred)\n", "print(\"Smaller Standardized NN: %.2f%% (%.2f%%)\" % (results.mean()*100, results.std()*100))\n", "print(conf_mat)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "VERY Interesting fact : Without using the StandardScaler() function our model is really poor. In particular, it hits 63% percent without the StandardScaler() and furthermore it doesn't recognize the y-values equal to 1 (i.e., when there is a problem); in other words it's not working. Below the first cell contains the non-standardized version and the second when StandardScaler() is considered. Also we provide a classification report for comparison. " ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.63 1.00 0.77 227\n", " 1 0.00 0.00 0.00 133\n", "\n", " micro avg 0.63 0.63 0.63 360\n", " macro avg 0.32 0.50 0.39 360\n", "weighted avg 0.40 0.63 0.49 360\n", "\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import classification_report\n", "\n", "mlp = KerasClassifier(build_fn=optimal_nn, epochs=100, batch_size=5, verbose=0)\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)\n", "mlp.fit(X_train,y_train)\n", "\n", "predictions = mlp.predict(X_test)\n", "\n", "print(classification_report(y_test,predictions))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.99 0.77 0.87 640\n", " 1 0.35 0.96 0.51 80\n", "\n", " micro avg 0.79 0.79 0.79 720\n", " macro avg 0.67 0.87 0.69 720\n", "weighted avg 0.92 0.79 0.83 720\n", "\n", "\n", "\n", "[[495 145]\n", " [ 3 77]]\n" ] }, { "ename": "AttributeError", "evalue": "'KerasClassifier' object has no attribute 'save'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 49\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconfusion_matrix\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mpredictions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 50\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 51\u001b[1;33m \u001b[0mmlp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"mlp_classifier.model\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m: 'KerasClassifier' object has no attribute 'save'" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import warnings\n", "from keras.models import Sequential\n", "from keras.layers import Dense, Dropout\n", "from keras.optimizers import SGD\n", "from keras.wrappers.scikit_learn import KerasClassifier\n", "from sklearn.model_selection import cross_val_score, cross_val_predict\n", "from sklearn.preprocessing import LabelEncoder, StandardScaler\n", "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.metrics import confusion_matrix\n", "from keras import regularizers\n", "from imblearn.over_sampling import SMOTE\n", "\n", "df = pd.read_csv(\"NN_training_data.csv\")\n", "\n", "del df[\"Unnamed: 0\"]\n", "del df[\"_time\"]\n", "\n", "dataset = df.values\n", "\n", "X = df[[\"NoB\",\"200\"]]\n", "y = df[\"Target\"]\n", "\n", "mlp = KerasClassifier(build_fn=optimal_nn, epochs=100, batch_size=5, verbose=0)\n", "\n", "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)\n", "\n", "smt=SMOTE()\n", "X_train, y_train = smt.fit_sample(X_train,y_train)\n", "np.bincount(y_train)\n", "\n", "scaler = StandardScaler()\n", "# Fit only to the training data\n", "scaler.fit(X_train)\n", "\n", "# Now apply the transformations to the data:\n", "X_train = scaler.transform(X_train)\n", "X_test = scaler.transform(X_test)\n", "\n", "mlp.fit(X_train,y_train)\n", "predictions = mlp.predict(X_test)\n", "\n", "from sklearn.metrics import classification_report, confusion_matrix\n", "print(classification_report(y_test,predictions))\n", "print(\"\\n\")\n", "print(confusion_matrix(y_test,predictions))\n", "\n", "mlp.save(\"mlp_classifier.model\")" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "Here we print also the weights and biases of the model." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 2304 samples, validate on 576 samples\n", "Epoch 1/5\n", "2304/2304 [==============================] - 2s 822us/step - loss: 0.3760 - acc: 0.8772 - val_loss: 0.1549 - val_acc: 1.0000\n", "Epoch 2/5\n", "2304/2304 [==============================] - 1s 282us/step - loss: 0.3749 - acc: 0.8772 - val_loss: 0.1309 - val_acc: 1.0000\n", "Epoch 3/5\n", "2304/2304 [==============================] - 1s 295us/step - loss: 0.3744 - acc: 0.8772 - val_loss: 0.1326 - val_acc: 1.0000\n", "Epoch 4/5\n", "2304/2304 [==============================] - 1s 283us/step - loss: 0.3745 - acc: 0.8772 - val_loss: 0.1337 - val_acc: 1.0000\n", "Epoch 5/5\n", "2304/2304 [==============================] - 1s 282us/step - loss: 0.3744 - acc: 0.8772 - val_loss: 0.1336 - val_acc: 1.0000\n", "[array([[-2.9095856e-03, -2.1770846e-03, -1.9934693e-16, 5.5312752e-03,\n", " 1.8497715e-02],\n", " [ 4.6612010e-03, 7.4044587e-03, -8.8255713e-03, 1.1704177e-02,\n", " 1.3716479e-02]], dtype=float32), array([ 0.00893806, 0.00683938, -0.01635896, 0.00989568, 0.01338646],\n", " dtype=float32), array([[ 0.26589656],\n", " [-0.11232933],\n", " [ 0.84088373],\n", " [-0.72920924],\n", " [-0.45201716]], dtype=float32), array([-0.08870715], dtype=float32)]\n" ] } ], "source": [ "import numpy\n", "import pandas as pd\n", "import warnings\n", "from keras.models import Sequential\n", "from keras.layers import Dense, Dropout\n", "from keras.optimizers import SGD\n", "from keras.wrappers.scikit_learn import KerasClassifier\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.pipeline import Pipeline\n", "from keras import regularizers\n", "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", "import tensorflow as tf\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "seed = 7\n", "numpy.random.seed(seed)\n", "\n", "df = pd.read_csv(\"NN_training_data.csv\")\n", "\n", "del df[\"Unnamed: 0\"]\n", "del df[\"_time\"]\n", "\n", "dataset = df.values\n", "\n", "X = df[[\"NoB\",\"200\"]]\n", "y = df[\"Target\"]\n", "\n", "# Sequential\n", "model = Sequential()\n", "\n", "# Neural network\n", "model.add(Dense(5, input_dim=2, kernel_initializer='normal', activation='tanh', kernel_regularizer=regularizers.l2(0.001)))\n", "# model.add(Dropout(0.2))\n", "model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.001)))\n", "\n", "\n", "# Compile model\n", "# sgd = optimizers.SGD(lr=0.01, decay=0.0, momentum=0.0, nesterov=False)\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "\n", "# callback=tf.keras.callbacks.EarlyStopping(monitor='val_acc', patience=10)\n", "\n", "# Fit model\n", "history = model.fit(X, y, epochs=5, validation_split=0.2, batch_size=5)\n", "\n", "# tf.keras.callbacks.ModelCheckpoint('NN_anomaly_alter.model', monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)\n", "\n", "print(model.get_weights())\n", "\n", "model.save('NN_anomaly_1.model')" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf\n", "model1 = tf.keras.models.load_model(\"NN_anomaly_nice.model\")\n", "# model2 = tf.keras.models.load_model(\"NN_anomaly_nice.model\")\n", "# model3 = tf.keras.models.load_model(\"NN_anomaly.model\")\n", "X=pd.read_csv(\"01.09.2019_NN_test_data.csv\")\n" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "prediction1=model1.predict(X)\n", "# prediction2=model2.predict(X1)\n", "# prediction3=model3.predict(X1)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "list_1=[]\n", "for i in range(len(prediction1)):\n", " if prediction1[i][0] > 0.5:\n", " list_1.append(i)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"NN_training_data.csv\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0_timeNoB200loginsTarget
00.0Fri Jul 12 00:0014933916970
11.0Fri Jul 12 00:0112938547820
22.0Fri Jul 12 00:0211431607610
33.0Fri Jul 12 00:039534857920
44.0Fri Jul 12 00:042831367390
\n", "
" ], "text/plain": [ " Unnamed: 0 _time NoB 200 logins Target\n", "0 0.0 Fri Jul 12 00:00 149 3391 697 0\n", "1 1.0 Fri Jul 12 00:01 129 3854 782 0\n", "2 2.0 Fri Jul 12 00:02 114 3160 761 0\n", "3 3.0 Fri Jul 12 00:03 95 3485 792 0\n", "4 4.0 Fri Jul 12 00:04 28 3136 739 0" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }