import pandas as pd import os import matplotlib import matplotlib.pyplot as plt # from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split import numpy as np from scipy.stats import trim_mean, kurtosis from scipy.stats.mstats import mode, gmean, hmean def linebreak(): """prints a line break to split up functions""" print('\n ============================================== \n') matplotlib.rcParams['backend'] = 'TkAgg' plt.style.use('seaborn-dark-palette') path = os.getcwd() data_file = str('/data/Social_Network_Ads.csv') df = pd.read_csv(path + data_file) # df = pd.DataFrame(df) df = df.sample(frac=1).reset_index(drop=True) print('{} rows. {} cols.'.format(df.shape[0], df.shape[1])) linebreak() print(df.iloc[0:10, :]) linebreak() X = df[['Age', 'EstimatedSalary']] y = df['Purchased'].to_frame() print('X equals:') print(X.iloc[0:5]) linebreak() print('y equals:') print(y[0:5]) linebreak() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) description = df.describe().drop(columns=['User ID']) description_grouped = df.groupby(['Purchased']) description_grouped_split = description_grouped['Age', 'EstimatedSalary']\ .describe().unstack() description_grouped_mode = description_grouped['Age'].apply(mode, axis=None) df_quartile_slary = df.groupby('Purchased')['EstimatedSalary']\ .quantile([.1, .5, .9]) df_quartile_age = df.groupby('Purchased')['Age'].quantile([.1, .5, .9]) df_trimmed_mean = description_grouped['Age', 'EstimatedSalary'].\ aggregate(trim_mean, .1) df_summary = description_grouped['Age', 'EstimatedSalary']\ .aggregate([np.median, np.std, np.mean, gmean, hmean]) df_var = description_grouped['Age', 'EstimatedSalary'].var() df_null = df.isna().sum() print(description) linebreak() print(description_grouped_split) linebreak() print(description_grouped_mode) linebreak() print(df_quartile_slary) print(df_quartile_age) linebreak() print(df_trimmed_mean) linebreak() print(df_summary) linebreak() print(df_var) linebreak() print(df_null)