85 lines
2.0 KiB
Python
85 lines
2.0 KiB
Python
import pandas as pd
|
|
import os
|
|
import matplotlib
|
|
import matplotlib.pyplot as plt
|
|
# from sklearn.preprocessing import StandardScaler
|
|
from sklearn.model_selection import train_test_split
|
|
import numpy as np
|
|
from scipy.stats import trim_mean, kurtosis
|
|
from scipy.stats.mstats import mode, gmean, hmean
|
|
|
|
|
|
def linebreak():
|
|
"""prints a line break to split up functions"""
|
|
print('\n ============================================== \n')
|
|
|
|
|
|
matplotlib.rcParams['backend'] = 'TkAgg'
|
|
plt.style.use('seaborn-dark-palette')
|
|
|
|
path = os.getcwd()
|
|
data_file = str('/data/Social_Network_Ads.csv')
|
|
|
|
df = pd.read_csv(path + data_file)
|
|
# df = pd.DataFrame(df)
|
|
|
|
df = df.sample(frac=1).reset_index(drop=True)
|
|
|
|
print('{} rows. {} cols.'.format(df.shape[0], df.shape[1]))
|
|
|
|
linebreak()
|
|
print(df.iloc[0:10, :])
|
|
|
|
linebreak()
|
|
X = df[['Age', 'EstimatedSalary']]
|
|
y = df['Purchased'].to_frame()
|
|
|
|
print('X equals:')
|
|
print(X.iloc[0:5])
|
|
linebreak()
|
|
print('y equals:')
|
|
print(y[0:5])
|
|
linebreak()
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
|
|
|
|
description = df.describe().drop(columns=['User ID'])
|
|
|
|
description_grouped = df.groupby(['Purchased'])
|
|
|
|
description_grouped_split = description_grouped['Age', 'EstimatedSalary']\
|
|
.describe().unstack()
|
|
|
|
description_grouped_mode = description_grouped['Age'].apply(mode, axis=None)
|
|
|
|
df_quartile_slary = df.groupby('Purchased')['EstimatedSalary']\
|
|
.quantile([.1, .5, .9])
|
|
df_quartile_age = df.groupby('Purchased')['Age'].quantile([.1, .5, .9])
|
|
|
|
df_trimmed_mean = description_grouped['Age', 'EstimatedSalary'].\
|
|
aggregate(trim_mean, .1)
|
|
|
|
df_summary = description_grouped['Age', 'EstimatedSalary']\
|
|
.aggregate([np.median, np.std, np.mean, gmean, hmean])
|
|
|
|
df_var = description_grouped['Age', 'EstimatedSalary'].var()
|
|
|
|
df_null = df.isna().sum()
|
|
|
|
print(description)
|
|
linebreak()
|
|
print(description_grouped_split)
|
|
linebreak()
|
|
print(description_grouped_mode)
|
|
linebreak()
|
|
print(df_quartile_slary)
|
|
print(df_quartile_age)
|
|
linebreak()
|
|
print(df_trimmed_mean)
|
|
linebreak()
|
|
print(df_summary)
|
|
linebreak()
|
|
print(df_var)
|
|
linebreak()
|
|
print(df_null)
|