adding all files done so far

This commit is contained in:
2019-07-10 20:18:31 +01:00
parent 13c0e9cb4d
commit e3ac390e8b
76 changed files with 8644 additions and 0 deletions

View File

@@ -0,0 +1,84 @@
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
# from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.stats import trim_mean, kurtosis
from scipy.stats.mstats import mode, gmean, hmean
def linebreak():
"""prints a line break to split up functions"""
print('\n ============================================== \n')
matplotlib.rcParams['backend'] = 'TkAgg'
plt.style.use('seaborn-dark-palette')
path = os.getcwd()
data_file = str('/data/Social_Network_Ads.csv')
df = pd.read_csv(path + data_file)
# df = pd.DataFrame(df)
df = df.sample(frac=1).reset_index(drop=True)
print('{} rows. {} cols.'.format(df.shape[0], df.shape[1]))
linebreak()
print(df.iloc[0:10, :])
linebreak()
X = df[['Age', 'EstimatedSalary']]
y = df['Purchased'].to_frame()
print('X equals:')
print(X.iloc[0:5])
linebreak()
print('y equals:')
print(y[0:5])
linebreak()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
description = df.describe().drop(columns=['User ID'])
description_grouped = df.groupby(['Purchased'])
description_grouped_split = description_grouped['Age', 'EstimatedSalary']\
.describe().unstack()
description_grouped_mode = description_grouped['Age'].apply(mode, axis=None)
df_quartile_slary = df.groupby('Purchased')['EstimatedSalary']\
.quantile([.1, .5, .9])
df_quartile_age = df.groupby('Purchased')['Age'].quantile([.1, .5, .9])
df_trimmed_mean = description_grouped['Age', 'EstimatedSalary'].\
aggregate(trim_mean, .1)
df_summary = description_grouped['Age', 'EstimatedSalary']\
.aggregate([np.median, np.std, np.mean, gmean, hmean])
df_var = description_grouped['Age', 'EstimatedSalary'].var()
df_null = df.isna().sum()
print(description)
linebreak()
print(description_grouped_split)
linebreak()
print(description_grouped_mode)
linebreak()
print(df_quartile_slary)
print(df_quartile_age)
linebreak()
print(df_trimmed_mean)
linebreak()
print(df_summary)
linebreak()
print(df_var)
linebreak()
print(df_null)