# setup
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from collections import Counter
from matplotlib.ticker import MaxNLocator
from patsy import dmatrices
from sklearn import linear_model
from statistics import mode
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.outliers_influence import variance_inflation_factor
col_list = ["Credits", "Debit"]
df = pd.read_csv("FCM_642_Raw_Data_10_000_Transactions_clean.csv", usecols=col_list)
print(df.head(10))
df["Credits"] = df["Credits"].fillna(0)
df["Debit"] = df["Debit"].fillna(0)
# use describe to get mean and standard deviations
print(df.describe())
Credits Debit 0 75100.0 NaN 1 NaN 127106.0 2 72300.0 NaN 3 121100.0 NaN 4 339300.0 NaN 5 96500.0 NaN 6 23900.0 NaN 7 NaN 181594.0 8 44100.0 NaN 9 49500.0 NaN Credits Debit count 10000.000000 10000.000000 mean 40581.300000 64676.300660 std 74308.512526 84180.214463 min 0.000000 0.000000 25% 0.000000 0.000000 50% 0.000000 38007.190000 75% 61100.000000 98686.000000 max 389300.000000 438360.000000
# create violin plots of Credits and Debits
plt.figure(figsize=(15,8))
df.boxplot()
<AxesSubplot:>
df2 = df.melt(var_name='groups', value_name='vals')
df2 = df2[df2["vals"] > 0]
print (df2)
plt.figure(figsize=(15,8))
ax = sns.violinplot(x="groups", y="vals", data=df2)
groups vals 0 Credits 75100.0 2 Credits 72300.0 3 Credits 121100.0 4 Credits 339300.0 5 Credits 96500.0 ... ... ... 19816 Debit 117110.0 19817 Debit 26166.0 19818 Debit 38040.0 19819 Debit 90062.0 19820 Debit 78302.0 [10000 rows x 2 columns]