In [1]:
# setup
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

from collections import Counter
from matplotlib.ticker import MaxNLocator
from patsy import dmatrices
from sklearn import linear_model
from statistics import mode
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.outliers_influence import variance_inflation_factor
In [2]:
col_list = ["Credits", "Debit"]
df = pd.read_csv("FCM_642_Raw_Data_10_000_Transactions_clean.csv", usecols=col_list)
print(df.head(10))
df["Credits"] = df["Credits"].fillna(0)
df["Debit"] = df["Debit"].fillna(0)
# use describe to get mean and standard deviations
print(df.describe())
    Credits     Debit
0   75100.0       NaN
1       NaN  127106.0
2   72300.0       NaN
3  121100.0       NaN
4  339300.0       NaN
5   96500.0       NaN
6   23900.0       NaN
7       NaN  181594.0
8   44100.0       NaN
9   49500.0       NaN
             Credits          Debit
count   10000.000000   10000.000000
mean    40581.300000   64676.300660
std     74308.512526   84180.214463
min         0.000000       0.000000
25%         0.000000       0.000000
50%         0.000000   38007.190000
75%     61100.000000   98686.000000
max    389300.000000  438360.000000
In [7]:
# create violin plots of Credits and Debits
plt.figure(figsize=(15,8))
df.boxplot()
Out[7]:
<AxesSubplot:>
In [6]:
df2 = df.melt(var_name='groups', value_name='vals')
df2 = df2[df2["vals"] > 0]
print (df2)
plt.figure(figsize=(15,8))
ax = sns.violinplot(x="groups", y="vals", data=df2)
        groups      vals
0      Credits   75100.0
2      Credits   72300.0
3      Credits  121100.0
4      Credits  339300.0
5      Credits   96500.0
...        ...       ...
19816    Debit  117110.0
19817    Debit   26166.0
19818    Debit   38040.0
19819    Debit   90062.0
19820    Debit   78302.0

[10000 rows x 2 columns]
In [ ]: