In [22]:
# setup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

from patsy import dmatrices
from sklearn import linear_model
from statistics import mode
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.outliers_influence import variance_inflation_factor
In [23]:
# read data from source
col_list = ["timestamp", "company", "title", "gender", "totalyearlycompensation", "location", "cityid", "yearsofexperience", "yearsatcompany"]
df = pd.read_csv("../DSC503-StatisticalMethods/stemSalaries.csv", usecols=col_list)

# remove zero values from totalyearlycompensation and only keep male and female genders and check head of dataframe
df = df[df["totalyearlycompensation"] > 0]
df = df[df["gender"].notna()]
df = df[df.gender.str.contains(r'Male|Female')]
print(df.head(10))
          timestamp         company              title  \
0     2/8/2020 8:44          Amazon  Software Engineer   
1    8/16/2019 8:42          Amazon  Software Engineer   
2    5/17/2021 9:12          Amazon  Software Engineer   
3   9/14/2019 13:41            Uber  Software Engineer   
4    9/16/2019 8:52            Uber  Software Engineer   
6    12/4/2020 9:06            Uber  Software Engineer   
7     8/1/2021 4:57            Uber  Software Engineer   
9    3/23/2021 6:07         Grubhub  Software Engineer   
10   6/4/2021 23:55          Andela  Software Engineer   
11   3/20/2020 2:29  DXC Technology  Software Engineer   

    totalyearlycompensation                 location  yearsofexperience  \
0                    109000      Aachen, NW, Germany                3.0   
1                    112000      Aachen, NW, Germany                5.0   
2                    114000      Aachen, NW, Germany                4.0   
3                    123000      Aarhus, AR, Denmark                3.0   
4                    200000      Aarhus, AR, Denmark                3.0   
6                    376000      Aarhus, AR, Denmark                4.0   
7                    418000      Aarhus, AR, Denmark               10.0   
9                    173000             Abingdon, MD                8.0   
10                    66000         Accra, AA, Ghana                5.0   
11                    49000  Adelaide, SA, Australia                4.0   

    yearsatcompany gender  cityid  
0              1.0   Male    3645  
1              0.0   Male    3645  
2              0.0   Male    3645  
3              1.0   Male   12962  
4              3.0   Male   12962  
6              4.0   Male   12962  
7              0.0   Male   12962  
9              4.0   Male   14910  
10             2.0   Male    3949  
11             4.0   Male    1312  
In [24]:
# use describe to get mean and standard deviations
print(df.describe())
# get mode and variance using built in stats library
print("Mode of totalyearlycompensation: ", mode(df["totalyearlycompensation"]))
print("Mode of cityid: ", mode(df["cityid"]))
print("Mode of yearsofexperience: ", mode(df["yearsofexperience"]))
print("Mode of yearsatcompany: ", mode(df["yearsatcompany"]))
print("Variance of totalyearlycompensation: ", np.var(df["totalyearlycompensation"], ddof=1))
print("Variance of cityid: ", np.var(df["cityid"], ddof=1))
print("Variance of yearsofexperience: ", np.var(df["yearsofexperience"], ddof=1))
print("Variance of yearsatcompany: ", np.var(df["yearsatcompany"], ddof=1))
       totalyearlycompensation  yearsofexperience  yearsatcompany  \
count             4.270100e+04       42701.000000    42701.000000   
mean              2.096069e+05           7.098784        2.677312   
std               1.347096e+05           5.851934        3.250897   
min               1.000000e+04           0.000000        0.000000   
25%               1.310000e+05           3.000000        0.000000   
50%               1.830000e+05           6.000000        2.000000   
75%               2.550000e+05          10.000000        4.000000   
max               4.980000e+06          45.000000       40.000000   

             cityid  
count  42701.000000  
mean    9935.687150  
std     6915.210395  
min        0.000000  
25%     7351.000000  
50%     8198.000000  
75%    11521.000000  
max    47926.000000  
Mode of totalyearlycompensation:  200000
Mode of cityid:  11527
Mode of yearsofexperience:  5.0
Mode of yearsatcompany:  0.0
Variance of totalyearlycompensation:  18146679952.80156
Variance of cityid:  47820134.81366836
Variance of yearsofexperience:  34.24513682132668
Variance of yearsatcompany:  10.568328747081715
In [25]:
#setup variables to treat as simple linear regressions
X1 = df.company.astype('category').cat.codes
Y = df["totalyearlycompensation"].astype(int)

#plot points for overview with trendline
x = X1.to_numpy()
y = df["totalyearlycompensation"].to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("Company")
plt.ylabel("Total Yearly Compensation")

m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, "    Intercept: ", b)
Slope:  -12.930704978764007     Intercept:  217122.25266743836
In [26]:
#setup variables to treat as simple linear regressions
X2 = df.title.astype('category').cat.codes

#plot points for overview with trendline
x = X2.to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("Position")
plt.ylabel("Total Yearly Compensation")

m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, "    Intercept: ", b)
Slope:  2730.394098249505     Intercept:  183053.69848981927
In [27]:
#setup variables to treat as simple linear regressions
X3 = df.gender.astype('category').cat.codes

#plot points for overview with trendline
x = X3.to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("Gender")
plt.ylabel("Total Yearly Compensation")

m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, "    Intercept: ", b)
Slope:  17327.389415765203     Intercept:  195119.5885126424
In [28]:
#setup variables to treat as simple linear regressions
X4 = df.location.astype('category').cat.codes

#plot points for overview with trendline
x = X4.to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("Location")
plt.ylabel("Total Yearly Compensation")

m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, "    Intercept: ", b)
Slope:  107.63347771237846     Intercept:  151369.95145899727
In [29]:
#setup variables to treat as simple linear regressions
X5 = df["cityid"]

#plot points for overview with trendline
x = X5.to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("CityID")
plt.ylabel("Total Yearly Compensation")

m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, "    Intercept: ", b)
Slope:  -2.1374757315351656     Intercept:  230844.18475228664
In [30]:
#setup variables to treat as simple linear regressions
X6 = df["yearsofexperience"]

#plot points for overview with trendline
x = X6.to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("Years Experience")
plt.ylabel("Total Yearly Compensation")

m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, "    Intercept: ", b)
Slope:  9482.568242161939     Intercept:  142292.18989560747
In [31]:
#setup variables to treat as simple linear regressions
X7 = df["yearsatcompany"]

#plot points for overview with trendline
x = X7.to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("Years at Company")
plt.ylabel("Total Yearly Compensation")

m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, "    Intercept: ", b)
Slope:  6787.984723670039     Intercept:  191433.3406365643
In [32]:
#zip values and handle multi linear regression
X = pd.DataFrame(list(zip(X1,X2,X3,X4,X5,X6,X7)))
reg = linear_model.LinearRegression()
reg.fit(X,Y)
print('Intercept: ', reg.intercept_)
print('Coefficent Values: ', reg.coef_)

smX = sm.add_constant(X)
smY = list(Y)
model = sm.OLS(smY,smX).fit()
predictions = model.predict(smX) 
print(model.summary())
Intercept:  94734.62058271191
Coefficent Values:  [-1.35838020e+01  1.34705595e+03  8.03710251e+03  1.04257209e+02
 -1.85040543e+00  1.02034478e+04 -2.80501615e+03]
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.226
Model:                            OLS   Adj. R-squared:                  0.226
Method:                 Least Squares   F-statistic:                     1782.
Date:                Fri, 10 Dec 2021   Prob (F-statistic):               0.00
Time:                        18:24:26   Log-Likelihood:            -5.5945e+05
No. Observations:               42701   AIC:                         1.119e+06
Df Residuals:                   42693   BIC:                         1.119e+06
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       9.473e+04   2843.568     33.315      0.000    8.92e+04       1e+05
0            -13.5838      1.463     -9.284      0.000     -16.452     -10.716
1           1347.0560    185.750      7.252      0.000     982.982    1711.130
2           8037.1025   1563.852      5.139      0.000    4971.922    1.11e+04
3            104.2572      2.271     45.907      0.000      99.806     108.709
4             -1.8504      0.083    -22.272      0.000      -2.013      -1.688
5            1.02e+04    115.388     88.428      0.000    9977.286    1.04e+04
6          -2805.0161    207.146    -13.541      0.000   -3211.026   -2399.007
==============================================================================
Omnibus:                    52088.268   Durbin-Watson:                   0.502
Prob(Omnibus):                  0.000   Jarque-Bera (JB):         45021627.321
Skew:                           5.894   Prob(JB):                         0.00
Kurtosis:                     161.636   Cond. No.                     6.19e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.19e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [33]:
#anova model for multivariate analysis
manova = MANOVA.from_formula('company + title + gender + location + cityid + yearsofexperience + yearsatcompany ~ totalyearlycompensation', data = df)
print(manova.mv_test())
                                   Multivariate linear model
================================================================================================
                                                                                                
------------------------------------------------------------------------------------------------
       Intercept                      Value                  Num DF    Den DF    F Value  Pr > F
------------------------------------------------------------------------------------------------
          Wilks' lambda -8586912603946734563658563584.0000 1658.0000 41038.0000  -24.7515 1.0000
         Pillai's trace                       6732109.8121 1658.0000 41038.0000  -24.7515 1.0000
 Hotelling-Lawley trace                           -16.2960 1658.0000 41038.0000 -403.3517 1.0000
    Roy's greatest root                             0.8921 1658.0000 41038.0000   22.0813 0.0000
------------------------------------------------------------------------------------------------
                                                                                                
-------------------------------------------------------------------------------------------------
  totalyearlycompensation             Value                Num DF     Den DF     F Value   Pr > F
-------------------------------------------------------------------------------------------------
            Wilks' lambda  63364490840984669650944.0000  1759.0000  40937.0000   -23.2729  1.0000
           Pillai's trace                  3040888.1009  1759.0000  40937.0000   -23.2729  1.0000
   Hotelling-Lawley trace                       -8.8055  1759.0000  40937.0000  -204.9299  1.0000
      Roy's greatest root                        0.5788  1759.0000  40937.0000    13.4692  0.0000
================================================================================================

In [34]:
#VIF test
dfTest = df
dfTest['company'] = df.company.astype('category').cat.codes
dfTest['title'] = df.title.astype('category').cat.codes
dfTest['gender'] = df.gender.astype('category').cat.codes
dfTest['location'] = df.location.astype('category').cat.codes
y,X = dmatrices('totalyearlycompensation ~ company + title + gender + location + cityid + yearsofexperience + yearsatcompany', data = dfTest, return_type = 'dataframe')

dfVif = pd.DataFrame()
dfVif['variable'] = X.columns
dfVif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(dfVif)
            variable        VIF
0          Intercept  24.581337
1            company   1.001698
2              title   1.018020
3             gender   1.018877
4           location   1.005509
5             cityid   1.003424
6  yearsofexperience   1.386071
7     yearsatcompany   1.378561
In [ ]: