# setup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices
from sklearn import linear_model
from statistics import mode
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.outliers_influence import variance_inflation_factor
# read data from source
col_list = ["timestamp", "company", "title", "gender", "totalyearlycompensation", "location", "cityid", "yearsofexperience", "yearsatcompany"]
df = pd.read_csv("../DSC503-StatisticalMethods/stemSalaries.csv", usecols=col_list)
# remove zero values from totalyearlycompensation and only keep male and female genders and check head of dataframe
df = df[df["totalyearlycompensation"] > 0]
df = df[df["gender"].notna()]
df = df[df.gender.str.contains(r'Male|Female')]
print(df.head(10))
timestamp company title \ 0 2/8/2020 8:44 Amazon Software Engineer 1 8/16/2019 8:42 Amazon Software Engineer 2 5/17/2021 9:12 Amazon Software Engineer 3 9/14/2019 13:41 Uber Software Engineer 4 9/16/2019 8:52 Uber Software Engineer 6 12/4/2020 9:06 Uber Software Engineer 7 8/1/2021 4:57 Uber Software Engineer 9 3/23/2021 6:07 Grubhub Software Engineer 10 6/4/2021 23:55 Andela Software Engineer 11 3/20/2020 2:29 DXC Technology Software Engineer totalyearlycompensation location yearsofexperience \ 0 109000 Aachen, NW, Germany 3.0 1 112000 Aachen, NW, Germany 5.0 2 114000 Aachen, NW, Germany 4.0 3 123000 Aarhus, AR, Denmark 3.0 4 200000 Aarhus, AR, Denmark 3.0 6 376000 Aarhus, AR, Denmark 4.0 7 418000 Aarhus, AR, Denmark 10.0 9 173000 Abingdon, MD 8.0 10 66000 Accra, AA, Ghana 5.0 11 49000 Adelaide, SA, Australia 4.0 yearsatcompany gender cityid 0 1.0 Male 3645 1 0.0 Male 3645 2 0.0 Male 3645 3 1.0 Male 12962 4 3.0 Male 12962 6 4.0 Male 12962 7 0.0 Male 12962 9 4.0 Male 14910 10 2.0 Male 3949 11 4.0 Male 1312
# use describe to get mean and standard deviations
print(df.describe())
# get mode and variance using built in stats library
print("Mode of totalyearlycompensation: ", mode(df["totalyearlycompensation"]))
print("Mode of cityid: ", mode(df["cityid"]))
print("Mode of yearsofexperience: ", mode(df["yearsofexperience"]))
print("Mode of yearsatcompany: ", mode(df["yearsatcompany"]))
print("Variance of totalyearlycompensation: ", np.var(df["totalyearlycompensation"], ddof=1))
print("Variance of cityid: ", np.var(df["cityid"], ddof=1))
print("Variance of yearsofexperience: ", np.var(df["yearsofexperience"], ddof=1))
print("Variance of yearsatcompany: ", np.var(df["yearsatcompany"], ddof=1))
totalyearlycompensation yearsofexperience yearsatcompany \ count 4.270100e+04 42701.000000 42701.000000 mean 2.096069e+05 7.098784 2.677312 std 1.347096e+05 5.851934 3.250897 min 1.000000e+04 0.000000 0.000000 25% 1.310000e+05 3.000000 0.000000 50% 1.830000e+05 6.000000 2.000000 75% 2.550000e+05 10.000000 4.000000 max 4.980000e+06 45.000000 40.000000 cityid count 42701.000000 mean 9935.687150 std 6915.210395 min 0.000000 25% 7351.000000 50% 8198.000000 75% 11521.000000 max 47926.000000 Mode of totalyearlycompensation: 200000 Mode of cityid: 11527 Mode of yearsofexperience: 5.0 Mode of yearsatcompany: 0.0 Variance of totalyearlycompensation: 18146679952.80156 Variance of cityid: 47820134.81366836 Variance of yearsofexperience: 34.24513682132668 Variance of yearsatcompany: 10.568328747081715
#setup variables to treat as simple linear regressions
X1 = df.company.astype('category').cat.codes
Y = df["totalyearlycompensation"].astype(int)
#plot points for overview with trendline
x = X1.to_numpy()
y = df["totalyearlycompensation"].to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("Company")
plt.ylabel("Total Yearly Compensation")
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, " Intercept: ", b)
Slope: -12.930704978764007 Intercept: 217122.25266743836
#setup variables to treat as simple linear regressions
X2 = df.title.astype('category').cat.codes
#plot points for overview with trendline
x = X2.to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("Position")
plt.ylabel("Total Yearly Compensation")
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, " Intercept: ", b)
Slope: 2730.394098249505 Intercept: 183053.69848981927
#setup variables to treat as simple linear regressions
X3 = df.gender.astype('category').cat.codes
#plot points for overview with trendline
x = X3.to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("Gender")
plt.ylabel("Total Yearly Compensation")
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, " Intercept: ", b)
Slope: 17327.389415765203 Intercept: 195119.5885126424
#setup variables to treat as simple linear regressions
X4 = df.location.astype('category').cat.codes
#plot points for overview with trendline
x = X4.to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("Location")
plt.ylabel("Total Yearly Compensation")
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, " Intercept: ", b)
Slope: 107.63347771237846 Intercept: 151369.95145899727
#setup variables to treat as simple linear regressions
X5 = df["cityid"]
#plot points for overview with trendline
x = X5.to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("CityID")
plt.ylabel("Total Yearly Compensation")
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, " Intercept: ", b)
Slope: -2.1374757315351656 Intercept: 230844.18475228664
#setup variables to treat as simple linear regressions
X6 = df["yearsofexperience"]
#plot points for overview with trendline
x = X6.to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("Years Experience")
plt.ylabel("Total Yearly Compensation")
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, " Intercept: ", b)
Slope: 9482.568242161939 Intercept: 142292.18989560747
#setup variables to treat as simple linear regressions
X7 = df["yearsatcompany"]
#plot points for overview with trendline
x = X7.to_numpy()
plt.plot(x, y, "o")
plt.ylim([0, 900000])
plt.xlabel("Years at Company")
plt.ylabel("Total Yearly Compensation")
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, "r", linewidth = 3)
print("Slope: ", m, " Intercept: ", b)
Slope: 6787.984723670039 Intercept: 191433.3406365643
#zip values and handle multi linear regression
X = pd.DataFrame(list(zip(X1,X2,X3,X4,X5,X6,X7)))
reg = linear_model.LinearRegression()
reg.fit(X,Y)
print('Intercept: ', reg.intercept_)
print('Coefficent Values: ', reg.coef_)
smX = sm.add_constant(X)
smY = list(Y)
model = sm.OLS(smY,smX).fit()
predictions = model.predict(smX)
print(model.summary())
Intercept: 94734.62058271191 Coefficent Values: [-1.35838020e+01 1.34705595e+03 8.03710251e+03 1.04257209e+02 -1.85040543e+00 1.02034478e+04 -2.80501615e+03] OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.226 Model: OLS Adj. R-squared: 0.226 Method: Least Squares F-statistic: 1782. Date: Fri, 10 Dec 2021 Prob (F-statistic): 0.00 Time: 18:24:26 Log-Likelihood: -5.5945e+05 No. Observations: 42701 AIC: 1.119e+06 Df Residuals: 42693 BIC: 1.119e+06 Df Model: 7 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 9.473e+04 2843.568 33.315 0.000 8.92e+04 1e+05 0 -13.5838 1.463 -9.284 0.000 -16.452 -10.716 1 1347.0560 185.750 7.252 0.000 982.982 1711.130 2 8037.1025 1563.852 5.139 0.000 4971.922 1.11e+04 3 104.2572 2.271 45.907 0.000 99.806 108.709 4 -1.8504 0.083 -22.272 0.000 -2.013 -1.688 5 1.02e+04 115.388 88.428 0.000 9977.286 1.04e+04 6 -2805.0161 207.146 -13.541 0.000 -3211.026 -2399.007 ============================================================================== Omnibus: 52088.268 Durbin-Watson: 0.502 Prob(Omnibus): 0.000 Jarque-Bera (JB): 45021627.321 Skew: 5.894 Prob(JB): 0.00 Kurtosis: 161.636 Cond. No. 6.19e+04 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 6.19e+04. This might indicate that there are strong multicollinearity or other numerical problems.
#anova model for multivariate analysis
manova = MANOVA.from_formula('company + title + gender + location + cityid + yearsofexperience + yearsatcompany ~ totalyearlycompensation', data = df)
print(manova.mv_test())
Multivariate linear model ================================================================================================ ------------------------------------------------------------------------------------------------ Intercept Value Num DF Den DF F Value Pr > F ------------------------------------------------------------------------------------------------ Wilks' lambda -8586912603946734563658563584.0000 1658.0000 41038.0000 -24.7515 1.0000 Pillai's trace 6732109.8121 1658.0000 41038.0000 -24.7515 1.0000 Hotelling-Lawley trace -16.2960 1658.0000 41038.0000 -403.3517 1.0000 Roy's greatest root 0.8921 1658.0000 41038.0000 22.0813 0.0000 ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------- totalyearlycompensation Value Num DF Den DF F Value Pr > F ------------------------------------------------------------------------------------------------- Wilks' lambda 63364490840984669650944.0000 1759.0000 40937.0000 -23.2729 1.0000 Pillai's trace 3040888.1009 1759.0000 40937.0000 -23.2729 1.0000 Hotelling-Lawley trace -8.8055 1759.0000 40937.0000 -204.9299 1.0000 Roy's greatest root 0.5788 1759.0000 40937.0000 13.4692 0.0000 ================================================================================================
#VIF test
dfTest = df
dfTest['company'] = df.company.astype('category').cat.codes
dfTest['title'] = df.title.astype('category').cat.codes
dfTest['gender'] = df.gender.astype('category').cat.codes
dfTest['location'] = df.location.astype('category').cat.codes
y,X = dmatrices('totalyearlycompensation ~ company + title + gender + location + cityid + yearsofexperience + yearsatcompany', data = dfTest, return_type = 'dataframe')
dfVif = pd.DataFrame()
dfVif['variable'] = X.columns
dfVif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(dfVif)
variable VIF 0 Intercept 24.581337 1 company 1.001698 2 title 1.018020 3 gender 1.018877 4 location 1.005509 5 cityid 1.003424 6 yearsofexperience 1.386071 7 yearsatcompany 1.378561