# setup
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.tools as tools
from collections import Counter
from matplotlib.ticker import MaxNLocator
from patsy import dmatrices
from sklearn.cluster import KMeans
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import SVR
from statistics import mode
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.outliers_influence import variance_inflation_factor
pd.options.mode.chained_assignment = None
# read data for Reed and religiosity datasets at state level
reed_col_list = ["stateId","stateName","statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022"
,"transAdultPop2016","transAdultPercent2016","transAdultPop2022","transAdultPercent2022"
,"religionImportantPew2014","worshipWeeklyPew2014","prayDailyPew2014","certainAboutGodPew2014"
,"overallReligiosityPew2014","veryReligiousStatista2017","moderatelyReligiousStatista2017"
,"nonreligiousStatista2017","relLibScore2022","relLibVote2022","relLibVax2022"
,"relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022"
]
reed_index_list = ["stateId","stateName","antiTransLegislationRiskIndex32023"
,"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022"
]
trans_pop_list = ["stateId","stateName","statePopulation2020","statePopulation2023","transAdultPop2016"
,"transAdultPercent2016","transAdultPop2022","transAdultPercent2022"
]
religiosity_2014_list = ["stateId","stateName","religionImportantPew2014","worshipWeeklyPew2014"
,"prayDailyPew2014","certainAboutGodPew2014","overallReligiosityPew2014"
]
religiosity_2017_list = ["stateId","stateName","veryReligiousStatista2017","moderatelyReligiousStatista2017"
,"nonreligiousStatista2017"
]
religiosity_2022_list = ["stateId","stateName","relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022"
,"relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022"
]
reedFulldf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=reed_col_list)
reedFulldf = reedFulldf[reedFulldf["stateId"] != 11]
reedIndexdf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=reed_index_list)
reedIndexdf = reedIndexdf[reedIndexdf["stateId"] != 11]
transStatePopdf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=trans_pop_list)
transStatePopdf = transStatePopdf[transStatePopdf["stateId"] != 11]
religious2014df = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=religiosity_2014_list)
religious2014df = religious2014df[religious2014df["stateId"] != 11]
religious2017df = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=religiosity_2017_list)
religious2017df = religious2017df[religious2017df["stateId"] != 11]
religious2022df = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=religiosity_2022_list)
religious2022df = religious2022df[religious2022df["stateId"] != 11]
#print(reedFulldf.head())
#create function to get count of unique values in column and get percentages
def countCol(df, dfCol):
tempdf = df[dfCol]
namecount = dfCol + "count"
namepercent = dfCol + "percent"
tempdf[namecount] = df[dfCol].value_counts()
tempdf[namepercent] = df[dfCol].value_counts(normalize=True)*100
return tempdf
# use describe to get mean and standard deviations of dataframe data
def describeDF(df, dfCol):
print(df.describe())
# get mode and variance using built in stats library
for colName in dfCol:
if(df[colName].dtypes != object):
print("Mode of ",colName,": ", mode(df[colName]))
print("Variance of ",colName,": ", np.var(df[colName], ddof=1))
print()
def combinedf(df, dfCol, dfName):
retdf = pd.DataFrame()
for colName in dfCol:
namecount = colName + "count"
namepercent = colName + "percent"
dfNameColCount = dfName + namecount
dfNameColPercent = dfName + namepercent
tempdf = countCol(df,colName)
retdf[dfNameColCount] = tempdf[namecount]
retdf[dfNameColPercent] = tempdf[namepercent]
return retdf
#function to allow grouping gender identity on 3 values
def basicGenMarker(asab, gender):
if(asab == 1 and gender == 1):
return "Cisgender Man"
elif (asab == 2 and gender == 2):
return "Cisgender Woman"
else:
return "Transgender"
#function to print covariance map based on given columns
def printCovariance(df,dfCol,colLabels,title):
cols = dfCol
stdsc = StandardScaler()
X_std = stdsc.fit_transform(df[cols].iloc[:,range(0,len(dfCol))].values)
cov_mat = np.cov(X_std.T)
plt.figure(figsize=(7,7))
sns.set(font_scale=1)
hm = sns.heatmap(cov_mat, cbar = True, annot = True, square = True, fmt = ".2f", cmap = "vlag",
annot_kws={"size":12}, yticklabels = colLabels, xticklabels = colLabels, cbar_kws={"shrink": 0.5})
#plt.title(title)
plt.tight_layout()
plt.show()
#function to build classifier matrix after testing NB model
# Build Naive Bayes Classifer to sort and classify data
# split datasets into training and test sets
def printNBClassifierOutcome(X,y,trainSize,trainState,matrixTitle):
size = trainSize
state = trainState
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-size, random_state=state)
# scale input data for training if necessary for better predictions
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
# initiale, train and test the BNB
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
pred = bnb.predict(X_test)
#check accuracy
bnb_accuracy = metrics.accuracy_score(pred, y_test)
printConfusionMatrix(y_test, pred,matrixTitle)
#function to build classifier matrix after testing SVM model
#build SVM
# split datasets into training and test sets
def printSVMClassifierOutcome(X,y,trainSize,trainState,matrixTitle):
size = trainSize
state = trainState
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-size, random_state=state)
# scale input data for training if necessary for better predictions
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
# build and train model
clf = SVC(kernel = "linear")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
#check accuracy
clf_accuracy = metrics.accuracy_score(pred, y_test)
printConfusionMatrix(y_test, pred,matrixTitle)
def printConfusionMatrix(y_test, pred,matrixTitle):
#confusion matrix
cfm = metrics.confusion_matrix(y_test, pred)
fig, ax = plt.subplots(figsize=(6,6))
ax.matshow(cfm, cmap = plt.cm.Purples, alpha=0.3)
for i in range(cfm.shape[0]):
for j in range(cfm.shape[1]):
ax.text(x=j, y=i,s=cfm[i, j], va='center',
ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize = 16)
plt.ylabel('Actuals', fontsize = 16)
plt.title(matrixTitle, fontsize = 14)
plt.show()
print(metrics.classification_report(y_test, pred, zero_division = 0))
nEstimators = 500
decPrecision = 4
maxDepth = 3
def printRFRClassifierOutcome(X,y,trainSize,trainState, featTitle):
size = trainSize
state = trainState
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-size, random_state=state)
sc_X = StandardScaler()
sc_y = StandardScaler()
sc_X_train = sc_X.fit_transform(X_train)
sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
sc_y_train = sc_y_train
rf_regressor = RandomForestRegressor(n_estimators = nEstimators, random_state = state)
rf_regressor.fit(X_train, y_train)
rf_y_pred = rf_regressor.predict(X_test)
np.set_printoptions(precision=decPrecision)
print("Random Forest (" + str(nEstimators) + " Tree) Regression Accuracy: " + str(round(r2_score(y_test, rf_y_pred), decPrecision)))
#rf_regressor.feature_names_in_
featureDf = pd.DataFrame({"Features" : rf_regressor.feature_names_in_, "Importance" : rf_regressor.feature_importances_})
featureDf = featureDf.sort_values(by=["Importance"], ascending=False)
print(featureDf)
#plot bar chart of importance
f, ax = plt.subplots(figsize=(20,12))
sns.barplot(x=featureDf["Features"], y=featureDf["Importance"], palette="flare")
plt.xlabel('Features', fontsize = 16)
plt.ylabel('Importance', fontsize = 16)
plt.title(featTitle, fontsize=16)
plt.xticks(rotation=45)
#for val in plt.containers:
#plt.bar_label(val)
plt.show()
#rfc = RandomForestClassifier(n_estimators=nEstimators, max_depth=maxDepth, random_state=state)
#rfc.fit(X_train, y_train)
#features = X.columns.values
#classes = ['Cisgender man', 'Cisgender Woman', 'Transgender']
#for estimator in rfc.estimators_:
#print(estimator)
#plt.figure(figsize=(20,10))
#tree.plot_tree(estimator, feature_names=features, class_names=classes, fontsize=10, filled=True, rounded=True)
#plt.show()
maxIter=1000000000
def LogRegressionOutcome(X,y,trainSize,trainState,featTitle):
size = trainSize
state = trainState
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-size, random_state = state)
sc_X = StandardScaler()
sc_y = StandardScaler()
sc_X_train = sc_X.fit_transform(X_train)
sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
sc_y_train = sc_y_train
log_regression = LogisticRegression(solver="newton-cg", random_state=state, penalty="l2", C=0.01, max_iter=maxIter).fit(X_train,y_train)
print("Logistic Regression Accuracy: " + str(round(log_regression.score(X_test,y_test), decPrecision)))
x_train_const = tools.add_constant(X_train)
coefArray = []
for ind in range(log_regression.coef_.shape[0]):
if ind == 0:
featFor = "Cisgender Men"
elif ind == 1:
featFor = "Cisgender Women"
else:
featFor = "Transgender"
fullFeatTitle = "Logistic Regression " + featFor + " Feature Important for " + featTitle
for x in log_regression.coef_[ind]:
#print(np.exp(x)/(1 + np.exp(x)))
coefArray.append(np.exp(x)/(1 + np.exp(x)))
featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["Features", "Coefficients"])
featureDf = featureDf.sort_values(by=["Coefficients"], ascending=False)
print(featureDf)
#plot bar chart of importance
f, ax = plt.subplots(figsize=(20,12))
sns.barplot(x=featureDf["Features"], y=featureDf["Coefficients"], palette="flare")
plt.xlabel('Features', fontsize = 16)
plt.ylabel('Coefficients', fontsize = 16)
plt.title(fullFeatTitle, fontsize=16)
plt.xticks(rotation=45)
#for val in plt.containers:
#plt.bar_label(val)
plt.show()
describeDF(reedIndexdf, reed_index_list)
stateId antiTransLegislationRiskIndex32023 \ count 50.000000 50.00000 mean 29.320000 2.08000 std 15.782243 1.60153 min 1.000000 0.00000 25% 17.250000 1.00000 50% 29.500000 2.00000 75% 41.750000 4.00000 max 56.000000 4.00000 antiTransLegislationRiskIndex122022 \ count 50.000000 mean 1.860000 std 1.340271 min 0.000000 25% 1.000000 50% 2.000000 75% 3.000000 max 4.000000 antiTransLegislationRiskIndex112022 count 50.00000 mean 1.82000 std 1.33539 min 0.00000 25% 1.00000 50% 2.00000 75% 3.00000 max 4.00000 Mode of stateId : 1 Variance of stateId : 249.0791836734694 Mode of antiTransLegislationRiskIndex32023 : 4 Variance of antiTransLegislationRiskIndex32023 : 2.564897959183673 Mode of antiTransLegislationRiskIndex122022 : 3 Variance of antiTransLegislationRiskIndex122022 : 1.7963265306122445 Mode of antiTransLegislationRiskIndex112022 : 1 Variance of antiTransLegislationRiskIndex112022 : 1.7832653061224486
reed_cov_list = ["stateId","antiTransLegislationRiskIndex32023"
,"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022"
]
reed_label = ["stateId","RiskIndex32023"
,"RiskIndex122022","RiskIndex112022"]
printCovariance(reedIndexdf, reed_cov_list, reed_label, "Anti-Transgender Legislation Risk Index Covariance Matrix")
describeDF(transStatePopdf, trans_pop_list)
stateId statePopulation2020 statePopulation2023 transAdultPop2016 \ count 50.000000 5.000000e+01 5.000000e+01 50.00000 mean 29.320000 6.615242e+06 8.960485e+06 27654.00000 std 15.782243 7.436124e+06 1.907631e+07 36854.01958 min 1.000000 5.768510e+05 5.808170e+05 1400.00000 25% 17.250000 1.869706e+06 1.940934e+06 6375.00000 50% 29.500000 4.581796e+06 4.625424e+06 19450.00000 75% 41.750000 7.566836e+06 7.844464e+06 31037.50000 max 56.000000 3.953822e+07 1.309280e+08 218400.00000 transAdultPercent2016 transAdultPop2022 transAdultPercent2022 count 50.000000 50.000000 50.000000 mean 0.530400 26638.000000 0.531800 std 0.121722 29080.703259 0.126889 min 0.300000 2100.000000 0.200000 25% 0.432500 7025.000000 0.442500 50% 0.535000 16950.000000 0.525000 75% 0.610000 33225.000000 0.600000 max 0.780000 150100.000000 0.870000 Mode of stateId : 1 Variance of stateId : 249.0791836734694 Mode of statePopulation2020 : 5024279 Variance of statePopulation2020 : 55295936980950.49 Mode of statePopulation2023 : 5097641 Variance of statePopulation2023 : 363905671623562.75 Mode of transAdultPop2016 : 2700 Variance of transAdultPop2016 : 1358218759.1836734 Mode of transAdultPercent2016 : 0.43 Variance of transAdultPercent2016 : 0.014816163265306125 Mode of transAdultPop2022 : 6300 Variance of transAdultPop2022 : 845687302.0408163 Mode of transAdultPercent2022 : 0.6 Variance of transAdultPercent2022 : 0.016100775510204078
pop_cov_list = ["stateId","statePopulation2020","statePopulation2023","transAdultPop2016","transAdultPop2022"]
pop_label = ["stateId","TotalPop2020","TotalPop2023","TransPop2016","TransPop2022"]
printCovariance(transStatePopdf, pop_cov_list, pop_label, "Population by State Covariance Matrix")
describeDF(religious2014df, religiosity_2014_list)
stateId religionImportantPew2014 worshipWeeklyPew2014 \ count 50.000000 50.000000 50.000000 mean 29.320000 0.527000 0.359400 std 15.782243 0.107499 0.075035 min 1.000000 0.320000 0.210000 25% 17.250000 0.452500 0.310000 50% 29.500000 0.510000 0.355000 75% 41.750000 0.597500 0.390000 max 56.000000 0.770000 0.530000 prayDailyPew2014 certainAboutGodPew2014 overallReligiosityPew2014 count 50.000000 50.000000 50.000000 mean 0.541400 0.633600 0.547000 std 0.094286 0.095271 0.107423 min 0.330000 0.400000 0.330000 25% 0.490000 0.575000 0.482500 50% 0.530000 0.630000 0.540000 75% 0.607500 0.690000 0.625000 max 0.750000 0.820000 0.770000 Mode of stateId : 1 Variance of stateId : 249.0791836734694 Mode of religionImportantPew2014 : 0.44 Variance of religionImportantPew2014 : 0.011556122448979588 Mode of worshipWeeklyPew2014 : 0.34 Variance of worshipWeeklyPew2014 : 0.005630244897959184 Mode of prayDailyPew2014 : 0.51 Variance of prayDailyPew2014 : 0.008889836734693879 Mode of certainAboutGodPew2014 : 0.61 Variance of certainAboutGodPew2014 : 0.009076571428571429 Mode of overallReligiosityPew2014 : 0.54 Variance of overallReligiosityPew2014 : 0.011539795918367344
religiosity_cov_2014_list = ["stateId","religionImportantPew2014","worshipWeeklyPew2014"
,"prayDailyPew2014","certainAboutGodPew2014","overallReligiosityPew2014"]
rel_2014_label = ["stateId","VeryImportant","WorshipWeekly","PrayDaily","CertainAboutGod","Overall"]
printCovariance(religious2014df, religiosity_cov_2014_list, rel_2014_label, "Pew 2014 Religiosity Covariance Matrix")
describeDF(religious2017df, religiosity_2017_list)
stateId veryReligiousStatista2017 moderatelyReligiousStatista2017 \ count 50.000000 50.000000 50.000000 mean 29.320000 0.371600 0.287200 std 15.782243 0.090449 0.030442 min 1.000000 0.160000 0.160000 25% 17.250000 0.310000 0.270000 50% 29.500000 0.365000 0.295000 75% 41.750000 0.437500 0.300000 max 56.000000 0.590000 0.330000 nonreligiousStatista2017 count 50.000000 mean 0.342000 std 0.099857 min 0.120000 25% 0.290000 50% 0.340000 75% 0.397500 max 0.590000 Mode of stateId : 1 Variance of stateId : 249.0791836734694 Mode of veryReligiousStatista2017 : 0.28 Variance of veryReligiousStatista2017 : 0.008181061224489796 Mode of moderatelyReligiousStatista2017 : 0.3 Variance of moderatelyReligiousStatista2017 : 0.0009266938775510202 Mode of nonreligiousStatista2017 : 0.33 Variance of nonreligiousStatista2017 : 0.00997142857142857
religiosity_cov_2017_list = ["stateId","veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"]
rel_2017_label = ["stateId","Very","Moderate","Nonreligious"]
printCovariance(religious2017df, religiosity_cov_2017_list, rel_2017_label, "Gallup 2017 Religiosity Covariance Matrix")
describeDF(religious2022df, religiosity_2022_list)
stateId relLibScore2022 relLibVote2022 relLibVax2022 \ count 50.000000 50.000000 50.000000 50.000000 mean 29.320000 0.393948 0.800000 0.900000 std 15.782243 0.133298 0.404061 0.303046 min 1.000000 0.155800 0.000000 0.000000 25% 17.250000 0.314950 1.000000 1.000000 50% 29.500000 0.371200 1.000000 1.000000 75% 41.750000 0.467550 1.000000 1.000000 max 56.000000 0.818200 1.000000 1.000000 relLibHealth2022 relLibHealthMandate2022 relLibMarriage2022 \ count 50.000000 50.000000 50.000000 mean 6.760000 0.640000 1.160000 std 4.023198 0.484873 1.489555 min 0.000000 0.000000 0.000000 25% 4.250000 0.000000 0.000000 50% 5.500000 1.000000 0.000000 75% 9.000000 1.000000 3.000000 max 20.000000 1.000000 5.000000 relLibRfra2022 count 50.000000 mean 0.480000 std 0.504672 min 0.000000 25% 0.000000 50% 0.000000 75% 1.000000 max 1.000000 Mode of stateId : 1 Variance of stateId : 249.0791836734694 Mode of relLibScore2022 : 0.3377 Variance of relLibScore2022 : 0.01776847438367347 Mode of relLibVote2022 : 1.0 Variance of relLibVote2022 : 0.16326530612244897 Mode of relLibVax2022 : 1.0 Variance of relLibVax2022 : 0.09183673469387756 Mode of relLibHealth2022 : 5.0 Variance of relLibHealth2022 : 16.186122448979592 Mode of relLibHealthMandate2022 : 1.0 Variance of relLibHealthMandate2022 : 0.2351020408163265 Mode of relLibMarriage2022 : 0.0 Variance of relLibMarriage2022 : 2.218775510204082 Mode of relLibRfra2022 : 0.0 Variance of relLibRfra2022 : 0.25469387755102035
religiosity_cov_2022_list = ["stateId","relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022"
,"relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022"]
rel_2022_label = ["stateId","Score","Vote","Vax","Health","HealthMandate","Marriage","Rfra"]
printCovariance(religious2022df, religiosity_cov_2022_list, rel_2022_label, "Religious Liberty 2022 Covariance Matrix")
# read self-identification Census Pulse Survey data
pulse_col_list = ["SCRAM","WEEK","EST_ST","TBIRTH_YEAR","EEDUC","AEDUC","EGENID_BIRTH","AGENID_BIRTH"
,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME","ENDDATE","EDUCATION","ASSIGNEDGENDER"
,"CHOSENGENDER","SEXUALORIENTATION","INCOMEMIN"]
pulse_num_col_list = ["WEEK","EST_ST","TBIRTH_YEAR","EEDUC","AEDUC","EGENID_BIRTH","AGENID_BIRTH"
,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME"]
pulsedf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\pulseModFull.csv", usecols=pulse_col_list)
print("Full count of data pulse data: ", (pulsedf["GENID_DESCRIBE"] > -100).sum())
print()
countDistrictofColumbia = (pulsedf["EST_ST"] == 11).sum()
print("Count of participants in District of Columbia: ", countDistrictofColumbia)
print()
countMissingGender = (pulsedf["GENID_DESCRIBE"] < 0).sum()
print("Count of missing or unreported gender identity: ", countMissingGender)
print()
countMissingSexuality = (pulsedf["SEXUAL_ORIENTATION"] < 0).sum()
print("Count of missing or unreported sexuality: ", countMissingSexuality)
print()
countMissingIncome = (pulsedf["INCOME"] < 0).sum()
print("Count of missing or unreported minimum income: ", countMissingIncome)
print()
#remove DC residents
pulsedf = pulsedf[pulsedf["EST_ST"] != 11]
#remove missing gender identity based on under 10% of total for better results
pulsedf = pulsedf[pulsedf["GENID_DESCRIBE"].isin([1,2,3,4])]
#remove missing sexuality based on under 10% of total for better results
pulsedf = pulsedf[pulsedf["SEXUAL_ORIENTATION"].isin([1,2,3,4,5])]
#remove missing income based on null values for better results
pulsedf = pulsedf[pulsedf["INCOME"].isin([1,2,3,4,5,6,7,8])]
pulsedfCount = len(pulsedf.index)
print("Count after row removal: ", pulsedfCount)
Full count of data pulse data: 1341164 Count of participants in District of Columbia: 17702 Count of missing or unreported gender identity: 17691 Count of missing or unreported sexuality: 24617 Count of missing or unreported minimum income: 263337 Count after row removal: 1048575
describeDF(pulsedf,pulse_num_col_list)
WEEK EST_ST TBIRTH_YEAR EEDUC AEDUC \ count 1.048575e+06 1.048575e+06 1.048575e+06 1.048575e+06 1.048575e+06 mean 4.391728e+01 2.854206e+01 1.968571e+03 5.367118e+00 1.994822e+00 std 6.120523e+00 1.640470e+01 1.575978e+01 1.436177e+00 7.176839e-02 min 3.400000e+01 1.000000e+00 1.933000e+03 1.000000e+00 1.000000e+00 25% 3.900000e+01 1.300000e+01 1.956000e+03 4.000000e+00 2.000000e+00 50% 4.300000e+01 2.800000e+01 1.968000e+03 6.000000e+00 2.000000e+00 75% 4.900000e+01 4.200000e+01 1.981000e+03 7.000000e+00 2.000000e+00 max 5.400000e+01 5.600000e+01 2.005000e+03 7.000000e+00 2.000000e+00 EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION \ count 1.048575e+06 1.048575e+06 1.048575e+06 1.048575e+06 mean 1.581580e+00 1.997595e+00 1.609426e+00 2.069715e+00 std 4.933001e-01 4.898353e-02 5.507229e-01 4.835032e-01 min 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 25% 1.000000e+00 2.000000e+00 1.000000e+00 2.000000e+00 50% 2.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00 75% 2.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00 max 2.000000e+00 2.000000e+00 4.000000e+00 5.000000e+00 INCOME count 1.048575e+06 mean 4.620572e+00 std 2.128103e+00 min 1.000000e+00 25% 3.000000e+00 50% 5.000000e+00 75% 6.000000e+00 max 8.000000e+00 Mode of WEEK : 43 Variance of WEEK : 37.460803320626816 Mode of EST_ST : 6 Variance of EST_ST : 269.1142209085299 Mode of TBIRTH_YEAR : 1955 Variance of TBIRTH_YEAR : 248.3706702974734 Mode of EEDUC : 6 Variance of EEDUC : 2.0626035639994904 Mode of AEDUC : 2 Variance of AEDUC : 0.005150701178258357 Mode of EGENID_BIRTH : 2 Variance of EGENID_BIRTH : 0.2433449743390272 Mode of AGENID_BIRTH : 2 Variance of AGENID_BIRTH : 0.0023993863704273163 Mode of GENID_DESCRIBE : 2 Variance of GENID_DESCRIBE : 0.3032957446696943 Mode of SEXUAL_ORIENTATION : 2 Variance of SEXUAL_ORIENTATION : 0.23377538647520063 Mode of INCOME : 6 Variance of INCOME : 4.528821091548032
pulse_cov_list = ["WEEK","EST_ST","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME"]
pulse_label_list = ["WEEK","EST_ST","BIRTH_YEAR","EDUC","SEX_AT_BIRTH","GENDERID","SEXUALITY","INCOME"]
printCovariance(pulsedf, pulse_cov_list, pulse_label_list, "USCB Pulse Survey Covariance Matrix")
# further analysis of pulse data
pulseIncomedf = pd.DataFrame()
pulseIncomedf["INCOMEMIN"] = pulsedf["INCOMEMIN"].astype(float)
print(pulseIncomedf.describe())
print("Mode of INCOMEMIN: ", mode(pulseIncomedf["INCOMEMIN"]))
print("Variance of INCOMEMIN: ", np.var(pulseIncomedf["INCOMEMIN"], ddof=1))
del pulseIncomedf
INCOMEMIN count 1.048575e+06 mean 7.935849e+04 std 5.884945e+04 min 0.000000e+00 25% 3.500000e+04 50% 7.500000e+04 75% 1.000000e+05 max 2.000000e+05 Mode of INCOMEMIN: 100000.0 Variance of INCOMEMIN: 3463257810.0526085
# look at income data based on gender
#clean data
pulseIncomeStatsdf = pulsedf
#remove missing values from table for income and force to number
pulseIncomeStatsdf = pulseIncomeStatsdf[pulseIncomeStatsdf["INCOMEMIN"].astype(str).str.isdigit()]
pulseIncomeStatsdf["INCOMEMIN"] = pd.to_numeric(pulseIncomeStatsdf["INCOMEMIN"], errors='coerce')
#remove unreported or missing chosen gender
pulseIncomeStatsdf = pulseIncomeStatsdf[pulseIncomeStatsdf["GENID_DESCRIBE"] > 0]
#print(pulseIncomeStatsdf.head())
pulseIncomeStatsdf = pulseIncomeStatsdf[pulseIncomeStatsdf["SEXUAL_ORIENTATION"] > 0]
pulse_income_col = ["WEEK","EST_ST","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOMEMIN"]
describeDF(pulseIncomeStatsdf, pulse_income_col)
WEEK EST_ST TBIRTH_YEAR EEDUC AEDUC \ count 1.048575e+06 1.048575e+06 1.048575e+06 1.048575e+06 1.048575e+06 mean 4.391728e+01 2.854206e+01 1.968571e+03 5.367118e+00 1.994822e+00 std 6.120523e+00 1.640470e+01 1.575978e+01 1.436177e+00 7.176839e-02 min 3.400000e+01 1.000000e+00 1.933000e+03 1.000000e+00 1.000000e+00 25% 3.900000e+01 1.300000e+01 1.956000e+03 4.000000e+00 2.000000e+00 50% 4.300000e+01 2.800000e+01 1.968000e+03 6.000000e+00 2.000000e+00 75% 4.900000e+01 4.200000e+01 1.981000e+03 7.000000e+00 2.000000e+00 max 5.400000e+01 5.600000e+01 2.005000e+03 7.000000e+00 2.000000e+00 EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION \ count 1.048575e+06 1.048575e+06 1.048575e+06 1.048575e+06 mean 1.581580e+00 1.997595e+00 1.609426e+00 2.069715e+00 std 4.933001e-01 4.898353e-02 5.507229e-01 4.835032e-01 min 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 25% 1.000000e+00 2.000000e+00 1.000000e+00 2.000000e+00 50% 2.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00 75% 2.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00 max 2.000000e+00 2.000000e+00 4.000000e+00 5.000000e+00 INCOME INCOMEMIN count 1.048575e+06 1.048575e+06 mean 4.620572e+00 7.935849e+04 std 2.128103e+00 5.884945e+04 min 1.000000e+00 0.000000e+00 25% 3.000000e+00 3.500000e+04 50% 5.000000e+00 7.500000e+04 75% 6.000000e+00 1.000000e+05 max 8.000000e+00 2.000000e+05 Mode of WEEK : 43 Variance of WEEK : 37.460803320626816 Mode of EST_ST : 6 Variance of EST_ST : 269.1142209085299 Mode of TBIRTH_YEAR : 1955 Variance of TBIRTH_YEAR : 248.3706702974734 Mode of EEDUC : 6 Variance of EEDUC : 2.0626035639994904 Mode of EGENID_BIRTH : 2 Variance of EGENID_BIRTH : 0.2433449743390272 Mode of GENID_DESCRIBE : 2 Variance of GENID_DESCRIBE : 0.3032957446696943 Mode of SEXUAL_ORIENTATION : 2 Variance of SEXUAL_ORIENTATION : 0.23377538647520063 Mode of INCOMEMIN : 100000 Variance of INCOMEMIN : 3463257810.0526085
#build violin plot
plt.figure(figsize=(12,8))
sns.violinplot(x=pulseIncomeStatsdf["EGENID_BIRTH"],y=pulseIncomeStatsdf["INCOMEMIN"],palette="bright")
ax = plt.gca()
leg = ax.get_legend()
ax.set_xticklabels(["Assigned Male at Birth","Assigned Female at Birth"])
plt.ylabel("Income in Dollars")
plt.xlabel("")
plt.title("Minimum Yearly Reported Income")
plt.show()
#build violin plot
incomeGeniddf = pulseIncomeStatsdf
incomeGeniddf["CUR_GENID"] = incomeGeniddf.apply(lambda x: str(x["EGENID_BIRTH"]) + str(x["GENID_DESCRIBE"]), axis=1)
#print(incomeGeniddf)
plt.figure(figsize=(15,8))
sns.violinplot(x=incomeGeniddf["CUR_GENID"],y=incomeGeniddf["INCOMEMIN"],palette="bright")
ax = plt.gca()
ax.set_xticklabels(["Cisgender Women","Cisgender Men", "Nonbinary AFAB", "Nonbinary AMAB"
, "Transgender AFAB", "Transgender AMAB", "Transgender FTM", "Transgender MTF"])
plt.ylabel("Income in Dollars")
plt.xlabel("")
plt.title("Minimum Yearly Reported Income by Gender Identity and Sex Assigned at Birth")
plt.show()
#income comparison based on assigned gender at birth
pulseIncomeAMABdf = pulseIncomeStatsdf
pulseIncomeAMABdf = pulseIncomeAMABdf[pulseIncomeAMABdf["EGENID_BIRTH"] == 1]
describeDF(pulseIncomeAMABdf, pulse_income_col)
pulseIncomeAFABdf = pulseIncomeStatsdf
pulseIncomeAFABdf = pulseIncomeAFABdf[pulseIncomeAFABdf["EGENID_BIRTH"] == 2]
describeDF(pulseIncomeAFABdf, pulse_income_col)
WEEK EST_ST TBIRTH_YEAR EEDUC \ count 438745.000000 438745.000000 438745.000000 438745.000000 mean 44.064304 28.533891 1967.503153 5.435818 std 6.133248 16.498385 16.139392 1.424280 min 34.000000 1.000000 1933.000000 1.000000 25% 39.000000 13.000000 1954.000000 4.000000 50% 44.000000 28.000000 1966.000000 6.000000 75% 50.000000 44.000000 1981.000000 7.000000 max 54.000000 56.000000 2005.000000 7.000000 AEDUC EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE \ count 438745.000000 438745.0 438745.000000 438745.000000 mean 1.994215 1.0 1.997568 1.034546 std 0.075837 0.0 0.049255 0.300578 min 1.000000 1.0 1.000000 1.000000 25% 2.000000 1.0 2.000000 1.000000 50% 2.000000 1.0 2.000000 1.000000 75% 2.000000 1.0 2.000000 1.000000 max 2.000000 1.0 2.000000 4.000000 SEXUAL_ORIENTATION INCOME INCOMEMIN count 438745.000000 438745.000000 438745.000000 mean 2.032830 4.966391 88654.389224 std 0.475219 2.088031 60425.293955 min 1.000000 1.000000 0.000000 25% 2.000000 4.000000 50000.000000 50% 2.000000 5.000000 75000.000000 75% 2.000000 6.000000 100000.000000 max 5.000000 8.000000 200000.000000 Mode of WEEK : 43 Variance of WEEK : 37.616725915359496 Mode of EST_ST : 6 Variance of EST_ST : 272.19670766553105 Mode of TBIRTH_YEAR : 1955 Variance of TBIRTH_YEAR : 260.47997895673205 Mode of EEDUC : 6 Variance of EEDUC : 2.0285745651188547 Mode of EGENID_BIRTH : 1 Variance of EGENID_BIRTH : 0.0 Mode of GENID_DESCRIBE : 1 Variance of GENID_DESCRIBE : 0.0903474059981086 Mode of SEXUAL_ORIENTATION : 2 Variance of SEXUAL_ORIENTATION : 0.22583355356615814 Mode of INCOMEMIN : 100000 Variance of INCOMEMIN : 3651216149.53128 WEEK EST_ST TBIRTH_YEAR EEDUC \ count 609830.000000 609830.000000 609830.000000 609830.000000 mean 43.811497 28.547931 1969.338557 5.317692 std 6.109169 16.336979 15.435331 1.442654 min 34.000000 1.000000 1933.000000 1.000000 25% 39.000000 13.000000 1957.000000 4.000000 50% 43.000000 28.000000 1969.000000 6.000000 75% 49.000000 42.000000 1982.000000 7.000000 max 54.000000 56.000000 2005.000000 7.000000 AEDUC EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE \ count 609830.000000 609830.0 609830.000000 609830.000000 mean 1.995259 2.0 1.997614 2.023026 std 0.068689 0.0 0.048788 0.218328 min 1.000000 2.0 1.000000 1.000000 25% 2.000000 2.0 2.000000 2.000000 50% 2.000000 2.0 2.000000 2.000000 75% 2.000000 2.0 2.000000 2.000000 max 2.000000 2.0 2.000000 4.000000 SEXUAL_ORIENTATION INCOME INCOMEMIN count 609830.000000 609830.000000 609830.000000 mean 2.096251 4.371771 72670.506535 std 0.487654 2.121895 56755.052819 min 1.000000 1.000000 0.000000 25% 2.000000 3.000000 35000.000000 50% 2.000000 4.000000 50000.000000 75% 2.000000 6.000000 100000.000000 max 5.000000 8.000000 200000.000000 Mode of WEEK : 43 Variance of WEEK : 37.32194368751015 Mode of EST_ST : 6 Variance of EST_ST : 266.89687185333827 Mode of TBIRTH_YEAR : 1955 Variance of TBIRTH_YEAR : 238.24944848073915 Mode of EEDUC : 6 Variance of EEDUC : 2.081250683388053 Mode of EGENID_BIRTH : 2 Variance of EGENID_BIRTH : 0.0 Mode of GENID_DESCRIBE : 2 Variance of GENID_DESCRIBE : 0.047666915897604446 Mode of SEXUAL_ORIENTATION : 2 Variance of SEXUAL_ORIENTATION : 0.23780655025122638 Mode of INCOMEMIN : 50000 Variance of INCOMEMIN : 3221136020.4754634
# build datasets based on chosen gender
col_for_counts = ["WEEK"]
pulseStateReduceddf = pulseIncomeStatsdf
#print("Total set week counts:")
pulseStateCountdf = combinedf(pulseStateReduceddf,col_for_counts,"All")
#print(pulseStateCountdf)
pulseStateTotalsdf = pd.DataFrame()
pulseStateTotalsdf["AllWEEKcount"] = pulseStateCountdf["AllWEEKcount"].astype(int)
del pulseStateCountdf
pulseCisMendf = pulseIncomeStatsdf
pulseCisMendf = pulseCisMendf[pulseCisMendf["EGENID_BIRTH"] == 1]
pulseCisMendf = pulseCisMendf[pulseCisMendf["GENID_DESCRIBE"] == 1]
describeDF(pulseCisMendf, pulse_income_col)
#print("Cis men week counts:")
pulseCisMenReducedDf = combinedf(pulseCisMendf,col_for_counts,"CisMen")
pulseStateTotalsdf["CisMenWEEKcount"] = pulseCisMenReducedDf["CisMenWEEKcount"].astype(int)
#print(pulseStateTotalsdf["CisMenWEEKcount"])
WEEK EST_ST TBIRTH_YEAR EEDUC \ count 432414.000000 432414.000000 432414.000000 432414.000000 mean 44.064441 28.527571 1967.402887 5.441038 std 6.133263 16.496388 16.082041 1.421493 min 34.000000 1.000000 1933.000000 1.000000 25% 39.000000 13.000000 1954.000000 4.000000 50% 44.000000 28.000000 1966.000000 6.000000 75% 50.000000 44.000000 1981.000000 7.000000 max 54.000000 56.000000 2005.000000 7.000000 AEDUC EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE \ count 432414.000000 432414.0 432414.000000 432414.0 mean 1.994237 1.0 1.998985 1.0 std 0.075695 0.0 0.031847 0.0 min 1.000000 1.0 1.000000 1.0 25% 2.000000 1.0 2.000000 1.0 50% 2.000000 1.0 2.000000 1.0 75% 2.000000 1.0 2.000000 1.0 max 2.000000 1.0 2.000000 1.0 SEXUAL_ORIENTATION INCOME INCOMEMIN count 432414.000000 432414.000000 432414.000000 mean 2.019733 4.980276 88989.105348 std 0.440445 2.082161 60375.536150 min 1.000000 1.000000 0.000000 25% 2.000000 4.000000 50000.000000 50% 2.000000 5.000000 75000.000000 75% 2.000000 7.000000 150000.000000 max 5.000000 8.000000 200000.000000 Mode of WEEK : 43 Variance of WEEK : 37.6169203141804 Mode of EST_ST : 6 Variance of EST_ST : 272.1308119804907 Mode of TBIRTH_YEAR : 1955 Variance of TBIRTH_YEAR : 258.63203797251515 Mode of EEDUC : 6 Variance of EEDUC : 2.0206427755641716 Mode of EGENID_BIRTH : 1 Variance of EGENID_BIRTH : 0.0 Mode of GENID_DESCRIBE : 1 Variance of GENID_DESCRIBE : 0.0 Mode of SEXUAL_ORIENTATION : 2 Variance of SEXUAL_ORIENTATION : 0.19399188938993783 Mode of INCOMEMIN : 100000 Variance of INCOMEMIN : 3645205365.415228
pulseCisWomendf = pulseIncomeStatsdf
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["EGENID_BIRTH"] == 2]
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["GENID_DESCRIBE"] == 2]
describeDF(pulseCisWomendf, pulse_income_col)
#print("Cis women week counts:")
pulseCisWomenReducedDf = combinedf(pulseCisWomendf,col_for_counts,"CisWomen")
pulseStateTotalsdf["CisWomenWEEKcount"] = pulseCisWomenReducedDf["CisWomenWEEKcount"].astype(int)
#print(pulseStateTotalsdf["CisWomenWEEKcount"])
WEEK EST_ST TBIRTH_YEAR EEDUC \ count 599899.000000 599899.000000 599899.000000 599899.000000 mean 43.804849 28.541668 1969.179439 5.321482 std 6.108812 16.333975 15.348571 1.440654 min 34.000000 1.000000 1933.000000 1.000000 25% 39.000000 13.000000 1957.000000 4.000000 50% 43.000000 28.000000 1969.000000 6.000000 75% 49.000000 42.000000 1982.000000 7.000000 max 54.000000 56.000000 2005.000000 7.000000 AEDUC EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE \ count 599899.000000 599899.0 599899.000000 599899.0 mean 1.995281 2.0 1.998725 2.0 std 0.068534 0.0 0.035687 0.0 min 1.000000 2.0 1.000000 2.0 25% 2.000000 2.0 2.000000 2.0 50% 2.000000 2.0 2.000000 2.0 75% 2.000000 2.0 2.000000 2.0 max 2.000000 2.0 2.000000 2.0 SEXUAL_ORIENTATION INCOME INCOMEMIN count 599899.000000 599899.000000 599899.000000 mean 2.085014 4.384903 72975.084139 std 0.461465 2.119228 56763.424571 min 1.000000 1.000000 0.000000 25% 2.000000 3.000000 35000.000000 50% 2.000000 4.000000 50000.000000 75% 2.000000 6.000000 100000.000000 max 5.000000 8.000000 200000.000000 Mode of WEEK : 43 Variance of WEEK : 37.317581415112876 Mode of EST_ST : 6 Variance of EST_ST : 266.79872778314314 Mode of TBIRTH_YEAR : 1955 Variance of TBIRTH_YEAR : 235.5786372658161 Mode of EEDUC : 6 Variance of EEDUC : 2.07548426437156 Mode of EGENID_BIRTH : 2 Variance of EGENID_BIRTH : 0.0 Mode of GENID_DESCRIBE : 2 Variance of GENID_DESCRIBE : 0.0 Mode of SEXUAL_ORIENTATION : 2 Variance of SEXUAL_ORIENTATION : 0.2129499850843681 Mode of INCOMEMIN : 50000 Variance of INCOMEMIN : 3222086368.9871078
#print("Cisgender week counts:")
cisdf = pd.DataFrame()
cisdf["CisgenderWEEKcount"] = pulseStateTotalsdf.loc[:,["CisMenWEEKcount","CisWomenWEEKcount"]].sum(axis=1)
pulseStateTotalsdf["CisgenderWEEKcount"] = cisdf["CisgenderWEEKcount"].astype(int)
#print(pulseStateTotalsdf)
pulseTranswomendf = pulseIncomeStatsdf
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["EGENID_BIRTH"] == 1]
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["GENID_DESCRIBE"].isin([2,3])]
describeDF(pulseTranswomendf, pulse_income_col)
#print("Trans women week counts:")
pulseTranswomenReduceddf = combinedf(pulseTranswomendf,col_for_counts,"TransWomen")
#print(pulseTranswomenReduceddf)
pulseStateTotalsdf["TransWomenWEEKcount"] = pulseTranswomenReduceddf["TransWomenWEEKcount"].astype(int)
WEEK EST_ST TBIRTH_YEAR EEDUC AEDUC \ count 2654.000000 2654.000000 2654.000000 2654.000000 2654.000000 mean 44.383572 29.207611 1976.794650 4.939337 1.993595 std 6.134856 16.413115 18.514133 1.537229 0.079792 min 34.000000 1.000000 1933.000000 1.000000 1.000000 25% 40.000000 16.000000 1962.000000 4.000000 2.000000 50% 44.000000 29.000000 1982.000000 5.000000 2.000000 75% 50.000000 42.000000 1993.000000 6.000000 2.000000 max 54.000000 56.000000 2005.000000 7.000000 2.000000 EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION \ count 2654.0 2654.000000 2654.000000 2654.00000 mean 1.0 1.794650 2.554635 2.64318 std 0.0 0.404034 0.497100 1.16700 min 1.0 1.000000 2.000000 1.00000 25% 1.0 2.000000 2.000000 2.00000 50% 1.0 2.000000 3.000000 3.00000 75% 1.0 2.000000 3.000000 4.00000 max 1.0 2.000000 3.000000 5.00000 INCOME INCOMEMIN count 2654.000000 2654.000000 mean 3.669932 56865.109269 std 2.190414 55358.774687 min 1.000000 0.000000 25% 2.000000 25000.000000 50% 4.000000 50000.000000 75% 5.000000 75000.000000 max 8.000000 200000.000000 Mode of WEEK : 52 Variance of WEEK : 37.6364582501901 Mode of EST_ST : 6 Variance of EST_ST : 269.39035290414995 Mode of TBIRTH_YEAR : 1992 Variance of TBIRTH_YEAR : 342.7731187425988 Mode of EEDUC : 4 Variance of EEDUC : 2.363073212535268 Mode of EGENID_BIRTH : 1 Variance of EGENID_BIRTH : 0.0 Mode of GENID_DESCRIBE : 3 Variance of GENID_DESCRIBE : 0.24710817771523663 Mode of SEXUAL_ORIENTATION : 3 Variance of SEXUAL_ORIENTATION : 1.3618890161739803 Mode of INCOMEMIN : 0 Variance of INCOMEMIN : 3064593934.8353977
pulseTransmendf = pulseIncomeStatsdf
pulseTransmendf = pulseTransmendf[pulseTransmendf["EGENID_BIRTH"] == 2]
pulseTransmendf = pulseTransmendf[pulseTransmendf["GENID_DESCRIBE"].isin([1,3])]
describeDF(pulseTransmendf, pulse_income_col)
#print("Trans men week counts:")
pulseTransmenReduceddf = combinedf(pulseTransmendf,col_for_counts,"TransMen")
#print(pulseTransmenReduceddf)
pulseStateTotalsdf["TransMenWEEKcount"] = pulseTransmenReduceddf["TransMenWEEKcount"].astype(int)
WEEK EST_ST TBIRTH_YEAR EEDUC AEDUC \ count 3444.000000 3444.000000 3444.000000 3444.000000 3444.000000 mean 44.622242 29.090012 1982.988095 5.067364 1.994483 std 6.068434 16.591153 17.827721 1.547615 0.074081 min 34.000000 1.000000 1933.000000 1.000000 1.000000 25% 40.000000 15.000000 1974.000000 4.000000 2.000000 50% 45.000000 29.000000 1990.000000 6.000000 2.000000 75% 50.000000 42.000000 1996.000000 6.000000 2.000000 max 54.000000 56.000000 2005.000000 7.000000 2.000000 EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION \ count 3444.0 3444.000000 3444.000000 3444.000000 mean 2.0 1.828688 2.310105 2.852497 std 0.0 0.376836 0.950841 1.133170 min 2.0 1.000000 1.000000 1.000000 25% 2.0 2.000000 1.000000 2.000000 50% 2.0 2.000000 3.000000 3.000000 75% 2.0 2.000000 3.000000 4.000000 max 2.0 2.000000 3.000000 5.000000 INCOME INCOMEMIN count 3444.000000 3444.000000 mean 3.504355 52778.745645 std 2.122240 53090.947248 min 1.000000 0.000000 25% 2.000000 25000.000000 50% 3.000000 35000.000000 75% 5.000000 75000.000000 max 8.000000 200000.000000 Mode of WEEK : 54 Variance of WEEK : 36.82588913592966 Mode of EST_ST : 6 Variance of EST_ST : 275.26636549507276 Mode of TBIRTH_YEAR : 1996 Variance of TBIRTH_YEAR : 317.82762471819984 Mode of EEDUC : 6 Variance of EEDUC : 2.3951123034735593 Mode of EGENID_BIRTH : 2 Variance of EGENID_BIRTH : 0.0 Mode of GENID_DESCRIBE : 3 Variance of GENID_DESCRIBE : 0.9040976945597842 Mode of SEXUAL_ORIENTATION : 3 Variance of SEXUAL_ORIENTATION : 1.2840745062361207 Mode of INCOMEMIN : 0 Variance of INCOMEMIN : 2818648679.692473
pulseNonedf = pulseIncomeStatsdf
pulseNonedf = pulseNonedf[pulseNonedf["GENID_DESCRIBE"] == 4]
describeDF(pulseNonedf, pulse_income_col)
#print("Non-Binary week counts:")
pulseNoneReduceddf = combinedf(pulseNonedf,col_for_counts,"Enby")
#print(pulseNoneReduceddf)
pulseStateTotalsdf["EnbyWEEKcount"] = pulseNoneReduceddf["EnbyWEEKcount"].astype(int)
WEEK EST_ST TBIRTH_YEAR EEDUC AEDUC \ count 10164.000000 10164.000000 10164.000000 10164.000000 10164.000000 mean 43.931425 28.821822 1975.280500 5.129083 1.993113 std 6.128902 16.585780 17.485799 1.551461 0.082706 min 34.000000 1.000000 1933.000000 1.000000 1.000000 25% 39.000000 13.000000 1962.000000 4.000000 2.000000 50% 44.000000 29.000000 1978.000000 6.000000 2.000000 75% 49.000000 44.000000 1990.000000 6.000000 2.000000 max 54.000000 56.000000 2005.000000 7.000000 2.000000 EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION \ count 10164.000000 10164.000000 10164.0 10164.000000 mean 1.638233 1.981995 4.0 2.878099 std 0.480535 0.132975 0.0 1.199313 min 1.000000 1.000000 4.0 1.000000 25% 1.000000 2.000000 4.0 2.000000 50% 2.000000 2.000000 4.0 2.000000 75% 2.000000 2.000000 4.0 4.000000 max 2.000000 2.000000 4.0 5.000000 INCOME INCOMEMIN count 10164.000000 10164.000000 mean 3.853503 61277.056277 std 2.212131 56845.106327 min 1.000000 0.000000 25% 2.000000 25000.000000 50% 4.000000 50000.000000 75% 6.000000 100000.000000 max 8.000000 200000.000000 Mode of WEEK : 41 Variance of WEEK : 37.56343628567068 Mode of EST_ST : 6 Variance of EST_ST : 275.0880960203078 Mode of TBIRTH_YEAR : 1993 Variance of TBIRTH_YEAR : 305.75315311040043 Mode of EEDUC : 6 Variance of EEDUC : 2.407029720940252 Mode of EGENID_BIRTH : 2 Variance of EGENID_BIRTH : 0.23091436232464746 Mode of GENID_DESCRIBE : 4 Variance of GENID_DESCRIBE : 0.0 Mode of SEXUAL_ORIENTATION : 2 Variance of SEXUAL_ORIENTATION : 1.4383513604283242 Mode of INCOMEMIN : 0 Variance of INCOMEMIN : 3231366113.305501
#print("NonCisgender week counts:")
transdf = pd.DataFrame()
transdf["NonCisgenderWEEKcount"] = pulseStateTotalsdf.loc[:,["TransWomenWEEKcount","TransMenWEEKcount","EnbyWEEKcount"]].sum(axis=1)
pulseStateTotalsdf["NonCisgenderWEEKcount"] = transdf["NonCisgenderWEEKcount"].astype(int)
#print(pulseStateTotalsdf)
del transdf
print("Percentages:")
pulseStateTotalsdf.assign(CisPercent = lambda x: (round(x["CisgenderWEEKcount"]/x["AllWEEKcount"] * 100,2)))
pulseStateTotalsdf.assign(NonCisPercent = lambda x: (round(x["NonCisgenderWEEKcount"]/x["AllWEEKcount"] * 100,2)))
Percentages:
AllWEEKcount | CisMenWEEKcount | CisWomenWEEKcount | CisgenderWEEKcount | TransWomenWEEKcount | TransMenWEEKcount | EnbyWEEKcount | NonCisgenderWEEKcount | NonCisPercent | |
---|---|---|---|---|---|---|---|---|---|
43 | 64543 | 25986 | 37672 | 63658 | 131 | 188 | 566 | 885 | 1.37 |
41 | 61390 | 24939 | 35525 | 60464 | 162 | 185 | 579 | 926 | 1.51 |
42 | 60221 | 24204 | 35106 | 59310 | 144 | 198 | 569 | 911 | 1.51 |
54 | 60220 | 25198 | 34080 | 59278 | 153 | 230 | 559 | 942 | 1.56 |
52 | 56169 | 24487 | 30771 | 55258 | 172 | 196 | 543 | 911 | 1.62 |
53 | 54574 | 22845 | 30918 | 53763 | 131 | 190 | 490 | 811 | 1.49 |
36 | 53108 | 21142 | 31193 | 52335 | 108 | 147 | 518 | 773 | 1.46 |
35 | 52930 | 21095 | 31041 | 52136 | 148 | 142 | 504 | 794 | 1.50 |
34 | 49604 | 19756 | 29066 | 48822 | 122 | 138 | 522 | 782 | 1.58 |
44 | 49565 | 20076 | 28742 | 48818 | 112 | 160 | 475 | 747 | 1.51 |
37 | 49090 | 19691 | 28689 | 48380 | 93 | 135 | 482 | 710 | 1.45 |
51 | 48800 | 21354 | 26597 | 47951 | 153 | 173 | 523 | 849 | 1.74 |
46 | 48346 | 19406 | 28206 | 47612 | 113 | 155 | 466 | 734 | 1.52 |
40 | 47908 | 20526 | 26667 | 47193 | 123 | 149 | 443 | 715 | 1.49 |
45 | 47676 | 19242 | 27636 | 46878 | 145 | 169 | 484 | 798 | 1.67 |
38 | 45991 | 18625 | 26685 | 45310 | 101 | 152 | 428 | 681 | 1.48 |
47 | 44593 | 17972 | 25899 | 43871 | 114 | 147 | 461 | 722 | 1.62 |
39 | 43767 | 17899 | 25256 | 43155 | 91 | 106 | 415 | 612 | 1.40 |
49 | 40405 | 17954 | 21784 | 39738 | 128 | 158 | 381 | 667 | 1.65 |
48 | 36405 | 14994 | 20714 | 35708 | 113 | 178 | 406 | 697 | 1.91 |
50 | 33270 | 15023 | 17652 | 32675 | 97 | 148 | 350 | 595 | 1.79 |
pulseWeekTotalsdf = pulseStateTotalsdf
print(pulseWeekTotalsdf.head())
print()
state_count_col = ["AllWEEKcount","CisgenderWEEKcount","NonCisgenderWEEKcount"
,"CisMenWEEKcount","CisWomenWEEKcount"
,"TransWomenWEEKcount","TransMenWEEKcount","EnbyWEEKcount"]
describeDF(pulseWeekTotalsdf,state_count_col)
AllWEEKcount CisMenWEEKcount CisWomenWEEKcount CisgenderWEEKcount \ 43 64543 25986 37672 63658 41 61390 24939 35525 60464 42 60221 24204 35106 59310 54 60220 25198 34080 59278 52 56169 24487 30771 55258 TransWomenWEEKcount TransMenWEEKcount EnbyWEEKcount \ 43 131 188 566 41 162 185 579 42 144 198 569 54 153 230 559 52 172 196 543 NonCisgenderWEEKcount 43 885 41 926 42 911 54 942 52 911 AllWEEKcount CisMenWEEKcount CisWomenWEEKcount CisgenderWEEKcount \ count 21.000000 21.000000 21.000000 21.000000 mean 49932.142857 20591.142857 28566.619048 49157.761905 std 8010.431825 3130.041123 4924.881364 7920.842814 min 33270.000000 14994.000000 17652.000000 32675.000000 25% 45991.000000 18625.000000 26597.000000 45310.000000 50% 49090.000000 20076.000000 28689.000000 48380.000000 75% 54574.000000 22845.000000 31041.000000 53763.000000 max 64543.000000 25986.000000 37672.000000 63658.000000 TransWomenWEEKcount TransMenWEEKcount EnbyWEEKcount \ count 21.000000 21.000000 21.00000 mean 126.380952 164.000000 484.00000 std 23.341971 27.597101 63.95389 min 91.000000 106.000000 350.00000 25% 112.000000 147.000000 443.00000 50% 123.000000 158.000000 484.00000 75% 145.000000 185.000000 523.00000 max 172.000000 230.000000 579.00000 NonCisgenderWEEKcount count 21.000000 mean 774.380952 std 101.523631 min 595.000000 25% 710.000000 50% 773.000000 75% 849.000000 max 942.000000 Mode of AllWEEKcount : 64543 Variance of AllWEEKcount : 64167018.02857144 Mode of CisgenderWEEKcount : 63658 Variance of CisgenderWEEKcount : 62739750.89047618 Mode of NonCisgenderWEEKcount : 911 Variance of NonCisgenderWEEKcount : 10307.04761904762 Mode of CisMenWEEKcount : 25986 Variance of CisMenWEEKcount : 9797157.42857143 Mode of CisWomenWEEKcount : 37672 Variance of CisWomenWEEKcount : 24254456.447619047 Mode of TransWomenWEEKcount : 131 Variance of TransWomenWEEKcount : 544.8476190476191 Mode of TransMenWEEKcount : 147 Variance of TransMenWEEKcount : 761.6 Mode of EnbyWEEKcount : 566 Variance of EnbyWEEKcount : 4090.1
# build datasets based on chosen gender
col_for_counts = ["EST_ST"]
pulseStateReduceddf = pulseIncomeStatsdf
#print("Total set state counts:")
pulseStateCountdf = combinedf(pulseStateReduceddf,col_for_counts,"All")
#print(pulseStateCountdf)
pulseStateTotalsdf = pd.DataFrame()
pulseStateTotalsdf["AllEST_STcount"] = pulseStateCountdf["AllEST_STcount"].astype(int)
del pulseStateCountdf
pulseCisMendf = pulseIncomeStatsdf
pulseCisMendf = pulseCisMendf[pulseCisMendf["EGENID_BIRTH"] == 1]
pulseCisMendf = pulseCisMendf[pulseCisMendf["GENID_DESCRIBE"] == 1]
#print("Cis men state counts:")
pulseCisMenReducedDf = combinedf(pulseCisMendf,col_for_counts,"CisMen")
pulseStateTotalsdf["CisMenEST_STcount"] = pulseCisMenReducedDf["CisMenEST_STcount"].astype(int)
#print(pulseStateTotalsdf["CisMenEST_STcount"])
pulseCisWomendf = pulseIncomeStatsdf
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["EGENID_BIRTH"] == 2]
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["GENID_DESCRIBE"] == 2]
#print("Cis men state counts:")
pulseCisWomenReducedDf = combinedf(pulseCisWomendf,col_for_counts,"CisWomen")
pulseStateTotalsdf["CisWomenEST_STcount"] = pulseCisWomenReducedDf["CisWomenEST_STcount"].astype(int)
#print(pulseStateTotalsdf["CisWomenEST_STcount"])
#print("Cisgender state counts:")
cisdf = pd.DataFrame()
cisdf["CisgenderEST_STcount"] = pulseStateTotalsdf.loc[:,["CisMenEST_STcount","CisWomenEST_STcount"]].sum(axis=1)
pulseStateTotalsdf["CisgenderEST_STcount"] = cisdf["CisgenderEST_STcount"].astype(int)
#print(pulseStateTotalsdf)
pulseTranswomendf = pulseIncomeStatsdf
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["EGENID_BIRTH"] == 1]
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["GENID_DESCRIBE"].isin([2,3])]
#print("Trans women state counts:")
pulseTranswomenReduceddf = combinedf(pulseTranswomendf,col_for_counts,"TransWomen")
#print(pulseTranswomenReduceddf)
pulseStateTotalsdf["TransWomenEST_STcount"] = pulseTranswomenReduceddf["TransWomenEST_STcount"].astype(int)
pulseTransmendf = pulseIncomeStatsdf
pulseTransmendf = pulseTransmendf[pulseTransmendf["EGENID_BIRTH"] == 2]
pulseTransmendf = pulseTransmendf[pulseTransmendf["GENID_DESCRIBE"].isin([1,3])]
#print("Trans men column counts:")
pulseTransmenReduceddf = combinedf(pulseTransmendf,col_for_counts,"TransMen")
#print(pulseTransmenReduceddf)
pulseStateTotalsdf["TransMenEST_STcount"] = pulseTransmenReduceddf["TransMenEST_STcount"].astype(int)
pulseNonedf = pulseIncomeStatsdf
pulseNonedf = pulseNonedf[pulseNonedf["GENID_DESCRIBE"] == 4]
#print("Non-Binary column counts:")
pulseNoneReduceddf = combinedf(pulseNonedf,col_for_counts,"Enby")
#print(pulseNoneReduceddf)
pulseStateTotalsdf["EnbyEST_STcount"] = pulseNoneReduceddf["EnbyEST_STcount"].astype(int)
print("NonCisgender state counts:")
transdf = pd.DataFrame()
transdf["NonCisgenderEST_STcount"] = pulseStateTotalsdf.loc[:,["TransWomenEST_STcount","TransMenEST_STcount","EnbyEST_STcount"]].sum(axis=1)
pulseStateTotalsdf["NonCisgenderEST_STcount"] = transdf["NonCisgenderEST_STcount"].astype(int)
#print(pulseStateTotalsdf)
del transdf
NonCisgender state counts:
print("Percentages:")
pulseStateTotalsdf.assign(CisPercent = lambda x: (round(x["CisgenderEST_STcount"]/x["AllEST_STcount"] * 100,2)))
pulseStateTotalsdf.assign(NonCisPercent = lambda x: (round(x["NonCisgenderEST_STcount"]/x["AllEST_STcount"] * 100,2)))
Percentages:
AllEST_STcount | CisMenEST_STcount | CisWomenEST_STcount | CisgenderEST_STcount | TransWomenEST_STcount | TransMenEST_STcount | EnbyEST_STcount | NonCisgenderEST_STcount | NonCisPercent | |
---|---|---|---|---|---|---|---|---|---|
6 | 79763 | 35075 | 43358 | 78433 | 196 | 292 | 842 | 1330 | 1.67 |
48 | 52968 | 22829 | 29432 | 52261 | 118 | 136 | 453 | 707 | 1.33 |
53 | 44216 | 19190 | 24043 | 43233 | 161 | 216 | 606 | 983 | 2.22 |
12 | 36164 | 15525 | 20207 | 35732 | 69 | 76 | 287 | 432 | 1.19 |
25 | 29233 | 12153 | 16581 | 28734 | 97 | 106 | 296 | 499 | 1.71 |
41 | 29039 | 11455 | 16841 | 28296 | 139 | 188 | 416 | 743 | 2.56 |
26 | 28883 | 11989 | 16456 | 28445 | 80 | 91 | 267 | 438 | 1.52 |
49 | 28771 | 12504 | 15855 | 28359 | 81 | 79 | 252 | 412 | 1.43 |
51 | 28311 | 12243 | 15679 | 27922 | 62 | 71 | 256 | 389 | 1.37 |
4 | 28004 | 11663 | 15922 | 27585 | 63 | 83 | 273 | 419 | 1.50 |
8 | 27907 | 11805 | 15629 | 27434 | 70 | 110 | 293 | 473 | 1.69 |
42 | 26712 | 11195 | 15118 | 26313 | 67 | 95 | 237 | 399 | 1.49 |
13 | 24722 | 9982 | 14380 | 24362 | 58 | 85 | 217 | 360 | 1.46 |
24 | 24421 | 9971 | 14084 | 24055 | 40 | 82 | 244 | 366 | 1.50 |
27 | 23848 | 10004 | 13467 | 23471 | 78 | 80 | 219 | 377 | 1.58 |
17 | 22360 | 9488 | 12471 | 21959 | 80 | 76 | 245 | 401 | 1.79 |
20 | 19673 | 7807 | 11564 | 19371 | 49 | 73 | 180 | 302 | 1.54 |
36 | 19474 | 8180 | 10947 | 19127 | 44 | 82 | 221 | 347 | 1.78 |
16 | 19306 | 7815 | 11212 | 19027 | 42 | 67 | 170 | 279 | 1.45 |
35 | 19302 | 7508 | 11472 | 18980 | 45 | 72 | 205 | 322 | 1.67 |
34 | 19089 | 8364 | 10506 | 18870 | 32 | 34 | 153 | 219 | 1.15 |
37 | 18992 | 7637 | 11100 | 18737 | 33 | 53 | 169 | 255 | 1.34 |
18 | 18836 | 7441 | 11123 | 18564 | 39 | 59 | 174 | 272 | 1.44 |
29 | 18833 | 7387 | 11145 | 18532 | 55 | 60 | 186 | 301 | 1.60 |
55 | 18774 | 7712 | 10750 | 18462 | 50 | 84 | 178 | 312 | 1.66 |
47 | 18081 | 7053 | 10763 | 17816 | 29 | 64 | 172 | 265 | 1.47 |
9 | 17796 | 7118 | 10424 | 17542 | 44 | 53 | 157 | 254 | 1.43 |
39 | 17452 | 7106 | 10094 | 17200 | 46 | 51 | 155 | 252 | 1.44 |
19 | 17153 | 6676 | 10241 | 16917 | 38 | 50 | 148 | 236 | 1.38 |
33 | 16854 | 7144 | 9473 | 16617 | 44 | 50 | 143 | 237 | 1.41 |
32 | 16779 | 7124 | 9422 | 16546 | 36 | 46 | 151 | 233 | 1.39 |
2 | 16617 | 6761 | 9546 | 16307 | 51 | 60 | 199 | 310 | 1.87 |
40 | 16149 | 6216 | 9686 | 15902 | 41 | 50 | 156 | 247 | 1.53 |
31 | 16114 | 6469 | 9396 | 15865 | 31 | 57 | 161 | 249 | 1.55 |
45 | 15821 | 6122 | 9491 | 15613 | 26 | 38 | 144 | 208 | 1.31 |
21 | 14878 | 5790 | 8900 | 14690 | 43 | 39 | 106 | 188 | 1.26 |
5 | 13482 | 5176 | 8145 | 13321 | 28 | 36 | 97 | 161 | 1.19 |
1 | 13238 | 5261 | 7807 | 13068 | 26 | 35 | 109 | 170 | 1.28 |
30 | 12483 | 4962 | 7352 | 12314 | 31 | 33 | 105 | 169 | 1.35 |
22 | 12068 | 4465 | 7413 | 11878 | 33 | 25 | 132 | 190 | 1.57 |
50 | 11646 | 4572 | 6880 | 11452 | 34 | 50 | 110 | 194 | 1.67 |
10 | 11563 | 4573 | 6833 | 11406 | 21 | 28 | 108 | 157 | 1.36 |
15 | 11375 | 4895 | 6309 | 11204 | 30 | 30 | 111 | 171 | 1.50 |
23 | 11128 | 4338 | 6605 | 10943 | 35 | 41 | 109 | 185 | 1.66 |
46 | 10867 | 4453 | 6284 | 10737 | 26 | 18 | 86 | 130 | 1.20 |
56 | 10844 | 4325 | 6360 | 10685 | 28 | 25 | 106 | 159 | 1.47 |
54 | 10685 | 3943 | 6606 | 10549 | 23 | 30 | 83 | 136 | 1.27 |
44 | 9601 | 3822 | 5595 | 9417 | 24 | 47 | 113 | 184 | 1.92 |
28 | 9301 | 3342 | 5846 | 9188 | 14 | 19 | 80 | 113 | 1.21 |
38 | 8999 | 3786 | 5086 | 8872 | 24 | 19 | 84 | 127 | 1.41 |
print(pulseStateTotalsdf.head())
print()
state_count_col = ["AllEST_STcount","CisgenderEST_STcount","NonCisgenderEST_STcount"
,"CisMenEST_STcount","CisWomenEST_STcount"
,"TransWomenEST_STcount","TransMenEST_STcount","EnbyEST_STcount"]
describeDF(pulseStateTotalsdf,state_count_col)
AllEST_STcount CisMenEST_STcount CisWomenEST_STcount \ 6 79763 35075 43358 48 52968 22829 29432 53 44216 19190 24043 12 36164 15525 20207 25 29233 12153 16581 CisgenderEST_STcount TransWomenEST_STcount TransMenEST_STcount \ 6 78433 196 292 48 52261 118 136 53 43233 161 216 12 35732 69 76 25 28734 97 106 EnbyEST_STcount NonCisgenderEST_STcount 6 842 1330 48 453 707 53 606 983 12 287 432 25 296 499 AllEST_STcount CisMenEST_STcount CisWomenEST_STcount \ count 50.000000 50.000000 50.000000 mean 20971.500000 8648.280000 11997.980000 std 12222.406664 5449.401873 6575.810556 min 8999.000000 3342.000000 5086.000000 25% 13299.000000 5197.250000 7891.500000 50% 18427.500000 7265.500000 10628.000000 75% 24646.750000 9998.500000 14306.000000 max 79763.000000 35075.000000 43358.000000 CisgenderEST_STcount TransWomenEST_STcount TransMenEST_STcount \ count 50.000000 50.000000 50.000000 mean 20646.260000 53.080000 68.880000 std 12013.928975 36.049762 50.080604 min 8872.000000 14.000000 18.000000 25% 13131.250000 31.000000 38.250000 50% 18139.000000 42.500000 58.000000 75% 24285.250000 62.750000 81.500000 max 78433.000000 196.000000 292.000000 EnbyEST_STcount NonCisgenderEST_STcount count 50.000000 50.000000 mean 203.280000 325.240000 std 135.915946 219.943366 min 80.000000 113.000000 25% 111.500000 188.500000 50% 169.500000 260.000000 75% 242.250000 386.000000 max 842.000000 1330.000000 Mode of AllEST_STcount : 79763 Variance of AllEST_STcount : 149387224.66326532 Mode of CisgenderEST_STcount : 78433 Variance of CisgenderEST_STcount : 144334489.42081633 Mode of NonCisgenderEST_STcount : 1330 Variance of NonCisgenderEST_STcount : 48375.08408163265 Mode of CisMenEST_STcount : 35075 Variance of CisMenEST_STcount : 29695980.777142856 Mode of CisWomenEST_STcount : 43358 Variance of CisWomenEST_STcount : 43241284.46897959 Mode of TransWomenEST_STcount : 44 Variance of TransWomenEST_STcount : 1299.585306122449 Mode of TransMenEST_STcount : 50 Variance of TransMenEST_STcount : 2508.0669387755106 Mode of EnbyEST_STcount : 106 Variance of EnbyEST_STcount : 18473.14448979592
#add basic gender column
pulseMungedf = pulseIncomeStatsdf
pulseMungedf["CUR_GENID"] = pulseMungedf.apply(lambda x: basicGenMarker(x["EGENID_BIRTH"],x["GENID_DESCRIBE"]), axis=1)
#replace state column name to allow merging
pulseMungedf.rename(columns={"EST_ST":"stateId"}, inplace=True)
print(pulseMungedf.head())
pulseMungedf["stateId"] = pulseMungedf["stateId"].astype(int)
reedFulldf["stateId"] = reedFulldf["stateId"].astype(int)
pulseMungedf = pd.merge(pulseMungedf, reedFulldf, on="stateId", how='inner')
print(pulseMungedf.head())
#list(pulseMungedf.columns)
SCRAM WEEK stateId TBIRTH_YEAR EEDUC AEDUC EGENID_BIRTH \ 1 V340000002 34 4 1982 7 2 2 3 V340000004 34 31 1957 4 2 1 4 V340000005 34 45 1962 5 2 2 5 V340000006 34 8 1956 7 2 1 6 V340000007 34 41 1982 7 2 2 AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION INCOME ENDDATE \ 1 2 2 2 7 8/2/2021 3 2 1 2 6 8/2/2021 4 2 2 2 4 8/2/2021 5 2 1 2 7 8/2/2021 6 2 2 2 8 8/2/2021 EDUCATION ASSIGNEDGENDER CHOSENGENDER SEXUALORIENTATION \ 1 Graduate degree female female straight 3 some college male male straight 4 Associate's degree female female straight 5 Graduate degree male male straight 6 Graduate degree female female straight INCOMEMIN CUR_GENID 1 150000 Cisgender Woman 3 100000 Cisgender Man 4 50000 Cisgender Woman 5 150000 Cisgender Man 6 200000 Cisgender Woman SCRAM WEEK stateId TBIRTH_YEAR EEDUC AEDUC EGENID_BIRTH \ 0 V340000002 34 4 1982 7 2 2 1 V340000076 34 4 1986 6 2 1 2 V340000087 34 4 1945 6 2 1 3 V340000238 34 4 1966 6 2 2 4 V340000281 34 4 1973 4 2 2 AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION ... \ 0 2 2 2 ... 1 2 1 2 ... 2 2 1 2 ... 3 2 2 2 ... 4 2 2 2 ... veryReligiousStatista2017 moderatelyReligiousStatista2017 \ 0 0.31 0.31 1 0.31 0.31 2 0.31 0.31 3 0.31 0.31 4 0.31 0.31 nonreligiousStatista2017 relLibScore2022 relLibVote2022 relLibVax2022 \ 0 0.39 0.4156 1.0 1.0 1 0.39 0.4156 1.0 1.0 2 0.39 0.4156 1.0 1.0 3 0.39 0.4156 1.0 1.0 4 0.39 0.4156 1.0 1.0 relLibHealth2022 relLibHealthMandate2022 relLibMarriage2022 relLibRfra2022 0 4.0 1.0 0.0 1.0 1 4.0 1.0 0.0 1.0 2 4.0 1.0 0.0 1.0 3 4.0 1.0 0.0 1.0 4 4.0 1.0 0.0 1.0 [5 rows x 43 columns]
munge_col_list = ["SCRAM","WEEK","stateId","TBIRTH_YEAR","EEDUC","AEDUC","EGENID_BIRTH","AGENID_BIRTH",
"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME","ENDDATE","EDUCATION","ASSIGNEDGENDER",
"CHOSENGENDER","SEXUALORIENTATION","INCOMEMIN","CUR_GENID",
"stateName","statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023",
"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022","transAdultPop2016",
"transAdultPercent2016","transAdultPop2022","transAdultPercent2022","religionImportantPew2014",
"worshipWeeklyPew2014","prayDailyPew2014","certainAboutGodPew2014","overallReligiosityPew2014",
"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017",
"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022",
"relLibMarriage2022","relLibRfra2022"]
#sample from the dataset based on the "CUR_GENID" and stateId columns,
#looking for (3gender*50state)*113samples = 16,950 rows per run
seed_value = 19
random.seed(seed_value)
rand_int = random.randint(0,1000)
modelSampledf = pulseMungedf.groupby(["CUR_GENID","stateId"]).sample(n=113,random_state=rand_int)
describeDF(modelSampledf,munge_col_list)
WEEK stateId TBIRTH_YEAR EEDUC AEDUC \ count 16950.000000 16950.000000 16950.000000 16950.000000 16950.000000 mean 44.072153 29.320000 1971.105959 5.254572 1.995162 std 6.117397 15.624084 16.989544 1.487861 0.069388 min 34.000000 1.000000 1933.000000 1.000000 1.000000 25% 39.000000 17.000000 1957.000000 4.000000 2.000000 50% 44.000000 29.500000 1971.000000 6.000000 2.000000 75% 50.000000 42.000000 1985.000000 7.000000 2.000000 max 54.000000 56.000000 2005.000000 7.000000 2.000000 EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION \ count 16950.000000 16950.00000 16950.000000 16950.000000 mean 1.533510 1.97056 2.136224 2.308024 std 0.498891 0.16904 1.122674 0.850734 min 1.000000 1.00000 1.000000 1.000000 25% 1.000000 2.00000 1.000000 2.000000 50% 2.000000 2.00000 2.000000 2.000000 75% 2.000000 2.00000 3.000000 2.000000 max 2.000000 2.00000 4.000000 5.000000 INCOME ... veryReligiousStatista2017 \ count 16950.000000 ... 16950.000000 mean 4.256342 ... 0.371600 std 2.169266 ... 0.089543 min 1.000000 ... 0.160000 25% 2.000000 ... 0.310000 50% 4.000000 ... 0.365000 75% 6.000000 ... 0.440000 max 8.000000 ... 0.590000 moderatelyReligiousStatista2017 nonreligiousStatista2017 \ count 16950.000000 16950.000000 mean 0.287200 0.342000 std 0.030137 0.098856 min 0.160000 0.120000 25% 0.270000 0.290000 50% 0.295000 0.340000 75% 0.300000 0.400000 max 0.330000 0.590000 relLibScore2022 relLibVote2022 relLibVax2022 relLibHealth2022 \ count 16950.000000 16950.000000 16950.000000 16950.00000 mean 0.393948 0.800000 0.900000 6.76000 std 0.131963 0.400012 0.300009 3.98288 min 0.155800 0.000000 0.000000 0.00000 25% 0.311700 1.000000 1.000000 4.00000 50% 0.371200 1.000000 1.000000 5.50000 75% 0.476200 1.000000 1.000000 9.00000 max 0.818200 1.000000 1.000000 20.00000 relLibHealthMandate2022 relLibMarriage2022 relLibRfra2022 count 16950.000000 16950.000000 16950.000000 mean 0.640000 1.160000 0.480000 std 0.480014 1.474628 0.499615 min 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 50% 1.000000 0.000000 0.000000 75% 1.000000 3.000000 1.000000 max 1.000000 5.000000 1.000000 [8 rows x 35 columns] Mode of WEEK : 54 Variance of WEEK : 37.422547430596495 Mode of stateId : 1 Variance of stateId : 244.11200188801695 Mode of TBIRTH_YEAR : 1960 Variance of TBIRTH_YEAR : 288.6446219936923 Mode of EEDUC : 6 Variance of EEDUC : 2.2137306418648626 Mode of AEDUC : 2 Variance of AEDUC : 0.00481463825799801 Mode of EGENID_BIRTH : 2 Variance of EGENID_BIRTH : 0.24889174203157796 Mode of AGENID_BIRTH : 2 Variance of AGENID_BIRTH : 0.02857452802620946 Mode of GENID_DESCRIBE : 1 Variance of GENID_DESCRIBE : 1.2603963861043481 Mode of SEXUAL_ORIENTATION : 2 Variance of SEXUAL_ORIENTATION : 0.7237482323771591 Mode of INCOME : 4 Variance of INCOME : 4.705716751155775 Mode of INCOMEMIN : 50000 Variance of INCOMEMIN : 3305722837.0866547 Mode of statePopulation2020 : 5024279 Variance of statePopulation2020 : 54193215481182.87 Mode of statePopulation2023 : 5097641 Variance of statePopulation2023 : 356648599406395.7 Mode of antiTransLegislationRiskIndex32023 : 4 Variance of antiTransLegislationRiskIndex32023 : 2.513748303734734 Mode of antiTransLegislationRiskIndex122022 : 3 Variance of antiTransLegislationRiskIndex122022 : 1.7605038645347806 Mode of antiTransLegislationRiskIndex112022 : 1 Variance of antiTransLegislationRiskIndex112022 : 1.7477031093279838 Mode of transAdultPop2016 : 2700 Variance of transAdultPop2016 : 1331132916.9154522 Mode of transAdultPercent2016 : 0.43 Variance of transAdultPercent2016 : 0.014520696678270103 Mode of transAdultPop2022 : 6300 Variance of transAdultPop2022 : 828822454.0798867 Mode of transAdultPercent2022 : 0.6 Variance of transAdultPercent2022 : 0.015779690955218594 Mode of religionImportantPew2014 : 0.44 Variance of religionImportantPew2014 : 0.011325668181013627 Mode of worshipWeeklyPew2014 : 0.34 Variance of worshipWeeklyPew2014 : 0.005517965543689893 Mode of prayDailyPew2014 : 0.51 Variance of prayDailyPew2014 : 0.008712554014986136 Mode of certainAboutGodPew2014 : 0.61 Variance of certainAboutGodPew2014 : 0.008895564812083307 Mode of overallReligiosityPew2014 : 0.54 Variance of overallReligiosityPew2014 : 0.011309667237005131 Mode of veryReligiousStatista2017 : 0.28 Variance of veryReligiousStatista2017 : 0.008017913033217302 Mode of moderatelyReligiousStatista2017 : 0.3 Variance of moderatelyReligiousStatista2017 : 0.0009082135819222371 Mode of nonreligiousStatista2017 : 0.33 Variance of nonreligiousStatista2017 : 0.009772576553188979 Mode of relLibScore2022 : 0.3377 Variance of relLibScore2022 : 0.0174141322784353 Mode of relLibVote2022 : 1.0 Variance of relLibVote2022 : 0.16000944008496082 Mode of relLibVax2022 : 1.0 Variance of relLibVax2022 : 0.09000531004779044 Mode of relLibHealth2022 : 5.0 Variance of relLibHealth2022 : 15.86333589002301 Mode of relLibHealthMandate2022 : 1.0 Variance of relLibHealthMandate2022 : 0.2304135937223435 Mode of relLibMarriage2022 : 0.0 Variance of relLibMarriage2022 : 2.174528290754617 Mode of relLibRfra2022 : 0.0 Variance of relLibRfra2022 : 0.2496147265325388
#convert all values that will be used to model to categories or numbers
col_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOMEMIN","CUR_GENID"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
dfClean = modelSampledf[col_list].copy()
print(dfClean.dtypes)
# convert text columns to category values
dfClean["CUR_GENID"] = dfClean["CUR_GENID"].astype("category")
dfClean["CUR_GENID_CAT"] = dfClean["CUR_GENID"].cat.codes
print(dfClean.describe())
WEEK int64 stateId int32 TBIRTH_YEAR int64 EEDUC int64 EGENID_BIRTH int64 GENID_DESCRIBE int64 SEXUAL_ORIENTATION int64 INCOMEMIN int64 CUR_GENID object statePopulation2020 int64 statePopulation2023 int64 antiTransLegislationRiskIndex32023 int64 transAdultPop2022 int64 overallReligiosityPew2014 float64 veryReligiousStatista2017 float64 moderatelyReligiousStatista2017 float64 nonreligiousStatista2017 float64 relLibScore2022 float64 relLibVote2022 float64 relLibVax2022 float64 relLibHealth2022 float64 relLibHealthMandate2022 float64 relLibMarriage2022 float64 relLibRfra2022 float64 dtype: object WEEK stateId TBIRTH_YEAR EEDUC EGENID_BIRTH \ count 16950.000000 16950.000000 16950.000000 16950.000000 16950.000000 mean 44.072153 29.320000 1971.105959 5.254572 1.533510 std 6.117397 15.624084 16.989544 1.487861 0.498891 min 34.000000 1.000000 1933.000000 1.000000 1.000000 25% 39.000000 17.000000 1957.000000 4.000000 1.000000 50% 44.000000 29.500000 1971.000000 6.000000 2.000000 75% 50.000000 42.000000 1985.000000 7.000000 2.000000 max 54.000000 56.000000 2005.000000 7.000000 2.000000 GENID_DESCRIBE SEXUAL_ORIENTATION INCOMEMIN statePopulation2020 \ count 16950.000000 16950.000000 16950.000000 1.695000e+04 mean 2.136224 2.308024 70282.595870 6.615242e+06 std 1.122674 0.850734 57495.415792 7.361604e+06 min 1.000000 1.000000 0.000000 5.768510e+05 25% 1.000000 2.000000 25000.000000 1.839106e+06 50% 2.000000 2.000000 50000.000000 4.581796e+06 75% 3.000000 2.000000 100000.000000 7.705281e+06 max 4.000000 5.000000 200000.000000 3.953822e+07 statePopulation2023 ... moderatelyReligiousStatista2017 \ count 1.695000e+04 ... 16950.000000 mean 8.960485e+06 ... 0.287200 std 1.888514e+07 ... 0.030137 min 5.808170e+05 ... 0.160000 25% 1.920562e+06 ... 0.270000 50% 4.625424e+06 ... 0.295000 75% 7.999503e+06 ... 0.300000 max 1.309280e+08 ... 0.330000 nonreligiousStatista2017 relLibScore2022 relLibVote2022 \ count 16950.000000 16950.000000 16950.000000 mean 0.342000 0.393948 0.800000 std 0.098856 0.131963 0.400012 min 0.120000 0.155800 0.000000 25% 0.290000 0.311700 1.000000 50% 0.340000 0.371200 1.000000 75% 0.400000 0.476200 1.000000 max 0.590000 0.818200 1.000000 relLibVax2022 relLibHealth2022 relLibHealthMandate2022 \ count 16950.000000 16950.00000 16950.000000 mean 0.900000 6.76000 0.640000 std 0.300009 3.98288 0.480014 min 0.000000 0.00000 0.000000 25% 1.000000 4.00000 0.000000 50% 1.000000 5.50000 1.000000 75% 1.000000 9.00000 1.000000 max 1.000000 20.00000 1.000000 relLibMarriage2022 relLibRfra2022 CUR_GENID_CAT count 16950.000000 16950.000000 16950.000000 mean 1.160000 0.480000 1.000000 std 1.474628 0.499615 0.816521 min 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 50% 0.000000 0.000000 1.000000 75% 3.000000 1.000000 2.000000 max 5.000000 1.000000 2.000000 [8 rows x 24 columns]
# Build kNN Classifier to sort and classify data
# reduce dimensionality based on experimentation and hypothesis criteria
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
,"SEXUAL_ORIENTATION","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
trainSize = 0.3
trainState = 1
# split datasets into training and test sets
size = 0.3
state = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=state)
# scale input data for training if necessary for better predictions
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
# prep data
cv_count = 18
max_neighbors = 200 #arbitrary magic number
parameters = {"n_neighbors": np.arange(1, max_neighbors)}
# run regression
knnr = KNeighborsClassifier(n_neighbors=cv_count,weights='distance')
# best neighbor count found in testing at 127
# use gridsearch to test all values for best n_neighbors number and highest accuracy
#knnr_gscv = GridSearchCV(knnr, parameters, cv=cv_count)
#knnr_gscv.fit(X.values, y.values)
#print("Best value for neighbor count found: ",knnr_gscv.best_params_)
#print("Best Average Accuracy found: ",knnr_gscv.best_score_)
# Build the new model
# split dataset into dependent(features) and independent(target) variable
#params = knnr_gscv.best_params_
#n_count = int(params['n_neighbors'])
n_count = 18
# splt datasets into training and test sets
size = 0.3
state = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=state)
# run regression
knnr = KNeighborsClassifier(n_neighbors=n_count,weights='distance')
knnr.fit(X_train.values, y_train.values)
pred = knnr.predict(X_test.values)
# confusion matrix for visualization is available, but unnecessary for this dataset
cfm = metrics.confusion_matrix(y_test, pred)
fig, ax = plt.subplots(figsize=(6, 6))
ax.matshow(cfm, cmap = plt.cm.Purples, alpha=0.3)
for i in range(cfm.shape[0]):
for j in range(cfm.shape[1]):
ax.text(x=j, y=i,s=cfm[i, j], va='center',
ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize = 16)
plt.ylabel('Actuals', fontsize = 16)
plt.title('kNN Confusion Matrix', fontsize = 16)
plt.show()
print(metrics.classification_report(y_test, pred, zero_division = 0))
precision recall f1-score support 0 0.42 0.43 0.43 1721 1 0.36 0.38 0.37 1650 2 0.48 0.43 0.45 1714 accuracy 0.42 5085 macro avg 0.42 0.42 0.42 5085 weighted avg 0.42 0.42 0.42 5085
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix')
precision recall f1-score support 0 0.84 0.97 0.90 3976 1 0.75 0.93 0.83 3896 2 0.83 0.51 0.63 3993 accuracy 0.80 11865 macro avg 0.81 0.80 0.79 11865 weighted avg 0.81 0.80 0.79 11865
# initialize, train and test the GNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
pred = gnb.predict(X_test)
#check accuracy
gnb_accuracy = metrics.accuracy_score(pred, y_test)
#confusion matrix
cfm = metrics.confusion_matrix(y_test, pred)
fig, ax = plt.subplots(figsize=(6,6))
ax.matshow(cfm, cmap = plt.cm.Purples, alpha=0.3)
for i in range(cfm.shape[0]):
for j in range(cfm.shape[1]):
ax.text(x=j, y=i,s=cfm[i, j], va='center',
ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize = 16)
plt.ylabel('Actuals', fontsize = 16)
plt.title('Gaussian NB Confusion Matrix', fontsize = 16)
plt.show()
print(metrics.classification_report(y_test, pred, zero_division = 0))
precision recall f1-score support 0 0.44 0.41 0.42 1721 1 0.32 0.15 0.20 1650 2 0.40 0.63 0.49 1714 accuracy 0.40 5085 macro avg 0.39 0.40 0.37 5085 weighted avg 0.39 0.40 0.38 5085
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix')
precision recall f1-score support 0 0.84 0.97 0.90 3976 1 0.69 0.98 0.81 3896 2 0.87 0.39 0.54 3993 accuracy 0.78 11865 macro avg 0.80 0.78 0.75 11865 weighted avg 0.80 0.78 0.75 11865
# Setup for A/B testing
# remove "SEXUAL_ORIENTATION" column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Sexuality Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Sexuality Removed')
precision recall f1-score support 0 0.74 0.90 0.81 3976 1 0.65 0.89 0.75 3896 2 0.52 0.22 0.31 3993 accuracy 0.67 11865 macro avg 0.64 0.67 0.62 11865 weighted avg 0.64 0.67 0.62 11865
precision recall f1-score support 0 0.71 1.00 0.83 3976 1 0.69 0.90 0.78 3896 2 0.68 0.20 0.31 3993 accuracy 0.70 11865 macro avg 0.69 0.70 0.64 11865 weighted avg 0.69 0.70 0.64 11865
# Setup for A/B testing
# remove "EEDUC","INCOMEMIN" column
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","SEXUAL_ORIENTATION"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Education and Income Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Education and Income Removed')
precision recall f1-score support 0 0.84 0.97 0.90 3976 1 0.75 0.93 0.83 3896 2 0.83 0.51 0.63 3993 accuracy 0.80 11865 macro avg 0.81 0.80 0.79 11865 weighted avg 0.81 0.80 0.79 11865
precision recall f1-score support 0 0.84 0.97 0.90 3976 1 0.69 0.98 0.81 3896 2 0.87 0.39 0.54 3993 accuracy 0.78 11865 macro avg 0.80 0.78 0.75 11865 weighted avg 0.80 0.78 0.75 11865
# Setup for A/B testing
# remove ,"statePopulation2020","statePopulation2023","transAdultPop2022" column
#population table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Population Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Population Table Removed')
precision recall f1-score support 0 0.74 0.90 0.81 3976 1 0.65 0.90 0.75 3896 2 0.52 0.21 0.30 3993 accuracy 0.67 11865 macro avg 0.64 0.67 0.62 11865 weighted avg 0.64 0.67 0.62 11865
precision recall f1-score support 0 0.71 1.00 0.83 3976 1 0.69 0.91 0.78 3896 2 0.69 0.20 0.31 3993 accuracy 0.70 11865 macro avg 0.70 0.70 0.64 11865 weighted avg 0.70 0.70 0.64 11865
# Setup for A/B testing
# remove ,"antiTransLegislationRiskIndex32023" column
#anti-trans legislation table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Anti-Trans Legislation Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Anti-Trans Legislation Table Removed')
precision recall f1-score support 0 0.74 0.90 0.81 3976 1 0.65 0.89 0.75 3896 2 0.52 0.22 0.31 3993 accuracy 0.67 11865 macro avg 0.64 0.67 0.63 11865 weighted avg 0.64 0.67 0.62 11865
precision recall f1-score support 0 0.71 1.00 0.83 3976 1 0.69 0.90 0.78 3896 2 0.68 0.20 0.31 3993 accuracy 0.70 11865 macro avg 0.70 0.70 0.64 11865 weighted avg 0.70 0.70 0.64 11865
# Setup for A/B testing
# remove ,"overallReligiosityPew2014" column
#Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Pew 2014 Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Pew 2014 Table Removed')
precision recall f1-score support 0 0.74 0.90 0.81 3976 1 0.65 0.89 0.75 3896 2 0.52 0.22 0.31 3993 accuracy 0.67 11865 macro avg 0.64 0.67 0.62 11865 weighted avg 0.64 0.67 0.62 11865
precision recall f1-score support 0 0.71 1.00 0.83 3976 1 0.69 0.91 0.78 3896 2 0.68 0.20 0.31 3993 accuracy 0.70 11865 macro avg 0.70 0.70 0.64 11865 weighted avg 0.70 0.70 0.64 11865
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017" column
#Statista 2017 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Statista 2017 Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Statista 2017 Table Removed')
precision recall f1-score support 0 0.74 0.90 0.81 3976 1 0.65 0.90 0.75 3896 2 0.52 0.22 0.31 3993 accuracy 0.67 11865 macro avg 0.64 0.67 0.62 11865 weighted avg 0.64 0.67 0.62 11865
precision recall f1-score support 0 0.71 1.00 0.83 3976 1 0.69 0.90 0.78 3896 2 0.68 0.20 0.31 3993 accuracy 0.70 11865 macro avg 0.69 0.70 0.64 11865 weighted avg 0.69 0.70 0.64 11865
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014" column
#Statista 2017 and Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Statista 2017 and Pew 2014 Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Statista 2017 and Pew 2014 Table Removed')
precision recall f1-score support 0 0.74 0.90 0.81 3976 1 0.65 0.90 0.75 3896 2 0.52 0.22 0.31 3993 accuracy 0.67 11865 macro avg 0.64 0.67 0.62 11865 weighted avg 0.64 0.67 0.62 11865
precision recall f1-score support 0 0.71 1.00 0.83 3976 1 0.69 0.90 0.78 3896 2 0.68 0.20 0.31 3993 accuracy 0.70 11865 macro avg 0.69 0.70 0.64 11865 weighted avg 0.69 0.70 0.64 11865
# Setup for A/B testing
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Religious Liberty Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Religious Liberty Table Removed')
precision recall f1-score support 0 0.74 0.90 0.81 3976 1 0.65 0.89 0.75 3896 2 0.52 0.23 0.32 3993 accuracy 0.67 11865 macro avg 0.64 0.67 0.63 11865 weighted avg 0.64 0.67 0.63 11865
precision recall f1-score support 0 0.71 1.00 0.83 3976 1 0.69 0.91 0.78 3896 2 0.69 0.20 0.31 3993 accuracy 0.70 11865 macro avg 0.70 0.70 0.64 11865 weighted avg 0.70 0.70 0.64 11865
# Setup for A/B testing
#,"statePopulation2020","statePopulation2023","transAdultPop2022"
#,"antiTransLegislationRiskIndex32023"
#,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Only Pulse Data')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Only Pulse Data')
precision recall f1-score support 0 0.74 0.90 0.81 3976 1 0.65 0.89 0.76 3896 2 0.53 0.23 0.32 3993 accuracy 0.67 11865 macro avg 0.64 0.67 0.63 11865 weighted avg 0.64 0.67 0.63 11865
precision recall f1-score support 0 0.71 1.00 0.83 3976 1 0.62 1.00 0.76 3896 2 0.00 0.00 0.00 3993 accuracy 0.66 11865 macro avg 0.44 0.67 0.53 11865 weighted avg 0.44 0.66 0.53 11865
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
,"SEXUAL_ORIENTATION","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
# regression pick
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
sc_X = StandardScaler()
sc_y = StandardScaler()
sc_X_train = sc_X.fit_transform(X_train)
sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
sc_y_train = sc_y_train
log_regression = LogisticRegression(solver="newton-cg", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))
#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
#print(np.exp(x)/(1 + np.exp(x)))
coefArray.append(np.exp(x))
featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)
Training set score: 0.68 Test set score: 0.6743 featureNames Coefficients 0 WEEK 1.002763 1 stateId 1.000906 2 TBIRTH_YEAR 1.003440 3 EEDUC 0.005272 4 EGENID_BIRTH 1.041176 5 SEXUAL_ORIENTATION 1.000001
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:210: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations. warnings.warn(
log_regression = LogisticRegression(solver="sag", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))
#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
#print(np.exp(x)/(1 + np.exp(x)))
coefArray.append(np.exp(x)/(1 + np.exp(x)))
featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)
Training set score: 0.4144 Test set score: 0.411 featureNames Coefficients 0 WEEK 0.500001 1 stateId 0.500001 2 TBIRTH_YEAR 0.499962 3 EEDUC 0.499995 4 EGENID_BIRTH 0.500000 5 SEXUAL_ORIENTATION 0.500001
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
log_regression = LogisticRegression(solver="saga", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))
#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
#print(np.exp(x)/(1 + np.exp(x)))
coefArray.append(np.exp(x)/(1 + np.exp(x)))
featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)
Training set score: 0.4144 Test set score: 0.411 featureNames Coefficients 0 WEEK 0.500000 1 stateId 0.500000 2 TBIRTH_YEAR 0.499962 3 EEDUC 0.499997 4 EGENID_BIRTH 0.500000 5 SEXUAL_ORIENTATION 0.500001
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
log_regression = LogisticRegression(solver="lbfgs", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))
#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
#print(np.exp(x)/(1 + np.exp(x)))
coefArray.append(np.exp(x)/(1 + np.exp(x)))
featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)
Training set score: 0.4144 Test set score: 0.411 featureNames Coefficients 0 WEEK 0.499999 1 stateId 0.499999 2 TBIRTH_YEAR 0.499962 3 EEDUC 0.500000 4 EGENID_BIRTH 0.500000 5 SEXUAL_ORIENTATION 0.500001
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
,"SEXUAL_ORIENTATION","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Full Dataset")
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-0.3, random_state = 0)
#sc_X = StandardScaler()
#sc_y = StandardScaler()
#sc_X_train = sc_X.fit_transform(X_train)
#sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
#sc_y_train = sc_y_train
#maxIter=1000000000
#log_regression = LogisticRegression(max_iter=max_iter)
#solvers = ["liblinear","newton-cg","sag","saga","lbfgs"]
#penalty=["l2"]
#cVals=[0.01,0.1,1.0,10.0,100.0]
#grid=dict(solver=solvers,penalty=penalty,C=cVals)
#cv=RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=state)
#grid_search = GridSearchCV(estimator=log_regression, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
#grid_result = grid_search.fit(X,y)
# summarize results
#print("Accuracy rate of Logistic Regression: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#Accuracy rate of Logistic Regression: 0.797286 using {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}
#log_regression = LogisticRegression(solver="newton-cg", random_state=state, penalty="l2", C=0.01, max_iter=maxIter).fit(X_train,y_train)
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
#print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
#print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))
#coefArray = []
#for ind in range(log_regression.coef_.shape[0]):
# if ind == 0:
# featFor = "Cisgender Men"
# elif ind == 1:
# featFor = "Cisgender Women"
# else:
# featFor = "Transgender"
# featTitle = "Logistic Regression " + featFor + " Feature Coefficients"
# for x in log_regression.coef_[ind]:
# #print(np.exp(x)/(1 + np.exp(x)))
# coefArray.append(np.exp(x)/(1 + np.exp(x)))
# featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["Features", "Coefficients"])
# featureDf = featureDf.sort_values(by=["Coefficients"], ascending=False)
# print(featureDf)
# #plot bar chart of importance
# f, ax = plt.subplots(figsize=(20,12))
# sns.barplot(x=featureDf["Features"], y=featureDf["Coefficients"], palette="flare")
# plt.title(featTitle, fontsize=14)
# plt.xticks(rotation=45)
# plt.show()
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging warn(msg, LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.791 Features Coefficients 3 EEDUC 0.510278 11 overallReligiosityPew2014 0.504194 12 veryReligiousStatista2017 0.501816 13 moderatelyReligiousStatista2017 0.501757 21 relLibRfra2022 0.501642 15 relLibScore2022 0.500698 20 relLibMarriage2022 0.500558 2 TBIRTH_YEAR 0.500448 18 relLibHealth2022 0.500004 6 INCOMEMIN 0.500001 7 statePopulation2020 0.500000 8 statePopulation2023 0.500000 10 transAdultPop2022 0.499999 0 WEEK 0.499949 1 stateId 0.499818 19 relLibHealthMandate2022 0.499021 14 nonreligiousStatista2017 0.499015 9 antiTransLegislationRiskIndex32023 0.498912 17 relLibVax2022 0.497051 16 relLibVote2022 0.491821 5 SEXUAL_ORIENTATION 0.406382 4 EGENID_BIRTH 0.112777
Features Coefficients 3 EEDUC 0.510278 11 overallReligiosityPew2014 0.504194 12 veryReligiousStatista2017 0.501816 13 moderatelyReligiousStatista2017 0.501757 21 relLibRfra2022 0.501642 15 relLibScore2022 0.500698 20 relLibMarriage2022 0.500558 2 TBIRTH_YEAR 0.500448 18 relLibHealth2022 0.500004 6 INCOMEMIN 0.500001 7 statePopulation2020 0.500000 8 statePopulation2023 0.500000 10 transAdultPop2022 0.499999 0 WEEK 0.499949 1 stateId 0.499818 19 relLibHealthMandate2022 0.499021 14 nonreligiousStatista2017 0.499015 9 antiTransLegislationRiskIndex32023 0.498912 17 relLibVax2022 0.497051 16 relLibVote2022 0.491821 5 SEXUAL_ORIENTATION 0.406382 4 EGENID_BIRTH 0.112777
Features Coefficients 3 EEDUC 0.510278 11 overallReligiosityPew2014 0.504194 12 veryReligiousStatista2017 0.501816 13 moderatelyReligiousStatista2017 0.501757 21 relLibRfra2022 0.501642 15 relLibScore2022 0.500698 20 relLibMarriage2022 0.500558 2 TBIRTH_YEAR 0.500448 18 relLibHealth2022 0.500004 6 INCOMEMIN 0.500001 7 statePopulation2020 0.500000 8 statePopulation2023 0.500000 10 transAdultPop2022 0.499999 0 WEEK 0.499949 1 stateId 0.499818 19 relLibHealthMandate2022 0.499021 14 nonreligiousStatista2017 0.499015 9 antiTransLegislationRiskIndex32023 0.498912 17 relLibVax2022 0.497051 16 relLibVote2022 0.491821 5 SEXUAL_ORIENTATION 0.406382 4 EGENID_BIRTH 0.112777
# Setup for A/B testing
# remove "SEXUAL_ORIENTATION" column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Sexuality Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging warn(msg, LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6689 Features Coefficients 3 EEDUC 0.511605 20 relLibRfra2022 0.505645 10 overallReligiosityPew2014 0.504704 12 moderatelyReligiousStatista2017 0.502584 19 relLibMarriage2022 0.501647 11 veryReligiousStatista2017 0.501373 0 WEEK 0.500593 2 TBIRTH_YEAR 0.500314 14 relLibScore2022 0.500257 8 antiTransLegislationRiskIndex32023 0.500143 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 9 transAdultPop2022 0.500000 17 relLibHealth2022 0.499961 1 stateId 0.499648 13 nonreligiousStatista2017 0.499627 16 relLibVax2022 0.497257 18 relLibHealthMandate2022 0.496243 15 relLibVote2022 0.489497 4 EGENID_BIRTH 0.112855
Features Coefficients 3 EEDUC 0.511605 20 relLibRfra2022 0.505645 10 overallReligiosityPew2014 0.504704 12 moderatelyReligiousStatista2017 0.502584 19 relLibMarriage2022 0.501647 11 veryReligiousStatista2017 0.501373 0 WEEK 0.500593 2 TBIRTH_YEAR 0.500314 14 relLibScore2022 0.500257 8 antiTransLegislationRiskIndex32023 0.500143 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 9 transAdultPop2022 0.500000 17 relLibHealth2022 0.499961 1 stateId 0.499648 13 nonreligiousStatista2017 0.499627 16 relLibVax2022 0.497257 18 relLibHealthMandate2022 0.496243 15 relLibVote2022 0.489497 4 EGENID_BIRTH 0.112855
Features Coefficients 3 EEDUC 0.511605 20 relLibRfra2022 0.505645 10 overallReligiosityPew2014 0.504704 12 moderatelyReligiousStatista2017 0.502584 19 relLibMarriage2022 0.501647 11 veryReligiousStatista2017 0.501373 0 WEEK 0.500593 2 TBIRTH_YEAR 0.500314 14 relLibScore2022 0.500257 8 antiTransLegislationRiskIndex32023 0.500143 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 9 transAdultPop2022 0.500000 17 relLibHealth2022 0.499961 1 stateId 0.499648 13 nonreligiousStatista2017 0.499627 16 relLibVax2022 0.497257 18 relLibHealthMandate2022 0.496243 15 relLibVote2022 0.489497 4 EGENID_BIRTH 0.112855
# Setup for A/B testing
# remove "EGENID_BIRTH", column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","SEXUAL_ORIENTATION","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Sexuality Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging warn(msg, LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.5102 Features Coefficients 3 EEDUC 0.510443 10 overallReligiosityPew2014 0.503726 18 relLibHealthMandate2022 0.502791 20 relLibRfra2022 0.502573 11 veryReligiousStatista2017 0.501265 12 moderatelyReligiousStatista2017 0.501013 8 antiTransLegislationRiskIndex32023 0.500868 0 WEEK 0.500678 14 relLibScore2022 0.500242 2 TBIRTH_YEAR 0.500045 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 9 transAdultPop2022 0.499999 1 stateId 0.499939 13 nonreligiousStatista2017 0.499419 17 relLibHealth2022 0.498873 19 relLibMarriage2022 0.497932 16 relLibVax2022 0.497623 15 relLibVote2022 0.490879 4 SEXUAL_ORIENTATION 0.402553
Features Coefficients 3 EEDUC 0.510443 10 overallReligiosityPew2014 0.503726 18 relLibHealthMandate2022 0.502791 20 relLibRfra2022 0.502573 11 veryReligiousStatista2017 0.501265 12 moderatelyReligiousStatista2017 0.501013 8 antiTransLegislationRiskIndex32023 0.500868 0 WEEK 0.500678 14 relLibScore2022 0.500242 2 TBIRTH_YEAR 0.500045 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 9 transAdultPop2022 0.499999 1 stateId 0.499939 13 nonreligiousStatista2017 0.499419 17 relLibHealth2022 0.498873 19 relLibMarriage2022 0.497932 16 relLibVax2022 0.497623 15 relLibVote2022 0.490879 4 SEXUAL_ORIENTATION 0.402553
Features Coefficients 3 EEDUC 0.510443 10 overallReligiosityPew2014 0.503726 18 relLibHealthMandate2022 0.502791 20 relLibRfra2022 0.502573 11 veryReligiousStatista2017 0.501265 12 moderatelyReligiousStatista2017 0.501013 8 antiTransLegislationRiskIndex32023 0.500868 0 WEEK 0.500678 14 relLibScore2022 0.500242 2 TBIRTH_YEAR 0.500045 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 9 transAdultPop2022 0.499999 1 stateId 0.499939 13 nonreligiousStatista2017 0.499419 17 relLibHealth2022 0.498873 19 relLibMarriage2022 0.497932 16 relLibVax2022 0.497623 15 relLibVote2022 0.490879 4 SEXUAL_ORIENTATION 0.402553
# Setup for A/B testing
# remove "EEDUC","INCOMEMIN" column
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","SEXUAL_ORIENTATION"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Education and Income Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging warn(msg, LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.7853 Features Coefficients 9 overallReligiosityPew2014 0.503246 11 moderatelyReligiousStatista2017 0.501590 19 relLibRfra2022 0.501426 10 veryReligiousStatista2017 0.501360 13 relLibScore2022 0.500738 18 relLibMarriage2022 0.500612 2 TBIRTH_YEAR 0.500497 15 relLibVax2022 0.500336 16 relLibHealth2022 0.500105 5 statePopulation2020 0.500000 6 statePopulation2023 0.500000 7 transAdultPop2022 0.499999 1 stateId 0.499872 0 WEEK 0.499861 12 nonreligiousStatista2017 0.499604 17 relLibHealthMandate2022 0.499491 8 antiTransLegislationRiskIndex32023 0.496482 14 relLibVote2022 0.494724 4 SEXUAL_ORIENTATION 0.403340 3 EGENID_BIRTH 0.114981
Features Coefficients 9 overallReligiosityPew2014 0.503246 11 moderatelyReligiousStatista2017 0.501590 19 relLibRfra2022 0.501426 10 veryReligiousStatista2017 0.501360 13 relLibScore2022 0.500738 18 relLibMarriage2022 0.500612 2 TBIRTH_YEAR 0.500497 15 relLibVax2022 0.500336 16 relLibHealth2022 0.500105 5 statePopulation2020 0.500000 6 statePopulation2023 0.500000 7 transAdultPop2022 0.499999 1 stateId 0.499872 0 WEEK 0.499861 12 nonreligiousStatista2017 0.499604 17 relLibHealthMandate2022 0.499491 8 antiTransLegislationRiskIndex32023 0.496482 14 relLibVote2022 0.494724 4 SEXUAL_ORIENTATION 0.403340 3 EGENID_BIRTH 0.114981
Features Coefficients 9 overallReligiosityPew2014 0.503246 11 moderatelyReligiousStatista2017 0.501590 19 relLibRfra2022 0.501426 10 veryReligiousStatista2017 0.501360 13 relLibScore2022 0.500738 18 relLibMarriage2022 0.500612 2 TBIRTH_YEAR 0.500497 15 relLibVax2022 0.500336 16 relLibHealth2022 0.500105 5 statePopulation2020 0.500000 6 statePopulation2023 0.500000 7 transAdultPop2022 0.499999 1 stateId 0.499872 0 WEEK 0.499861 12 nonreligiousStatista2017 0.499604 17 relLibHealthMandate2022 0.499491 8 antiTransLegislationRiskIndex32023 0.496482 14 relLibVote2022 0.494724 4 SEXUAL_ORIENTATION 0.403340 3 EGENID_BIRTH 0.114981
# Setup for A/B testing
# remove ,"statePopulation2020","statePopulation2023","transAdultPop2022" column
#population table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Population Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.7093 Features Coefficients 4 EEDUC 0.509532 17 relLibRfra2022 0.505297 16 relLibMarriage2022 0.502339 7 overallReligiosityPew2014 0.502115 6 antiTransLegislationRiskIndex32023 0.501436 9 moderatelyReligiousStatista2017 0.501296 0 WEEK 0.500696 8 veryReligiousStatista2017 0.500224 11 relLibScore2022 0.500088 5 INCOMEMIN 0.500001 14 relLibHealth2022 0.499730 1 stateId 0.499727 10 nonreligiousStatista2017 0.498659 2 TBIRTH_YEAR 0.498232 15 relLibHealthMandate2022 0.495723 13 relLibVax2022 0.493200 12 relLibVote2022 0.489051 3 EGENID_BIRTH 0.111752
Features Coefficients 4 EEDUC 0.509532 17 relLibRfra2022 0.505297 16 relLibMarriage2022 0.502339 7 overallReligiosityPew2014 0.502115 6 antiTransLegislationRiskIndex32023 0.501436 9 moderatelyReligiousStatista2017 0.501296 0 WEEK 0.500696 8 veryReligiousStatista2017 0.500224 11 relLibScore2022 0.500088 5 INCOMEMIN 0.500001 14 relLibHealth2022 0.499730 1 stateId 0.499727 10 nonreligiousStatista2017 0.498659 2 TBIRTH_YEAR 0.498232 15 relLibHealthMandate2022 0.495723 13 relLibVax2022 0.493200 12 relLibVote2022 0.489051 3 EGENID_BIRTH 0.111752
Features Coefficients 4 EEDUC 0.509532 17 relLibRfra2022 0.505297 16 relLibMarriage2022 0.502339 7 overallReligiosityPew2014 0.502115 6 antiTransLegislationRiskIndex32023 0.501436 9 moderatelyReligiousStatista2017 0.501296 0 WEEK 0.500696 8 veryReligiousStatista2017 0.500224 11 relLibScore2022 0.500088 5 INCOMEMIN 0.500001 14 relLibHealth2022 0.499730 1 stateId 0.499727 10 nonreligiousStatista2017 0.498659 2 TBIRTH_YEAR 0.498232 15 relLibHealthMandate2022 0.495723 13 relLibVax2022 0.493200 12 relLibVote2022 0.489051 3 EGENID_BIRTH 0.111752
# Setup for A/B testing
# remove ,"antiTransLegislationRiskIndex32023" column
#anti-trans legislation table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Anti-Trans Legislation Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging warn(msg, LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6684 Features Coefficients 4 EEDUC 0.511643 19 relLibRfra2022 0.505801 9 overallReligiosityPew2014 0.505390 11 moderatelyReligiousStatista2017 0.502597 10 veryReligiousStatista2017 0.502021 18 relLibMarriage2022 0.501604 0 WEEK 0.500581 13 relLibScore2022 0.500353 2 TBIRTH_YEAR 0.500315 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 16 relLibHealth2022 0.499941 1 stateId 0.499647 12 nonreligiousStatista2017 0.498976 15 relLibVax2022 0.497316 17 relLibHealthMandate2022 0.496308 14 relLibVote2022 0.489418 3 EGENID_BIRTH 0.112749
Features Coefficients 4 EEDUC 0.511643 19 relLibRfra2022 0.505801 9 overallReligiosityPew2014 0.505390 11 moderatelyReligiousStatista2017 0.502597 10 veryReligiousStatista2017 0.502021 18 relLibMarriage2022 0.501604 0 WEEK 0.500581 13 relLibScore2022 0.500353 2 TBIRTH_YEAR 0.500315 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 16 relLibHealth2022 0.499941 1 stateId 0.499647 12 nonreligiousStatista2017 0.498976 15 relLibVax2022 0.497316 17 relLibHealthMandate2022 0.496308 14 relLibVote2022 0.489418 3 EGENID_BIRTH 0.112749
Features Coefficients 4 EEDUC 0.511643 19 relLibRfra2022 0.505801 9 overallReligiosityPew2014 0.505390 11 moderatelyReligiousStatista2017 0.502597 10 veryReligiousStatista2017 0.502021 18 relLibMarriage2022 0.501604 0 WEEK 0.500581 13 relLibScore2022 0.500353 2 TBIRTH_YEAR 0.500315 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 16 relLibHealth2022 0.499941 1 stateId 0.499647 12 nonreligiousStatista2017 0.498976 15 relLibVax2022 0.497316 17 relLibHealthMandate2022 0.496308 14 relLibVote2022 0.489418 3 EGENID_BIRTH 0.112749
# Setup for A/B testing
# remove ,"overallReligiosityPew2014" column
#Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Pew 2014 Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging warn(msg, LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6694 Features Coefficients 4 EEDUC 0.511619 19 relLibRfra2022 0.505808 11 moderatelyReligiousStatista2017 0.502805 18 relLibMarriage2022 0.501627 10 veryReligiousStatista2017 0.501534 0 WEEK 0.500607 13 relLibScore2022 0.500326 2 TBIRTH_YEAR 0.500317 9 antiTransLegislationRiskIndex32023 0.500300 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 16 relLibHealth2022 0.499978 1 stateId 0.499649 12 nonreligiousStatista2017 0.499524 15 relLibVax2022 0.497192 17 relLibHealthMandate2022 0.496067 14 relLibVote2022 0.489147 3 EGENID_BIRTH 0.111512
Features Coefficients 4 EEDUC 0.511619 19 relLibRfra2022 0.505808 11 moderatelyReligiousStatista2017 0.502805 18 relLibMarriage2022 0.501627 10 veryReligiousStatista2017 0.501534 0 WEEK 0.500607 13 relLibScore2022 0.500326 2 TBIRTH_YEAR 0.500317 9 antiTransLegislationRiskIndex32023 0.500300 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 16 relLibHealth2022 0.499978 1 stateId 0.499649 12 nonreligiousStatista2017 0.499524 15 relLibVax2022 0.497192 17 relLibHealthMandate2022 0.496067 14 relLibVote2022 0.489147 3 EGENID_BIRTH 0.111512
Features Coefficients 4 EEDUC 0.511619 19 relLibRfra2022 0.505808 11 moderatelyReligiousStatista2017 0.502805 18 relLibMarriage2022 0.501627 10 veryReligiousStatista2017 0.501534 0 WEEK 0.500607 13 relLibScore2022 0.500326 2 TBIRTH_YEAR 0.500317 9 antiTransLegislationRiskIndex32023 0.500300 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 16 relLibHealth2022 0.499978 1 stateId 0.499649 12 nonreligiousStatista2017 0.499524 15 relLibVax2022 0.497192 17 relLibHealthMandate2022 0.496067 14 relLibVote2022 0.489147 3 EGENID_BIRTH 0.111512
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017" column
#Statista 2017 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Statista 2017 Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging warn(msg, LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6689 Features Coefficients 4 EEDUC 0.511610 17 relLibRfra2022 0.505686 10 overallReligiosityPew2014 0.504812 16 relLibMarriage2022 0.501656 0 WEEK 0.500590 2 TBIRTH_YEAR 0.500315 11 relLibScore2022 0.500276 9 antiTransLegislationRiskIndex32023 0.500180 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 14 relLibHealth2022 0.499969 1 stateId 0.499648 13 relLibVax2022 0.497284 15 relLibHealthMandate2022 0.496187 12 relLibVote2022 0.489382 3 EGENID_BIRTH 0.112520
Features Coefficients 4 EEDUC 0.511610 17 relLibRfra2022 0.505686 10 overallReligiosityPew2014 0.504812 16 relLibMarriage2022 0.501656 0 WEEK 0.500590 2 TBIRTH_YEAR 0.500315 11 relLibScore2022 0.500276 9 antiTransLegislationRiskIndex32023 0.500180 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 14 relLibHealth2022 0.499969 1 stateId 0.499648 13 relLibVax2022 0.497284 15 relLibHealthMandate2022 0.496187 12 relLibVote2022 0.489382 3 EGENID_BIRTH 0.112520
Features Coefficients 4 EEDUC 0.511610 17 relLibRfra2022 0.505686 10 overallReligiosityPew2014 0.504812 16 relLibMarriage2022 0.501656 0 WEEK 0.500590 2 TBIRTH_YEAR 0.500315 11 relLibScore2022 0.500276 9 antiTransLegislationRiskIndex32023 0.500180 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 14 relLibHealth2022 0.499969 1 stateId 0.499648 13 relLibVax2022 0.497284 15 relLibHealthMandate2022 0.496187 12 relLibVote2022 0.489382 3 EGENID_BIRTH 0.112520
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014" column
#Statista 2017 and Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Statista 2017 and Pew 2014 Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging warn(msg, LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6802 Features Coefficients 4 EEDUC 0.510759 16 relLibRfra2022 0.506450 9 antiTransLegislationRiskIndex32023 0.501529 15 relLibMarriage2022 0.501230 0 WEEK 0.500592 13 relLibHealth2022 0.500099 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 2 TBIRTH_YEAR 0.499922 1 stateId 0.499648 14 relLibHealthMandate2022 0.497649 12 relLibVax2022 0.494921 10 relLibScore2022 0.491488 11 relLibVote2022 0.488917 3 EGENID_BIRTH 0.109164
Features Coefficients 4 EEDUC 0.510759 16 relLibRfra2022 0.506450 9 antiTransLegislationRiskIndex32023 0.501529 15 relLibMarriage2022 0.501230 0 WEEK 0.500592 13 relLibHealth2022 0.500099 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 2 TBIRTH_YEAR 0.499922 1 stateId 0.499648 14 relLibHealthMandate2022 0.497649 12 relLibVax2022 0.494921 10 relLibScore2022 0.491488 11 relLibVote2022 0.488917 3 EGENID_BIRTH 0.109164
Features Coefficients 4 EEDUC 0.510759 16 relLibRfra2022 0.506450 9 antiTransLegislationRiskIndex32023 0.501529 15 relLibMarriage2022 0.501230 0 WEEK 0.500592 13 relLibHealth2022 0.500099 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 2 TBIRTH_YEAR 0.499922 1 stateId 0.499648 14 relLibHealthMandate2022 0.497649 12 relLibVax2022 0.494921 10 relLibScore2022 0.491488 11 relLibVote2022 0.488917 3 EGENID_BIRTH 0.109164
# Setup for A/B testing
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Religious Liberty Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging warn(msg, LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6761 Features Coefficients 4 EEDUC 0.510815 13 overallReligiosityPew2014 0.506299 11 moderatelyReligiousStatista2017 0.504494 10 veryReligiousStatista2017 0.501713 9 antiTransLegislationRiskIndex32023 0.500621 0 WEEK 0.500606 12 nonreligiousStatista2017 0.500278 2 TBIRTH_YEAR 0.500069 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.499999 1 stateId 0.499662 3 EGENID_BIRTH 0.111579
Features Coefficients 4 EEDUC 0.510815 13 overallReligiosityPew2014 0.506299 11 moderatelyReligiousStatista2017 0.504494 10 veryReligiousStatista2017 0.501713 9 antiTransLegislationRiskIndex32023 0.500621 0 WEEK 0.500606 12 nonreligiousStatista2017 0.500278 2 TBIRTH_YEAR 0.500069 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.499999 1 stateId 0.499662 3 EGENID_BIRTH 0.111579
Features Coefficients 4 EEDUC 0.510815 13 overallReligiosityPew2014 0.506299 11 moderatelyReligiousStatista2017 0.504494 10 veryReligiousStatista2017 0.501713 9 antiTransLegislationRiskIndex32023 0.500621 0 WEEK 0.500606 12 nonreligiousStatista2017 0.500278 2 TBIRTH_YEAR 0.500069 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.499999 1 stateId 0.499662 3 EGENID_BIRTH 0.111579
# Setup for A/B testing
#,"statePopulation2020","statePopulation2023","transAdultPop2022"
#,"antiTransLegislationRiskIndex32023"
#,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Only Pulse Data")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge warn('The line search algorithm did not converge', LineSearchWarning)
Logistic Regression Accuracy: 0.7099 Features Coefficients 4 EEDUC 0.509491 0 WEEK 0.500709 5 INCOMEMIN 0.500002 1 stateId 0.499759 2 TBIRTH_YEAR 0.498227 3 EGENID_BIRTH 0.111782
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging warn(msg, LineSearchWarning) C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed warnings.warn("Line Search failed")
Features Coefficients 4 EEDUC 0.509491 0 WEEK 0.500709 5 INCOMEMIN 0.500002 1 stateId 0.499759 2 TBIRTH_YEAR 0.498227 3 EGENID_BIRTH 0.111782
Features Coefficients 4 EEDUC 0.509491 0 WEEK 0.500709 5 INCOMEMIN 0.500002 1 stateId 0.499759 2 TBIRTH_YEAR 0.498227 3 EGENID_BIRTH 0.111782
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
,"SEXUAL_ORIENTATION","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState, "Random Forest Feature Importance")
Random Forest (500 Tree) Regression Accuracy: 0.4966 Features Importance 5 SEXUAL_ORIENTATION 0.283867 4 EGENID_BIRTH 0.247087 2 TBIRTH_YEAR 0.115527 0 WEEK 0.082819 6 INCOMEMIN 0.051162 3 EEDUC 0.041775 1 stateId 0.024697 8 statePopulation2023 0.019611 15 relLibScore2022 0.017565 7 statePopulation2020 0.014974 11 overallReligiosityPew2014 0.014905 10 transAdultPop2022 0.014747 18 relLibHealth2022 0.014445 13 moderatelyReligiousStatista2017 0.011986 12 veryReligiousStatista2017 0.011783 14 nonreligiousStatista2017 0.011340 9 antiTransLegislationRiskIndex32023 0.007486 20 relLibMarriage2022 0.005403 19 relLibHealthMandate2022 0.002955 21 relLibRfra2022 0.002738 16 relLibVote2022 0.002267 17 relLibVax2022 0.000862
# Setup for A/B testing
# remove "SEXUAL_ORIENTATION" column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Sexuality Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2711 Features Importance 4 EGENID_BIRTH 0.239197 2 TBIRTH_YEAR 0.200776 0 WEEK 0.127075 5 INCOMEMIN 0.087728 3 EEDUC 0.068879 1 stateId 0.041188 14 relLibScore2022 0.030655 7 statePopulation2023 0.025917 9 transAdultPop2022 0.023027 10 overallReligiosityPew2014 0.022958 6 statePopulation2020 0.021619 17 relLibHealth2022 0.021323 12 moderatelyReligiousStatista2017 0.019417 11 veryReligiousStatista2017 0.018438 13 nonreligiousStatista2017 0.017602 8 antiTransLegislationRiskIndex32023 0.011535 19 relLibMarriage2022 0.009055 20 relLibRfra2022 0.004458 18 relLibHealthMandate2022 0.004111 15 relLibVote2022 0.003277 16 relLibVax2022 0.001768
# Setup for A/B testing
# remove "EEDUC","INCOMEMIN" column
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","SEXUAL_ORIENTATION"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Education and Income Removed")
Random Forest (500 Tree) Regression Accuracy: 0.4744 Features Importance 4 SEXUAL_ORIENTATION 0.285457 3 EGENID_BIRTH 0.248089 2 TBIRTH_YEAR 0.166847 0 WEEK 0.121240 1 stateId 0.024385 6 statePopulation2023 0.018840 13 relLibScore2022 0.017829 16 relLibHealth2022 0.014760 5 statePopulation2020 0.014618 7 transAdultPop2022 0.014434 9 overallReligiosityPew2014 0.014231 10 veryReligiousStatista2017 0.012056 12 nonreligiousStatista2017 0.012010 11 moderatelyReligiousStatista2017 0.011817 8 antiTransLegislationRiskIndex32023 0.008044 18 relLibMarriage2022 0.005661 17 relLibHealthMandate2022 0.003300 19 relLibRfra2022 0.002953 14 relLibVote2022 0.002526 15 relLibVax2022 0.000904
# Setup for A/B testing
# remove ,"statePopulation2020","statePopulation2023","transAdultPop2022" column
#population table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Population Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2717 Features Importance 3 EGENID_BIRTH 0.239197 2 TBIRTH_YEAR 0.200083 0 WEEK 0.127960 5 INCOMEMIN 0.087268 4 EEDUC 0.069257 1 stateId 0.054054 11 relLibScore2022 0.041764 7 overallReligiosityPew2014 0.030534 14 relLibHealth2022 0.028542 9 moderatelyReligiousStatista2017 0.025326 8 veryReligiousStatista2017 0.024941 10 nonreligiousStatista2017 0.023795 6 antiTransLegislationRiskIndex32023 0.016182 16 relLibMarriage2022 0.011925 17 relLibRfra2022 0.006150 15 relLibHealthMandate2022 0.005858 12 relLibVote2022 0.004419 13 relLibVax2022 0.002745
# Setup for A/B testing
# remove ,"antiTransLegislationRiskIndex32023" column
#anti-trans legislation table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Anti-Trans Legislation Removed")
Random Forest (500 Tree) Regression Accuracy: 0.271 Features Importance 3 EGENID_BIRTH 0.239197 2 TBIRTH_YEAR 0.201075 0 WEEK 0.127463 5 INCOMEMIN 0.087390 4 EEDUC 0.069082 1 stateId 0.042216 13 relLibScore2022 0.031651 7 statePopulation2023 0.026780 8 transAdultPop2022 0.024179 9 overallReligiosityPew2014 0.024081 6 statePopulation2020 0.021919 16 relLibHealth2022 0.021759 11 moderatelyReligiousStatista2017 0.020444 10 veryReligiousStatista2017 0.019535 12 nonreligiousStatista2017 0.019072 18 relLibMarriage2022 0.009757 19 relLibRfra2022 0.004788 17 relLibHealthMandate2022 0.004328 14 relLibVote2022 0.003453 15 relLibVax2022 0.001831
# Setup for A/B testing
# remove ,"overallReligiosityPew2014" column
#Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Pew 2014 Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2723 Features Importance 3 EGENID_BIRTH 0.239197 2 TBIRTH_YEAR 0.200986 0 WEEK 0.127489 5 INCOMEMIN 0.087619 4 EEDUC 0.068820 1 stateId 0.043777 13 relLibScore2022 0.032480 7 statePopulation2023 0.027228 8 transAdultPop2022 0.024436 16 relLibHealth2022 0.023153 6 statePopulation2020 0.023089 12 nonreligiousStatista2017 0.022216 10 veryReligiousStatista2017 0.021502 11 moderatelyReligiousStatista2017 0.020735 9 antiTransLegislationRiskIndex32023 0.012801 18 relLibMarriage2022 0.009710 19 relLibRfra2022 0.004750 17 relLibHealthMandate2022 0.004410 14 relLibVote2022 0.003699 15 relLibVax2022 0.001903
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017" column
#Statista 2017 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Statista 2017 Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2722 Features Importance 3 EGENID_BIRTH 0.239197 2 TBIRTH_YEAR 0.201996 0 WEEK 0.128081 5 INCOMEMIN 0.087045 4 EEDUC 0.068836 1 stateId 0.048270 11 relLibScore2022 0.036531 10 overallReligiosityPew2014 0.034922 7 statePopulation2023 0.031234 8 transAdultPop2022 0.028197 14 relLibHealth2022 0.026239 6 statePopulation2020 0.025789 9 antiTransLegislationRiskIndex32023 0.015637 16 relLibMarriage2022 0.011671 17 relLibRfra2022 0.005378 15 relLibHealthMandate2022 0.004677 12 relLibVote2022 0.004106 13 relLibVax2022 0.002194
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014" column
#Statista 2017 and Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Statista 2017 and Pew 2014 Removed")
Random Forest (500 Tree) Regression Accuracy: 0.274 Features Importance 3 EGENID_BIRTH 0.239197 2 TBIRTH_YEAR 0.202868 0 WEEK 0.128257 5 INCOMEMIN 0.087695 4 EEDUC 0.068477 1 stateId 0.055199 10 relLibScore2022 0.040815 7 statePopulation2023 0.034473 8 transAdultPop2022 0.031218 13 relLibHealth2022 0.030359 6 statePopulation2020 0.028814 9 antiTransLegislationRiskIndex32023 0.020202 15 relLibMarriage2022 0.013167 16 relLibRfra2022 0.006054 11 relLibVote2022 0.005467 14 relLibHealthMandate2022 0.005383 12 relLibVax2022 0.002356
# Setup for A/B testing
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Religious Liberty Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2721 Features Importance 3 EGENID_BIRTH 0.239197 2 TBIRTH_YEAR 0.201343 0 WEEK 0.129005 5 INCOMEMIN 0.087782 4 EEDUC 0.069254 1 stateId 0.054400 7 statePopulation2023 0.034165 13 overallReligiosityPew2014 0.031626 8 transAdultPop2022 0.031116 6 statePopulation2020 0.029118 11 moderatelyReligiousStatista2017 0.026989 10 veryReligiousStatista2017 0.025652 12 nonreligiousStatista2017 0.024772 9 antiTransLegislationRiskIndex32023 0.015582
# Setup for A/B testing
#,"statePopulation2020","statePopulation2023","transAdultPop2022"
#,"antiTransLegislationRiskIndex32023"
#,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Only Pulse Data")
Random Forest (500 Tree) Regression Accuracy: 0.2661 Features Importance 3 EGENID_BIRTH 0.239197 2 TBIRTH_YEAR 0.226661 1 stateId 0.201604 0 WEEK 0.152386 5 INCOMEMIN 0.098260 4 EEDUC 0.081893