# setup
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.tools as tools

from collections import Counter

from matplotlib.ticker import MaxNLocator

from patsy import dmatrices

from sklearn.cluster import KMeans
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn import tree
from sklearn.tree import DecisionTreeRegressor

from sklearn import metrics
from sklearn.metrics import r2_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import SVR

from statistics import mode
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.outliers_influence import variance_inflation_factor

pd.options.mode.chained_assignment = None


# read data for Reed and religiosity datasets at state level
reed_col_list = ["stateId","stateName","statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
                 ,"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022"
                 ,"transAdultPop2016","transAdultPercent2016","transAdultPop2022","transAdultPercent2022"
                 ,"religionImportantPew2014","worshipWeeklyPew2014","prayDailyPew2014","certainAboutGodPew2014"
                 ,"overallReligiosityPew2014","veryReligiousStatista2017","moderatelyReligiousStatista2017"
                 ,"nonreligiousStatista2017","relLibScore2022","relLibVote2022","relLibVax2022"
                 ,"relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022"
                ]

reed_index_list = ["stateId","stateName","antiTransLegislationRiskIndex32023"
                   ,"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022"
                  ]
trans_pop_list = ["stateId","stateName","statePopulation2020","statePopulation2023","transAdultPop2016"
                  ,"transAdultPercent2016","transAdultPop2022","transAdultPercent2022"
                 ]
religiosity_2014_list = ["stateId","stateName","religionImportantPew2014","worshipWeeklyPew2014"
                         ,"prayDailyPew2014","certainAboutGodPew2014","overallReligiosityPew2014"
                        ]
religiosity_2017_list = ["stateId","stateName","veryReligiousStatista2017","moderatelyReligiousStatista2017"
                         ,"nonreligiousStatista2017"
                        ]
religiosity_2022_list = ["stateId","stateName","relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022"
                         ,"relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022"
                        ]

reedFulldf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=reed_col_list)
reedFulldf = reedFulldf[reedFulldf["stateId"] != 11]
reedIndexdf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=reed_index_list)
reedIndexdf = reedIndexdf[reedIndexdf["stateId"] != 11]
transStatePopdf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=trans_pop_list)
transStatePopdf = transStatePopdf[transStatePopdf["stateId"] != 11]
religious2014df = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=religiosity_2014_list)
religious2014df = religious2014df[religious2014df["stateId"] != 11]
religious2017df = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=religiosity_2017_list)
religious2017df = religious2017df[religious2017df["stateId"] != 11]
religious2022df = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=religiosity_2022_list)
religious2022df = religious2022df[religious2022df["stateId"] != 11]

#print(reedFulldf.head())


#create function to get count of unique values in column and get percentages
def countCol(df, dfCol):
    tempdf = df[dfCol]
    namecount = dfCol + "count"
    namepercent = dfCol + "percent"
    tempdf[namecount] = df[dfCol].value_counts()
    tempdf[namepercent] = df[dfCol].value_counts(normalize=True)*100
    return tempdf


# use describe to get mean and standard deviations of dataframe data
def describeDF(df, dfCol):
    print(df.describe())
    # get mode and variance using built in stats library
    for colName in dfCol:
        if(df[colName].dtypes != object):
            print("Mode of ",colName,": ", mode(df[colName]))
            print("Variance of ",colName,": ", np.var(df[colName], ddof=1))
    print()


def combinedf(df, dfCol, dfName):
    retdf = pd.DataFrame()
    for colName in dfCol:
        namecount = colName + "count"
        namepercent = colName + "percent"
        dfNameColCount = dfName + namecount
        dfNameColPercent = dfName + namepercent
        tempdf = countCol(df,colName)
        retdf[dfNameColCount] = tempdf[namecount]
        retdf[dfNameColPercent] = tempdf[namepercent]
    return retdf


#function to allow grouping gender identity on 3 values
def basicGenMarker(asab, gender):
    if(asab == 1 and gender == 1): 
        return "Cisgender Man"
    elif (asab == 2 and gender == 2):
        return "Cisgender Woman"
    else:
        return "Transgender"


#function to print covariance map based on given columns
def printCovariance(df,dfCol,colLabels,title):
    cols = dfCol
    stdsc = StandardScaler()
    X_std = stdsc.fit_transform(df[cols].iloc[:,range(0,len(dfCol))].values)
    cov_mat = np.cov(X_std.T)
    
    plt.figure(figsize=(7,7))
    sns.set(font_scale=1)
    hm = sns.heatmap(cov_mat, cbar = True, annot = True, square = True, fmt = ".2f", cmap = "vlag",
                    annot_kws={"size":12}, yticklabels = colLabels, xticklabels = colLabels, cbar_kws={"shrink": 0.5})
    #plt.title(title)
    plt.tight_layout()
    plt.show()


#function to build classifier matrix after testing NB model
# Build Naive Bayes Classifer to sort and classify data
# split datasets into training and test sets
def printNBClassifierOutcome(X,y,trainSize,trainState,matrixTitle):
    size = trainSize
    state = trainState
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-size, random_state=state)

    # scale input data for training if necessary for better predictions
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.fit_transform(X_test)

    # initiale, train and test the BNB
    bnb = BernoulliNB()
    bnb.fit(X_train, y_train)
    pred = bnb.predict(X_test)

    #check accuracy
    bnb_accuracy = metrics.accuracy_score(pred, y_test)

    printConfusionMatrix(y_test, pred,matrixTitle)


#function to build classifier matrix after testing SVM model
#build SVM
# split datasets into training and test sets
def printSVMClassifierOutcome(X,y,trainSize,trainState,matrixTitle):
    size = trainSize
    state = trainState
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-size, random_state=state)

    # scale input data for training if necessary for better predictions
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.fit_transform(X_test)

    # build and train model
    clf = SVC(kernel = "linear")
    clf.fit(X_train, y_train)

    pred = clf.predict(X_test)

    #check accuracy
    clf_accuracy = metrics.accuracy_score(pred, y_test)

    printConfusionMatrix(y_test, pred,matrixTitle)


def printConfusionMatrix(y_test, pred,matrixTitle):
    #confusion matrix
    cfm = metrics.confusion_matrix(y_test, pred)
    fig, ax = plt.subplots(figsize=(6,6))
    ax.matshow(cfm, cmap = plt.cm.Purples, alpha=0.3)
    for i in range(cfm.shape[0]):
        for j in range(cfm.shape[1]):
            ax.text(x=j, y=i,s=cfm[i, j], va='center', 
                    ha='center', size='xx-large')

    plt.xlabel('Predictions', fontsize = 16)
    plt.ylabel('Actuals', fontsize = 16)
    plt.title(matrixTitle, fontsize = 14)
    plt.show()

    print(metrics.classification_report(y_test, pred, zero_division = 0))


nEstimators = 500
decPrecision = 4
maxDepth = 3

def printRFRClassifierOutcome(X,y,trainSize,trainState, featTitle):
    size = trainSize
    state = trainState
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-size, random_state=state)
    sc_X = StandardScaler()
    sc_y = StandardScaler()
    sc_X_train = sc_X.fit_transform(X_train)
    sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
    sc_y_train = sc_y_train

    rf_regressor = RandomForestRegressor(n_estimators = nEstimators, random_state = state)
    rf_regressor.fit(X_train, y_train)
    rf_y_pred = rf_regressor.predict(X_test)
    np.set_printoptions(precision=decPrecision)

    print("Random Forest (" + str(nEstimators) + " Tree) Regression Accuracy: " + str(round(r2_score(y_test, rf_y_pred), decPrecision)))
    
    #rf_regressor.feature_names_in_
    featureDf = pd.DataFrame({"Features" : rf_regressor.feature_names_in_, "Importance" : rf_regressor.feature_importances_})
    featureDf = featureDf.sort_values(by=["Importance"], ascending=False)
    print(featureDf)
    
    #plot bar chart of importance
    f, ax = plt.subplots(figsize=(20,12))
    sns.barplot(x=featureDf["Features"], y=featureDf["Importance"], palette="flare")
    plt.xlabel('Features', fontsize = 16)
    plt.ylabel('Importance', fontsize = 16)
    plt.title(featTitle, fontsize=16)
    plt.xticks(rotation=45)
    #for val in plt.containers:
        #plt.bar_label(val)
    
    plt.show()
    
    #rfc = RandomForestClassifier(n_estimators=nEstimators, max_depth=maxDepth, random_state=state)
    #rfc.fit(X_train, y_train)
    #features = X.columns.values
    #classes = ['Cisgender man', 'Cisgender Woman', 'Transgender']
    #for estimator in rfc.estimators_:
        #print(estimator)
        #plt.figure(figsize=(20,10))
        #tree.plot_tree(estimator, feature_names=features, class_names=classes, fontsize=10, filled=True, rounded=True)
        #plt.show()


maxIter=1000000000

def LogRegressionOutcome(X,y,trainSize,trainState,featTitle):
    size = trainSize
    state = trainState
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-size, random_state = state)
    sc_X = StandardScaler()
    sc_y = StandardScaler()
    sc_X_train = sc_X.fit_transform(X_train)
    sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
    sc_y_train = sc_y_train
    
    log_regression = LogisticRegression(solver="newton-cg", random_state=state, penalty="l2", C=0.01, max_iter=maxIter).fit(X_train,y_train)
    print("Logistic Regression Accuracy: " + str(round(log_regression.score(X_test,y_test), decPrecision)))

    x_train_const = tools.add_constant(X_train)
    
    coefArray = []
    for ind in range(log_regression.coef_.shape[0]):
        if ind == 0:
            featFor = "Cisgender Men"
        elif ind == 1:
            featFor = "Cisgender Women"
        else:
            featFor = "Transgender"
        fullFeatTitle = "Logistic Regression " + featFor + " Feature Important for " + featTitle

        for x in log_regression.coef_[ind]:
            #print(np.exp(x)/(1 + np.exp(x)))
            coefArray.append(np.exp(x)/(1 + np.exp(x)))

        featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["Features", "Coefficients"])
        featureDf = featureDf.sort_values(by=["Coefficients"], ascending=False)
        print(featureDf)
    
        #plot bar chart of importance
        f, ax = plt.subplots(figsize=(20,12))
        sns.barplot(x=featureDf["Features"], y=featureDf["Coefficients"], palette="flare")
        plt.xlabel('Features', fontsize = 16)
        plt.ylabel('Coefficients', fontsize = 16)
        plt.title(fullFeatTitle, fontsize=16)
        plt.xticks(rotation=45)
        #for val in plt.containers:
            #plt.bar_label(val)
    
        plt.show()


describeDF(reedIndexdf, reed_index_list)

         stateId  antiTransLegislationRiskIndex32023  \
count  50.000000                            50.00000   
mean   29.320000                             2.08000   
std    15.782243                             1.60153   
min     1.000000                             0.00000   
25%    17.250000                             1.00000   
50%    29.500000                             2.00000   
75%    41.750000                             4.00000   
max    56.000000                             4.00000   

       antiTransLegislationRiskIndex122022  \
count                            50.000000   
mean                              1.860000   
std                               1.340271   
min                               0.000000   
25%                               1.000000   
50%                               2.000000   
75%                               3.000000   
max                               4.000000   

       antiTransLegislationRiskIndex112022  
count                             50.00000  
mean                               1.82000  
std                                1.33539  
min                                0.00000  
25%                                1.00000  
50%                                2.00000  
75%                                3.00000  
max                                4.00000  
Mode of  stateId :  1
Variance of  stateId :  249.0791836734694
Mode of  antiTransLegislationRiskIndex32023 :  4
Variance of  antiTransLegislationRiskIndex32023 :  2.564897959183673
Mode of  antiTransLegislationRiskIndex122022 :  3
Variance of  antiTransLegislationRiskIndex122022 :  1.7963265306122445
Mode of  antiTransLegislationRiskIndex112022 :  1
Variance of  antiTransLegislationRiskIndex112022 :  1.7832653061224486


reed_cov_list = ["stateId","antiTransLegislationRiskIndex32023"
                   ,"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022"
                  ]
reed_label = ["stateId","RiskIndex32023"
             ,"RiskIndex122022","RiskIndex112022"]
printCovariance(reedIndexdf, reed_cov_list, reed_label, "Anti-Transgender Legislation Risk Index Covariance Matrix")


describeDF(transStatePopdf, trans_pop_list)

         stateId  statePopulation2020  statePopulation2023  transAdultPop2016  \
count  50.000000         5.000000e+01         5.000000e+01           50.00000   
mean   29.320000         6.615242e+06         8.960485e+06        27654.00000   
std    15.782243         7.436124e+06         1.907631e+07        36854.01958   
min     1.000000         5.768510e+05         5.808170e+05         1400.00000   
25%    17.250000         1.869706e+06         1.940934e+06         6375.00000   
50%    29.500000         4.581796e+06         4.625424e+06        19450.00000   
75%    41.750000         7.566836e+06         7.844464e+06        31037.50000   
max    56.000000         3.953822e+07         1.309280e+08       218400.00000   

       transAdultPercent2016  transAdultPop2022  transAdultPercent2022  
count              50.000000          50.000000              50.000000  
mean                0.530400       26638.000000               0.531800  
std                 0.121722       29080.703259               0.126889  
min                 0.300000        2100.000000               0.200000  
25%                 0.432500        7025.000000               0.442500  
50%                 0.535000       16950.000000               0.525000  
75%                 0.610000       33225.000000               0.600000  
max                 0.780000      150100.000000               0.870000  
Mode of  stateId :  1
Variance of  stateId :  249.0791836734694
Mode of  statePopulation2020 :  5024279
Variance of  statePopulation2020 :  55295936980950.49
Mode of  statePopulation2023 :  5097641
Variance of  statePopulation2023 :  363905671623562.75
Mode of  transAdultPop2016 :  2700
Variance of  transAdultPop2016 :  1358218759.1836734
Mode of  transAdultPercent2016 :  0.43
Variance of  transAdultPercent2016 :  0.014816163265306125
Mode of  transAdultPop2022 :  6300
Variance of  transAdultPop2022 :  845687302.0408163
Mode of  transAdultPercent2022 :  0.6
Variance of  transAdultPercent2022 :  0.016100775510204078


pop_cov_list = ["stateId","statePopulation2020","statePopulation2023","transAdultPop2016","transAdultPop2022"]
pop_label = ["stateId","TotalPop2020","TotalPop2023","TransPop2016","TransPop2022"]
printCovariance(transStatePopdf, pop_cov_list, pop_label, "Population by State Covariance Matrix")


describeDF(religious2014df, religiosity_2014_list)

         stateId  religionImportantPew2014  worshipWeeklyPew2014  \
count  50.000000                 50.000000             50.000000   
mean   29.320000                  0.527000              0.359400   
std    15.782243                  0.107499              0.075035   
min     1.000000                  0.320000              0.210000   
25%    17.250000                  0.452500              0.310000   
50%    29.500000                  0.510000              0.355000   
75%    41.750000                  0.597500              0.390000   
max    56.000000                  0.770000              0.530000   

       prayDailyPew2014  certainAboutGodPew2014  overallReligiosityPew2014  
count         50.000000               50.000000                  50.000000  
mean           0.541400                0.633600                   0.547000  
std            0.094286                0.095271                   0.107423  
min            0.330000                0.400000                   0.330000  
25%            0.490000                0.575000                   0.482500  
50%            0.530000                0.630000                   0.540000  
75%            0.607500                0.690000                   0.625000  
max            0.750000                0.820000                   0.770000  
Mode of  stateId :  1
Variance of  stateId :  249.0791836734694
Mode of  religionImportantPew2014 :  0.44
Variance of  religionImportantPew2014 :  0.011556122448979588
Mode of  worshipWeeklyPew2014 :  0.34
Variance of  worshipWeeklyPew2014 :  0.005630244897959184
Mode of  prayDailyPew2014 :  0.51
Variance of  prayDailyPew2014 :  0.008889836734693879
Mode of  certainAboutGodPew2014 :  0.61
Variance of  certainAboutGodPew2014 :  0.009076571428571429
Mode of  overallReligiosityPew2014 :  0.54
Variance of  overallReligiosityPew2014 :  0.011539795918367344


religiosity_cov_2014_list = ["stateId","religionImportantPew2014","worshipWeeklyPew2014"
                         ,"prayDailyPew2014","certainAboutGodPew2014","overallReligiosityPew2014"]
rel_2014_label = ["stateId","VeryImportant","WorshipWeekly","PrayDaily","CertainAboutGod","Overall"]
printCovariance(religious2014df, religiosity_cov_2014_list, rel_2014_label, "Pew 2014 Religiosity Covariance Matrix")


describeDF(religious2017df, religiosity_2017_list)

         stateId  veryReligiousStatista2017  moderatelyReligiousStatista2017  \
count  50.000000                  50.000000                        50.000000   
mean   29.320000                   0.371600                         0.287200   
std    15.782243                   0.090449                         0.030442   
min     1.000000                   0.160000                         0.160000   
25%    17.250000                   0.310000                         0.270000   
50%    29.500000                   0.365000                         0.295000   
75%    41.750000                   0.437500                         0.300000   
max    56.000000                   0.590000                         0.330000   

       nonreligiousStatista2017  
count                 50.000000  
mean                   0.342000  
std                    0.099857  
min                    0.120000  
25%                    0.290000  
50%                    0.340000  
75%                    0.397500  
max                    0.590000  
Mode of  stateId :  1
Variance of  stateId :  249.0791836734694
Mode of  veryReligiousStatista2017 :  0.28
Variance of  veryReligiousStatista2017 :  0.008181061224489796
Mode of  moderatelyReligiousStatista2017 :  0.3
Variance of  moderatelyReligiousStatista2017 :  0.0009266938775510202
Mode of  nonreligiousStatista2017 :  0.33
Variance of  nonreligiousStatista2017 :  0.00997142857142857


religiosity_cov_2017_list = ["stateId","veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"]
rel_2017_label = ["stateId","Very","Moderate","Nonreligious"]
printCovariance(religious2017df, religiosity_cov_2017_list, rel_2017_label, "Gallup 2017 Religiosity Covariance Matrix")


describeDF(religious2022df, religiosity_2022_list)

         stateId  relLibScore2022  relLibVote2022  relLibVax2022  \
count  50.000000        50.000000       50.000000      50.000000   
mean   29.320000         0.393948        0.800000       0.900000   
std    15.782243         0.133298        0.404061       0.303046   
min     1.000000         0.155800        0.000000       0.000000   
25%    17.250000         0.314950        1.000000       1.000000   
50%    29.500000         0.371200        1.000000       1.000000   
75%    41.750000         0.467550        1.000000       1.000000   
max    56.000000         0.818200        1.000000       1.000000   

       relLibHealth2022  relLibHealthMandate2022  relLibMarriage2022  \
count         50.000000                50.000000           50.000000   
mean           6.760000                 0.640000            1.160000   
std            4.023198                 0.484873            1.489555   
min            0.000000                 0.000000            0.000000   
25%            4.250000                 0.000000            0.000000   
50%            5.500000                 1.000000            0.000000   
75%            9.000000                 1.000000            3.000000   
max           20.000000                 1.000000            5.000000   

       relLibRfra2022  
count       50.000000  
mean         0.480000  
std          0.504672  
min          0.000000  
25%          0.000000  
50%          0.000000  
75%          1.000000  
max          1.000000  
Mode of  stateId :  1
Variance of  stateId :  249.0791836734694
Mode of  relLibScore2022 :  0.3377
Variance of  relLibScore2022 :  0.01776847438367347
Mode of  relLibVote2022 :  1.0
Variance of  relLibVote2022 :  0.16326530612244897
Mode of  relLibVax2022 :  1.0
Variance of  relLibVax2022 :  0.09183673469387756
Mode of  relLibHealth2022 :  5.0
Variance of  relLibHealth2022 :  16.186122448979592
Mode of  relLibHealthMandate2022 :  1.0
Variance of  relLibHealthMandate2022 :  0.2351020408163265
Mode of  relLibMarriage2022 :  0.0
Variance of  relLibMarriage2022 :  2.218775510204082
Mode of  relLibRfra2022 :  0.0
Variance of  relLibRfra2022 :  0.25469387755102035


religiosity_cov_2022_list = ["stateId","relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022"
                         ,"relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022"]
rel_2022_label = ["stateId","Score","Vote","Vax","Health","HealthMandate","Marriage","Rfra"]
printCovariance(religious2022df, religiosity_cov_2022_list, rel_2022_label, "Religious Liberty 2022 Covariance Matrix")


# read self-identification Census Pulse Survey data
pulse_col_list = ["SCRAM","WEEK","EST_ST","TBIRTH_YEAR","EEDUC","AEDUC","EGENID_BIRTH","AGENID_BIRTH"
                  ,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME","ENDDATE","EDUCATION","ASSIGNEDGENDER"
                  ,"CHOSENGENDER","SEXUALORIENTATION","INCOMEMIN"]
pulse_num_col_list = ["WEEK","EST_ST","TBIRTH_YEAR","EEDUC","AEDUC","EGENID_BIRTH","AGENID_BIRTH"
                      ,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME"]

pulsedf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\pulseModFull.csv", usecols=pulse_col_list)
print("Full count of data pulse data: ", (pulsedf["GENID_DESCRIBE"] > -100).sum())
print()
countDistrictofColumbia = (pulsedf["EST_ST"] == 11).sum()
print("Count of participants in District of Columbia: ", countDistrictofColumbia)
print()
countMissingGender = (pulsedf["GENID_DESCRIBE"] < 0).sum()
print("Count of missing or unreported gender identity: ", countMissingGender)
print()
countMissingSexuality = (pulsedf["SEXUAL_ORIENTATION"] < 0).sum()
print("Count of missing or unreported sexuality: ", countMissingSexuality)
print()
countMissingIncome = (pulsedf["INCOME"] < 0).sum()
print("Count of missing or unreported minimum income: ", countMissingIncome)
print()
#remove DC residents
pulsedf = pulsedf[pulsedf["EST_ST"] != 11]
#remove missing gender identity based on under 10% of total for better results
pulsedf = pulsedf[pulsedf["GENID_DESCRIBE"].isin([1,2,3,4])]
#remove missing sexuality based on under 10% of total for better results
pulsedf = pulsedf[pulsedf["SEXUAL_ORIENTATION"].isin([1,2,3,4,5])]
#remove missing income based on null values for better results
pulsedf = pulsedf[pulsedf["INCOME"].isin([1,2,3,4,5,6,7,8])]

pulsedfCount = len(pulsedf.index)
print("Count after row removal: ", pulsedfCount)

Full count of data pulse data:  1341164

Count of participants in District of Columbia:  17702

Count of missing or unreported gender identity:  17691

Count of missing or unreported sexuality:  24617

Count of missing or unreported minimum income:  263337

Count after row removal:  1048575


describeDF(pulsedf,pulse_num_col_list)

               WEEK        EST_ST   TBIRTH_YEAR         EEDUC         AEDUC  \
count  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06   
mean   4.391728e+01  2.854206e+01  1.968571e+03  5.367118e+00  1.994822e+00   
std    6.120523e+00  1.640470e+01  1.575978e+01  1.436177e+00  7.176839e-02   
min    3.400000e+01  1.000000e+00  1.933000e+03  1.000000e+00  1.000000e+00   
25%    3.900000e+01  1.300000e+01  1.956000e+03  4.000000e+00  2.000000e+00   
50%    4.300000e+01  2.800000e+01  1.968000e+03  6.000000e+00  2.000000e+00   
75%    4.900000e+01  4.200000e+01  1.981000e+03  7.000000e+00  2.000000e+00   
max    5.400000e+01  5.600000e+01  2.005000e+03  7.000000e+00  2.000000e+00   

       EGENID_BIRTH  AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  \
count  1.048575e+06  1.048575e+06    1.048575e+06        1.048575e+06   
mean   1.581580e+00  1.997595e+00    1.609426e+00        2.069715e+00   
std    4.933001e-01  4.898353e-02    5.507229e-01        4.835032e-01   
min    1.000000e+00  1.000000e+00    1.000000e+00        1.000000e+00   
25%    1.000000e+00  2.000000e+00    1.000000e+00        2.000000e+00   
50%    2.000000e+00  2.000000e+00    2.000000e+00        2.000000e+00   
75%    2.000000e+00  2.000000e+00    2.000000e+00        2.000000e+00   
max    2.000000e+00  2.000000e+00    4.000000e+00        5.000000e+00   

             INCOME  
count  1.048575e+06  
mean   4.620572e+00  
std    2.128103e+00  
min    1.000000e+00  
25%    3.000000e+00  
50%    5.000000e+00  
75%    6.000000e+00  
max    8.000000e+00  
Mode of  WEEK :  43
Variance of  WEEK :  37.460803320626816
Mode of  EST_ST :  6
Variance of  EST_ST :  269.1142209085299
Mode of  TBIRTH_YEAR :  1955
Variance of  TBIRTH_YEAR :  248.3706702974734
Mode of  EEDUC :  6
Variance of  EEDUC :  2.0626035639994904
Mode of  AEDUC :  2
Variance of  AEDUC :  0.005150701178258357
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.2433449743390272
Mode of  AGENID_BIRTH :  2
Variance of  AGENID_BIRTH :  0.0023993863704273163
Mode of  GENID_DESCRIBE :  2
Variance of  GENID_DESCRIBE :  0.3032957446696943
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.23377538647520063
Mode of  INCOME :  6
Variance of  INCOME :  4.528821091548032


pulse_cov_list = ["WEEK","EST_ST","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME"]
pulse_label_list = ["WEEK","EST_ST","BIRTH_YEAR","EDUC","SEX_AT_BIRTH","GENDERID","SEXUALITY","INCOME"]
printCovariance(pulsedf, pulse_cov_list, pulse_label_list, "USCB Pulse Survey Covariance Matrix")


# further analysis of pulse data
pulseIncomedf = pd.DataFrame()
pulseIncomedf["INCOMEMIN"] = pulsedf["INCOMEMIN"].astype(float)
print(pulseIncomedf.describe())

print("Mode of INCOMEMIN: ", mode(pulseIncomedf["INCOMEMIN"]))
print("Variance of INCOMEMIN: ", np.var(pulseIncomedf["INCOMEMIN"], ddof=1))

del pulseIncomedf

          INCOMEMIN
count  1.048575e+06
mean   7.935849e+04
std    5.884945e+04
min    0.000000e+00
25%    3.500000e+04
50%    7.500000e+04
75%    1.000000e+05
max    2.000000e+05
Mode of INCOMEMIN:  100000.0
Variance of INCOMEMIN:  3463257810.0526085


# look at income data based on gender
#clean data
pulseIncomeStatsdf = pulsedf
#remove missing values from table for income and force to number
pulseIncomeStatsdf = pulseIncomeStatsdf[pulseIncomeStatsdf["INCOMEMIN"].astype(str).str.isdigit()]
pulseIncomeStatsdf["INCOMEMIN"] = pd.to_numeric(pulseIncomeStatsdf["INCOMEMIN"], errors='coerce')
#remove unreported or missing chosen gender
pulseIncomeStatsdf = pulseIncomeStatsdf[pulseIncomeStatsdf["GENID_DESCRIBE"] > 0]
#print(pulseIncomeStatsdf.head())

pulseIncomeStatsdf = pulseIncomeStatsdf[pulseIncomeStatsdf["SEXUAL_ORIENTATION"] > 0]

pulse_income_col = ["WEEK","EST_ST","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
                    ,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOMEMIN"]
describeDF(pulseIncomeStatsdf, pulse_income_col)

               WEEK        EST_ST   TBIRTH_YEAR         EEDUC         AEDUC  \
count  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06   
mean   4.391728e+01  2.854206e+01  1.968571e+03  5.367118e+00  1.994822e+00   
std    6.120523e+00  1.640470e+01  1.575978e+01  1.436177e+00  7.176839e-02   
min    3.400000e+01  1.000000e+00  1.933000e+03  1.000000e+00  1.000000e+00   
25%    3.900000e+01  1.300000e+01  1.956000e+03  4.000000e+00  2.000000e+00   
50%    4.300000e+01  2.800000e+01  1.968000e+03  6.000000e+00  2.000000e+00   
75%    4.900000e+01  4.200000e+01  1.981000e+03  7.000000e+00  2.000000e+00   
max    5.400000e+01  5.600000e+01  2.005000e+03  7.000000e+00  2.000000e+00   

       EGENID_BIRTH  AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  \
count  1.048575e+06  1.048575e+06    1.048575e+06        1.048575e+06   
mean   1.581580e+00  1.997595e+00    1.609426e+00        2.069715e+00   
std    4.933001e-01  4.898353e-02    5.507229e-01        4.835032e-01   
min    1.000000e+00  1.000000e+00    1.000000e+00        1.000000e+00   
25%    1.000000e+00  2.000000e+00    1.000000e+00        2.000000e+00   
50%    2.000000e+00  2.000000e+00    2.000000e+00        2.000000e+00   
75%    2.000000e+00  2.000000e+00    2.000000e+00        2.000000e+00   
max    2.000000e+00  2.000000e+00    4.000000e+00        5.000000e+00   

             INCOME     INCOMEMIN  
count  1.048575e+06  1.048575e+06  
mean   4.620572e+00  7.935849e+04  
std    2.128103e+00  5.884945e+04  
min    1.000000e+00  0.000000e+00  
25%    3.000000e+00  3.500000e+04  
50%    5.000000e+00  7.500000e+04  
75%    6.000000e+00  1.000000e+05  
max    8.000000e+00  2.000000e+05  
Mode of  WEEK :  43
Variance of  WEEK :  37.460803320626816
Mode of  EST_ST :  6
Variance of  EST_ST :  269.1142209085299
Mode of  TBIRTH_YEAR :  1955
Variance of  TBIRTH_YEAR :  248.3706702974734
Mode of  EEDUC :  6
Variance of  EEDUC :  2.0626035639994904
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.2433449743390272
Mode of  GENID_DESCRIBE :  2
Variance of  GENID_DESCRIBE :  0.3032957446696943
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.23377538647520063
Mode of  INCOMEMIN :  100000
Variance of  INCOMEMIN :  3463257810.0526085


#build violin plot
plt.figure(figsize=(12,8))
sns.violinplot(x=pulseIncomeStatsdf["EGENID_BIRTH"],y=pulseIncomeStatsdf["INCOMEMIN"],palette="bright")

ax = plt.gca()
leg = ax.get_legend()
ax.set_xticklabels(["Assigned Male at Birth","Assigned Female at Birth"])
plt.ylabel("Income in Dollars")
plt.xlabel("")
plt.title("Minimum Yearly Reported Income")
plt.show()


#build violin plot
incomeGeniddf = pulseIncomeStatsdf
incomeGeniddf["CUR_GENID"] = incomeGeniddf.apply(lambda x: str(x["EGENID_BIRTH"]) + str(x["GENID_DESCRIBE"]), axis=1)
#print(incomeGeniddf)

plt.figure(figsize=(15,8))
sns.violinplot(x=incomeGeniddf["CUR_GENID"],y=incomeGeniddf["INCOMEMIN"],palette="bright")

ax = plt.gca()
ax.set_xticklabels(["Cisgender Women","Cisgender Men", "Nonbinary AFAB", "Nonbinary AMAB"
           , "Transgender AFAB", "Transgender AMAB", "Transgender FTM", "Transgender MTF"])
plt.ylabel("Income in Dollars")
plt.xlabel("")
plt.title("Minimum Yearly Reported Income by Gender Identity and Sex Assigned at Birth")
plt.show()


#income comparison based on assigned gender at birth
pulseIncomeAMABdf = pulseIncomeStatsdf
pulseIncomeAMABdf = pulseIncomeAMABdf[pulseIncomeAMABdf["EGENID_BIRTH"] == 1]
describeDF(pulseIncomeAMABdf, pulse_income_col)

pulseIncomeAFABdf = pulseIncomeStatsdf
pulseIncomeAFABdf = pulseIncomeAFABdf[pulseIncomeAFABdf["EGENID_BIRTH"] == 2]
describeDF(pulseIncomeAFABdf, pulse_income_col)

                WEEK         EST_ST    TBIRTH_YEAR          EEDUC  \
count  438745.000000  438745.000000  438745.000000  438745.000000   
mean       44.064304      28.533891    1967.503153       5.435818   
std         6.133248      16.498385      16.139392       1.424280   
min        34.000000       1.000000    1933.000000       1.000000   
25%        39.000000      13.000000    1954.000000       4.000000   
50%        44.000000      28.000000    1966.000000       6.000000   
75%        50.000000      44.000000    1981.000000       7.000000   
max        54.000000      56.000000    2005.000000       7.000000   

               AEDUC  EGENID_BIRTH   AGENID_BIRTH  GENID_DESCRIBE  \
count  438745.000000      438745.0  438745.000000   438745.000000   
mean        1.994215           1.0       1.997568        1.034546   
std         0.075837           0.0       0.049255        0.300578   
min         1.000000           1.0       1.000000        1.000000   
25%         2.000000           1.0       2.000000        1.000000   
50%         2.000000           1.0       2.000000        1.000000   
75%         2.000000           1.0       2.000000        1.000000   
max         2.000000           1.0       2.000000        4.000000   

       SEXUAL_ORIENTATION         INCOME      INCOMEMIN  
count       438745.000000  438745.000000  438745.000000  
mean             2.032830       4.966391   88654.389224  
std              0.475219       2.088031   60425.293955  
min              1.000000       1.000000       0.000000  
25%              2.000000       4.000000   50000.000000  
50%              2.000000       5.000000   75000.000000  
75%              2.000000       6.000000  100000.000000  
max              5.000000       8.000000  200000.000000  
Mode of  WEEK :  43
Variance of  WEEK :  37.616725915359496
Mode of  EST_ST :  6
Variance of  EST_ST :  272.19670766553105
Mode of  TBIRTH_YEAR :  1955
Variance of  TBIRTH_YEAR :  260.47997895673205
Mode of  EEDUC :  6
Variance of  EEDUC :  2.0285745651188547
Mode of  EGENID_BIRTH :  1
Variance of  EGENID_BIRTH :  0.0
Mode of  GENID_DESCRIBE :  1
Variance of  GENID_DESCRIBE :  0.0903474059981086
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.22583355356615814
Mode of  INCOMEMIN :  100000
Variance of  INCOMEMIN :  3651216149.53128

                WEEK         EST_ST    TBIRTH_YEAR          EEDUC  \
count  609830.000000  609830.000000  609830.000000  609830.000000   
mean       43.811497      28.547931    1969.338557       5.317692   
std         6.109169      16.336979      15.435331       1.442654   
min        34.000000       1.000000    1933.000000       1.000000   
25%        39.000000      13.000000    1957.000000       4.000000   
50%        43.000000      28.000000    1969.000000       6.000000   
75%        49.000000      42.000000    1982.000000       7.000000   
max        54.000000      56.000000    2005.000000       7.000000   

               AEDUC  EGENID_BIRTH   AGENID_BIRTH  GENID_DESCRIBE  \
count  609830.000000      609830.0  609830.000000   609830.000000   
mean        1.995259           2.0       1.997614        2.023026   
std         0.068689           0.0       0.048788        0.218328   
min         1.000000           2.0       1.000000        1.000000   
25%         2.000000           2.0       2.000000        2.000000   
50%         2.000000           2.0       2.000000        2.000000   
75%         2.000000           2.0       2.000000        2.000000   
max         2.000000           2.0       2.000000        4.000000   

       SEXUAL_ORIENTATION         INCOME      INCOMEMIN  
count       609830.000000  609830.000000  609830.000000  
mean             2.096251       4.371771   72670.506535  
std              0.487654       2.121895   56755.052819  
min              1.000000       1.000000       0.000000  
25%              2.000000       3.000000   35000.000000  
50%              2.000000       4.000000   50000.000000  
75%              2.000000       6.000000  100000.000000  
max              5.000000       8.000000  200000.000000  
Mode of  WEEK :  43
Variance of  WEEK :  37.32194368751015
Mode of  EST_ST :  6
Variance of  EST_ST :  266.89687185333827
Mode of  TBIRTH_YEAR :  1955
Variance of  TBIRTH_YEAR :  238.24944848073915
Mode of  EEDUC :  6
Variance of  EEDUC :  2.081250683388053
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.0
Mode of  GENID_DESCRIBE :  2
Variance of  GENID_DESCRIBE :  0.047666915897604446
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.23780655025122638
Mode of  INCOMEMIN :  50000
Variance of  INCOMEMIN :  3221136020.4754634


# build datasets based on chosen gender
col_for_counts = ["WEEK"]

pulseStateReduceddf = pulseIncomeStatsdf
#print("Total set week counts:")
pulseStateCountdf = combinedf(pulseStateReduceddf,col_for_counts,"All")
#print(pulseStateCountdf)

pulseStateTotalsdf = pd.DataFrame()
pulseStateTotalsdf["AllWEEKcount"] = pulseStateCountdf["AllWEEKcount"].astype(int)

del pulseStateCountdf


pulseCisMendf = pulseIncomeStatsdf
pulseCisMendf = pulseCisMendf[pulseCisMendf["EGENID_BIRTH"] == 1]
pulseCisMendf = pulseCisMendf[pulseCisMendf["GENID_DESCRIBE"] == 1]
describeDF(pulseCisMendf, pulse_income_col)

#print("Cis men week counts:")
pulseCisMenReducedDf = combinedf(pulseCisMendf,col_for_counts,"CisMen")

pulseStateTotalsdf["CisMenWEEKcount"] = pulseCisMenReducedDf["CisMenWEEKcount"].astype(int)
#print(pulseStateTotalsdf["CisMenWEEKcount"])

                WEEK         EST_ST    TBIRTH_YEAR          EEDUC  \
count  432414.000000  432414.000000  432414.000000  432414.000000   
mean       44.064441      28.527571    1967.402887       5.441038   
std         6.133263      16.496388      16.082041       1.421493   
min        34.000000       1.000000    1933.000000       1.000000   
25%        39.000000      13.000000    1954.000000       4.000000   
50%        44.000000      28.000000    1966.000000       6.000000   
75%        50.000000      44.000000    1981.000000       7.000000   
max        54.000000      56.000000    2005.000000       7.000000   

               AEDUC  EGENID_BIRTH   AGENID_BIRTH  GENID_DESCRIBE  \
count  432414.000000      432414.0  432414.000000        432414.0   
mean        1.994237           1.0       1.998985             1.0   
std         0.075695           0.0       0.031847             0.0   
min         1.000000           1.0       1.000000             1.0   
25%         2.000000           1.0       2.000000             1.0   
50%         2.000000           1.0       2.000000             1.0   
75%         2.000000           1.0       2.000000             1.0   
max         2.000000           1.0       2.000000             1.0   

       SEXUAL_ORIENTATION         INCOME      INCOMEMIN  
count       432414.000000  432414.000000  432414.000000  
mean             2.019733       4.980276   88989.105348  
std              0.440445       2.082161   60375.536150  
min              1.000000       1.000000       0.000000  
25%              2.000000       4.000000   50000.000000  
50%              2.000000       5.000000   75000.000000  
75%              2.000000       7.000000  150000.000000  
max              5.000000       8.000000  200000.000000  
Mode of  WEEK :  43
Variance of  WEEK :  37.6169203141804
Mode of  EST_ST :  6
Variance of  EST_ST :  272.1308119804907
Mode of  TBIRTH_YEAR :  1955
Variance of  TBIRTH_YEAR :  258.63203797251515
Mode of  EEDUC :  6
Variance of  EEDUC :  2.0206427755641716
Mode of  EGENID_BIRTH :  1
Variance of  EGENID_BIRTH :  0.0
Mode of  GENID_DESCRIBE :  1
Variance of  GENID_DESCRIBE :  0.0
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.19399188938993783
Mode of  INCOMEMIN :  100000
Variance of  INCOMEMIN :  3645205365.415228


pulseCisWomendf = pulseIncomeStatsdf
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["EGENID_BIRTH"] == 2]
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["GENID_DESCRIBE"] == 2]
describeDF(pulseCisWomendf, pulse_income_col)

#print("Cis women week counts:")
pulseCisWomenReducedDf = combinedf(pulseCisWomendf,col_for_counts,"CisWomen")

pulseStateTotalsdf["CisWomenWEEKcount"] = pulseCisWomenReducedDf["CisWomenWEEKcount"].astype(int)
#print(pulseStateTotalsdf["CisWomenWEEKcount"])

                WEEK         EST_ST    TBIRTH_YEAR          EEDUC  \
count  599899.000000  599899.000000  599899.000000  599899.000000   
mean       43.804849      28.541668    1969.179439       5.321482   
std         6.108812      16.333975      15.348571       1.440654   
min        34.000000       1.000000    1933.000000       1.000000   
25%        39.000000      13.000000    1957.000000       4.000000   
50%        43.000000      28.000000    1969.000000       6.000000   
75%        49.000000      42.000000    1982.000000       7.000000   
max        54.000000      56.000000    2005.000000       7.000000   

               AEDUC  EGENID_BIRTH   AGENID_BIRTH  GENID_DESCRIBE  \
count  599899.000000      599899.0  599899.000000        599899.0   
mean        1.995281           2.0       1.998725             2.0   
std         0.068534           0.0       0.035687             0.0   
min         1.000000           2.0       1.000000             2.0   
25%         2.000000           2.0       2.000000             2.0   
50%         2.000000           2.0       2.000000             2.0   
75%         2.000000           2.0       2.000000             2.0   
max         2.000000           2.0       2.000000             2.0   

       SEXUAL_ORIENTATION         INCOME      INCOMEMIN  
count       599899.000000  599899.000000  599899.000000  
mean             2.085014       4.384903   72975.084139  
std              0.461465       2.119228   56763.424571  
min              1.000000       1.000000       0.000000  
25%              2.000000       3.000000   35000.000000  
50%              2.000000       4.000000   50000.000000  
75%              2.000000       6.000000  100000.000000  
max              5.000000       8.000000  200000.000000  
Mode of  WEEK :  43
Variance of  WEEK :  37.317581415112876
Mode of  EST_ST :  6
Variance of  EST_ST :  266.79872778314314
Mode of  TBIRTH_YEAR :  1955
Variance of  TBIRTH_YEAR :  235.5786372658161
Mode of  EEDUC :  6
Variance of  EEDUC :  2.07548426437156
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.0
Mode of  GENID_DESCRIBE :  2
Variance of  GENID_DESCRIBE :  0.0
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.2129499850843681
Mode of  INCOMEMIN :  50000
Variance of  INCOMEMIN :  3222086368.9871078


#print("Cisgender week counts:")
cisdf = pd.DataFrame()
cisdf["CisgenderWEEKcount"] = pulseStateTotalsdf.loc[:,["CisMenWEEKcount","CisWomenWEEKcount"]].sum(axis=1)
pulseStateTotalsdf["CisgenderWEEKcount"] = cisdf["CisgenderWEEKcount"].astype(int)
#print(pulseStateTotalsdf)


pulseTranswomendf = pulseIncomeStatsdf
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["EGENID_BIRTH"] == 1]
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["GENID_DESCRIBE"].isin([2,3])]
describeDF(pulseTranswomendf, pulse_income_col)

#print("Trans women week counts:")
pulseTranswomenReduceddf = combinedf(pulseTranswomendf,col_for_counts,"TransWomen")
#print(pulseTranswomenReduceddf)

pulseStateTotalsdf["TransWomenWEEKcount"] = pulseTranswomenReduceddf["TransWomenWEEKcount"].astype(int)

              WEEK       EST_ST  TBIRTH_YEAR        EEDUC        AEDUC  \
count  2654.000000  2654.000000  2654.000000  2654.000000  2654.000000   
mean     44.383572    29.207611  1976.794650     4.939337     1.993595   
std       6.134856    16.413115    18.514133     1.537229     0.079792   
min      34.000000     1.000000  1933.000000     1.000000     1.000000   
25%      40.000000    16.000000  1962.000000     4.000000     2.000000   
50%      44.000000    29.000000  1982.000000     5.000000     2.000000   
75%      50.000000    42.000000  1993.000000     6.000000     2.000000   
max      54.000000    56.000000  2005.000000     7.000000     2.000000   

       EGENID_BIRTH  AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  \
count        2654.0   2654.000000     2654.000000          2654.00000   
mean            1.0      1.794650        2.554635             2.64318   
std             0.0      0.404034        0.497100             1.16700   
min             1.0      1.000000        2.000000             1.00000   
25%             1.0      2.000000        2.000000             2.00000   
50%             1.0      2.000000        3.000000             3.00000   
75%             1.0      2.000000        3.000000             4.00000   
max             1.0      2.000000        3.000000             5.00000   

            INCOME      INCOMEMIN  
count  2654.000000    2654.000000  
mean      3.669932   56865.109269  
std       2.190414   55358.774687  
min       1.000000       0.000000  
25%       2.000000   25000.000000  
50%       4.000000   50000.000000  
75%       5.000000   75000.000000  
max       8.000000  200000.000000  
Mode of  WEEK :  52
Variance of  WEEK :  37.6364582501901
Mode of  EST_ST :  6
Variance of  EST_ST :  269.39035290414995
Mode of  TBIRTH_YEAR :  1992
Variance of  TBIRTH_YEAR :  342.7731187425988
Mode of  EEDUC :  4
Variance of  EEDUC :  2.363073212535268
Mode of  EGENID_BIRTH :  1
Variance of  EGENID_BIRTH :  0.0
Mode of  GENID_DESCRIBE :  3
Variance of  GENID_DESCRIBE :  0.24710817771523663
Mode of  SEXUAL_ORIENTATION :  3
Variance of  SEXUAL_ORIENTATION :  1.3618890161739803
Mode of  INCOMEMIN :  0
Variance of  INCOMEMIN :  3064593934.8353977


pulseTransmendf = pulseIncomeStatsdf
pulseTransmendf = pulseTransmendf[pulseTransmendf["EGENID_BIRTH"] == 2]
pulseTransmendf = pulseTransmendf[pulseTransmendf["GENID_DESCRIBE"].isin([1,3])]
describeDF(pulseTransmendf, pulse_income_col)

#print("Trans men week counts:")
pulseTransmenReduceddf = combinedf(pulseTransmendf,col_for_counts,"TransMen")
#print(pulseTransmenReduceddf)

pulseStateTotalsdf["TransMenWEEKcount"] = pulseTransmenReduceddf["TransMenWEEKcount"].astype(int)

              WEEK       EST_ST  TBIRTH_YEAR        EEDUC        AEDUC  \
count  3444.000000  3444.000000  3444.000000  3444.000000  3444.000000   
mean     44.622242    29.090012  1982.988095     5.067364     1.994483   
std       6.068434    16.591153    17.827721     1.547615     0.074081   
min      34.000000     1.000000  1933.000000     1.000000     1.000000   
25%      40.000000    15.000000  1974.000000     4.000000     2.000000   
50%      45.000000    29.000000  1990.000000     6.000000     2.000000   
75%      50.000000    42.000000  1996.000000     6.000000     2.000000   
max      54.000000    56.000000  2005.000000     7.000000     2.000000   

       EGENID_BIRTH  AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  \
count        3444.0   3444.000000     3444.000000         3444.000000   
mean            2.0      1.828688        2.310105            2.852497   
std             0.0      0.376836        0.950841            1.133170   
min             2.0      1.000000        1.000000            1.000000   
25%             2.0      2.000000        1.000000            2.000000   
50%             2.0      2.000000        3.000000            3.000000   
75%             2.0      2.000000        3.000000            4.000000   
max             2.0      2.000000        3.000000            5.000000   

            INCOME      INCOMEMIN  
count  3444.000000    3444.000000  
mean      3.504355   52778.745645  
std       2.122240   53090.947248  
min       1.000000       0.000000  
25%       2.000000   25000.000000  
50%       3.000000   35000.000000  
75%       5.000000   75000.000000  
max       8.000000  200000.000000  
Mode of  WEEK :  54
Variance of  WEEK :  36.82588913592966
Mode of  EST_ST :  6
Variance of  EST_ST :  275.26636549507276
Mode of  TBIRTH_YEAR :  1996
Variance of  TBIRTH_YEAR :  317.82762471819984
Mode of  EEDUC :  6
Variance of  EEDUC :  2.3951123034735593
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.0
Mode of  GENID_DESCRIBE :  3
Variance of  GENID_DESCRIBE :  0.9040976945597842
Mode of  SEXUAL_ORIENTATION :  3
Variance of  SEXUAL_ORIENTATION :  1.2840745062361207
Mode of  INCOMEMIN :  0
Variance of  INCOMEMIN :  2818648679.692473


pulseNonedf = pulseIncomeStatsdf
pulseNonedf = pulseNonedf[pulseNonedf["GENID_DESCRIBE"] == 4]
describeDF(pulseNonedf, pulse_income_col)

#print("Non-Binary week counts:")
pulseNoneReduceddf = combinedf(pulseNonedf,col_for_counts,"Enby")
#print(pulseNoneReduceddf)

pulseStateTotalsdf["EnbyWEEKcount"] = pulseNoneReduceddf["EnbyWEEKcount"].astype(int)

               WEEK        EST_ST   TBIRTH_YEAR         EEDUC         AEDUC  \
count  10164.000000  10164.000000  10164.000000  10164.000000  10164.000000   
mean      43.931425     28.821822   1975.280500      5.129083      1.993113   
std        6.128902     16.585780     17.485799      1.551461      0.082706   
min       34.000000      1.000000   1933.000000      1.000000      1.000000   
25%       39.000000     13.000000   1962.000000      4.000000      2.000000   
50%       44.000000     29.000000   1978.000000      6.000000      2.000000   
75%       49.000000     44.000000   1990.000000      6.000000      2.000000   
max       54.000000     56.000000   2005.000000      7.000000      2.000000   

       EGENID_BIRTH  AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  \
count  10164.000000  10164.000000         10164.0        10164.000000   
mean       1.638233      1.981995             4.0            2.878099   
std        0.480535      0.132975             0.0            1.199313   
min        1.000000      1.000000             4.0            1.000000   
25%        1.000000      2.000000             4.0            2.000000   
50%        2.000000      2.000000             4.0            2.000000   
75%        2.000000      2.000000             4.0            4.000000   
max        2.000000      2.000000             4.0            5.000000   

             INCOME      INCOMEMIN  
count  10164.000000   10164.000000  
mean       3.853503   61277.056277  
std        2.212131   56845.106327  
min        1.000000       0.000000  
25%        2.000000   25000.000000  
50%        4.000000   50000.000000  
75%        6.000000  100000.000000  
max        8.000000  200000.000000  
Mode of  WEEK :  41
Variance of  WEEK :  37.56343628567068
Mode of  EST_ST :  6
Variance of  EST_ST :  275.0880960203078
Mode of  TBIRTH_YEAR :  1993
Variance of  TBIRTH_YEAR :  305.75315311040043
Mode of  EEDUC :  6
Variance of  EEDUC :  2.407029720940252
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.23091436232464746
Mode of  GENID_DESCRIBE :  4
Variance of  GENID_DESCRIBE :  0.0
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  1.4383513604283242
Mode of  INCOMEMIN :  0
Variance of  INCOMEMIN :  3231366113.305501


#print("NonCisgender week counts:")
transdf = pd.DataFrame()
transdf["NonCisgenderWEEKcount"] = pulseStateTotalsdf.loc[:,["TransWomenWEEKcount","TransMenWEEKcount","EnbyWEEKcount"]].sum(axis=1)
pulseStateTotalsdf["NonCisgenderWEEKcount"] = transdf["NonCisgenderWEEKcount"].astype(int)
#print(pulseStateTotalsdf)
del transdf


print("Percentages:")
pulseStateTotalsdf.assign(CisPercent = lambda x: (round(x["CisgenderWEEKcount"]/x["AllWEEKcount"] * 100,2)))
pulseStateTotalsdf.assign(NonCisPercent = lambda x: (round(x["NonCisgenderWEEKcount"]/x["AllWEEKcount"] * 100,2)))

Percentages:


pulseWeekTotalsdf = pulseStateTotalsdf
print(pulseWeekTotalsdf.head())
print()
state_count_col = ["AllWEEKcount","CisgenderWEEKcount","NonCisgenderWEEKcount"
                   ,"CisMenWEEKcount","CisWomenWEEKcount"
                   ,"TransWomenWEEKcount","TransMenWEEKcount","EnbyWEEKcount"]
describeDF(pulseWeekTotalsdf,state_count_col)

    AllWEEKcount  CisMenWEEKcount  CisWomenWEEKcount  CisgenderWEEKcount  \
43         64543            25986              37672               63658   
41         61390            24939              35525               60464   
42         60221            24204              35106               59310   
54         60220            25198              34080               59278   
52         56169            24487              30771               55258   

    TransWomenWEEKcount  TransMenWEEKcount  EnbyWEEKcount  \
43                  131                188            566   
41                  162                185            579   
42                  144                198            569   
54                  153                230            559   
52                  172                196            543   

    NonCisgenderWEEKcount  
43                    885  
41                    926  
42                    911  
54                    942  
52                    911  

       AllWEEKcount  CisMenWEEKcount  CisWomenWEEKcount  CisgenderWEEKcount  \
count     21.000000        21.000000          21.000000           21.000000   
mean   49932.142857     20591.142857       28566.619048        49157.761905   
std     8010.431825      3130.041123        4924.881364         7920.842814   
min    33270.000000     14994.000000       17652.000000        32675.000000   
25%    45991.000000     18625.000000       26597.000000        45310.000000   
50%    49090.000000     20076.000000       28689.000000        48380.000000   
75%    54574.000000     22845.000000       31041.000000        53763.000000   
max    64543.000000     25986.000000       37672.000000        63658.000000   

       TransWomenWEEKcount  TransMenWEEKcount  EnbyWEEKcount  \
count            21.000000          21.000000       21.00000   
mean            126.380952         164.000000      484.00000   
std              23.341971          27.597101       63.95389   
min              91.000000         106.000000      350.00000   
25%             112.000000         147.000000      443.00000   
50%             123.000000         158.000000      484.00000   
75%             145.000000         185.000000      523.00000   
max             172.000000         230.000000      579.00000   

       NonCisgenderWEEKcount  
count              21.000000  
mean              774.380952  
std               101.523631  
min               595.000000  
25%               710.000000  
50%               773.000000  
75%               849.000000  
max               942.000000  
Mode of  AllWEEKcount :  64543
Variance of  AllWEEKcount :  64167018.02857144
Mode of  CisgenderWEEKcount :  63658
Variance of  CisgenderWEEKcount :  62739750.89047618
Mode of  NonCisgenderWEEKcount :  911
Variance of  NonCisgenderWEEKcount :  10307.04761904762
Mode of  CisMenWEEKcount :  25986
Variance of  CisMenWEEKcount :  9797157.42857143
Mode of  CisWomenWEEKcount :  37672
Variance of  CisWomenWEEKcount :  24254456.447619047
Mode of  TransWomenWEEKcount :  131
Variance of  TransWomenWEEKcount :  544.8476190476191
Mode of  TransMenWEEKcount :  147
Variance of  TransMenWEEKcount :  761.6
Mode of  EnbyWEEKcount :  566
Variance of  EnbyWEEKcount :  4090.1


# build datasets based on chosen gender
col_for_counts = ["EST_ST"]

pulseStateReduceddf = pulseIncomeStatsdf
#print("Total set state counts:")
pulseStateCountdf = combinedf(pulseStateReduceddf,col_for_counts,"All")
#print(pulseStateCountdf)

pulseStateTotalsdf = pd.DataFrame()
pulseStateTotalsdf["AllEST_STcount"] = pulseStateCountdf["AllEST_STcount"].astype(int)

del pulseStateCountdf


pulseCisMendf = pulseIncomeStatsdf
pulseCisMendf = pulseCisMendf[pulseCisMendf["EGENID_BIRTH"] == 1]
pulseCisMendf = pulseCisMendf[pulseCisMendf["GENID_DESCRIBE"] == 1]

#print("Cis men state counts:")
pulseCisMenReducedDf = combinedf(pulseCisMendf,col_for_counts,"CisMen")

pulseStateTotalsdf["CisMenEST_STcount"] = pulseCisMenReducedDf["CisMenEST_STcount"].astype(int)
#print(pulseStateTotalsdf["CisMenEST_STcount"])


pulseCisWomendf = pulseIncomeStatsdf
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["EGENID_BIRTH"] == 2]
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["GENID_DESCRIBE"] == 2]

#print("Cis men state counts:")
pulseCisWomenReducedDf = combinedf(pulseCisWomendf,col_for_counts,"CisWomen")

pulseStateTotalsdf["CisWomenEST_STcount"] = pulseCisWomenReducedDf["CisWomenEST_STcount"].astype(int)
#print(pulseStateTotalsdf["CisWomenEST_STcount"])


#print("Cisgender state counts:")
cisdf = pd.DataFrame()
cisdf["CisgenderEST_STcount"] = pulseStateTotalsdf.loc[:,["CisMenEST_STcount","CisWomenEST_STcount"]].sum(axis=1)
pulseStateTotalsdf["CisgenderEST_STcount"] = cisdf["CisgenderEST_STcount"].astype(int)
#print(pulseStateTotalsdf)


pulseTranswomendf = pulseIncomeStatsdf
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["EGENID_BIRTH"] == 1]
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["GENID_DESCRIBE"].isin([2,3])]

#print("Trans women state counts:")
pulseTranswomenReduceddf = combinedf(pulseTranswomendf,col_for_counts,"TransWomen")
#print(pulseTranswomenReduceddf)

pulseStateTotalsdf["TransWomenEST_STcount"] = pulseTranswomenReduceddf["TransWomenEST_STcount"].astype(int)


pulseTransmendf = pulseIncomeStatsdf
pulseTransmendf = pulseTransmendf[pulseTransmendf["EGENID_BIRTH"] == 2]
pulseTransmendf = pulseTransmendf[pulseTransmendf["GENID_DESCRIBE"].isin([1,3])]

#print("Trans men column counts:")
pulseTransmenReduceddf = combinedf(pulseTransmendf,col_for_counts,"TransMen")
#print(pulseTransmenReduceddf)

pulseStateTotalsdf["TransMenEST_STcount"] = pulseTransmenReduceddf["TransMenEST_STcount"].astype(int)


pulseNonedf = pulseIncomeStatsdf
pulseNonedf = pulseNonedf[pulseNonedf["GENID_DESCRIBE"] == 4]

#print("Non-Binary column counts:")
pulseNoneReduceddf = combinedf(pulseNonedf,col_for_counts,"Enby")
#print(pulseNoneReduceddf)

pulseStateTotalsdf["EnbyEST_STcount"] = pulseNoneReduceddf["EnbyEST_STcount"].astype(int)


print("NonCisgender state counts:")
transdf = pd.DataFrame()
transdf["NonCisgenderEST_STcount"] = pulseStateTotalsdf.loc[:,["TransWomenEST_STcount","TransMenEST_STcount","EnbyEST_STcount"]].sum(axis=1)
pulseStateTotalsdf["NonCisgenderEST_STcount"] = transdf["NonCisgenderEST_STcount"].astype(int)
#print(pulseStateTotalsdf)
del transdf

NonCisgender state counts:


print("Percentages:")
pulseStateTotalsdf.assign(CisPercent = lambda x: (round(x["CisgenderEST_STcount"]/x["AllEST_STcount"] * 100,2)))
pulseStateTotalsdf.assign(NonCisPercent = lambda x: (round(x["NonCisgenderEST_STcount"]/x["AllEST_STcount"] * 100,2)))

Percentages:


print(pulseStateTotalsdf.head())
print()
state_count_col = ["AllEST_STcount","CisgenderEST_STcount","NonCisgenderEST_STcount"
                   ,"CisMenEST_STcount","CisWomenEST_STcount"
                   ,"TransWomenEST_STcount","TransMenEST_STcount","EnbyEST_STcount"]
describeDF(pulseStateTotalsdf,state_count_col)

    AllEST_STcount  CisMenEST_STcount  CisWomenEST_STcount  \
6            79763              35075                43358   
48           52968              22829                29432   
53           44216              19190                24043   
12           36164              15525                20207   
25           29233              12153                16581   

    CisgenderEST_STcount  TransWomenEST_STcount  TransMenEST_STcount  \
6                  78433                    196                  292   
48                 52261                    118                  136   
53                 43233                    161                  216   
12                 35732                     69                   76   
25                 28734                     97                  106   

    EnbyEST_STcount  NonCisgenderEST_STcount  
6               842                     1330  
48              453                      707  
53              606                      983  
12              287                      432  
25              296                      499  

       AllEST_STcount  CisMenEST_STcount  CisWomenEST_STcount  \
count       50.000000          50.000000            50.000000   
mean     20971.500000        8648.280000         11997.980000   
std      12222.406664        5449.401873          6575.810556   
min       8999.000000        3342.000000          5086.000000   
25%      13299.000000        5197.250000          7891.500000   
50%      18427.500000        7265.500000         10628.000000   
75%      24646.750000        9998.500000         14306.000000   
max      79763.000000       35075.000000         43358.000000   

       CisgenderEST_STcount  TransWomenEST_STcount  TransMenEST_STcount  \
count             50.000000              50.000000            50.000000   
mean           20646.260000              53.080000            68.880000   
std            12013.928975              36.049762            50.080604   
min             8872.000000              14.000000            18.000000   
25%            13131.250000              31.000000            38.250000   
50%            18139.000000              42.500000            58.000000   
75%            24285.250000              62.750000            81.500000   
max            78433.000000             196.000000           292.000000   

       EnbyEST_STcount  NonCisgenderEST_STcount  
count        50.000000                50.000000  
mean        203.280000               325.240000  
std         135.915946               219.943366  
min          80.000000               113.000000  
25%         111.500000               188.500000  
50%         169.500000               260.000000  
75%         242.250000               386.000000  
max         842.000000              1330.000000  
Mode of  AllEST_STcount :  79763
Variance of  AllEST_STcount :  149387224.66326532
Mode of  CisgenderEST_STcount :  78433
Variance of  CisgenderEST_STcount :  144334489.42081633
Mode of  NonCisgenderEST_STcount :  1330
Variance of  NonCisgenderEST_STcount :  48375.08408163265
Mode of  CisMenEST_STcount :  35075
Variance of  CisMenEST_STcount :  29695980.777142856
Mode of  CisWomenEST_STcount :  43358
Variance of  CisWomenEST_STcount :  43241284.46897959
Mode of  TransWomenEST_STcount :  44
Variance of  TransWomenEST_STcount :  1299.585306122449
Mode of  TransMenEST_STcount :  50
Variance of  TransMenEST_STcount :  2508.0669387755106
Mode of  EnbyEST_STcount :  106
Variance of  EnbyEST_STcount :  18473.14448979592


#add basic gender column
pulseMungedf = pulseIncomeStatsdf
pulseMungedf["CUR_GENID"] = pulseMungedf.apply(lambda x: basicGenMarker(x["EGENID_BIRTH"],x["GENID_DESCRIBE"]), axis=1)


#replace state column name to allow merging
pulseMungedf.rename(columns={"EST_ST":"stateId"}, inplace=True)
print(pulseMungedf.head())
pulseMungedf["stateId"] = pulseMungedf["stateId"].astype(int)
reedFulldf["stateId"] = reedFulldf["stateId"].astype(int)

pulseMungedf = pd.merge(pulseMungedf, reedFulldf, on="stateId", how='inner')
print(pulseMungedf.head())

#list(pulseMungedf.columns)

        SCRAM  WEEK  stateId  TBIRTH_YEAR  EEDUC  AEDUC  EGENID_BIRTH  \
1  V340000002    34        4         1982      7      2             2   
3  V340000004    34       31         1957      4      2             1   
4  V340000005    34       45         1962      5      2             2   
5  V340000006    34        8         1956      7      2             1   
6  V340000007    34       41         1982      7      2             2   

   AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  INCOME   ENDDATE  \
1             2               2                   2       7  8/2/2021   
3             2               1                   2       6  8/2/2021   
4             2               2                   2       4  8/2/2021   
5             2               1                   2       7  8/2/2021   
6             2               2                   2       8  8/2/2021   

            EDUCATION ASSIGNEDGENDER CHOSENGENDER SEXUALORIENTATION  \
1     Graduate degree         female       female          straight   
3        some college           male         male          straight   
4  Associate's degree         female       female          straight   
5     Graduate degree           male         male          straight   
6     Graduate degree         female       female          straight   

   INCOMEMIN        CUR_GENID  
1     150000  Cisgender Woman  
3     100000    Cisgender Man  
4      50000  Cisgender Woman  
5     150000    Cisgender Man  
6     200000  Cisgender Woman  
        SCRAM  WEEK  stateId  TBIRTH_YEAR  EEDUC  AEDUC  EGENID_BIRTH  \
0  V340000002    34        4         1982      7      2             2   
1  V340000076    34        4         1986      6      2             1   
2  V340000087    34        4         1945      6      2             1   
3  V340000238    34        4         1966      6      2             2   
4  V340000281    34        4         1973      4      2             2   

   AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  ...  \
0             2               2                   2  ...   
1             2               1                   2  ...   
2             2               1                   2  ...   
3             2               2                   2  ...   
4             2               2                   2  ...   

   veryReligiousStatista2017 moderatelyReligiousStatista2017  \
0                       0.31                            0.31   
1                       0.31                            0.31   
2                       0.31                            0.31   
3                       0.31                            0.31   
4                       0.31                            0.31   

  nonreligiousStatista2017 relLibScore2022 relLibVote2022 relLibVax2022  \
0                     0.39          0.4156            1.0           1.0   
1                     0.39          0.4156            1.0           1.0   
2                     0.39          0.4156            1.0           1.0   
3                     0.39          0.4156            1.0           1.0   
4                     0.39          0.4156            1.0           1.0   

   relLibHealth2022 relLibHealthMandate2022 relLibMarriage2022  relLibRfra2022  
0               4.0                     1.0                0.0             1.0  
1               4.0                     1.0                0.0             1.0  
2               4.0                     1.0                0.0             1.0  
3               4.0                     1.0                0.0             1.0  
4               4.0                     1.0                0.0             1.0  

[5 rows x 43 columns]


munge_col_list = ["SCRAM","WEEK","stateId","TBIRTH_YEAR","EEDUC","AEDUC","EGENID_BIRTH","AGENID_BIRTH",
                  "GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME","ENDDATE","EDUCATION","ASSIGNEDGENDER",
                  "CHOSENGENDER","SEXUALORIENTATION","INCOMEMIN","CUR_GENID",
                  "stateName","statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023",
                  "antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022","transAdultPop2016",
                  "transAdultPercent2016","transAdultPop2022","transAdultPercent2022","religionImportantPew2014",
                  "worshipWeeklyPew2014","prayDailyPew2014","certainAboutGodPew2014","overallReligiosityPew2014",
                  "veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017",
                  "relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022",
                  "relLibMarriage2022","relLibRfra2022"]


#sample from the dataset based on the "CUR_GENID" and stateId columns, 
#looking for (3gender*50state)*113samples = 16,950 rows per run
seed_value = 19
random.seed(seed_value)
rand_int = random.randint(0,1000)
modelSampledf = pulseMungedf.groupby(["CUR_GENID","stateId"]).sample(n=113,random_state=rand_int)

describeDF(modelSampledf,munge_col_list)

               WEEK       stateId   TBIRTH_YEAR         EEDUC         AEDUC  \
count  16950.000000  16950.000000  16950.000000  16950.000000  16950.000000   
mean      44.072153     29.320000   1971.105959      5.254572      1.995162   
std        6.117397     15.624084     16.989544      1.487861      0.069388   
min       34.000000      1.000000   1933.000000      1.000000      1.000000   
25%       39.000000     17.000000   1957.000000      4.000000      2.000000   
50%       44.000000     29.500000   1971.000000      6.000000      2.000000   
75%       50.000000     42.000000   1985.000000      7.000000      2.000000   
max       54.000000     56.000000   2005.000000      7.000000      2.000000   

       EGENID_BIRTH  AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  \
count  16950.000000   16950.00000    16950.000000        16950.000000   
mean       1.533510       1.97056        2.136224            2.308024   
std        0.498891       0.16904        1.122674            0.850734   
min        1.000000       1.00000        1.000000            1.000000   
25%        1.000000       2.00000        1.000000            2.000000   
50%        2.000000       2.00000        2.000000            2.000000   
75%        2.000000       2.00000        3.000000            2.000000   
max        2.000000       2.00000        4.000000            5.000000   

             INCOME  ...  veryReligiousStatista2017  \
count  16950.000000  ...               16950.000000   
mean       4.256342  ...                   0.371600   
std        2.169266  ...                   0.089543   
min        1.000000  ...                   0.160000   
25%        2.000000  ...                   0.310000   
50%        4.000000  ...                   0.365000   
75%        6.000000  ...                   0.440000   
max        8.000000  ...                   0.590000   

       moderatelyReligiousStatista2017  nonreligiousStatista2017  \
count                     16950.000000              16950.000000   
mean                          0.287200                  0.342000   
std                           0.030137                  0.098856   
min                           0.160000                  0.120000   
25%                           0.270000                  0.290000   
50%                           0.295000                  0.340000   
75%                           0.300000                  0.400000   
max                           0.330000                  0.590000   

       relLibScore2022  relLibVote2022  relLibVax2022  relLibHealth2022  \
count     16950.000000    16950.000000   16950.000000       16950.00000   
mean          0.393948        0.800000       0.900000           6.76000   
std           0.131963        0.400012       0.300009           3.98288   
min           0.155800        0.000000       0.000000           0.00000   
25%           0.311700        1.000000       1.000000           4.00000   
50%           0.371200        1.000000       1.000000           5.50000   
75%           0.476200        1.000000       1.000000           9.00000   
max           0.818200        1.000000       1.000000          20.00000   

       relLibHealthMandate2022  relLibMarriage2022  relLibRfra2022  
count             16950.000000        16950.000000    16950.000000  
mean                  0.640000            1.160000        0.480000  
std                   0.480014            1.474628        0.499615  
min                   0.000000            0.000000        0.000000  
25%                   0.000000            0.000000        0.000000  
50%                   1.000000            0.000000        0.000000  
75%                   1.000000            3.000000        1.000000  
max                   1.000000            5.000000        1.000000  

[8 rows x 35 columns]
Mode of  WEEK :  54
Variance of  WEEK :  37.422547430596495
Mode of  stateId :  1
Variance of  stateId :  244.11200188801695
Mode of  TBIRTH_YEAR :  1960
Variance of  TBIRTH_YEAR :  288.6446219936923
Mode of  EEDUC :  6
Variance of  EEDUC :  2.2137306418648626
Mode of  AEDUC :  2
Variance of  AEDUC :  0.00481463825799801
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.24889174203157796
Mode of  AGENID_BIRTH :  2
Variance of  AGENID_BIRTH :  0.02857452802620946
Mode of  GENID_DESCRIBE :  1
Variance of  GENID_DESCRIBE :  1.2603963861043481
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.7237482323771591
Mode of  INCOME :  4
Variance of  INCOME :  4.705716751155775
Mode of  INCOMEMIN :  50000
Variance of  INCOMEMIN :  3305722837.0866547
Mode of  statePopulation2020 :  5024279
Variance of  statePopulation2020 :  54193215481182.87
Mode of  statePopulation2023 :  5097641
Variance of  statePopulation2023 :  356648599406395.7
Mode of  antiTransLegislationRiskIndex32023 :  4
Variance of  antiTransLegislationRiskIndex32023 :  2.513748303734734
Mode of  antiTransLegislationRiskIndex122022 :  3
Variance of  antiTransLegislationRiskIndex122022 :  1.7605038645347806
Mode of  antiTransLegislationRiskIndex112022 :  1
Variance of  antiTransLegislationRiskIndex112022 :  1.7477031093279838
Mode of  transAdultPop2016 :  2700
Variance of  transAdultPop2016 :  1331132916.9154522
Mode of  transAdultPercent2016 :  0.43
Variance of  transAdultPercent2016 :  0.014520696678270103
Mode of  transAdultPop2022 :  6300
Variance of  transAdultPop2022 :  828822454.0798867
Mode of  transAdultPercent2022 :  0.6
Variance of  transAdultPercent2022 :  0.015779690955218594
Mode of  religionImportantPew2014 :  0.44
Variance of  religionImportantPew2014 :  0.011325668181013627
Mode of  worshipWeeklyPew2014 :  0.34
Variance of  worshipWeeklyPew2014 :  0.005517965543689893
Mode of  prayDailyPew2014 :  0.51
Variance of  prayDailyPew2014 :  0.008712554014986136
Mode of  certainAboutGodPew2014 :  0.61
Variance of  certainAboutGodPew2014 :  0.008895564812083307
Mode of  overallReligiosityPew2014 :  0.54
Variance of  overallReligiosityPew2014 :  0.011309667237005131
Mode of  veryReligiousStatista2017 :  0.28
Variance of  veryReligiousStatista2017 :  0.008017913033217302
Mode of  moderatelyReligiousStatista2017 :  0.3
Variance of  moderatelyReligiousStatista2017 :  0.0009082135819222371
Mode of  nonreligiousStatista2017 :  0.33
Variance of  nonreligiousStatista2017 :  0.009772576553188979
Mode of  relLibScore2022 :  0.3377
Variance of  relLibScore2022 :  0.0174141322784353
Mode of  relLibVote2022 :  1.0
Variance of  relLibVote2022 :  0.16000944008496082
Mode of  relLibVax2022 :  1.0
Variance of  relLibVax2022 :  0.09000531004779044
Mode of  relLibHealth2022 :  5.0
Variance of  relLibHealth2022 :  15.86333589002301
Mode of  relLibHealthMandate2022 :  1.0
Variance of  relLibHealthMandate2022 :  0.2304135937223435
Mode of  relLibMarriage2022 :  0.0
Variance of  relLibMarriage2022 :  2.174528290754617
Mode of  relLibRfra2022 :  0.0
Variance of  relLibRfra2022 :  0.2496147265325388


#convert all values that will be used to model to categories or numbers
col_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
            ,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOMEMIN","CUR_GENID"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
dfClean = modelSampledf[col_list].copy()

print(dfClean.dtypes)

# convert text columns to category values
dfClean["CUR_GENID"] = dfClean["CUR_GENID"].astype("category")
dfClean["CUR_GENID_CAT"] = dfClean["CUR_GENID"].cat.codes

print(dfClean.describe())

WEEK                                    int64
stateId                                 int32
TBIRTH_YEAR                             int64
EEDUC                                   int64
EGENID_BIRTH                            int64
GENID_DESCRIBE                          int64
SEXUAL_ORIENTATION                      int64
INCOMEMIN                               int64
CUR_GENID                              object
statePopulation2020                     int64
statePopulation2023                     int64
antiTransLegislationRiskIndex32023      int64
transAdultPop2022                       int64
overallReligiosityPew2014             float64
veryReligiousStatista2017             float64
moderatelyReligiousStatista2017       float64
nonreligiousStatista2017              float64
relLibScore2022                       float64
relLibVote2022                        float64
relLibVax2022                         float64
relLibHealth2022                      float64
relLibHealthMandate2022               float64
relLibMarriage2022                    float64
relLibRfra2022                        float64
dtype: object
               WEEK       stateId   TBIRTH_YEAR         EEDUC  EGENID_BIRTH  \
count  16950.000000  16950.000000  16950.000000  16950.000000  16950.000000   
mean      44.072153     29.320000   1971.105959      5.254572      1.533510   
std        6.117397     15.624084     16.989544      1.487861      0.498891   
min       34.000000      1.000000   1933.000000      1.000000      1.000000   
25%       39.000000     17.000000   1957.000000      4.000000      1.000000   
50%       44.000000     29.500000   1971.000000      6.000000      2.000000   
75%       50.000000     42.000000   1985.000000      7.000000      2.000000   
max       54.000000     56.000000   2005.000000      7.000000      2.000000   

       GENID_DESCRIBE  SEXUAL_ORIENTATION      INCOMEMIN  statePopulation2020  \
count    16950.000000        16950.000000   16950.000000         1.695000e+04   
mean         2.136224            2.308024   70282.595870         6.615242e+06   
std          1.122674            0.850734   57495.415792         7.361604e+06   
min          1.000000            1.000000       0.000000         5.768510e+05   
25%          1.000000            2.000000   25000.000000         1.839106e+06   
50%          2.000000            2.000000   50000.000000         4.581796e+06   
75%          3.000000            2.000000  100000.000000         7.705281e+06   
max          4.000000            5.000000  200000.000000         3.953822e+07   

       statePopulation2023  ...  moderatelyReligiousStatista2017  \
count         1.695000e+04  ...                     16950.000000   
mean          8.960485e+06  ...                         0.287200   
std           1.888514e+07  ...                         0.030137   
min           5.808170e+05  ...                         0.160000   
25%           1.920562e+06  ...                         0.270000   
50%           4.625424e+06  ...                         0.295000   
75%           7.999503e+06  ...                         0.300000   
max           1.309280e+08  ...                         0.330000   

       nonreligiousStatista2017  relLibScore2022  relLibVote2022  \
count              16950.000000     16950.000000    16950.000000   
mean                   0.342000         0.393948        0.800000   
std                    0.098856         0.131963        0.400012   
min                    0.120000         0.155800        0.000000   
25%                    0.290000         0.311700        1.000000   
50%                    0.340000         0.371200        1.000000   
75%                    0.400000         0.476200        1.000000   
max                    0.590000         0.818200        1.000000   

       relLibVax2022  relLibHealth2022  relLibHealthMandate2022  \
count   16950.000000       16950.00000             16950.000000   
mean        0.900000           6.76000                 0.640000   
std         0.300009           3.98288                 0.480014   
min         0.000000           0.00000                 0.000000   
25%         1.000000           4.00000                 0.000000   
50%         1.000000           5.50000                 1.000000   
75%         1.000000           9.00000                 1.000000   
max         1.000000          20.00000                 1.000000   

       relLibMarriage2022  relLibRfra2022  CUR_GENID_CAT  
count        16950.000000    16950.000000   16950.000000  
mean             1.160000        0.480000       1.000000  
std              1.474628        0.499615       0.816521  
min              0.000000        0.000000       0.000000  
25%              0.000000        0.000000       0.000000  
50%              0.000000        0.000000       1.000000  
75%              3.000000        1.000000       2.000000  
max              5.000000        1.000000       2.000000  

[8 rows x 24 columns]


# Build kNN Classifier to sort and classify data
# reduce dimensionality based on experimentation and hypothesis criteria

# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
            ,"SEXUAL_ORIENTATION","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
trainSize = 0.3
trainState = 1


# split datasets into training and test sets
size = 0.3
state = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=state)

# scale input data for training if necessary for better predictions
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

# prep data
cv_count = 18
max_neighbors = 200 #arbitrary magic number
parameters = {"n_neighbors": np.arange(1, max_neighbors)}
# run regression
knnr = KNeighborsClassifier(n_neighbors=cv_count,weights='distance')

# best neighbor count found in testing at 127
# use gridsearch to test all values for best n_neighbors number and highest accuracy
#knnr_gscv = GridSearchCV(knnr, parameters, cv=cv_count)
#knnr_gscv.fit(X.values, y.values)
#print("Best value for neighbor count found: ",knnr_gscv.best_params_)
#print("Best Average Accuracy found: ",knnr_gscv.best_score_)


# Build the new model
# split dataset into dependent(features) and independent(target) variable
#params = knnr_gscv.best_params_
#n_count = int(params['n_neighbors'])
n_count = 18

# splt datasets into training and test sets
size = 0.3
state = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=state)

# run regression
knnr = KNeighborsClassifier(n_neighbors=n_count,weights='distance')
knnr.fit(X_train.values, y_train.values)
pred = knnr.predict(X_test.values)

# confusion matrix for visualization is available, but unnecessary for this dataset
cfm = metrics.confusion_matrix(y_test, pred)
fig, ax = plt.subplots(figsize=(6, 6))
ax.matshow(cfm, cmap = plt.cm.Purples, alpha=0.3)
for i in range(cfm.shape[0]):
    for j in range(cfm.shape[1]):
        ax.text(x=j, y=i,s=cfm[i, j], va='center', 
                ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize = 16)
plt.ylabel('Actuals', fontsize = 16)
plt.title('kNN Confusion Matrix', fontsize = 16)
plt.show()

print(metrics.classification_report(y_test, pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.42      0.43      0.43      1721
           1       0.36      0.38      0.37      1650
           2       0.48      0.43      0.45      1714

    accuracy                           0.42      5085
   macro avg       0.42      0.42      0.42      5085
weighted avg       0.42      0.42      0.42      5085


printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix')

              precision    recall  f1-score   support

           0       0.84      0.97      0.90      3976
           1       0.75      0.93      0.83      3896
           2       0.83      0.51      0.63      3993

    accuracy                           0.80     11865
   macro avg       0.81      0.80      0.79     11865
weighted avg       0.81      0.80      0.79     11865


# initialize, train and test the GNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
pred = gnb.predict(X_test)

#check accuracy
gnb_accuracy = metrics.accuracy_score(pred, y_test)

#confusion matrix
cfm = metrics.confusion_matrix(y_test, pred)
fig, ax = plt.subplots(figsize=(6,6))
ax.matshow(cfm, cmap = plt.cm.Purples, alpha=0.3)
for i in range(cfm.shape[0]):
    for j in range(cfm.shape[1]):
        ax.text(x=j, y=i,s=cfm[i, j], va='center', 
                ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize = 16)
plt.ylabel('Actuals', fontsize = 16)
plt.title('Gaussian NB Confusion Matrix', fontsize = 16)
plt.show()

print(metrics.classification_report(y_test, pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.44      0.41      0.42      1721
           1       0.32      0.15      0.20      1650
           2       0.40      0.63      0.49      1714

    accuracy                           0.40      5085
   macro avg       0.39      0.40      0.37      5085
weighted avg       0.39      0.40      0.38      5085


printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix')

              precision    recall  f1-score   support

           0       0.84      0.97      0.90      3976
           1       0.69      0.98      0.81      3896
           2       0.87      0.39      0.54      3993

    accuracy                           0.78     11865
   macro avg       0.80      0.78      0.75     11865
weighted avg       0.80      0.78      0.75     11865


# Setup for A/B testing
# remove "SEXUAL_ORIENTATION" column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]


printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Sexuality Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Sexuality Removed')

              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.89      0.75      3896
           2       0.52      0.22      0.31      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.62     11865
weighted avg       0.64      0.67      0.62     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.90      0.78      3896
           2       0.68      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.69      0.70      0.64     11865
weighted avg       0.69      0.70      0.64     11865


# Setup for A/B testing
# remove "EEDUC","INCOMEMIN" column
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","SEXUAL_ORIENTATION"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]


printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Education and Income Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Education and Income Removed')

              precision    recall  f1-score   support

           0       0.84      0.97      0.90      3976
           1       0.75      0.93      0.83      3896
           2       0.83      0.51      0.63      3993

    accuracy                           0.80     11865
   macro avg       0.81      0.80      0.79     11865
weighted avg       0.81      0.80      0.79     11865

              precision    recall  f1-score   support

           0       0.84      0.97      0.90      3976
           1       0.69      0.98      0.81      3896
           2       0.87      0.39      0.54      3993

    accuracy                           0.78     11865
   macro avg       0.80      0.78      0.75     11865
weighted avg       0.80      0.78      0.75     11865


# Setup for A/B testing
# remove ,"statePopulation2020","statePopulation2023","transAdultPop2022" column
#population table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]


printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Population Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Population Table Removed')

              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.90      0.75      3896
           2       0.52      0.21      0.30      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.62     11865
weighted avg       0.64      0.67      0.62     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.91      0.78      3896
           2       0.69      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.70      0.70      0.64     11865
weighted avg       0.70      0.70      0.64     11865


# Setup for A/B testing
# remove ,"antiTransLegislationRiskIndex32023" column
#anti-trans legislation table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]


printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Anti-Trans Legislation Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Anti-Trans Legislation Table Removed')

              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.89      0.75      3896
           2       0.52      0.22      0.31      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.63     11865
weighted avg       0.64      0.67      0.62     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.90      0.78      3896
           2       0.68      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.70      0.70      0.64     11865
weighted avg       0.70      0.70      0.64     11865


# Setup for A/B testing
# remove ,"overallReligiosityPew2014" column
#Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]


printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Pew 2014 Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Pew 2014 Table Removed')

              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.89      0.75      3896
           2       0.52      0.22      0.31      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.62     11865
weighted avg       0.64      0.67      0.62     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.91      0.78      3896
           2       0.68      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.70      0.70      0.64     11865
weighted avg       0.70      0.70      0.64     11865


# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017" column
#Statista 2017 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]


printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Statista 2017 Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Statista 2017 Table Removed')

              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.90      0.75      3896
           2       0.52      0.22      0.31      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.62     11865
weighted avg       0.64      0.67      0.62     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.90      0.78      3896
           2       0.68      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.69      0.70      0.64     11865
weighted avg       0.69      0.70      0.64     11865


# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014" column
#Statista 2017 and Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]


printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Statista 2017 and Pew 2014 Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Statista 2017 and Pew 2014 Table Removed')

              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.90      0.75      3896
           2       0.52      0.22      0.31      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.62     11865
weighted avg       0.64      0.67      0.62     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.90      0.78      3896
           2       0.68      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.69      0.70      0.64     11865
weighted avg       0.69      0.70      0.64     11865


# Setup for A/B testing
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]


printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Religious Liberty Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Religious Liberty Table Removed')

              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.89      0.75      3896
           2       0.52      0.23      0.32      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.63     11865
weighted avg       0.64      0.67      0.63     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.91      0.78      3896
           2       0.69      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.70      0.70      0.64     11865
weighted avg       0.70      0.70      0.64     11865


# Setup for A/B testing
#,"statePopulation2020","statePopulation2023","transAdultPop2022"
#,"antiTransLegislationRiskIndex32023"
#,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]


printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Only Pulse Data')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Only Pulse Data')

              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.89      0.76      3896
           2       0.53      0.23      0.32      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.63     11865
weighted avg       0.64      0.67      0.63     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.62      1.00      0.76      3896
           2       0.00      0.00      0.00      3993

    accuracy                           0.66     11865
   macro avg       0.44      0.67      0.53     11865
weighted avg       0.44      0.66      0.53     11865


clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
            ,"SEXUAL_ORIENTATION","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]

# regression pick

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
sc_X = StandardScaler()
sc_y = StandardScaler()
sc_X_train = sc_X.fit_transform(X_train)
sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
sc_y_train = sc_y_train


log_regression = LogisticRegression(solver="newton-cg", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))

#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
    #print(np.exp(x)/(1 + np.exp(x)))
    coefArray.append(np.exp(x))

featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)

Training set score: 0.68
Test set score: 0.6743
         featureNames  Coefficients
0                WEEK      1.002763
1             stateId      1.000906
2         TBIRTH_YEAR      1.003440
3               EEDUC      0.005272
4        EGENID_BIRTH      1.041176
5  SEXUAL_ORIENTATION      1.000001

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:210: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.
  warnings.warn(


log_regression = LogisticRegression(solver="sag", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))

#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
    #print(np.exp(x)/(1 + np.exp(x)))
    coefArray.append(np.exp(x)/(1 + np.exp(x)))

featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)

Training set score: 0.4144
Test set score: 0.411
         featureNames  Coefficients
0                WEEK      0.500001
1             stateId      0.500001
2         TBIRTH_YEAR      0.499962
3               EEDUC      0.499995
4        EGENID_BIRTH      0.500000
5  SEXUAL_ORIENTATION      0.500001

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(


log_regression = LogisticRegression(solver="saga", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))

#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
    #print(np.exp(x)/(1 + np.exp(x)))
    coefArray.append(np.exp(x)/(1 + np.exp(x)))

featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)

Training set score: 0.4144
Test set score: 0.411
         featureNames  Coefficients
0                WEEK      0.500000
1             stateId      0.500000
2         TBIRTH_YEAR      0.499962
3               EEDUC      0.499997
4        EGENID_BIRTH      0.500000
5  SEXUAL_ORIENTATION      0.500001

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(


log_regression = LogisticRegression(solver="lbfgs", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))

#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
    #print(np.exp(x)/(1 + np.exp(x)))
    coefArray.append(np.exp(x)/(1 + np.exp(x)))

featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)

Training set score: 0.4144
Test set score: 0.411
         featureNames  Coefficients
0                WEEK      0.499999
1             stateId      0.499999
2         TBIRTH_YEAR      0.499962
3               EEDUC      0.500000
4        EGENID_BIRTH      0.500000
5  SEXUAL_ORIENTATION      0.500001


clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
            ,"SEXUAL_ORIENTATION","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Full Dataset")
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-0.3, random_state = 0)
#sc_X = StandardScaler()
#sc_y = StandardScaler()
#sc_X_train = sc_X.fit_transform(X_train)
#sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
#sc_y_train = sc_y_train

#maxIter=1000000000
#log_regression = LogisticRegression(max_iter=max_iter)
#solvers = ["liblinear","newton-cg","sag","saga","lbfgs"]
#penalty=["l2"]
#cVals=[0.01,0.1,1.0,10.0,100.0]

#grid=dict(solver=solvers,penalty=penalty,C=cVals)
#cv=RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=state)
#grid_search = GridSearchCV(estimator=log_regression, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
#grid_result = grid_search.fit(X,y)
# summarize results
#print("Accuracy rate of Logistic Regression: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#Accuracy rate of Logistic Regression: 0.797286 using {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}


#log_regression = LogisticRegression(solver="newton-cg", random_state=state, penalty="l2", C=0.01, max_iter=maxIter).fit(X_train,y_train)
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
#print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
#print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))

#coefArray = []
#for ind in range(log_regression.coef_.shape[0]):
#    if ind == 0:
#        featFor = "Cisgender Men"
#    elif ind == 1:
#        featFor = "Cisgender Women"
#    else:
#        featFor = "Transgender"
#    featTitle = "Logistic Regression " + featFor + " Feature Coefficients"

#    for x in log_regression.coef_[ind]:
#        #print(np.exp(x)/(1 + np.exp(x)))
#        coefArray.append(np.exp(x)/(1 + np.exp(x)))

#    featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["Features", "Coefficients"])
#    featureDf = featureDf.sort_values(by=["Coefficients"], ascending=False)
#    print(featureDf)
    
#    #plot bar chart of importance
#    f, ax = plt.subplots(figsize=(20,12))
#    sns.barplot(x=featureDf["Features"], y=featureDf["Coefficients"], palette="flare")
#    plt.title(featTitle, fontsize=14)
#    plt.xticks(rotation=45)
    
#    plt.show()

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")

Logistic Regression Accuracy: 0.791
                              Features  Coefficients
3                                EEDUC      0.510278
11           overallReligiosityPew2014      0.504194
12           veryReligiousStatista2017      0.501816
13     moderatelyReligiousStatista2017      0.501757
21                      relLibRfra2022      0.501642
15                     relLibScore2022      0.500698
20                  relLibMarriage2022      0.500558
2                          TBIRTH_YEAR      0.500448
18                    relLibHealth2022      0.500004
6                            INCOMEMIN      0.500001
7                  statePopulation2020      0.500000
8                  statePopulation2023      0.500000
10                   transAdultPop2022      0.499999
0                                 WEEK      0.499949
1                              stateId      0.499818
19             relLibHealthMandate2022      0.499021
14            nonreligiousStatista2017      0.499015
9   antiTransLegislationRiskIndex32023      0.498912
17                       relLibVax2022      0.497051
16                      relLibVote2022      0.491821
5                   SEXUAL_ORIENTATION      0.406382
4                         EGENID_BIRTH      0.112777

                              Features  Coefficients
3                                EEDUC      0.510278
11           overallReligiosityPew2014      0.504194
12           veryReligiousStatista2017      0.501816
13     moderatelyReligiousStatista2017      0.501757
21                      relLibRfra2022      0.501642
15                     relLibScore2022      0.500698
20                  relLibMarriage2022      0.500558
2                          TBIRTH_YEAR      0.500448
18                    relLibHealth2022      0.500004
6                            INCOMEMIN      0.500001
7                  statePopulation2020      0.500000
8                  statePopulation2023      0.500000
10                   transAdultPop2022      0.499999
0                                 WEEK      0.499949
1                              stateId      0.499818
19             relLibHealthMandate2022      0.499021
14            nonreligiousStatista2017      0.499015
9   antiTransLegislationRiskIndex32023      0.498912
17                       relLibVax2022      0.497051
16                      relLibVote2022      0.491821
5                   SEXUAL_ORIENTATION      0.406382
4                         EGENID_BIRTH      0.112777

                              Features  Coefficients
3                                EEDUC      0.510278
11           overallReligiosityPew2014      0.504194
12           veryReligiousStatista2017      0.501816
13     moderatelyReligiousStatista2017      0.501757
21                      relLibRfra2022      0.501642
15                     relLibScore2022      0.500698
20                  relLibMarriage2022      0.500558
2                          TBIRTH_YEAR      0.500448
18                    relLibHealth2022      0.500004
6                            INCOMEMIN      0.500001
7                  statePopulation2020      0.500000
8                  statePopulation2023      0.500000
10                   transAdultPop2022      0.499999
0                                 WEEK      0.499949
1                              stateId      0.499818
19             relLibHealthMandate2022      0.499021
14            nonreligiousStatista2017      0.499015
9   antiTransLegislationRiskIndex32023      0.498912
17                       relLibVax2022      0.497051
16                      relLibVote2022      0.491821
5                   SEXUAL_ORIENTATION      0.406382
4                         EGENID_BIRTH      0.112777


# Setup for A/B testing
# remove "SEXUAL_ORIENTATION" column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Sexuality Removed")

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")

Logistic Regression Accuracy: 0.6689
                              Features  Coefficients
3                                EEDUC      0.511605
20                      relLibRfra2022      0.505645
10           overallReligiosityPew2014      0.504704
12     moderatelyReligiousStatista2017      0.502584
19                  relLibMarriage2022      0.501647
11           veryReligiousStatista2017      0.501373
0                                 WEEK      0.500593
2                          TBIRTH_YEAR      0.500314
14                     relLibScore2022      0.500257
8   antiTransLegislationRiskIndex32023      0.500143
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
9                    transAdultPop2022      0.500000
17                    relLibHealth2022      0.499961
1                              stateId      0.499648
13            nonreligiousStatista2017      0.499627
16                       relLibVax2022      0.497257
18             relLibHealthMandate2022      0.496243
15                      relLibVote2022      0.489497
4                         EGENID_BIRTH      0.112855

                              Features  Coefficients
3                                EEDUC      0.511605
20                      relLibRfra2022      0.505645
10           overallReligiosityPew2014      0.504704
12     moderatelyReligiousStatista2017      0.502584
19                  relLibMarriage2022      0.501647
11           veryReligiousStatista2017      0.501373
0                                 WEEK      0.500593
2                          TBIRTH_YEAR      0.500314
14                     relLibScore2022      0.500257
8   antiTransLegislationRiskIndex32023      0.500143
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
9                    transAdultPop2022      0.500000
17                    relLibHealth2022      0.499961
1                              stateId      0.499648
13            nonreligiousStatista2017      0.499627
16                       relLibVax2022      0.497257
18             relLibHealthMandate2022      0.496243
15                      relLibVote2022      0.489497
4                         EGENID_BIRTH      0.112855

                              Features  Coefficients
3                                EEDUC      0.511605
20                      relLibRfra2022      0.505645
10           overallReligiosityPew2014      0.504704
12     moderatelyReligiousStatista2017      0.502584
19                  relLibMarriage2022      0.501647
11           veryReligiousStatista2017      0.501373
0                                 WEEK      0.500593
2                          TBIRTH_YEAR      0.500314
14                     relLibScore2022      0.500257
8   antiTransLegislationRiskIndex32023      0.500143
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
9                    transAdultPop2022      0.500000
17                    relLibHealth2022      0.499961
1                              stateId      0.499648
13            nonreligiousStatista2017      0.499627
16                       relLibVax2022      0.497257
18             relLibHealthMandate2022      0.496243
15                      relLibVote2022      0.489497
4                         EGENID_BIRTH      0.112855


# Setup for A/B testing
# remove "EGENID_BIRTH", column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","SEXUAL_ORIENTATION","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Sexuality Removed")

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")

Logistic Regression Accuracy: 0.5102
                              Features  Coefficients
3                                EEDUC      0.510443
10           overallReligiosityPew2014      0.503726
18             relLibHealthMandate2022      0.502791
20                      relLibRfra2022      0.502573
11           veryReligiousStatista2017      0.501265
12     moderatelyReligiousStatista2017      0.501013
8   antiTransLegislationRiskIndex32023      0.500868
0                                 WEEK      0.500678
14                     relLibScore2022      0.500242
2                          TBIRTH_YEAR      0.500045
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
9                    transAdultPop2022      0.499999
1                              stateId      0.499939
13            nonreligiousStatista2017      0.499419
17                    relLibHealth2022      0.498873
19                  relLibMarriage2022      0.497932
16                       relLibVax2022      0.497623
15                      relLibVote2022      0.490879
4                   SEXUAL_ORIENTATION      0.402553

                              Features  Coefficients
3                                EEDUC      0.510443
10           overallReligiosityPew2014      0.503726
18             relLibHealthMandate2022      0.502791
20                      relLibRfra2022      0.502573
11           veryReligiousStatista2017      0.501265
12     moderatelyReligiousStatista2017      0.501013
8   antiTransLegislationRiskIndex32023      0.500868
0                                 WEEK      0.500678
14                     relLibScore2022      0.500242
2                          TBIRTH_YEAR      0.500045
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
9                    transAdultPop2022      0.499999
1                              stateId      0.499939
13            nonreligiousStatista2017      0.499419
17                    relLibHealth2022      0.498873
19                  relLibMarriage2022      0.497932
16                       relLibVax2022      0.497623
15                      relLibVote2022      0.490879
4                   SEXUAL_ORIENTATION      0.402553

                              Features  Coefficients
3                                EEDUC      0.510443
10           overallReligiosityPew2014      0.503726
18             relLibHealthMandate2022      0.502791
20                      relLibRfra2022      0.502573
11           veryReligiousStatista2017      0.501265
12     moderatelyReligiousStatista2017      0.501013
8   antiTransLegislationRiskIndex32023      0.500868
0                                 WEEK      0.500678
14                     relLibScore2022      0.500242
2                          TBIRTH_YEAR      0.500045
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
9                    transAdultPop2022      0.499999
1                              stateId      0.499939
13            nonreligiousStatista2017      0.499419
17                    relLibHealth2022      0.498873
19                  relLibMarriage2022      0.497932
16                       relLibVax2022      0.497623
15                      relLibVote2022      0.490879
4                   SEXUAL_ORIENTATION      0.402553


# Setup for A/B testing
# remove "EEDUC","INCOMEMIN" column
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","SEXUAL_ORIENTATION"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Education and Income Removed")

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")

Logistic Regression Accuracy: 0.7853
                              Features  Coefficients
9            overallReligiosityPew2014      0.503246
11     moderatelyReligiousStatista2017      0.501590
19                      relLibRfra2022      0.501426
10           veryReligiousStatista2017      0.501360
13                     relLibScore2022      0.500738
18                  relLibMarriage2022      0.500612
2                          TBIRTH_YEAR      0.500497
15                       relLibVax2022      0.500336
16                    relLibHealth2022      0.500105
5                  statePopulation2020      0.500000
6                  statePopulation2023      0.500000
7                    transAdultPop2022      0.499999
1                              stateId      0.499872
0                                 WEEK      0.499861
12            nonreligiousStatista2017      0.499604
17             relLibHealthMandate2022      0.499491
8   antiTransLegislationRiskIndex32023      0.496482
14                      relLibVote2022      0.494724
4                   SEXUAL_ORIENTATION      0.403340
3                         EGENID_BIRTH      0.114981

                              Features  Coefficients
9            overallReligiosityPew2014      0.503246
11     moderatelyReligiousStatista2017      0.501590
19                      relLibRfra2022      0.501426
10           veryReligiousStatista2017      0.501360
13                     relLibScore2022      0.500738
18                  relLibMarriage2022      0.500612
2                          TBIRTH_YEAR      0.500497
15                       relLibVax2022      0.500336
16                    relLibHealth2022      0.500105
5                  statePopulation2020      0.500000
6                  statePopulation2023      0.500000
7                    transAdultPop2022      0.499999
1                              stateId      0.499872
0                                 WEEK      0.499861
12            nonreligiousStatista2017      0.499604
17             relLibHealthMandate2022      0.499491
8   antiTransLegislationRiskIndex32023      0.496482
14                      relLibVote2022      0.494724
4                   SEXUAL_ORIENTATION      0.403340
3                         EGENID_BIRTH      0.114981

                              Features  Coefficients
9            overallReligiosityPew2014      0.503246
11     moderatelyReligiousStatista2017      0.501590
19                      relLibRfra2022      0.501426
10           veryReligiousStatista2017      0.501360
13                     relLibScore2022      0.500738
18                  relLibMarriage2022      0.500612
2                          TBIRTH_YEAR      0.500497
15                       relLibVax2022      0.500336
16                    relLibHealth2022      0.500105
5                  statePopulation2020      0.500000
6                  statePopulation2023      0.500000
7                    transAdultPop2022      0.499999
1                              stateId      0.499872
0                                 WEEK      0.499861
12            nonreligiousStatista2017      0.499604
17             relLibHealthMandate2022      0.499491
8   antiTransLegislationRiskIndex32023      0.496482
14                      relLibVote2022      0.494724
4                   SEXUAL_ORIENTATION      0.403340
3                         EGENID_BIRTH      0.114981


# Setup for A/B testing
# remove ,"statePopulation2020","statePopulation2023","transAdultPop2022" column
#population table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Population Removed")

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")

Logistic Regression Accuracy: 0.7093
                              Features  Coefficients
4                                EEDUC      0.509532
17                      relLibRfra2022      0.505297
16                  relLibMarriage2022      0.502339
7            overallReligiosityPew2014      0.502115
6   antiTransLegislationRiskIndex32023      0.501436
9      moderatelyReligiousStatista2017      0.501296
0                                 WEEK      0.500696
8            veryReligiousStatista2017      0.500224
11                     relLibScore2022      0.500088
5                            INCOMEMIN      0.500001
14                    relLibHealth2022      0.499730
1                              stateId      0.499727
10            nonreligiousStatista2017      0.498659
2                          TBIRTH_YEAR      0.498232
15             relLibHealthMandate2022      0.495723
13                       relLibVax2022      0.493200
12                      relLibVote2022      0.489051
3                         EGENID_BIRTH      0.111752

                              Features  Coefficients
4                                EEDUC      0.509532
17                      relLibRfra2022      0.505297
16                  relLibMarriage2022      0.502339
7            overallReligiosityPew2014      0.502115
6   antiTransLegislationRiskIndex32023      0.501436
9      moderatelyReligiousStatista2017      0.501296
0                                 WEEK      0.500696
8            veryReligiousStatista2017      0.500224
11                     relLibScore2022      0.500088
5                            INCOMEMIN      0.500001
14                    relLibHealth2022      0.499730
1                              stateId      0.499727
10            nonreligiousStatista2017      0.498659
2                          TBIRTH_YEAR      0.498232
15             relLibHealthMandate2022      0.495723
13                       relLibVax2022      0.493200
12                      relLibVote2022      0.489051
3                         EGENID_BIRTH      0.111752

                              Features  Coefficients
4                                EEDUC      0.509532
17                      relLibRfra2022      0.505297
16                  relLibMarriage2022      0.502339
7            overallReligiosityPew2014      0.502115
6   antiTransLegislationRiskIndex32023      0.501436
9      moderatelyReligiousStatista2017      0.501296
0                                 WEEK      0.500696
8            veryReligiousStatista2017      0.500224
11                     relLibScore2022      0.500088
5                            INCOMEMIN      0.500001
14                    relLibHealth2022      0.499730
1                              stateId      0.499727
10            nonreligiousStatista2017      0.498659
2                          TBIRTH_YEAR      0.498232
15             relLibHealthMandate2022      0.495723
13                       relLibVax2022      0.493200
12                      relLibVote2022      0.489051
3                         EGENID_BIRTH      0.111752


# Setup for A/B testing
# remove ,"antiTransLegislationRiskIndex32023" column
#anti-trans legislation table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Anti-Trans Legislation Removed")

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")

Logistic Regression Accuracy: 0.6684
                           Features  Coefficients
4                             EEDUC      0.511643
19                   relLibRfra2022      0.505801
9         overallReligiosityPew2014      0.505390
11  moderatelyReligiousStatista2017      0.502597
10        veryReligiousStatista2017      0.502021
18               relLibMarriage2022      0.501604
0                              WEEK      0.500581
13                  relLibScore2022      0.500353
2                       TBIRTH_YEAR      0.500315
5                         INCOMEMIN      0.500001
6               statePopulation2020      0.500000
7               statePopulation2023      0.500000
8                 transAdultPop2022      0.500000
16                 relLibHealth2022      0.499941
1                           stateId      0.499647
12         nonreligiousStatista2017      0.498976
15                    relLibVax2022      0.497316
17          relLibHealthMandate2022      0.496308
14                   relLibVote2022      0.489418
3                      EGENID_BIRTH      0.112749

                           Features  Coefficients
4                             EEDUC      0.511643
19                   relLibRfra2022      0.505801
9         overallReligiosityPew2014      0.505390
11  moderatelyReligiousStatista2017      0.502597
10        veryReligiousStatista2017      0.502021
18               relLibMarriage2022      0.501604
0                              WEEK      0.500581
13                  relLibScore2022      0.500353
2                       TBIRTH_YEAR      0.500315
5                         INCOMEMIN      0.500001
6               statePopulation2020      0.500000
7               statePopulation2023      0.500000
8                 transAdultPop2022      0.500000
16                 relLibHealth2022      0.499941
1                           stateId      0.499647
12         nonreligiousStatista2017      0.498976
15                    relLibVax2022      0.497316
17          relLibHealthMandate2022      0.496308
14                   relLibVote2022      0.489418
3                      EGENID_BIRTH      0.112749

                           Features  Coefficients
4                             EEDUC      0.511643
19                   relLibRfra2022      0.505801
9         overallReligiosityPew2014      0.505390
11  moderatelyReligiousStatista2017      0.502597
10        veryReligiousStatista2017      0.502021
18               relLibMarriage2022      0.501604
0                              WEEK      0.500581
13                  relLibScore2022      0.500353
2                       TBIRTH_YEAR      0.500315
5                         INCOMEMIN      0.500001
6               statePopulation2020      0.500000
7               statePopulation2023      0.500000
8                 transAdultPop2022      0.500000
16                 relLibHealth2022      0.499941
1                           stateId      0.499647
12         nonreligiousStatista2017      0.498976
15                    relLibVax2022      0.497316
17          relLibHealthMandate2022      0.496308
14                   relLibVote2022      0.489418
3                      EGENID_BIRTH      0.112749


# Setup for A/B testing
# remove ,"overallReligiosityPew2014" column
#Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Pew 2014 Removed")

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")

Logistic Regression Accuracy: 0.6694
                              Features  Coefficients
4                                EEDUC      0.511619
19                      relLibRfra2022      0.505808
11     moderatelyReligiousStatista2017      0.502805
18                  relLibMarriage2022      0.501627
10           veryReligiousStatista2017      0.501534
0                                 WEEK      0.500607
13                     relLibScore2022      0.500326
2                          TBIRTH_YEAR      0.500317
9   antiTransLegislationRiskIndex32023      0.500300
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
16                    relLibHealth2022      0.499978
1                              stateId      0.499649
12            nonreligiousStatista2017      0.499524
15                       relLibVax2022      0.497192
17             relLibHealthMandate2022      0.496067
14                      relLibVote2022      0.489147
3                         EGENID_BIRTH      0.111512

                              Features  Coefficients
4                                EEDUC      0.511619
19                      relLibRfra2022      0.505808
11     moderatelyReligiousStatista2017      0.502805
18                  relLibMarriage2022      0.501627
10           veryReligiousStatista2017      0.501534
0                                 WEEK      0.500607
13                     relLibScore2022      0.500326
2                          TBIRTH_YEAR      0.500317
9   antiTransLegislationRiskIndex32023      0.500300
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
16                    relLibHealth2022      0.499978
1                              stateId      0.499649
12            nonreligiousStatista2017      0.499524
15                       relLibVax2022      0.497192
17             relLibHealthMandate2022      0.496067
14                      relLibVote2022      0.489147
3                         EGENID_BIRTH      0.111512

                              Features  Coefficients
4                                EEDUC      0.511619
19                      relLibRfra2022      0.505808
11     moderatelyReligiousStatista2017      0.502805
18                  relLibMarriage2022      0.501627
10           veryReligiousStatista2017      0.501534
0                                 WEEK      0.500607
13                     relLibScore2022      0.500326
2                          TBIRTH_YEAR      0.500317
9   antiTransLegislationRiskIndex32023      0.500300
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
16                    relLibHealth2022      0.499978
1                              stateId      0.499649
12            nonreligiousStatista2017      0.499524
15                       relLibVax2022      0.497192
17             relLibHealthMandate2022      0.496067
14                      relLibVote2022      0.489147
3                         EGENID_BIRTH      0.111512


# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017" column
#Statista 2017 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Statista 2017 Removed")

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")

Logistic Regression Accuracy: 0.6689
                              Features  Coefficients
4                                EEDUC      0.511610
17                      relLibRfra2022      0.505686
10           overallReligiosityPew2014      0.504812
16                  relLibMarriage2022      0.501656
0                                 WEEK      0.500590
2                          TBIRTH_YEAR      0.500315
11                     relLibScore2022      0.500276
9   antiTransLegislationRiskIndex32023      0.500180
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
14                    relLibHealth2022      0.499969
1                              stateId      0.499648
13                       relLibVax2022      0.497284
15             relLibHealthMandate2022      0.496187
12                      relLibVote2022      0.489382
3                         EGENID_BIRTH      0.112520

                              Features  Coefficients
4                                EEDUC      0.511610
17                      relLibRfra2022      0.505686
10           overallReligiosityPew2014      0.504812
16                  relLibMarriage2022      0.501656
0                                 WEEK      0.500590
2                          TBIRTH_YEAR      0.500315
11                     relLibScore2022      0.500276
9   antiTransLegislationRiskIndex32023      0.500180
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
14                    relLibHealth2022      0.499969
1                              stateId      0.499648
13                       relLibVax2022      0.497284
15             relLibHealthMandate2022      0.496187
12                      relLibVote2022      0.489382
3                         EGENID_BIRTH      0.112520

                              Features  Coefficients
4                                EEDUC      0.511610
17                      relLibRfra2022      0.505686
10           overallReligiosityPew2014      0.504812
16                  relLibMarriage2022      0.501656
0                                 WEEK      0.500590
2                          TBIRTH_YEAR      0.500315
11                     relLibScore2022      0.500276
9   antiTransLegislationRiskIndex32023      0.500180
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
14                    relLibHealth2022      0.499969
1                              stateId      0.499648
13                       relLibVax2022      0.497284
15             relLibHealthMandate2022      0.496187
12                      relLibVote2022      0.489382
3                         EGENID_BIRTH      0.112520


# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014" column
#Statista 2017 and Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Statista 2017 and Pew 2014 Removed")

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")

Logistic Regression Accuracy: 0.6802
                              Features  Coefficients
4                                EEDUC      0.510759
16                      relLibRfra2022      0.506450
9   antiTransLegislationRiskIndex32023      0.501529
15                  relLibMarriage2022      0.501230
0                                 WEEK      0.500592
13                    relLibHealth2022      0.500099
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
2                          TBIRTH_YEAR      0.499922
1                              stateId      0.499648
14             relLibHealthMandate2022      0.497649
12                       relLibVax2022      0.494921
10                     relLibScore2022      0.491488
11                      relLibVote2022      0.488917
3                         EGENID_BIRTH      0.109164

                              Features  Coefficients
4                                EEDUC      0.510759
16                      relLibRfra2022      0.506450
9   antiTransLegislationRiskIndex32023      0.501529
15                  relLibMarriage2022      0.501230
0                                 WEEK      0.500592
13                    relLibHealth2022      0.500099
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
2                          TBIRTH_YEAR      0.499922
1                              stateId      0.499648
14             relLibHealthMandate2022      0.497649
12                       relLibVax2022      0.494921
10                     relLibScore2022      0.491488
11                      relLibVote2022      0.488917
3                         EGENID_BIRTH      0.109164

                              Features  Coefficients
4                                EEDUC      0.510759
16                      relLibRfra2022      0.506450
9   antiTransLegislationRiskIndex32023      0.501529
15                  relLibMarriage2022      0.501230
0                                 WEEK      0.500592
13                    relLibHealth2022      0.500099
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
2                          TBIRTH_YEAR      0.499922
1                              stateId      0.499648
14             relLibHealthMandate2022      0.497649
12                       relLibVax2022      0.494921
10                     relLibScore2022      0.491488
11                      relLibVote2022      0.488917
3                         EGENID_BIRTH      0.109164


# Setup for A/B testing
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Religious Liberty Removed")

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")

Logistic Regression Accuracy: 0.6761
                              Features  Coefficients
4                                EEDUC      0.510815
13           overallReligiosityPew2014      0.506299
11     moderatelyReligiousStatista2017      0.504494
10           veryReligiousStatista2017      0.501713
9   antiTransLegislationRiskIndex32023      0.500621
0                                 WEEK      0.500606
12            nonreligiousStatista2017      0.500278
2                          TBIRTH_YEAR      0.500069
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.499999
1                              stateId      0.499662
3                         EGENID_BIRTH      0.111579

                              Features  Coefficients
4                                EEDUC      0.510815
13           overallReligiosityPew2014      0.506299
11     moderatelyReligiousStatista2017      0.504494
10           veryReligiousStatista2017      0.501713
9   antiTransLegislationRiskIndex32023      0.500621
0                                 WEEK      0.500606
12            nonreligiousStatista2017      0.500278
2                          TBIRTH_YEAR      0.500069
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.499999
1                              stateId      0.499662
3                         EGENID_BIRTH      0.111579

                              Features  Coefficients
4                                EEDUC      0.510815
13           overallReligiosityPew2014      0.506299
11     moderatelyReligiousStatista2017      0.504494
10           veryReligiousStatista2017      0.501713
9   antiTransLegislationRiskIndex32023      0.500621
0                                 WEEK      0.500606
12            nonreligiousStatista2017      0.500278
2                          TBIRTH_YEAR      0.500069
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.499999
1                              stateId      0.499662
3                         EGENID_BIRTH      0.111579


# Setup for A/B testing
#,"statePopulation2020","statePopulation2023","transAdultPop2022"
#,"antiTransLegislationRiskIndex32023"
#,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Only Pulse Data")

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)

Logistic Regression Accuracy: 0.7099
       Features  Coefficients
4         EEDUC      0.509491
0          WEEK      0.500709
5     INCOMEMIN      0.500002
1       stateId      0.499759
2   TBIRTH_YEAR      0.498227
3  EGENID_BIRTH      0.111782

C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")

       Features  Coefficients
4         EEDUC      0.509491
0          WEEK      0.500709
5     INCOMEMIN      0.500002
1       stateId      0.499759
2   TBIRTH_YEAR      0.498227
3  EGENID_BIRTH      0.111782

       Features  Coefficients
4         EEDUC      0.509491
0          WEEK      0.500709
5     INCOMEMIN      0.500002
1       stateId      0.499759
2   TBIRTH_YEAR      0.498227
3  EGENID_BIRTH      0.111782


clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
            ,"SEXUAL_ORIENTATION","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState, "Random Forest Feature Importance")

Random Forest (500 Tree) Regression Accuracy: 0.4966
                              Features  Importance
5                   SEXUAL_ORIENTATION    0.283867
4                         EGENID_BIRTH    0.247087
2                          TBIRTH_YEAR    0.115527
0                                 WEEK    0.082819
6                            INCOMEMIN    0.051162
3                                EEDUC    0.041775
1                              stateId    0.024697
8                  statePopulation2023    0.019611
15                     relLibScore2022    0.017565
7                  statePopulation2020    0.014974
11           overallReligiosityPew2014    0.014905
10                   transAdultPop2022    0.014747
18                    relLibHealth2022    0.014445
13     moderatelyReligiousStatista2017    0.011986
12           veryReligiousStatista2017    0.011783
14            nonreligiousStatista2017    0.011340
9   antiTransLegislationRiskIndex32023    0.007486
20                  relLibMarriage2022    0.005403
19             relLibHealthMandate2022    0.002955
21                      relLibRfra2022    0.002738
16                      relLibVote2022    0.002267
17                       relLibVax2022    0.000862


# Setup for A/B testing
# remove "SEXUAL_ORIENTATION" column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Sexuality Removed")

Random Forest (500 Tree) Regression Accuracy: 0.2711
                              Features  Importance
4                         EGENID_BIRTH    0.239197
2                          TBIRTH_YEAR    0.200776
0                                 WEEK    0.127075
5                            INCOMEMIN    0.087728
3                                EEDUC    0.068879
1                              stateId    0.041188
14                     relLibScore2022    0.030655
7                  statePopulation2023    0.025917
9                    transAdultPop2022    0.023027
10           overallReligiosityPew2014    0.022958
6                  statePopulation2020    0.021619
17                    relLibHealth2022    0.021323
12     moderatelyReligiousStatista2017    0.019417
11           veryReligiousStatista2017    0.018438
13            nonreligiousStatista2017    0.017602
8   antiTransLegislationRiskIndex32023    0.011535
19                  relLibMarriage2022    0.009055
20                      relLibRfra2022    0.004458
18             relLibHealthMandate2022    0.004111
15                      relLibVote2022    0.003277
16                       relLibVax2022    0.001768


# Setup for A/B testing
# remove "EEDUC","INCOMEMIN" column
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","SEXUAL_ORIENTATION"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Education and Income Removed")

Random Forest (500 Tree) Regression Accuracy: 0.4744
                              Features  Importance
4                   SEXUAL_ORIENTATION    0.285457
3                         EGENID_BIRTH    0.248089
2                          TBIRTH_YEAR    0.166847
0                                 WEEK    0.121240
1                              stateId    0.024385
6                  statePopulation2023    0.018840
13                     relLibScore2022    0.017829
16                    relLibHealth2022    0.014760
5                  statePopulation2020    0.014618
7                    transAdultPop2022    0.014434
9            overallReligiosityPew2014    0.014231
10           veryReligiousStatista2017    0.012056
12            nonreligiousStatista2017    0.012010
11     moderatelyReligiousStatista2017    0.011817
8   antiTransLegislationRiskIndex32023    0.008044
18                  relLibMarriage2022    0.005661
17             relLibHealthMandate2022    0.003300
19                      relLibRfra2022    0.002953
14                      relLibVote2022    0.002526
15                       relLibVax2022    0.000904


# Setup for A/B testing
# remove ,"statePopulation2020","statePopulation2023","transAdultPop2022" column
#population table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Population Removed")

Random Forest (500 Tree) Regression Accuracy: 0.2717
                              Features  Importance
3                         EGENID_BIRTH    0.239197
2                          TBIRTH_YEAR    0.200083
0                                 WEEK    0.127960
5                            INCOMEMIN    0.087268
4                                EEDUC    0.069257
1                              stateId    0.054054
11                     relLibScore2022    0.041764
7            overallReligiosityPew2014    0.030534
14                    relLibHealth2022    0.028542
9      moderatelyReligiousStatista2017    0.025326
8            veryReligiousStatista2017    0.024941
10            nonreligiousStatista2017    0.023795
6   antiTransLegislationRiskIndex32023    0.016182
16                  relLibMarriage2022    0.011925
17                      relLibRfra2022    0.006150
15             relLibHealthMandate2022    0.005858
12                      relLibVote2022    0.004419
13                       relLibVax2022    0.002745


# Setup for A/B testing
# remove ,"antiTransLegislationRiskIndex32023" column
#anti-trans legislation table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Anti-Trans Legislation Removed")

Random Forest (500 Tree) Regression Accuracy: 0.271
                           Features  Importance
3                      EGENID_BIRTH    0.239197
2                       TBIRTH_YEAR    0.201075
0                              WEEK    0.127463
5                         INCOMEMIN    0.087390
4                             EEDUC    0.069082
1                           stateId    0.042216
13                  relLibScore2022    0.031651
7               statePopulation2023    0.026780
8                 transAdultPop2022    0.024179
9         overallReligiosityPew2014    0.024081
6               statePopulation2020    0.021919
16                 relLibHealth2022    0.021759
11  moderatelyReligiousStatista2017    0.020444
10        veryReligiousStatista2017    0.019535
12         nonreligiousStatista2017    0.019072
18               relLibMarriage2022    0.009757
19                   relLibRfra2022    0.004788
17          relLibHealthMandate2022    0.004328
14                   relLibVote2022    0.003453
15                    relLibVax2022    0.001831


# Setup for A/B testing
# remove ,"overallReligiosityPew2014" column
#Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Pew 2014 Removed")

Random Forest (500 Tree) Regression Accuracy: 0.2723
                              Features  Importance
3                         EGENID_BIRTH    0.239197
2                          TBIRTH_YEAR    0.200986
0                                 WEEK    0.127489
5                            INCOMEMIN    0.087619
4                                EEDUC    0.068820
1                              stateId    0.043777
13                     relLibScore2022    0.032480
7                  statePopulation2023    0.027228
8                    transAdultPop2022    0.024436
16                    relLibHealth2022    0.023153
6                  statePopulation2020    0.023089
12            nonreligiousStatista2017    0.022216
10           veryReligiousStatista2017    0.021502
11     moderatelyReligiousStatista2017    0.020735
9   antiTransLegislationRiskIndex32023    0.012801
18                  relLibMarriage2022    0.009710
19                      relLibRfra2022    0.004750
17             relLibHealthMandate2022    0.004410
14                      relLibVote2022    0.003699
15                       relLibVax2022    0.001903


# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017" column
#Statista 2017 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Statista 2017 Removed")

Random Forest (500 Tree) Regression Accuracy: 0.2722
                              Features  Importance
3                         EGENID_BIRTH    0.239197
2                          TBIRTH_YEAR    0.201996
0                                 WEEK    0.128081
5                            INCOMEMIN    0.087045
4                                EEDUC    0.068836
1                              stateId    0.048270
11                     relLibScore2022    0.036531
10           overallReligiosityPew2014    0.034922
7                  statePopulation2023    0.031234
8                    transAdultPop2022    0.028197
14                    relLibHealth2022    0.026239
6                  statePopulation2020    0.025789
9   antiTransLegislationRiskIndex32023    0.015637
16                  relLibMarriage2022    0.011671
17                      relLibRfra2022    0.005378
15             relLibHealthMandate2022    0.004677
12                      relLibVote2022    0.004106
13                       relLibVax2022    0.002194


# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014" column
#Statista 2017 and Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Statista 2017 and Pew 2014 Removed")

Random Forest (500 Tree) Regression Accuracy: 0.274
                              Features  Importance
3                         EGENID_BIRTH    0.239197
2                          TBIRTH_YEAR    0.202868
0                                 WEEK    0.128257
5                            INCOMEMIN    0.087695
4                                EEDUC    0.068477
1                              stateId    0.055199
10                     relLibScore2022    0.040815
7                  statePopulation2023    0.034473
8                    transAdultPop2022    0.031218
13                    relLibHealth2022    0.030359
6                  statePopulation2020    0.028814
9   antiTransLegislationRiskIndex32023    0.020202
15                  relLibMarriage2022    0.013167
16                      relLibRfra2022    0.006054
11                      relLibVote2022    0.005467
14             relLibHealthMandate2022    0.005383
12                       relLibVax2022    0.002356


# Setup for A/B testing
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Religious Liberty Removed")

Random Forest (500 Tree) Regression Accuracy: 0.2721
                              Features  Importance
3                         EGENID_BIRTH    0.239197
2                          TBIRTH_YEAR    0.201343
0                                 WEEK    0.129005
5                            INCOMEMIN    0.087782
4                                EEDUC    0.069254
1                              stateId    0.054400
7                  statePopulation2023    0.034165
13           overallReligiosityPew2014    0.031626
8                    transAdultPop2022    0.031116
6                  statePopulation2020    0.029118
11     moderatelyReligiousStatista2017    0.026989
10           veryReligiousStatista2017    0.025652
12            nonreligiousStatista2017    0.024772
9   antiTransLegislationRiskIndex32023    0.015582


# Setup for A/B testing
#,"statePopulation2020","statePopulation2023","transAdultPop2022"
#,"antiTransLegislationRiskIndex32023"
#,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Only Pulse Data")

Random Forest (500 Tree) Regression Accuracy: 0.2661
       Features  Importance
3  EGENID_BIRTH    0.239197
2   TBIRTH_YEAR    0.226661
1       stateId    0.201604
0          WEEK    0.152386
5     INCOMEMIN    0.098260
4         EEDUC    0.081893

	AllWEEKcount	CisMenWEEKcount	CisWomenWEEKcount	CisgenderWEEKcount	TransWomenWEEKcount	TransMenWEEKcount	EnbyWEEKcount	NonCisgenderWEEKcount	NonCisPercent
43	64543	25986	37672	63658	131	188	566	885	1.37
41	61390	24939	35525	60464	162	185	579	926	1.51
42	60221	24204	35106	59310	144	198	569	911	1.51
54	60220	25198	34080	59278	153	230	559	942	1.56
52	56169	24487	30771	55258	172	196	543	911	1.62
53	54574	22845	30918	53763	131	190	490	811	1.49
36	53108	21142	31193	52335	108	147	518	773	1.46
35	52930	21095	31041	52136	148	142	504	794	1.50
34	49604	19756	29066	48822	122	138	522	782	1.58
44	49565	20076	28742	48818	112	160	475	747	1.51
37	49090	19691	28689	48380	93	135	482	710	1.45
51	48800	21354	26597	47951	153	173	523	849	1.74
46	48346	19406	28206	47612	113	155	466	734	1.52
40	47908	20526	26667	47193	123	149	443	715	1.49
45	47676	19242	27636	46878	145	169	484	798	1.67
38	45991	18625	26685	45310	101	152	428	681	1.48
47	44593	17972	25899	43871	114	147	461	722	1.62
39	43767	17899	25256	43155	91	106	415	612	1.40
49	40405	17954	21784	39738	128	158	381	667	1.65
48	36405	14994	20714	35708	113	178	406	697	1.91
50	33270	15023	17652	32675	97	148	350	595	1.79

	AllEST_STcount	CisMenEST_STcount	CisWomenEST_STcount	CisgenderEST_STcount	TransWomenEST_STcount	TransMenEST_STcount	EnbyEST_STcount	NonCisgenderEST_STcount	NonCisPercent
6	79763	35075	43358	78433	196	292	842	1330	1.67
48	52968	22829	29432	52261	118	136	453	707	1.33
53	44216	19190	24043	43233	161	216	606	983	2.22
12	36164	15525	20207	35732	69	76	287	432	1.19
25	29233	12153	16581	28734	97	106	296	499	1.71
41	29039	11455	16841	28296	139	188	416	743	2.56
26	28883	11989	16456	28445	80	91	267	438	1.52
49	28771	12504	15855	28359	81	79	252	412	1.43
51	28311	12243	15679	27922	62	71	256	389	1.37
4	28004	11663	15922	27585	63	83	273	419	1.50
8	27907	11805	15629	27434	70	110	293	473	1.69
42	26712	11195	15118	26313	67	95	237	399	1.49
13	24722	9982	14380	24362	58	85	217	360	1.46
24	24421	9971	14084	24055	40	82	244	366	1.50
27	23848	10004	13467	23471	78	80	219	377	1.58
17	22360	9488	12471	21959	80	76	245	401	1.79
20	19673	7807	11564	19371	49	73	180	302	1.54
36	19474	8180	10947	19127	44	82	221	347	1.78
16	19306	7815	11212	19027	42	67	170	279	1.45
35	19302	7508	11472	18980	45	72	205	322	1.67
34	19089	8364	10506	18870	32	34	153	219	1.15
37	18992	7637	11100	18737	33	53	169	255	1.34
18	18836	7441	11123	18564	39	59	174	272	1.44
29	18833	7387	11145	18532	55	60	186	301	1.60
55	18774	7712	10750	18462	50	84	178	312	1.66
47	18081	7053	10763	17816	29	64	172	265	1.47
9	17796	7118	10424	17542	44	53	157	254	1.43
39	17452	7106	10094	17200	46	51	155	252	1.44
19	17153	6676	10241	16917	38	50	148	236	1.38
33	16854	7144	9473	16617	44	50	143	237	1.41
32	16779	7124	9422	16546	36	46	151	233	1.39
2	16617	6761	9546	16307	51	60	199	310	1.87
40	16149	6216	9686	15902	41	50	156	247	1.53
31	16114	6469	9396	15865	31	57	161	249	1.55
45	15821	6122	9491	15613	26	38	144	208	1.31
21	14878	5790	8900	14690	43	39	106	188	1.26
5	13482	5176	8145	13321	28	36	97	161	1.19
1	13238	5261	7807	13068	26	35	109	170	1.28
30	12483	4962	7352	12314	31	33	105	169	1.35
22	12068	4465	7413	11878	33	25	132	190	1.57
50	11646	4572	6880	11452	34	50	110	194	1.67
10	11563	4573	6833	11406	21	28	108	157	1.36
15	11375	4895	6309	11204	30	30	111	171	1.50
23	11128	4338	6605	10943	35	41	109	185	1.66
46	10867	4453	6284	10737	26	18	86	130	1.20
56	10844	4325	6360	10685	28	25	106	159	1.47
54	10685	3943	6606	10549	23	30	83	136	1.27
44	9601	3822	5595	9417	24	47	113	184	1.92
28	9301	3342	5846	9188	14	19	80	113	1.21
38	8999	3786	5086	8872	24	19	84	127	1.41