In [1]:
# setup
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.tools as tools

from collections import Counter

from matplotlib.ticker import MaxNLocator

from patsy import dmatrices

from sklearn.cluster import KMeans
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn import tree
from sklearn.tree import DecisionTreeRegressor

from sklearn import metrics
from sklearn.metrics import r2_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import SVR

from statistics import mode
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.outliers_influence import variance_inflation_factor

pd.options.mode.chained_assignment = None
In [2]:
# read data for Reed and religiosity datasets at state level
reed_col_list = ["stateId","stateName","statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
                 ,"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022"
                 ,"transAdultPop2016","transAdultPercent2016","transAdultPop2022","transAdultPercent2022"
                 ,"religionImportantPew2014","worshipWeeklyPew2014","prayDailyPew2014","certainAboutGodPew2014"
                 ,"overallReligiosityPew2014","veryReligiousStatista2017","moderatelyReligiousStatista2017"
                 ,"nonreligiousStatista2017","relLibScore2022","relLibVote2022","relLibVax2022"
                 ,"relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022"
                ]

reed_index_list = ["stateId","stateName","antiTransLegislationRiskIndex32023"
                   ,"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022"
                  ]
trans_pop_list = ["stateId","stateName","statePopulation2020","statePopulation2023","transAdultPop2016"
                  ,"transAdultPercent2016","transAdultPop2022","transAdultPercent2022"
                 ]
religiosity_2014_list = ["stateId","stateName","religionImportantPew2014","worshipWeeklyPew2014"
                         ,"prayDailyPew2014","certainAboutGodPew2014","overallReligiosityPew2014"
                        ]
religiosity_2017_list = ["stateId","stateName","veryReligiousStatista2017","moderatelyReligiousStatista2017"
                         ,"nonreligiousStatista2017"
                        ]
religiosity_2022_list = ["stateId","stateName","relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022"
                         ,"relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022"
                        ]

reedFulldf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=reed_col_list)
reedFulldf = reedFulldf[reedFulldf["stateId"] != 11]
reedIndexdf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=reed_index_list)
reedIndexdf = reedIndexdf[reedIndexdf["stateId"] != 11]
transStatePopdf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=trans_pop_list)
transStatePopdf = transStatePopdf[transStatePopdf["stateId"] != 11]
religious2014df = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=religiosity_2014_list)
religious2014df = religious2014df[religious2014df["stateId"] != 11]
religious2017df = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=religiosity_2017_list)
religious2017df = religious2017df[religious2017df["stateId"] != 11]
religious2022df = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=religiosity_2022_list)
religious2022df = religious2022df[religious2022df["stateId"] != 11]

#print(reedFulldf.head())
In [3]:
#create function to get count of unique values in column and get percentages
def countCol(df, dfCol):
    tempdf = df[dfCol]
    namecount = dfCol + "count"
    namepercent = dfCol + "percent"
    tempdf[namecount] = df[dfCol].value_counts()
    tempdf[namepercent] = df[dfCol].value_counts(normalize=True)*100
    return tempdf
In [4]:
# use describe to get mean and standard deviations of dataframe data
def describeDF(df, dfCol):
    print(df.describe())
    # get mode and variance using built in stats library
    for colName in dfCol:
        if(df[colName].dtypes != object):
            print("Mode of ",colName,": ", mode(df[colName]))
            print("Variance of ",colName,": ", np.var(df[colName], ddof=1))
    print()
In [5]:
def combinedf(df, dfCol, dfName):
    retdf = pd.DataFrame()
    for colName in dfCol:
        namecount = colName + "count"
        namepercent = colName + "percent"
        dfNameColCount = dfName + namecount
        dfNameColPercent = dfName + namepercent
        tempdf = countCol(df,colName)
        retdf[dfNameColCount] = tempdf[namecount]
        retdf[dfNameColPercent] = tempdf[namepercent]
    return retdf
In [6]:
#function to allow grouping gender identity on 3 values
def basicGenMarker(asab, gender):
    if(asab == 1 and gender == 1): 
        return "Cisgender Man"
    elif (asab == 2 and gender == 2):
        return "Cisgender Woman"
    else:
        return "Transgender"
In [7]:
#function to print covariance map based on given columns
def printCovariance(df,dfCol,colLabels,title):
    cols = dfCol
    stdsc = StandardScaler()
    X_std = stdsc.fit_transform(df[cols].iloc[:,range(0,len(dfCol))].values)
    cov_mat = np.cov(X_std.T)
    
    plt.figure(figsize=(7,7))
    sns.set(font_scale=1)
    hm = sns.heatmap(cov_mat, cbar = True, annot = True, square = True, fmt = ".2f", cmap = "vlag",
                    annot_kws={"size":12}, yticklabels = colLabels, xticklabels = colLabels, cbar_kws={"shrink": 0.5})
    #plt.title(title)
    plt.tight_layout()
    plt.show()
In [8]:
#function to build classifier matrix after testing NB model
# Build Naive Bayes Classifer to sort and classify data
# split datasets into training and test sets
def printNBClassifierOutcome(X,y,trainSize,trainState,matrixTitle):
    size = trainSize
    state = trainState
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-size, random_state=state)

    # scale input data for training if necessary for better predictions
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.fit_transform(X_test)

    # initiale, train and test the BNB
    bnb = BernoulliNB()
    bnb.fit(X_train, y_train)
    pred = bnb.predict(X_test)

    #check accuracy
    bnb_accuracy = metrics.accuracy_score(pred, y_test)

    printConfusionMatrix(y_test, pred,matrixTitle)
In [9]:
#function to build classifier matrix after testing SVM model
#build SVM
# split datasets into training and test sets
def printSVMClassifierOutcome(X,y,trainSize,trainState,matrixTitle):
    size = trainSize
    state = trainState
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-size, random_state=state)

    # scale input data for training if necessary for better predictions
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.fit_transform(X_test)

    # build and train model
    clf = SVC(kernel = "linear")
    clf.fit(X_train, y_train)

    pred = clf.predict(X_test)

    #check accuracy
    clf_accuracy = metrics.accuracy_score(pred, y_test)

    printConfusionMatrix(y_test, pred,matrixTitle)
In [10]:
def printConfusionMatrix(y_test, pred,matrixTitle):
    #confusion matrix
    cfm = metrics.confusion_matrix(y_test, pred)
    fig, ax = plt.subplots(figsize=(6,6))
    ax.matshow(cfm, cmap = plt.cm.Purples, alpha=0.3)
    for i in range(cfm.shape[0]):
        for j in range(cfm.shape[1]):
            ax.text(x=j, y=i,s=cfm[i, j], va='center', 
                    ha='center', size='xx-large')

    plt.xlabel('Predictions', fontsize = 16)
    plt.ylabel('Actuals', fontsize = 16)
    plt.title(matrixTitle, fontsize = 14)
    plt.show()

    print(metrics.classification_report(y_test, pred, zero_division = 0))
In [11]:
nEstimators = 500
decPrecision = 4
maxDepth = 3

def printRFRClassifierOutcome(X,y,trainSize,trainState, featTitle):
    size = trainSize
    state = trainState
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-size, random_state=state)
    sc_X = StandardScaler()
    sc_y = StandardScaler()
    sc_X_train = sc_X.fit_transform(X_train)
    sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
    sc_y_train = sc_y_train

    rf_regressor = RandomForestRegressor(n_estimators = nEstimators, random_state = state)
    rf_regressor.fit(X_train, y_train)
    rf_y_pred = rf_regressor.predict(X_test)
    np.set_printoptions(precision=decPrecision)

    print("Random Forest (" + str(nEstimators) + " Tree) Regression Accuracy: " + str(round(r2_score(y_test, rf_y_pred), decPrecision)))
    
    #rf_regressor.feature_names_in_
    featureDf = pd.DataFrame({"Features" : rf_regressor.feature_names_in_, "Importance" : rf_regressor.feature_importances_})
    featureDf = featureDf.sort_values(by=["Importance"], ascending=False)
    print(featureDf)
    
    #plot bar chart of importance
    f, ax = plt.subplots(figsize=(20,12))
    sns.barplot(x=featureDf["Features"], y=featureDf["Importance"], palette="flare")
    plt.xlabel('Features', fontsize = 16)
    plt.ylabel('Importance', fontsize = 16)
    plt.title(featTitle, fontsize=16)
    plt.xticks(rotation=45)
    #for val in plt.containers:
        #plt.bar_label(val)
    
    plt.show()
    
    #rfc = RandomForestClassifier(n_estimators=nEstimators, max_depth=maxDepth, random_state=state)
    #rfc.fit(X_train, y_train)
    #features = X.columns.values
    #classes = ['Cisgender man', 'Cisgender Woman', 'Transgender']
    #for estimator in rfc.estimators_:
        #print(estimator)
        #plt.figure(figsize=(20,10))
        #tree.plot_tree(estimator, feature_names=features, class_names=classes, fontsize=10, filled=True, rounded=True)
        #plt.show()
In [12]:
maxIter=1000000000

def LogRegressionOutcome(X,y,trainSize,trainState,featTitle):
    size = trainSize
    state = trainState
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-size, random_state = state)
    sc_X = StandardScaler()
    sc_y = StandardScaler()
    sc_X_train = sc_X.fit_transform(X_train)
    sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
    sc_y_train = sc_y_train
    
    log_regression = LogisticRegression(solver="newton-cg", random_state=state, penalty="l2", C=0.01, max_iter=maxIter).fit(X_train,y_train)
    print("Logistic Regression Accuracy: " + str(round(log_regression.score(X_test,y_test), decPrecision)))

    x_train_const = tools.add_constant(X_train)
    
    coefArray = []
    for ind in range(log_regression.coef_.shape[0]):
        if ind == 0:
            featFor = "Cisgender Men"
        elif ind == 1:
            featFor = "Cisgender Women"
        else:
            featFor = "Transgender"
        fullFeatTitle = "Logistic Regression " + featFor + " Feature Important for " + featTitle

        for x in log_regression.coef_[ind]:
            #print(np.exp(x)/(1 + np.exp(x)))
            coefArray.append(np.exp(x)/(1 + np.exp(x)))

        featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["Features", "Coefficients"])
        featureDf = featureDf.sort_values(by=["Coefficients"], ascending=False)
        print(featureDf)
    
        #plot bar chart of importance
        f, ax = plt.subplots(figsize=(20,12))
        sns.barplot(x=featureDf["Features"], y=featureDf["Coefficients"], palette="flare")
        plt.xlabel('Features', fontsize = 16)
        plt.ylabel('Coefficients', fontsize = 16)
        plt.title(fullFeatTitle, fontsize=16)
        plt.xticks(rotation=45)
        #for val in plt.containers:
            #plt.bar_label(val)
    
        plt.show()
In [13]:
describeDF(reedIndexdf, reed_index_list)
         stateId  antiTransLegislationRiskIndex32023  \
count  50.000000                            50.00000   
mean   29.320000                             2.08000   
std    15.782243                             1.60153   
min     1.000000                             0.00000   
25%    17.250000                             1.00000   
50%    29.500000                             2.00000   
75%    41.750000                             4.00000   
max    56.000000                             4.00000   

       antiTransLegislationRiskIndex122022  \
count                            50.000000   
mean                              1.860000   
std                               1.340271   
min                               0.000000   
25%                               1.000000   
50%                               2.000000   
75%                               3.000000   
max                               4.000000   

       antiTransLegislationRiskIndex112022  
count                             50.00000  
mean                               1.82000  
std                                1.33539  
min                                0.00000  
25%                                1.00000  
50%                                2.00000  
75%                                3.00000  
max                                4.00000  
Mode of  stateId :  1
Variance of  stateId :  249.0791836734694
Mode of  antiTransLegislationRiskIndex32023 :  4
Variance of  antiTransLegislationRiskIndex32023 :  2.564897959183673
Mode of  antiTransLegislationRiskIndex122022 :  3
Variance of  antiTransLegislationRiskIndex122022 :  1.7963265306122445
Mode of  antiTransLegislationRiskIndex112022 :  1
Variance of  antiTransLegislationRiskIndex112022 :  1.7832653061224486

In [14]:
reed_cov_list = ["stateId","antiTransLegislationRiskIndex32023"
                   ,"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022"
                  ]
reed_label = ["stateId","RiskIndex32023"
             ,"RiskIndex122022","RiskIndex112022"]
printCovariance(reedIndexdf, reed_cov_list, reed_label, "Anti-Transgender Legislation Risk Index Covariance Matrix")
In [15]:
describeDF(transStatePopdf, trans_pop_list)
         stateId  statePopulation2020  statePopulation2023  transAdultPop2016  \
count  50.000000         5.000000e+01         5.000000e+01           50.00000   
mean   29.320000         6.615242e+06         8.960485e+06        27654.00000   
std    15.782243         7.436124e+06         1.907631e+07        36854.01958   
min     1.000000         5.768510e+05         5.808170e+05         1400.00000   
25%    17.250000         1.869706e+06         1.940934e+06         6375.00000   
50%    29.500000         4.581796e+06         4.625424e+06        19450.00000   
75%    41.750000         7.566836e+06         7.844464e+06        31037.50000   
max    56.000000         3.953822e+07         1.309280e+08       218400.00000   

       transAdultPercent2016  transAdultPop2022  transAdultPercent2022  
count              50.000000          50.000000              50.000000  
mean                0.530400       26638.000000               0.531800  
std                 0.121722       29080.703259               0.126889  
min                 0.300000        2100.000000               0.200000  
25%                 0.432500        7025.000000               0.442500  
50%                 0.535000       16950.000000               0.525000  
75%                 0.610000       33225.000000               0.600000  
max                 0.780000      150100.000000               0.870000  
Mode of  stateId :  1
Variance of  stateId :  249.0791836734694
Mode of  statePopulation2020 :  5024279
Variance of  statePopulation2020 :  55295936980950.49
Mode of  statePopulation2023 :  5097641
Variance of  statePopulation2023 :  363905671623562.75
Mode of  transAdultPop2016 :  2700
Variance of  transAdultPop2016 :  1358218759.1836734
Mode of  transAdultPercent2016 :  0.43
Variance of  transAdultPercent2016 :  0.014816163265306125
Mode of  transAdultPop2022 :  6300
Variance of  transAdultPop2022 :  845687302.0408163
Mode of  transAdultPercent2022 :  0.6
Variance of  transAdultPercent2022 :  0.016100775510204078

In [16]:
pop_cov_list = ["stateId","statePopulation2020","statePopulation2023","transAdultPop2016","transAdultPop2022"]
pop_label = ["stateId","TotalPop2020","TotalPop2023","TransPop2016","TransPop2022"]
printCovariance(transStatePopdf, pop_cov_list, pop_label, "Population by State Covariance Matrix")
In [17]:
describeDF(religious2014df, religiosity_2014_list)
         stateId  religionImportantPew2014  worshipWeeklyPew2014  \
count  50.000000                 50.000000             50.000000   
mean   29.320000                  0.527000              0.359400   
std    15.782243                  0.107499              0.075035   
min     1.000000                  0.320000              0.210000   
25%    17.250000                  0.452500              0.310000   
50%    29.500000                  0.510000              0.355000   
75%    41.750000                  0.597500              0.390000   
max    56.000000                  0.770000              0.530000   

       prayDailyPew2014  certainAboutGodPew2014  overallReligiosityPew2014  
count         50.000000               50.000000                  50.000000  
mean           0.541400                0.633600                   0.547000  
std            0.094286                0.095271                   0.107423  
min            0.330000                0.400000                   0.330000  
25%            0.490000                0.575000                   0.482500  
50%            0.530000                0.630000                   0.540000  
75%            0.607500                0.690000                   0.625000  
max            0.750000                0.820000                   0.770000  
Mode of  stateId :  1
Variance of  stateId :  249.0791836734694
Mode of  religionImportantPew2014 :  0.44
Variance of  religionImportantPew2014 :  0.011556122448979588
Mode of  worshipWeeklyPew2014 :  0.34
Variance of  worshipWeeklyPew2014 :  0.005630244897959184
Mode of  prayDailyPew2014 :  0.51
Variance of  prayDailyPew2014 :  0.008889836734693879
Mode of  certainAboutGodPew2014 :  0.61
Variance of  certainAboutGodPew2014 :  0.009076571428571429
Mode of  overallReligiosityPew2014 :  0.54
Variance of  overallReligiosityPew2014 :  0.011539795918367344

In [18]:
religiosity_cov_2014_list = ["stateId","religionImportantPew2014","worshipWeeklyPew2014"
                         ,"prayDailyPew2014","certainAboutGodPew2014","overallReligiosityPew2014"]
rel_2014_label = ["stateId","VeryImportant","WorshipWeekly","PrayDaily","CertainAboutGod","Overall"]
printCovariance(religious2014df, religiosity_cov_2014_list, rel_2014_label, "Pew 2014 Religiosity Covariance Matrix")
In [19]:
describeDF(religious2017df, religiosity_2017_list)
         stateId  veryReligiousStatista2017  moderatelyReligiousStatista2017  \
count  50.000000                  50.000000                        50.000000   
mean   29.320000                   0.371600                         0.287200   
std    15.782243                   0.090449                         0.030442   
min     1.000000                   0.160000                         0.160000   
25%    17.250000                   0.310000                         0.270000   
50%    29.500000                   0.365000                         0.295000   
75%    41.750000                   0.437500                         0.300000   
max    56.000000                   0.590000                         0.330000   

       nonreligiousStatista2017  
count                 50.000000  
mean                   0.342000  
std                    0.099857  
min                    0.120000  
25%                    0.290000  
50%                    0.340000  
75%                    0.397500  
max                    0.590000  
Mode of  stateId :  1
Variance of  stateId :  249.0791836734694
Mode of  veryReligiousStatista2017 :  0.28
Variance of  veryReligiousStatista2017 :  0.008181061224489796
Mode of  moderatelyReligiousStatista2017 :  0.3
Variance of  moderatelyReligiousStatista2017 :  0.0009266938775510202
Mode of  nonreligiousStatista2017 :  0.33
Variance of  nonreligiousStatista2017 :  0.00997142857142857

In [20]:
religiosity_cov_2017_list = ["stateId","veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"]
rel_2017_label = ["stateId","Very","Moderate","Nonreligious"]
printCovariance(religious2017df, religiosity_cov_2017_list, rel_2017_label, "Gallup 2017 Religiosity Covariance Matrix")
In [21]:
describeDF(religious2022df, religiosity_2022_list)
         stateId  relLibScore2022  relLibVote2022  relLibVax2022  \
count  50.000000        50.000000       50.000000      50.000000   
mean   29.320000         0.393948        0.800000       0.900000   
std    15.782243         0.133298        0.404061       0.303046   
min     1.000000         0.155800        0.000000       0.000000   
25%    17.250000         0.314950        1.000000       1.000000   
50%    29.500000         0.371200        1.000000       1.000000   
75%    41.750000         0.467550        1.000000       1.000000   
max    56.000000         0.818200        1.000000       1.000000   

       relLibHealth2022  relLibHealthMandate2022  relLibMarriage2022  \
count         50.000000                50.000000           50.000000   
mean           6.760000                 0.640000            1.160000   
std            4.023198                 0.484873            1.489555   
min            0.000000                 0.000000            0.000000   
25%            4.250000                 0.000000            0.000000   
50%            5.500000                 1.000000            0.000000   
75%            9.000000                 1.000000            3.000000   
max           20.000000                 1.000000            5.000000   

       relLibRfra2022  
count       50.000000  
mean         0.480000  
std          0.504672  
min          0.000000  
25%          0.000000  
50%          0.000000  
75%          1.000000  
max          1.000000  
Mode of  stateId :  1
Variance of  stateId :  249.0791836734694
Mode of  relLibScore2022 :  0.3377
Variance of  relLibScore2022 :  0.01776847438367347
Mode of  relLibVote2022 :  1.0
Variance of  relLibVote2022 :  0.16326530612244897
Mode of  relLibVax2022 :  1.0
Variance of  relLibVax2022 :  0.09183673469387756
Mode of  relLibHealth2022 :  5.0
Variance of  relLibHealth2022 :  16.186122448979592
Mode of  relLibHealthMandate2022 :  1.0
Variance of  relLibHealthMandate2022 :  0.2351020408163265
Mode of  relLibMarriage2022 :  0.0
Variance of  relLibMarriage2022 :  2.218775510204082
Mode of  relLibRfra2022 :  0.0
Variance of  relLibRfra2022 :  0.25469387755102035

In [22]:
religiosity_cov_2022_list = ["stateId","relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022"
                         ,"relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022"]
rel_2022_label = ["stateId","Score","Vote","Vax","Health","HealthMandate","Marriage","Rfra"]
printCovariance(religious2022df, religiosity_cov_2022_list, rel_2022_label, "Religious Liberty 2022 Covariance Matrix")
In [23]:
# read self-identification Census Pulse Survey data
pulse_col_list = ["SCRAM","WEEK","EST_ST","TBIRTH_YEAR","EEDUC","AEDUC","EGENID_BIRTH","AGENID_BIRTH"
                  ,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME","ENDDATE","EDUCATION","ASSIGNEDGENDER"
                  ,"CHOSENGENDER","SEXUALORIENTATION","INCOMEMIN"]
pulse_num_col_list = ["WEEK","EST_ST","TBIRTH_YEAR","EEDUC","AEDUC","EGENID_BIRTH","AGENID_BIRTH"
                      ,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME"]

pulsedf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\pulseModFull.csv", usecols=pulse_col_list)
print("Full count of data pulse data: ", (pulsedf["GENID_DESCRIBE"] > -100).sum())
print()
countDistrictofColumbia = (pulsedf["EST_ST"] == 11).sum()
print("Count of participants in District of Columbia: ", countDistrictofColumbia)
print()
countMissingGender = (pulsedf["GENID_DESCRIBE"] < 0).sum()
print("Count of missing or unreported gender identity: ", countMissingGender)
print()
countMissingSexuality = (pulsedf["SEXUAL_ORIENTATION"] < 0).sum()
print("Count of missing or unreported sexuality: ", countMissingSexuality)
print()
countMissingIncome = (pulsedf["INCOME"] < 0).sum()
print("Count of missing or unreported minimum income: ", countMissingIncome)
print()
#remove DC residents
pulsedf = pulsedf[pulsedf["EST_ST"] != 11]
#remove missing gender identity based on under 10% of total for better results
pulsedf = pulsedf[pulsedf["GENID_DESCRIBE"].isin([1,2,3,4])]
#remove missing sexuality based on under 10% of total for better results
pulsedf = pulsedf[pulsedf["SEXUAL_ORIENTATION"].isin([1,2,3,4,5])]
#remove missing income based on null values for better results
pulsedf = pulsedf[pulsedf["INCOME"].isin([1,2,3,4,5,6,7,8])]

pulsedfCount = len(pulsedf.index)
print("Count after row removal: ", pulsedfCount)
Full count of data pulse data:  1341164

Count of participants in District of Columbia:  17702

Count of missing or unreported gender identity:  17691

Count of missing or unreported sexuality:  24617

Count of missing or unreported minimum income:  263337

Count after row removal:  1048575
In [24]:
describeDF(pulsedf,pulse_num_col_list)
               WEEK        EST_ST   TBIRTH_YEAR         EEDUC         AEDUC  \
count  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06   
mean   4.391728e+01  2.854206e+01  1.968571e+03  5.367118e+00  1.994822e+00   
std    6.120523e+00  1.640470e+01  1.575978e+01  1.436177e+00  7.176839e-02   
min    3.400000e+01  1.000000e+00  1.933000e+03  1.000000e+00  1.000000e+00   
25%    3.900000e+01  1.300000e+01  1.956000e+03  4.000000e+00  2.000000e+00   
50%    4.300000e+01  2.800000e+01  1.968000e+03  6.000000e+00  2.000000e+00   
75%    4.900000e+01  4.200000e+01  1.981000e+03  7.000000e+00  2.000000e+00   
max    5.400000e+01  5.600000e+01  2.005000e+03  7.000000e+00  2.000000e+00   

       EGENID_BIRTH  AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  \
count  1.048575e+06  1.048575e+06    1.048575e+06        1.048575e+06   
mean   1.581580e+00  1.997595e+00    1.609426e+00        2.069715e+00   
std    4.933001e-01  4.898353e-02    5.507229e-01        4.835032e-01   
min    1.000000e+00  1.000000e+00    1.000000e+00        1.000000e+00   
25%    1.000000e+00  2.000000e+00    1.000000e+00        2.000000e+00   
50%    2.000000e+00  2.000000e+00    2.000000e+00        2.000000e+00   
75%    2.000000e+00  2.000000e+00    2.000000e+00        2.000000e+00   
max    2.000000e+00  2.000000e+00    4.000000e+00        5.000000e+00   

             INCOME  
count  1.048575e+06  
mean   4.620572e+00  
std    2.128103e+00  
min    1.000000e+00  
25%    3.000000e+00  
50%    5.000000e+00  
75%    6.000000e+00  
max    8.000000e+00  
Mode of  WEEK :  43
Variance of  WEEK :  37.460803320626816
Mode of  EST_ST :  6
Variance of  EST_ST :  269.1142209085299
Mode of  TBIRTH_YEAR :  1955
Variance of  TBIRTH_YEAR :  248.3706702974734
Mode of  EEDUC :  6
Variance of  EEDUC :  2.0626035639994904
Mode of  AEDUC :  2
Variance of  AEDUC :  0.005150701178258357
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.2433449743390272
Mode of  AGENID_BIRTH :  2
Variance of  AGENID_BIRTH :  0.0023993863704273163
Mode of  GENID_DESCRIBE :  2
Variance of  GENID_DESCRIBE :  0.3032957446696943
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.23377538647520063
Mode of  INCOME :  6
Variance of  INCOME :  4.528821091548032

In [25]:
pulse_cov_list = ["WEEK","EST_ST","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME"]
pulse_label_list = ["WEEK","EST_ST","BIRTH_YEAR","EDUC","SEX_AT_BIRTH","GENDERID","SEXUALITY","INCOME"]
printCovariance(pulsedf, pulse_cov_list, pulse_label_list, "USCB Pulse Survey Covariance Matrix")
In [26]:
# further analysis of pulse data
pulseIncomedf = pd.DataFrame()
pulseIncomedf["INCOMEMIN"] = pulsedf["INCOMEMIN"].astype(float)
print(pulseIncomedf.describe())

print("Mode of INCOMEMIN: ", mode(pulseIncomedf["INCOMEMIN"]))
print("Variance of INCOMEMIN: ", np.var(pulseIncomedf["INCOMEMIN"], ddof=1))

del pulseIncomedf
          INCOMEMIN
count  1.048575e+06
mean   7.935849e+04
std    5.884945e+04
min    0.000000e+00
25%    3.500000e+04
50%    7.500000e+04
75%    1.000000e+05
max    2.000000e+05
Mode of INCOMEMIN:  100000.0
Variance of INCOMEMIN:  3463257810.0526085
In [27]:
# look at income data based on gender
#clean data
pulseIncomeStatsdf = pulsedf
#remove missing values from table for income and force to number
pulseIncomeStatsdf = pulseIncomeStatsdf[pulseIncomeStatsdf["INCOMEMIN"].astype(str).str.isdigit()]
pulseIncomeStatsdf["INCOMEMIN"] = pd.to_numeric(pulseIncomeStatsdf["INCOMEMIN"], errors='coerce')
#remove unreported or missing chosen gender
pulseIncomeStatsdf = pulseIncomeStatsdf[pulseIncomeStatsdf["GENID_DESCRIBE"] > 0]
#print(pulseIncomeStatsdf.head())

pulseIncomeStatsdf = pulseIncomeStatsdf[pulseIncomeStatsdf["SEXUAL_ORIENTATION"] > 0]

pulse_income_col = ["WEEK","EST_ST","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
                    ,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOMEMIN"]
describeDF(pulseIncomeStatsdf, pulse_income_col)
               WEEK        EST_ST   TBIRTH_YEAR         EEDUC         AEDUC  \
count  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06   
mean   4.391728e+01  2.854206e+01  1.968571e+03  5.367118e+00  1.994822e+00   
std    6.120523e+00  1.640470e+01  1.575978e+01  1.436177e+00  7.176839e-02   
min    3.400000e+01  1.000000e+00  1.933000e+03  1.000000e+00  1.000000e+00   
25%    3.900000e+01  1.300000e+01  1.956000e+03  4.000000e+00  2.000000e+00   
50%    4.300000e+01  2.800000e+01  1.968000e+03  6.000000e+00  2.000000e+00   
75%    4.900000e+01  4.200000e+01  1.981000e+03  7.000000e+00  2.000000e+00   
max    5.400000e+01  5.600000e+01  2.005000e+03  7.000000e+00  2.000000e+00   

       EGENID_BIRTH  AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  \
count  1.048575e+06  1.048575e+06    1.048575e+06        1.048575e+06   
mean   1.581580e+00  1.997595e+00    1.609426e+00        2.069715e+00   
std    4.933001e-01  4.898353e-02    5.507229e-01        4.835032e-01   
min    1.000000e+00  1.000000e+00    1.000000e+00        1.000000e+00   
25%    1.000000e+00  2.000000e+00    1.000000e+00        2.000000e+00   
50%    2.000000e+00  2.000000e+00    2.000000e+00        2.000000e+00   
75%    2.000000e+00  2.000000e+00    2.000000e+00        2.000000e+00   
max    2.000000e+00  2.000000e+00    4.000000e+00        5.000000e+00   

             INCOME     INCOMEMIN  
count  1.048575e+06  1.048575e+06  
mean   4.620572e+00  7.935849e+04  
std    2.128103e+00  5.884945e+04  
min    1.000000e+00  0.000000e+00  
25%    3.000000e+00  3.500000e+04  
50%    5.000000e+00  7.500000e+04  
75%    6.000000e+00  1.000000e+05  
max    8.000000e+00  2.000000e+05  
Mode of  WEEK :  43
Variance of  WEEK :  37.460803320626816
Mode of  EST_ST :  6
Variance of  EST_ST :  269.1142209085299
Mode of  TBIRTH_YEAR :  1955
Variance of  TBIRTH_YEAR :  248.3706702974734
Mode of  EEDUC :  6
Variance of  EEDUC :  2.0626035639994904
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.2433449743390272
Mode of  GENID_DESCRIBE :  2
Variance of  GENID_DESCRIBE :  0.3032957446696943
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.23377538647520063
Mode of  INCOMEMIN :  100000
Variance of  INCOMEMIN :  3463257810.0526085

In [28]:
#build violin plot
plt.figure(figsize=(12,8))
sns.violinplot(x=pulseIncomeStatsdf["EGENID_BIRTH"],y=pulseIncomeStatsdf["INCOMEMIN"],palette="bright")

ax = plt.gca()
leg = ax.get_legend()
ax.set_xticklabels(["Assigned Male at Birth","Assigned Female at Birth"])
plt.ylabel("Income in Dollars")
plt.xlabel("")
plt.title("Minimum Yearly Reported Income")
plt.show()
In [29]:
#build violin plot
incomeGeniddf = pulseIncomeStatsdf
incomeGeniddf["CUR_GENID"] = incomeGeniddf.apply(lambda x: str(x["EGENID_BIRTH"]) + str(x["GENID_DESCRIBE"]), axis=1)
#print(incomeGeniddf)

plt.figure(figsize=(15,8))
sns.violinplot(x=incomeGeniddf["CUR_GENID"],y=incomeGeniddf["INCOMEMIN"],palette="bright")

ax = plt.gca()
ax.set_xticklabels(["Cisgender Women","Cisgender Men", "Nonbinary AFAB", "Nonbinary AMAB"
           , "Transgender AFAB", "Transgender AMAB", "Transgender FTM", "Transgender MTF"])
plt.ylabel("Income in Dollars")
plt.xlabel("")
plt.title("Minimum Yearly Reported Income by Gender Identity and Sex Assigned at Birth")
plt.show()
In [30]:
#income comparison based on assigned gender at birth
pulseIncomeAMABdf = pulseIncomeStatsdf
pulseIncomeAMABdf = pulseIncomeAMABdf[pulseIncomeAMABdf["EGENID_BIRTH"] == 1]
describeDF(pulseIncomeAMABdf, pulse_income_col)

pulseIncomeAFABdf = pulseIncomeStatsdf
pulseIncomeAFABdf = pulseIncomeAFABdf[pulseIncomeAFABdf["EGENID_BIRTH"] == 2]
describeDF(pulseIncomeAFABdf, pulse_income_col)
                WEEK         EST_ST    TBIRTH_YEAR          EEDUC  \
count  438745.000000  438745.000000  438745.000000  438745.000000   
mean       44.064304      28.533891    1967.503153       5.435818   
std         6.133248      16.498385      16.139392       1.424280   
min        34.000000       1.000000    1933.000000       1.000000   
25%        39.000000      13.000000    1954.000000       4.000000   
50%        44.000000      28.000000    1966.000000       6.000000   
75%        50.000000      44.000000    1981.000000       7.000000   
max        54.000000      56.000000    2005.000000       7.000000   

               AEDUC  EGENID_BIRTH   AGENID_BIRTH  GENID_DESCRIBE  \
count  438745.000000      438745.0  438745.000000   438745.000000   
mean        1.994215           1.0       1.997568        1.034546   
std         0.075837           0.0       0.049255        0.300578   
min         1.000000           1.0       1.000000        1.000000   
25%         2.000000           1.0       2.000000        1.000000   
50%         2.000000           1.0       2.000000        1.000000   
75%         2.000000           1.0       2.000000        1.000000   
max         2.000000           1.0       2.000000        4.000000   

       SEXUAL_ORIENTATION         INCOME      INCOMEMIN  
count       438745.000000  438745.000000  438745.000000  
mean             2.032830       4.966391   88654.389224  
std              0.475219       2.088031   60425.293955  
min              1.000000       1.000000       0.000000  
25%              2.000000       4.000000   50000.000000  
50%              2.000000       5.000000   75000.000000  
75%              2.000000       6.000000  100000.000000  
max              5.000000       8.000000  200000.000000  
Mode of  WEEK :  43
Variance of  WEEK :  37.616725915359496
Mode of  EST_ST :  6
Variance of  EST_ST :  272.19670766553105
Mode of  TBIRTH_YEAR :  1955
Variance of  TBIRTH_YEAR :  260.47997895673205
Mode of  EEDUC :  6
Variance of  EEDUC :  2.0285745651188547
Mode of  EGENID_BIRTH :  1
Variance of  EGENID_BIRTH :  0.0
Mode of  GENID_DESCRIBE :  1
Variance of  GENID_DESCRIBE :  0.0903474059981086
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.22583355356615814
Mode of  INCOMEMIN :  100000
Variance of  INCOMEMIN :  3651216149.53128

                WEEK         EST_ST    TBIRTH_YEAR          EEDUC  \
count  609830.000000  609830.000000  609830.000000  609830.000000   
mean       43.811497      28.547931    1969.338557       5.317692   
std         6.109169      16.336979      15.435331       1.442654   
min        34.000000       1.000000    1933.000000       1.000000   
25%        39.000000      13.000000    1957.000000       4.000000   
50%        43.000000      28.000000    1969.000000       6.000000   
75%        49.000000      42.000000    1982.000000       7.000000   
max        54.000000      56.000000    2005.000000       7.000000   

               AEDUC  EGENID_BIRTH   AGENID_BIRTH  GENID_DESCRIBE  \
count  609830.000000      609830.0  609830.000000   609830.000000   
mean        1.995259           2.0       1.997614        2.023026   
std         0.068689           0.0       0.048788        0.218328   
min         1.000000           2.0       1.000000        1.000000   
25%         2.000000           2.0       2.000000        2.000000   
50%         2.000000           2.0       2.000000        2.000000   
75%         2.000000           2.0       2.000000        2.000000   
max         2.000000           2.0       2.000000        4.000000   

       SEXUAL_ORIENTATION         INCOME      INCOMEMIN  
count       609830.000000  609830.000000  609830.000000  
mean             2.096251       4.371771   72670.506535  
std              0.487654       2.121895   56755.052819  
min              1.000000       1.000000       0.000000  
25%              2.000000       3.000000   35000.000000  
50%              2.000000       4.000000   50000.000000  
75%              2.000000       6.000000  100000.000000  
max              5.000000       8.000000  200000.000000  
Mode of  WEEK :  43
Variance of  WEEK :  37.32194368751015
Mode of  EST_ST :  6
Variance of  EST_ST :  266.89687185333827
Mode of  TBIRTH_YEAR :  1955
Variance of  TBIRTH_YEAR :  238.24944848073915
Mode of  EEDUC :  6
Variance of  EEDUC :  2.081250683388053
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.0
Mode of  GENID_DESCRIBE :  2
Variance of  GENID_DESCRIBE :  0.047666915897604446
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.23780655025122638
Mode of  INCOMEMIN :  50000
Variance of  INCOMEMIN :  3221136020.4754634

In [31]:
# build datasets based on chosen gender
col_for_counts = ["WEEK"]

pulseStateReduceddf = pulseIncomeStatsdf
#print("Total set week counts:")
pulseStateCountdf = combinedf(pulseStateReduceddf,col_for_counts,"All")
#print(pulseStateCountdf)

pulseStateTotalsdf = pd.DataFrame()
pulseStateTotalsdf["AllWEEKcount"] = pulseStateCountdf["AllWEEKcount"].astype(int)

del pulseStateCountdf
In [32]:
pulseCisMendf = pulseIncomeStatsdf
pulseCisMendf = pulseCisMendf[pulseCisMendf["EGENID_BIRTH"] == 1]
pulseCisMendf = pulseCisMendf[pulseCisMendf["GENID_DESCRIBE"] == 1]
describeDF(pulseCisMendf, pulse_income_col)

#print("Cis men week counts:")
pulseCisMenReducedDf = combinedf(pulseCisMendf,col_for_counts,"CisMen")

pulseStateTotalsdf["CisMenWEEKcount"] = pulseCisMenReducedDf["CisMenWEEKcount"].astype(int)
#print(pulseStateTotalsdf["CisMenWEEKcount"])
                WEEK         EST_ST    TBIRTH_YEAR          EEDUC  \
count  432414.000000  432414.000000  432414.000000  432414.000000   
mean       44.064441      28.527571    1967.402887       5.441038   
std         6.133263      16.496388      16.082041       1.421493   
min        34.000000       1.000000    1933.000000       1.000000   
25%        39.000000      13.000000    1954.000000       4.000000   
50%        44.000000      28.000000    1966.000000       6.000000   
75%        50.000000      44.000000    1981.000000       7.000000   
max        54.000000      56.000000    2005.000000       7.000000   

               AEDUC  EGENID_BIRTH   AGENID_BIRTH  GENID_DESCRIBE  \
count  432414.000000      432414.0  432414.000000        432414.0   
mean        1.994237           1.0       1.998985             1.0   
std         0.075695           0.0       0.031847             0.0   
min         1.000000           1.0       1.000000             1.0   
25%         2.000000           1.0       2.000000             1.0   
50%         2.000000           1.0       2.000000             1.0   
75%         2.000000           1.0       2.000000             1.0   
max         2.000000           1.0       2.000000             1.0   

       SEXUAL_ORIENTATION         INCOME      INCOMEMIN  
count       432414.000000  432414.000000  432414.000000  
mean             2.019733       4.980276   88989.105348  
std              0.440445       2.082161   60375.536150  
min              1.000000       1.000000       0.000000  
25%              2.000000       4.000000   50000.000000  
50%              2.000000       5.000000   75000.000000  
75%              2.000000       7.000000  150000.000000  
max              5.000000       8.000000  200000.000000  
Mode of  WEEK :  43
Variance of  WEEK :  37.6169203141804
Mode of  EST_ST :  6
Variance of  EST_ST :  272.1308119804907
Mode of  TBIRTH_YEAR :  1955
Variance of  TBIRTH_YEAR :  258.63203797251515
Mode of  EEDUC :  6
Variance of  EEDUC :  2.0206427755641716
Mode of  EGENID_BIRTH :  1
Variance of  EGENID_BIRTH :  0.0
Mode of  GENID_DESCRIBE :  1
Variance of  GENID_DESCRIBE :  0.0
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.19399188938993783
Mode of  INCOMEMIN :  100000
Variance of  INCOMEMIN :  3645205365.415228

In [33]:
pulseCisWomendf = pulseIncomeStatsdf
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["EGENID_BIRTH"] == 2]
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["GENID_DESCRIBE"] == 2]
describeDF(pulseCisWomendf, pulse_income_col)

#print("Cis women week counts:")
pulseCisWomenReducedDf = combinedf(pulseCisWomendf,col_for_counts,"CisWomen")

pulseStateTotalsdf["CisWomenWEEKcount"] = pulseCisWomenReducedDf["CisWomenWEEKcount"].astype(int)
#print(pulseStateTotalsdf["CisWomenWEEKcount"])
                WEEK         EST_ST    TBIRTH_YEAR          EEDUC  \
count  599899.000000  599899.000000  599899.000000  599899.000000   
mean       43.804849      28.541668    1969.179439       5.321482   
std         6.108812      16.333975      15.348571       1.440654   
min        34.000000       1.000000    1933.000000       1.000000   
25%        39.000000      13.000000    1957.000000       4.000000   
50%        43.000000      28.000000    1969.000000       6.000000   
75%        49.000000      42.000000    1982.000000       7.000000   
max        54.000000      56.000000    2005.000000       7.000000   

               AEDUC  EGENID_BIRTH   AGENID_BIRTH  GENID_DESCRIBE  \
count  599899.000000      599899.0  599899.000000        599899.0   
mean        1.995281           2.0       1.998725             2.0   
std         0.068534           0.0       0.035687             0.0   
min         1.000000           2.0       1.000000             2.0   
25%         2.000000           2.0       2.000000             2.0   
50%         2.000000           2.0       2.000000             2.0   
75%         2.000000           2.0       2.000000             2.0   
max         2.000000           2.0       2.000000             2.0   

       SEXUAL_ORIENTATION         INCOME      INCOMEMIN  
count       599899.000000  599899.000000  599899.000000  
mean             2.085014       4.384903   72975.084139  
std              0.461465       2.119228   56763.424571  
min              1.000000       1.000000       0.000000  
25%              2.000000       3.000000   35000.000000  
50%              2.000000       4.000000   50000.000000  
75%              2.000000       6.000000  100000.000000  
max              5.000000       8.000000  200000.000000  
Mode of  WEEK :  43
Variance of  WEEK :  37.317581415112876
Mode of  EST_ST :  6
Variance of  EST_ST :  266.79872778314314
Mode of  TBIRTH_YEAR :  1955
Variance of  TBIRTH_YEAR :  235.5786372658161
Mode of  EEDUC :  6
Variance of  EEDUC :  2.07548426437156
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.0
Mode of  GENID_DESCRIBE :  2
Variance of  GENID_DESCRIBE :  0.0
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.2129499850843681
Mode of  INCOMEMIN :  50000
Variance of  INCOMEMIN :  3222086368.9871078

In [34]:
#print("Cisgender week counts:")
cisdf = pd.DataFrame()
cisdf["CisgenderWEEKcount"] = pulseStateTotalsdf.loc[:,["CisMenWEEKcount","CisWomenWEEKcount"]].sum(axis=1)
pulseStateTotalsdf["CisgenderWEEKcount"] = cisdf["CisgenderWEEKcount"].astype(int)
#print(pulseStateTotalsdf)
In [35]:
pulseTranswomendf = pulseIncomeStatsdf
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["EGENID_BIRTH"] == 1]
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["GENID_DESCRIBE"].isin([2,3])]
describeDF(pulseTranswomendf, pulse_income_col)

#print("Trans women week counts:")
pulseTranswomenReduceddf = combinedf(pulseTranswomendf,col_for_counts,"TransWomen")
#print(pulseTranswomenReduceddf)

pulseStateTotalsdf["TransWomenWEEKcount"] = pulseTranswomenReduceddf["TransWomenWEEKcount"].astype(int)
              WEEK       EST_ST  TBIRTH_YEAR        EEDUC        AEDUC  \
count  2654.000000  2654.000000  2654.000000  2654.000000  2654.000000   
mean     44.383572    29.207611  1976.794650     4.939337     1.993595   
std       6.134856    16.413115    18.514133     1.537229     0.079792   
min      34.000000     1.000000  1933.000000     1.000000     1.000000   
25%      40.000000    16.000000  1962.000000     4.000000     2.000000   
50%      44.000000    29.000000  1982.000000     5.000000     2.000000   
75%      50.000000    42.000000  1993.000000     6.000000     2.000000   
max      54.000000    56.000000  2005.000000     7.000000     2.000000   

       EGENID_BIRTH  AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  \
count        2654.0   2654.000000     2654.000000          2654.00000   
mean            1.0      1.794650        2.554635             2.64318   
std             0.0      0.404034        0.497100             1.16700   
min             1.0      1.000000        2.000000             1.00000   
25%             1.0      2.000000        2.000000             2.00000   
50%             1.0      2.000000        3.000000             3.00000   
75%             1.0      2.000000        3.000000             4.00000   
max             1.0      2.000000        3.000000             5.00000   

            INCOME      INCOMEMIN  
count  2654.000000    2654.000000  
mean      3.669932   56865.109269  
std       2.190414   55358.774687  
min       1.000000       0.000000  
25%       2.000000   25000.000000  
50%       4.000000   50000.000000  
75%       5.000000   75000.000000  
max       8.000000  200000.000000  
Mode of  WEEK :  52
Variance of  WEEK :  37.6364582501901
Mode of  EST_ST :  6
Variance of  EST_ST :  269.39035290414995
Mode of  TBIRTH_YEAR :  1992
Variance of  TBIRTH_YEAR :  342.7731187425988
Mode of  EEDUC :  4
Variance of  EEDUC :  2.363073212535268
Mode of  EGENID_BIRTH :  1
Variance of  EGENID_BIRTH :  0.0
Mode of  GENID_DESCRIBE :  3
Variance of  GENID_DESCRIBE :  0.24710817771523663
Mode of  SEXUAL_ORIENTATION :  3
Variance of  SEXUAL_ORIENTATION :  1.3618890161739803
Mode of  INCOMEMIN :  0
Variance of  INCOMEMIN :  3064593934.8353977

In [36]:
pulseTransmendf = pulseIncomeStatsdf
pulseTransmendf = pulseTransmendf[pulseTransmendf["EGENID_BIRTH"] == 2]
pulseTransmendf = pulseTransmendf[pulseTransmendf["GENID_DESCRIBE"].isin([1,3])]
describeDF(pulseTransmendf, pulse_income_col)

#print("Trans men week counts:")
pulseTransmenReduceddf = combinedf(pulseTransmendf,col_for_counts,"TransMen")
#print(pulseTransmenReduceddf)

pulseStateTotalsdf["TransMenWEEKcount"] = pulseTransmenReduceddf["TransMenWEEKcount"].astype(int)
              WEEK       EST_ST  TBIRTH_YEAR        EEDUC        AEDUC  \
count  3444.000000  3444.000000  3444.000000  3444.000000  3444.000000   
mean     44.622242    29.090012  1982.988095     5.067364     1.994483   
std       6.068434    16.591153    17.827721     1.547615     0.074081   
min      34.000000     1.000000  1933.000000     1.000000     1.000000   
25%      40.000000    15.000000  1974.000000     4.000000     2.000000   
50%      45.000000    29.000000  1990.000000     6.000000     2.000000   
75%      50.000000    42.000000  1996.000000     6.000000     2.000000   
max      54.000000    56.000000  2005.000000     7.000000     2.000000   

       EGENID_BIRTH  AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  \
count        3444.0   3444.000000     3444.000000         3444.000000   
mean            2.0      1.828688        2.310105            2.852497   
std             0.0      0.376836        0.950841            1.133170   
min             2.0      1.000000        1.000000            1.000000   
25%             2.0      2.000000        1.000000            2.000000   
50%             2.0      2.000000        3.000000            3.000000   
75%             2.0      2.000000        3.000000            4.000000   
max             2.0      2.000000        3.000000            5.000000   

            INCOME      INCOMEMIN  
count  3444.000000    3444.000000  
mean      3.504355   52778.745645  
std       2.122240   53090.947248  
min       1.000000       0.000000  
25%       2.000000   25000.000000  
50%       3.000000   35000.000000  
75%       5.000000   75000.000000  
max       8.000000  200000.000000  
Mode of  WEEK :  54
Variance of  WEEK :  36.82588913592966
Mode of  EST_ST :  6
Variance of  EST_ST :  275.26636549507276
Mode of  TBIRTH_YEAR :  1996
Variance of  TBIRTH_YEAR :  317.82762471819984
Mode of  EEDUC :  6
Variance of  EEDUC :  2.3951123034735593
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.0
Mode of  GENID_DESCRIBE :  3
Variance of  GENID_DESCRIBE :  0.9040976945597842
Mode of  SEXUAL_ORIENTATION :  3
Variance of  SEXUAL_ORIENTATION :  1.2840745062361207
Mode of  INCOMEMIN :  0
Variance of  INCOMEMIN :  2818648679.692473

In [37]:
pulseNonedf = pulseIncomeStatsdf
pulseNonedf = pulseNonedf[pulseNonedf["GENID_DESCRIBE"] == 4]
describeDF(pulseNonedf, pulse_income_col)

#print("Non-Binary week counts:")
pulseNoneReduceddf = combinedf(pulseNonedf,col_for_counts,"Enby")
#print(pulseNoneReduceddf)

pulseStateTotalsdf["EnbyWEEKcount"] = pulseNoneReduceddf["EnbyWEEKcount"].astype(int)
               WEEK        EST_ST   TBIRTH_YEAR         EEDUC         AEDUC  \
count  10164.000000  10164.000000  10164.000000  10164.000000  10164.000000   
mean      43.931425     28.821822   1975.280500      5.129083      1.993113   
std        6.128902     16.585780     17.485799      1.551461      0.082706   
min       34.000000      1.000000   1933.000000      1.000000      1.000000   
25%       39.000000     13.000000   1962.000000      4.000000      2.000000   
50%       44.000000     29.000000   1978.000000      6.000000      2.000000   
75%       49.000000     44.000000   1990.000000      6.000000      2.000000   
max       54.000000     56.000000   2005.000000      7.000000      2.000000   

       EGENID_BIRTH  AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  \
count  10164.000000  10164.000000         10164.0        10164.000000   
mean       1.638233      1.981995             4.0            2.878099   
std        0.480535      0.132975             0.0            1.199313   
min        1.000000      1.000000             4.0            1.000000   
25%        1.000000      2.000000             4.0            2.000000   
50%        2.000000      2.000000             4.0            2.000000   
75%        2.000000      2.000000             4.0            4.000000   
max        2.000000      2.000000             4.0            5.000000   

             INCOME      INCOMEMIN  
count  10164.000000   10164.000000  
mean       3.853503   61277.056277  
std        2.212131   56845.106327  
min        1.000000       0.000000  
25%        2.000000   25000.000000  
50%        4.000000   50000.000000  
75%        6.000000  100000.000000  
max        8.000000  200000.000000  
Mode of  WEEK :  41
Variance of  WEEK :  37.56343628567068
Mode of  EST_ST :  6
Variance of  EST_ST :  275.0880960203078
Mode of  TBIRTH_YEAR :  1993
Variance of  TBIRTH_YEAR :  305.75315311040043
Mode of  EEDUC :  6
Variance of  EEDUC :  2.407029720940252
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.23091436232464746
Mode of  GENID_DESCRIBE :  4
Variance of  GENID_DESCRIBE :  0.0
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  1.4383513604283242
Mode of  INCOMEMIN :  0
Variance of  INCOMEMIN :  3231366113.305501

In [38]:
#print("NonCisgender week counts:")
transdf = pd.DataFrame()
transdf["NonCisgenderWEEKcount"] = pulseStateTotalsdf.loc[:,["TransWomenWEEKcount","TransMenWEEKcount","EnbyWEEKcount"]].sum(axis=1)
pulseStateTotalsdf["NonCisgenderWEEKcount"] = transdf["NonCisgenderWEEKcount"].astype(int)
#print(pulseStateTotalsdf)
del transdf
In [39]:
print("Percentages:")
pulseStateTotalsdf.assign(CisPercent = lambda x: (round(x["CisgenderWEEKcount"]/x["AllWEEKcount"] * 100,2)))
pulseStateTotalsdf.assign(NonCisPercent = lambda x: (round(x["NonCisgenderWEEKcount"]/x["AllWEEKcount"] * 100,2)))
Percentages:
Out[39]:
AllWEEKcount CisMenWEEKcount CisWomenWEEKcount CisgenderWEEKcount TransWomenWEEKcount TransMenWEEKcount EnbyWEEKcount NonCisgenderWEEKcount NonCisPercent
43 64543 25986 37672 63658 131 188 566 885 1.37
41 61390 24939 35525 60464 162 185 579 926 1.51
42 60221 24204 35106 59310 144 198 569 911 1.51
54 60220 25198 34080 59278 153 230 559 942 1.56
52 56169 24487 30771 55258 172 196 543 911 1.62
53 54574 22845 30918 53763 131 190 490 811 1.49
36 53108 21142 31193 52335 108 147 518 773 1.46
35 52930 21095 31041 52136 148 142 504 794 1.50
34 49604 19756 29066 48822 122 138 522 782 1.58
44 49565 20076 28742 48818 112 160 475 747 1.51
37 49090 19691 28689 48380 93 135 482 710 1.45
51 48800 21354 26597 47951 153 173 523 849 1.74
46 48346 19406 28206 47612 113 155 466 734 1.52
40 47908 20526 26667 47193 123 149 443 715 1.49
45 47676 19242 27636 46878 145 169 484 798 1.67
38 45991 18625 26685 45310 101 152 428 681 1.48
47 44593 17972 25899 43871 114 147 461 722 1.62
39 43767 17899 25256 43155 91 106 415 612 1.40
49 40405 17954 21784 39738 128 158 381 667 1.65
48 36405 14994 20714 35708 113 178 406 697 1.91
50 33270 15023 17652 32675 97 148 350 595 1.79
In [40]:
pulseWeekTotalsdf = pulseStateTotalsdf
print(pulseWeekTotalsdf.head())
print()
state_count_col = ["AllWEEKcount","CisgenderWEEKcount","NonCisgenderWEEKcount"
                   ,"CisMenWEEKcount","CisWomenWEEKcount"
                   ,"TransWomenWEEKcount","TransMenWEEKcount","EnbyWEEKcount"]
describeDF(pulseWeekTotalsdf,state_count_col)
    AllWEEKcount  CisMenWEEKcount  CisWomenWEEKcount  CisgenderWEEKcount  \
43         64543            25986              37672               63658   
41         61390            24939              35525               60464   
42         60221            24204              35106               59310   
54         60220            25198              34080               59278   
52         56169            24487              30771               55258   

    TransWomenWEEKcount  TransMenWEEKcount  EnbyWEEKcount  \
43                  131                188            566   
41                  162                185            579   
42                  144                198            569   
54                  153                230            559   
52                  172                196            543   

    NonCisgenderWEEKcount  
43                    885  
41                    926  
42                    911  
54                    942  
52                    911  

       AllWEEKcount  CisMenWEEKcount  CisWomenWEEKcount  CisgenderWEEKcount  \
count     21.000000        21.000000          21.000000           21.000000   
mean   49932.142857     20591.142857       28566.619048        49157.761905   
std     8010.431825      3130.041123        4924.881364         7920.842814   
min    33270.000000     14994.000000       17652.000000        32675.000000   
25%    45991.000000     18625.000000       26597.000000        45310.000000   
50%    49090.000000     20076.000000       28689.000000        48380.000000   
75%    54574.000000     22845.000000       31041.000000        53763.000000   
max    64543.000000     25986.000000       37672.000000        63658.000000   

       TransWomenWEEKcount  TransMenWEEKcount  EnbyWEEKcount  \
count            21.000000          21.000000       21.00000   
mean            126.380952         164.000000      484.00000   
std              23.341971          27.597101       63.95389   
min              91.000000         106.000000      350.00000   
25%             112.000000         147.000000      443.00000   
50%             123.000000         158.000000      484.00000   
75%             145.000000         185.000000      523.00000   
max             172.000000         230.000000      579.00000   

       NonCisgenderWEEKcount  
count              21.000000  
mean              774.380952  
std               101.523631  
min               595.000000  
25%               710.000000  
50%               773.000000  
75%               849.000000  
max               942.000000  
Mode of  AllWEEKcount :  64543
Variance of  AllWEEKcount :  64167018.02857144
Mode of  CisgenderWEEKcount :  63658
Variance of  CisgenderWEEKcount :  62739750.89047618
Mode of  NonCisgenderWEEKcount :  911
Variance of  NonCisgenderWEEKcount :  10307.04761904762
Mode of  CisMenWEEKcount :  25986
Variance of  CisMenWEEKcount :  9797157.42857143
Mode of  CisWomenWEEKcount :  37672
Variance of  CisWomenWEEKcount :  24254456.447619047
Mode of  TransWomenWEEKcount :  131
Variance of  TransWomenWEEKcount :  544.8476190476191
Mode of  TransMenWEEKcount :  147
Variance of  TransMenWEEKcount :  761.6
Mode of  EnbyWEEKcount :  566
Variance of  EnbyWEEKcount :  4090.1

In [41]:
# build datasets based on chosen gender
col_for_counts = ["EST_ST"]

pulseStateReduceddf = pulseIncomeStatsdf
#print("Total set state counts:")
pulseStateCountdf = combinedf(pulseStateReduceddf,col_for_counts,"All")
#print(pulseStateCountdf)

pulseStateTotalsdf = pd.DataFrame()
pulseStateTotalsdf["AllEST_STcount"] = pulseStateCountdf["AllEST_STcount"].astype(int)

del pulseStateCountdf
In [42]:
pulseCisMendf = pulseIncomeStatsdf
pulseCisMendf = pulseCisMendf[pulseCisMendf["EGENID_BIRTH"] == 1]
pulseCisMendf = pulseCisMendf[pulseCisMendf["GENID_DESCRIBE"] == 1]

#print("Cis men state counts:")
pulseCisMenReducedDf = combinedf(pulseCisMendf,col_for_counts,"CisMen")

pulseStateTotalsdf["CisMenEST_STcount"] = pulseCisMenReducedDf["CisMenEST_STcount"].astype(int)
#print(pulseStateTotalsdf["CisMenEST_STcount"])
In [43]:
pulseCisWomendf = pulseIncomeStatsdf
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["EGENID_BIRTH"] == 2]
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["GENID_DESCRIBE"] == 2]

#print("Cis men state counts:")
pulseCisWomenReducedDf = combinedf(pulseCisWomendf,col_for_counts,"CisWomen")

pulseStateTotalsdf["CisWomenEST_STcount"] = pulseCisWomenReducedDf["CisWomenEST_STcount"].astype(int)
#print(pulseStateTotalsdf["CisWomenEST_STcount"])
In [44]:
#print("Cisgender state counts:")
cisdf = pd.DataFrame()
cisdf["CisgenderEST_STcount"] = pulseStateTotalsdf.loc[:,["CisMenEST_STcount","CisWomenEST_STcount"]].sum(axis=1)
pulseStateTotalsdf["CisgenderEST_STcount"] = cisdf["CisgenderEST_STcount"].astype(int)
#print(pulseStateTotalsdf)
In [45]:
pulseTranswomendf = pulseIncomeStatsdf
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["EGENID_BIRTH"] == 1]
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["GENID_DESCRIBE"].isin([2,3])]

#print("Trans women state counts:")
pulseTranswomenReduceddf = combinedf(pulseTranswomendf,col_for_counts,"TransWomen")
#print(pulseTranswomenReduceddf)

pulseStateTotalsdf["TransWomenEST_STcount"] = pulseTranswomenReduceddf["TransWomenEST_STcount"].astype(int)
In [46]:
pulseTransmendf = pulseIncomeStatsdf
pulseTransmendf = pulseTransmendf[pulseTransmendf["EGENID_BIRTH"] == 2]
pulseTransmendf = pulseTransmendf[pulseTransmendf["GENID_DESCRIBE"].isin([1,3])]

#print("Trans men column counts:")
pulseTransmenReduceddf = combinedf(pulseTransmendf,col_for_counts,"TransMen")
#print(pulseTransmenReduceddf)

pulseStateTotalsdf["TransMenEST_STcount"] = pulseTransmenReduceddf["TransMenEST_STcount"].astype(int)
In [47]:
pulseNonedf = pulseIncomeStatsdf
pulseNonedf = pulseNonedf[pulseNonedf["GENID_DESCRIBE"] == 4]

#print("Non-Binary column counts:")
pulseNoneReduceddf = combinedf(pulseNonedf,col_for_counts,"Enby")
#print(pulseNoneReduceddf)

pulseStateTotalsdf["EnbyEST_STcount"] = pulseNoneReduceddf["EnbyEST_STcount"].astype(int)
In [48]:
print("NonCisgender state counts:")
transdf = pd.DataFrame()
transdf["NonCisgenderEST_STcount"] = pulseStateTotalsdf.loc[:,["TransWomenEST_STcount","TransMenEST_STcount","EnbyEST_STcount"]].sum(axis=1)
pulseStateTotalsdf["NonCisgenderEST_STcount"] = transdf["NonCisgenderEST_STcount"].astype(int)
#print(pulseStateTotalsdf)
del transdf
NonCisgender state counts:
In [49]:
print("Percentages:")
pulseStateTotalsdf.assign(CisPercent = lambda x: (round(x["CisgenderEST_STcount"]/x["AllEST_STcount"] * 100,2)))
pulseStateTotalsdf.assign(NonCisPercent = lambda x: (round(x["NonCisgenderEST_STcount"]/x["AllEST_STcount"] * 100,2)))
Percentages:
Out[49]:
AllEST_STcount CisMenEST_STcount CisWomenEST_STcount CisgenderEST_STcount TransWomenEST_STcount TransMenEST_STcount EnbyEST_STcount NonCisgenderEST_STcount NonCisPercent
6 79763 35075 43358 78433 196 292 842 1330 1.67
48 52968 22829 29432 52261 118 136 453 707 1.33
53 44216 19190 24043 43233 161 216 606 983 2.22
12 36164 15525 20207 35732 69 76 287 432 1.19
25 29233 12153 16581 28734 97 106 296 499 1.71
41 29039 11455 16841 28296 139 188 416 743 2.56
26 28883 11989 16456 28445 80 91 267 438 1.52
49 28771 12504 15855 28359 81 79 252 412 1.43
51 28311 12243 15679 27922 62 71 256 389 1.37
4 28004 11663 15922 27585 63 83 273 419 1.50
8 27907 11805 15629 27434 70 110 293 473 1.69
42 26712 11195 15118 26313 67 95 237 399 1.49
13 24722 9982 14380 24362 58 85 217 360 1.46
24 24421 9971 14084 24055 40 82 244 366 1.50
27 23848 10004 13467 23471 78 80 219 377 1.58
17 22360 9488 12471 21959 80 76 245 401 1.79
20 19673 7807 11564 19371 49 73 180 302 1.54
36 19474 8180 10947 19127 44 82 221 347 1.78
16 19306 7815 11212 19027 42 67 170 279 1.45
35 19302 7508 11472 18980 45 72 205 322 1.67
34 19089 8364 10506 18870 32 34 153 219 1.15
37 18992 7637 11100 18737 33 53 169 255 1.34
18 18836 7441 11123 18564 39 59 174 272 1.44
29 18833 7387 11145 18532 55 60 186 301 1.60
55 18774 7712 10750 18462 50 84 178 312 1.66
47 18081 7053 10763 17816 29 64 172 265 1.47
9 17796 7118 10424 17542 44 53 157 254 1.43
39 17452 7106 10094 17200 46 51 155 252 1.44
19 17153 6676 10241 16917 38 50 148 236 1.38
33 16854 7144 9473 16617 44 50 143 237 1.41
32 16779 7124 9422 16546 36 46 151 233 1.39
2 16617 6761 9546 16307 51 60 199 310 1.87
40 16149 6216 9686 15902 41 50 156 247 1.53
31 16114 6469 9396 15865 31 57 161 249 1.55
45 15821 6122 9491 15613 26 38 144 208 1.31
21 14878 5790 8900 14690 43 39 106 188 1.26
5 13482 5176 8145 13321 28 36 97 161 1.19
1 13238 5261 7807 13068 26 35 109 170 1.28
30 12483 4962 7352 12314 31 33 105 169 1.35
22 12068 4465 7413 11878 33 25 132 190 1.57
50 11646 4572 6880 11452 34 50 110 194 1.67
10 11563 4573 6833 11406 21 28 108 157 1.36
15 11375 4895 6309 11204 30 30 111 171 1.50
23 11128 4338 6605 10943 35 41 109 185 1.66
46 10867 4453 6284 10737 26 18 86 130 1.20
56 10844 4325 6360 10685 28 25 106 159 1.47
54 10685 3943 6606 10549 23 30 83 136 1.27
44 9601 3822 5595 9417 24 47 113 184 1.92
28 9301 3342 5846 9188 14 19 80 113 1.21
38 8999 3786 5086 8872 24 19 84 127 1.41
In [50]:
print(pulseStateTotalsdf.head())
print()
state_count_col = ["AllEST_STcount","CisgenderEST_STcount","NonCisgenderEST_STcount"
                   ,"CisMenEST_STcount","CisWomenEST_STcount"
                   ,"TransWomenEST_STcount","TransMenEST_STcount","EnbyEST_STcount"]
describeDF(pulseStateTotalsdf,state_count_col)
    AllEST_STcount  CisMenEST_STcount  CisWomenEST_STcount  \
6            79763              35075                43358   
48           52968              22829                29432   
53           44216              19190                24043   
12           36164              15525                20207   
25           29233              12153                16581   

    CisgenderEST_STcount  TransWomenEST_STcount  TransMenEST_STcount  \
6                  78433                    196                  292   
48                 52261                    118                  136   
53                 43233                    161                  216   
12                 35732                     69                   76   
25                 28734                     97                  106   

    EnbyEST_STcount  NonCisgenderEST_STcount  
6               842                     1330  
48              453                      707  
53              606                      983  
12              287                      432  
25              296                      499  

       AllEST_STcount  CisMenEST_STcount  CisWomenEST_STcount  \
count       50.000000          50.000000            50.000000   
mean     20971.500000        8648.280000         11997.980000   
std      12222.406664        5449.401873          6575.810556   
min       8999.000000        3342.000000          5086.000000   
25%      13299.000000        5197.250000          7891.500000   
50%      18427.500000        7265.500000         10628.000000   
75%      24646.750000        9998.500000         14306.000000   
max      79763.000000       35075.000000         43358.000000   

       CisgenderEST_STcount  TransWomenEST_STcount  TransMenEST_STcount  \
count             50.000000              50.000000            50.000000   
mean           20646.260000              53.080000            68.880000   
std            12013.928975              36.049762            50.080604   
min             8872.000000              14.000000            18.000000   
25%            13131.250000              31.000000            38.250000   
50%            18139.000000              42.500000            58.000000   
75%            24285.250000              62.750000            81.500000   
max            78433.000000             196.000000           292.000000   

       EnbyEST_STcount  NonCisgenderEST_STcount  
count        50.000000                50.000000  
mean        203.280000               325.240000  
std         135.915946               219.943366  
min          80.000000               113.000000  
25%         111.500000               188.500000  
50%         169.500000               260.000000  
75%         242.250000               386.000000  
max         842.000000              1330.000000  
Mode of  AllEST_STcount :  79763
Variance of  AllEST_STcount :  149387224.66326532
Mode of  CisgenderEST_STcount :  78433
Variance of  CisgenderEST_STcount :  144334489.42081633
Mode of  NonCisgenderEST_STcount :  1330
Variance of  NonCisgenderEST_STcount :  48375.08408163265
Mode of  CisMenEST_STcount :  35075
Variance of  CisMenEST_STcount :  29695980.777142856
Mode of  CisWomenEST_STcount :  43358
Variance of  CisWomenEST_STcount :  43241284.46897959
Mode of  TransWomenEST_STcount :  44
Variance of  TransWomenEST_STcount :  1299.585306122449
Mode of  TransMenEST_STcount :  50
Variance of  TransMenEST_STcount :  2508.0669387755106
Mode of  EnbyEST_STcount :  106
Variance of  EnbyEST_STcount :  18473.14448979592

In [51]:
#add basic gender column
pulseMungedf = pulseIncomeStatsdf
pulseMungedf["CUR_GENID"] = pulseMungedf.apply(lambda x: basicGenMarker(x["EGENID_BIRTH"],x["GENID_DESCRIBE"]), axis=1)
In [52]:
#replace state column name to allow merging
pulseMungedf.rename(columns={"EST_ST":"stateId"}, inplace=True)
print(pulseMungedf.head())
pulseMungedf["stateId"] = pulseMungedf["stateId"].astype(int)
reedFulldf["stateId"] = reedFulldf["stateId"].astype(int)

pulseMungedf = pd.merge(pulseMungedf, reedFulldf, on="stateId", how='inner')
print(pulseMungedf.head())

#list(pulseMungedf.columns)
        SCRAM  WEEK  stateId  TBIRTH_YEAR  EEDUC  AEDUC  EGENID_BIRTH  \
1  V340000002    34        4         1982      7      2             2   
3  V340000004    34       31         1957      4      2             1   
4  V340000005    34       45         1962      5      2             2   
5  V340000006    34        8         1956      7      2             1   
6  V340000007    34       41         1982      7      2             2   

   AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  INCOME   ENDDATE  \
1             2               2                   2       7  8/2/2021   
3             2               1                   2       6  8/2/2021   
4             2               2                   2       4  8/2/2021   
5             2               1                   2       7  8/2/2021   
6             2               2                   2       8  8/2/2021   

            EDUCATION ASSIGNEDGENDER CHOSENGENDER SEXUALORIENTATION  \
1     Graduate degree         female       female          straight   
3        some college           male         male          straight   
4  Associate's degree         female       female          straight   
5     Graduate degree           male         male          straight   
6     Graduate degree         female       female          straight   

   INCOMEMIN        CUR_GENID  
1     150000  Cisgender Woman  
3     100000    Cisgender Man  
4      50000  Cisgender Woman  
5     150000    Cisgender Man  
6     200000  Cisgender Woman  
        SCRAM  WEEK  stateId  TBIRTH_YEAR  EEDUC  AEDUC  EGENID_BIRTH  \
0  V340000002    34        4         1982      7      2             2   
1  V340000076    34        4         1986      6      2             1   
2  V340000087    34        4         1945      6      2             1   
3  V340000238    34        4         1966      6      2             2   
4  V340000281    34        4         1973      4      2             2   

   AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  ...  \
0             2               2                   2  ...   
1             2               1                   2  ...   
2             2               1                   2  ...   
3             2               2                   2  ...   
4             2               2                   2  ...   

   veryReligiousStatista2017 moderatelyReligiousStatista2017  \
0                       0.31                            0.31   
1                       0.31                            0.31   
2                       0.31                            0.31   
3                       0.31                            0.31   
4                       0.31                            0.31   

  nonreligiousStatista2017 relLibScore2022 relLibVote2022 relLibVax2022  \
0                     0.39          0.4156            1.0           1.0   
1                     0.39          0.4156            1.0           1.0   
2                     0.39          0.4156            1.0           1.0   
3                     0.39          0.4156            1.0           1.0   
4                     0.39          0.4156            1.0           1.0   

   relLibHealth2022 relLibHealthMandate2022 relLibMarriage2022  relLibRfra2022  
0               4.0                     1.0                0.0             1.0  
1               4.0                     1.0                0.0             1.0  
2               4.0                     1.0                0.0             1.0  
3               4.0                     1.0                0.0             1.0  
4               4.0                     1.0                0.0             1.0  

[5 rows x 43 columns]
In [53]:
munge_col_list = ["SCRAM","WEEK","stateId","TBIRTH_YEAR","EEDUC","AEDUC","EGENID_BIRTH","AGENID_BIRTH",
                  "GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME","ENDDATE","EDUCATION","ASSIGNEDGENDER",
                  "CHOSENGENDER","SEXUALORIENTATION","INCOMEMIN","CUR_GENID",
                  "stateName","statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023",
                  "antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022","transAdultPop2016",
                  "transAdultPercent2016","transAdultPop2022","transAdultPercent2022","religionImportantPew2014",
                  "worshipWeeklyPew2014","prayDailyPew2014","certainAboutGodPew2014","overallReligiosityPew2014",
                  "veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017",
                  "relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022",
                  "relLibMarriage2022","relLibRfra2022"]
In [54]:
#sample from the dataset based on the "CUR_GENID" and stateId columns, 
#looking for (3gender*50state)*113samples = 16,950 rows per run
seed_value = 19
random.seed(seed_value)
rand_int = random.randint(0,1000)
modelSampledf = pulseMungedf.groupby(["CUR_GENID","stateId"]).sample(n=113,random_state=rand_int)

describeDF(modelSampledf,munge_col_list)
               WEEK       stateId   TBIRTH_YEAR         EEDUC         AEDUC  \
count  16950.000000  16950.000000  16950.000000  16950.000000  16950.000000   
mean      44.072153     29.320000   1971.105959      5.254572      1.995162   
std        6.117397     15.624084     16.989544      1.487861      0.069388   
min       34.000000      1.000000   1933.000000      1.000000      1.000000   
25%       39.000000     17.000000   1957.000000      4.000000      2.000000   
50%       44.000000     29.500000   1971.000000      6.000000      2.000000   
75%       50.000000     42.000000   1985.000000      7.000000      2.000000   
max       54.000000     56.000000   2005.000000      7.000000      2.000000   

       EGENID_BIRTH  AGENID_BIRTH  GENID_DESCRIBE  SEXUAL_ORIENTATION  \
count  16950.000000   16950.00000    16950.000000        16950.000000   
mean       1.533510       1.97056        2.136224            2.308024   
std        0.498891       0.16904        1.122674            0.850734   
min        1.000000       1.00000        1.000000            1.000000   
25%        1.000000       2.00000        1.000000            2.000000   
50%        2.000000       2.00000        2.000000            2.000000   
75%        2.000000       2.00000        3.000000            2.000000   
max        2.000000       2.00000        4.000000            5.000000   

             INCOME  ...  veryReligiousStatista2017  \
count  16950.000000  ...               16950.000000   
mean       4.256342  ...                   0.371600   
std        2.169266  ...                   0.089543   
min        1.000000  ...                   0.160000   
25%        2.000000  ...                   0.310000   
50%        4.000000  ...                   0.365000   
75%        6.000000  ...                   0.440000   
max        8.000000  ...                   0.590000   

       moderatelyReligiousStatista2017  nonreligiousStatista2017  \
count                     16950.000000              16950.000000   
mean                          0.287200                  0.342000   
std                           0.030137                  0.098856   
min                           0.160000                  0.120000   
25%                           0.270000                  0.290000   
50%                           0.295000                  0.340000   
75%                           0.300000                  0.400000   
max                           0.330000                  0.590000   

       relLibScore2022  relLibVote2022  relLibVax2022  relLibHealth2022  \
count     16950.000000    16950.000000   16950.000000       16950.00000   
mean          0.393948        0.800000       0.900000           6.76000   
std           0.131963        0.400012       0.300009           3.98288   
min           0.155800        0.000000       0.000000           0.00000   
25%           0.311700        1.000000       1.000000           4.00000   
50%           0.371200        1.000000       1.000000           5.50000   
75%           0.476200        1.000000       1.000000           9.00000   
max           0.818200        1.000000       1.000000          20.00000   

       relLibHealthMandate2022  relLibMarriage2022  relLibRfra2022  
count             16950.000000        16950.000000    16950.000000  
mean                  0.640000            1.160000        0.480000  
std                   0.480014            1.474628        0.499615  
min                   0.000000            0.000000        0.000000  
25%                   0.000000            0.000000        0.000000  
50%                   1.000000            0.000000        0.000000  
75%                   1.000000            3.000000        1.000000  
max                   1.000000            5.000000        1.000000  

[8 rows x 35 columns]
Mode of  WEEK :  54
Variance of  WEEK :  37.422547430596495
Mode of  stateId :  1
Variance of  stateId :  244.11200188801695
Mode of  TBIRTH_YEAR :  1960
Variance of  TBIRTH_YEAR :  288.6446219936923
Mode of  EEDUC :  6
Variance of  EEDUC :  2.2137306418648626
Mode of  AEDUC :  2
Variance of  AEDUC :  0.00481463825799801
Mode of  EGENID_BIRTH :  2
Variance of  EGENID_BIRTH :  0.24889174203157796
Mode of  AGENID_BIRTH :  2
Variance of  AGENID_BIRTH :  0.02857452802620946
Mode of  GENID_DESCRIBE :  1
Variance of  GENID_DESCRIBE :  1.2603963861043481
Mode of  SEXUAL_ORIENTATION :  2
Variance of  SEXUAL_ORIENTATION :  0.7237482323771591
Mode of  INCOME :  4
Variance of  INCOME :  4.705716751155775
Mode of  INCOMEMIN :  50000
Variance of  INCOMEMIN :  3305722837.0866547
Mode of  statePopulation2020 :  5024279
Variance of  statePopulation2020 :  54193215481182.87
Mode of  statePopulation2023 :  5097641
Variance of  statePopulation2023 :  356648599406395.7
Mode of  antiTransLegislationRiskIndex32023 :  4
Variance of  antiTransLegislationRiskIndex32023 :  2.513748303734734
Mode of  antiTransLegislationRiskIndex122022 :  3
Variance of  antiTransLegislationRiskIndex122022 :  1.7605038645347806
Mode of  antiTransLegislationRiskIndex112022 :  1
Variance of  antiTransLegislationRiskIndex112022 :  1.7477031093279838
Mode of  transAdultPop2016 :  2700
Variance of  transAdultPop2016 :  1331132916.9154522
Mode of  transAdultPercent2016 :  0.43
Variance of  transAdultPercent2016 :  0.014520696678270103
Mode of  transAdultPop2022 :  6300
Variance of  transAdultPop2022 :  828822454.0798867
Mode of  transAdultPercent2022 :  0.6
Variance of  transAdultPercent2022 :  0.015779690955218594
Mode of  religionImportantPew2014 :  0.44
Variance of  religionImportantPew2014 :  0.011325668181013627
Mode of  worshipWeeklyPew2014 :  0.34
Variance of  worshipWeeklyPew2014 :  0.005517965543689893
Mode of  prayDailyPew2014 :  0.51
Variance of  prayDailyPew2014 :  0.008712554014986136
Mode of  certainAboutGodPew2014 :  0.61
Variance of  certainAboutGodPew2014 :  0.008895564812083307
Mode of  overallReligiosityPew2014 :  0.54
Variance of  overallReligiosityPew2014 :  0.011309667237005131
Mode of  veryReligiousStatista2017 :  0.28
Variance of  veryReligiousStatista2017 :  0.008017913033217302
Mode of  moderatelyReligiousStatista2017 :  0.3
Variance of  moderatelyReligiousStatista2017 :  0.0009082135819222371
Mode of  nonreligiousStatista2017 :  0.33
Variance of  nonreligiousStatista2017 :  0.009772576553188979
Mode of  relLibScore2022 :  0.3377
Variance of  relLibScore2022 :  0.0174141322784353
Mode of  relLibVote2022 :  1.0
Variance of  relLibVote2022 :  0.16000944008496082
Mode of  relLibVax2022 :  1.0
Variance of  relLibVax2022 :  0.09000531004779044
Mode of  relLibHealth2022 :  5.0
Variance of  relLibHealth2022 :  15.86333589002301
Mode of  relLibHealthMandate2022 :  1.0
Variance of  relLibHealthMandate2022 :  0.2304135937223435
Mode of  relLibMarriage2022 :  0.0
Variance of  relLibMarriage2022 :  2.174528290754617
Mode of  relLibRfra2022 :  0.0
Variance of  relLibRfra2022 :  0.2496147265325388

In [55]:
#convert all values that will be used to model to categories or numbers
col_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
            ,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOMEMIN","CUR_GENID"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
dfClean = modelSampledf[col_list].copy()

print(dfClean.dtypes)

# convert text columns to category values
dfClean["CUR_GENID"] = dfClean["CUR_GENID"].astype("category")
dfClean["CUR_GENID_CAT"] = dfClean["CUR_GENID"].cat.codes

print(dfClean.describe())
WEEK                                    int64
stateId                                 int32
TBIRTH_YEAR                             int64
EEDUC                                   int64
EGENID_BIRTH                            int64
GENID_DESCRIBE                          int64
SEXUAL_ORIENTATION                      int64
INCOMEMIN                               int64
CUR_GENID                              object
statePopulation2020                     int64
statePopulation2023                     int64
antiTransLegislationRiskIndex32023      int64
transAdultPop2022                       int64
overallReligiosityPew2014             float64
veryReligiousStatista2017             float64
moderatelyReligiousStatista2017       float64
nonreligiousStatista2017              float64
relLibScore2022                       float64
relLibVote2022                        float64
relLibVax2022                         float64
relLibHealth2022                      float64
relLibHealthMandate2022               float64
relLibMarriage2022                    float64
relLibRfra2022                        float64
dtype: object
               WEEK       stateId   TBIRTH_YEAR         EEDUC  EGENID_BIRTH  \
count  16950.000000  16950.000000  16950.000000  16950.000000  16950.000000   
mean      44.072153     29.320000   1971.105959      5.254572      1.533510   
std        6.117397     15.624084     16.989544      1.487861      0.498891   
min       34.000000      1.000000   1933.000000      1.000000      1.000000   
25%       39.000000     17.000000   1957.000000      4.000000      1.000000   
50%       44.000000     29.500000   1971.000000      6.000000      2.000000   
75%       50.000000     42.000000   1985.000000      7.000000      2.000000   
max       54.000000     56.000000   2005.000000      7.000000      2.000000   

       GENID_DESCRIBE  SEXUAL_ORIENTATION      INCOMEMIN  statePopulation2020  \
count    16950.000000        16950.000000   16950.000000         1.695000e+04   
mean         2.136224            2.308024   70282.595870         6.615242e+06   
std          1.122674            0.850734   57495.415792         7.361604e+06   
min          1.000000            1.000000       0.000000         5.768510e+05   
25%          1.000000            2.000000   25000.000000         1.839106e+06   
50%          2.000000            2.000000   50000.000000         4.581796e+06   
75%          3.000000            2.000000  100000.000000         7.705281e+06   
max          4.000000            5.000000  200000.000000         3.953822e+07   

       statePopulation2023  ...  moderatelyReligiousStatista2017  \
count         1.695000e+04  ...                     16950.000000   
mean          8.960485e+06  ...                         0.287200   
std           1.888514e+07  ...                         0.030137   
min           5.808170e+05  ...                         0.160000   
25%           1.920562e+06  ...                         0.270000   
50%           4.625424e+06  ...                         0.295000   
75%           7.999503e+06  ...                         0.300000   
max           1.309280e+08  ...                         0.330000   

       nonreligiousStatista2017  relLibScore2022  relLibVote2022  \
count              16950.000000     16950.000000    16950.000000   
mean                   0.342000         0.393948        0.800000   
std                    0.098856         0.131963        0.400012   
min                    0.120000         0.155800        0.000000   
25%                    0.290000         0.311700        1.000000   
50%                    0.340000         0.371200        1.000000   
75%                    0.400000         0.476200        1.000000   
max                    0.590000         0.818200        1.000000   

       relLibVax2022  relLibHealth2022  relLibHealthMandate2022  \
count   16950.000000       16950.00000             16950.000000   
mean        0.900000           6.76000                 0.640000   
std         0.300009           3.98288                 0.480014   
min         0.000000           0.00000                 0.000000   
25%         1.000000           4.00000                 0.000000   
50%         1.000000           5.50000                 1.000000   
75%         1.000000           9.00000                 1.000000   
max         1.000000          20.00000                 1.000000   

       relLibMarriage2022  relLibRfra2022  CUR_GENID_CAT  
count        16950.000000    16950.000000   16950.000000  
mean             1.160000        0.480000       1.000000  
std              1.474628        0.499615       0.816521  
min              0.000000        0.000000       0.000000  
25%              0.000000        0.000000       0.000000  
50%              0.000000        0.000000       1.000000  
75%              3.000000        1.000000       2.000000  
max              5.000000        1.000000       2.000000  

[8 rows x 24 columns]
In [56]:
# Build kNN Classifier to sort and classify data
# reduce dimensionality based on experimentation and hypothesis criteria

# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
            ,"SEXUAL_ORIENTATION","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
trainSize = 0.3
trainState = 1
In [57]:
# split datasets into training and test sets
size = 0.3
state = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=state)

# scale input data for training if necessary for better predictions
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

# prep data
cv_count = 18
max_neighbors = 200 #arbitrary magic number
parameters = {"n_neighbors": np.arange(1, max_neighbors)}
# run regression
knnr = KNeighborsClassifier(n_neighbors=cv_count,weights='distance')

# best neighbor count found in testing at 127
# use gridsearch to test all values for best n_neighbors number and highest accuracy
#knnr_gscv = GridSearchCV(knnr, parameters, cv=cv_count)
#knnr_gscv.fit(X.values, y.values)
#print("Best value for neighbor count found: ",knnr_gscv.best_params_)
#print("Best Average Accuracy found: ",knnr_gscv.best_score_)
In [58]:
# Build the new model
# split dataset into dependent(features) and independent(target) variable
#params = knnr_gscv.best_params_
#n_count = int(params['n_neighbors'])
n_count = 18

# splt datasets into training and test sets
size = 0.3
state = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=state)

# run regression
knnr = KNeighborsClassifier(n_neighbors=n_count,weights='distance')
knnr.fit(X_train.values, y_train.values)
pred = knnr.predict(X_test.values)

# confusion matrix for visualization is available, but unnecessary for this dataset
cfm = metrics.confusion_matrix(y_test, pred)
fig, ax = plt.subplots(figsize=(6, 6))
ax.matshow(cfm, cmap = plt.cm.Purples, alpha=0.3)
for i in range(cfm.shape[0]):
    for j in range(cfm.shape[1]):
        ax.text(x=j, y=i,s=cfm[i, j], va='center', 
                ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize = 16)
plt.ylabel('Actuals', fontsize = 16)
plt.title('kNN Confusion Matrix', fontsize = 16)
plt.show()

print(metrics.classification_report(y_test, pred, zero_division = 0))
              precision    recall  f1-score   support

           0       0.42      0.43      0.43      1721
           1       0.36      0.38      0.37      1650
           2       0.48      0.43      0.45      1714

    accuracy                           0.42      5085
   macro avg       0.42      0.42      0.42      5085
weighted avg       0.42      0.42      0.42      5085

In [59]:
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix')
              precision    recall  f1-score   support

           0       0.84      0.97      0.90      3976
           1       0.75      0.93      0.83      3896
           2       0.83      0.51      0.63      3993

    accuracy                           0.80     11865
   macro avg       0.81      0.80      0.79     11865
weighted avg       0.81      0.80      0.79     11865

In [60]:
# initialize, train and test the GNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
pred = gnb.predict(X_test)

#check accuracy
gnb_accuracy = metrics.accuracy_score(pred, y_test)

#confusion matrix
cfm = metrics.confusion_matrix(y_test, pred)
fig, ax = plt.subplots(figsize=(6,6))
ax.matshow(cfm, cmap = plt.cm.Purples, alpha=0.3)
for i in range(cfm.shape[0]):
    for j in range(cfm.shape[1]):
        ax.text(x=j, y=i,s=cfm[i, j], va='center', 
                ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize = 16)
plt.ylabel('Actuals', fontsize = 16)
plt.title('Gaussian NB Confusion Matrix', fontsize = 16)
plt.show()

print(metrics.classification_report(y_test, pred, zero_division = 0))
              precision    recall  f1-score   support

           0       0.44      0.41      0.42      1721
           1       0.32      0.15      0.20      1650
           2       0.40      0.63      0.49      1714

    accuracy                           0.40      5085
   macro avg       0.39      0.40      0.37      5085
weighted avg       0.39      0.40      0.38      5085

In [61]:
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix')
              precision    recall  f1-score   support

           0       0.84      0.97      0.90      3976
           1       0.69      0.98      0.81      3896
           2       0.87      0.39      0.54      3993

    accuracy                           0.78     11865
   macro avg       0.80      0.78      0.75     11865
weighted avg       0.80      0.78      0.75     11865

In [62]:
# Setup for A/B testing
# remove "SEXUAL_ORIENTATION" column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
In [63]:
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Sexuality Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Sexuality Removed')
              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.89      0.75      3896
           2       0.52      0.22      0.31      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.62     11865
weighted avg       0.64      0.67      0.62     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.90      0.78      3896
           2       0.68      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.69      0.70      0.64     11865
weighted avg       0.69      0.70      0.64     11865

In [64]:
# Setup for A/B testing
# remove "EEDUC","INCOMEMIN" column
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","SEXUAL_ORIENTATION"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
In [65]:
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Education and Income Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Education and Income Removed')
              precision    recall  f1-score   support

           0       0.84      0.97      0.90      3976
           1       0.75      0.93      0.83      3896
           2       0.83      0.51      0.63      3993

    accuracy                           0.80     11865
   macro avg       0.81      0.80      0.79     11865
weighted avg       0.81      0.80      0.79     11865

              precision    recall  f1-score   support

           0       0.84      0.97      0.90      3976
           1       0.69      0.98      0.81      3896
           2       0.87      0.39      0.54      3993

    accuracy                           0.78     11865
   macro avg       0.80      0.78      0.75     11865
weighted avg       0.80      0.78      0.75     11865

In [66]:
# Setup for A/B testing
# remove ,"statePopulation2020","statePopulation2023","transAdultPop2022" column
#population table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
In [67]:
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Population Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Population Table Removed')
              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.90      0.75      3896
           2       0.52      0.21      0.30      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.62     11865
weighted avg       0.64      0.67      0.62     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.91      0.78      3896
           2       0.69      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.70      0.70      0.64     11865
weighted avg       0.70      0.70      0.64     11865

In [68]:
# Setup for A/B testing
# remove ,"antiTransLegislationRiskIndex32023" column
#anti-trans legislation table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
In [69]:
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Anti-Trans Legislation Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Anti-Trans Legislation Table Removed')
              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.89      0.75      3896
           2       0.52      0.22      0.31      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.63     11865
weighted avg       0.64      0.67      0.62     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.90      0.78      3896
           2       0.68      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.70      0.70      0.64     11865
weighted avg       0.70      0.70      0.64     11865

In [70]:
# Setup for A/B testing
# remove ,"overallReligiosityPew2014" column
#Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
In [71]:
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Pew 2014 Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Pew 2014 Table Removed')
              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.89      0.75      3896
           2       0.52      0.22      0.31      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.62     11865
weighted avg       0.64      0.67      0.62     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.91      0.78      3896
           2       0.68      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.70      0.70      0.64     11865
weighted avg       0.70      0.70      0.64     11865

In [72]:
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017" column
#Statista 2017 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
In [73]:
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Statista 2017 Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Statista 2017 Table Removed')
              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.90      0.75      3896
           2       0.52      0.22      0.31      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.62     11865
weighted avg       0.64      0.67      0.62     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.90      0.78      3896
           2       0.68      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.69      0.70      0.64     11865
weighted avg       0.69      0.70      0.64     11865

In [74]:
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014" column
#Statista 2017 and Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
In [75]:
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Statista 2017 and Pew 2014 Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Statista 2017 and Pew 2014 Table Removed')
              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.90      0.75      3896
           2       0.52      0.22      0.31      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.62     11865
weighted avg       0.64      0.67      0.62     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.90      0.78      3896
           2       0.68      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.69      0.70      0.64     11865
weighted avg       0.69      0.70      0.64     11865

In [76]:
# Setup for A/B testing
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
In [77]:
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Religious Liberty Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Religious Liberty Table Removed')
              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.89      0.75      3896
           2       0.52      0.23      0.32      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.63     11865
weighted avg       0.64      0.67      0.63     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.69      0.91      0.78      3896
           2       0.69      0.20      0.31      3993

    accuracy                           0.70     11865
   macro avg       0.70      0.70      0.64     11865
weighted avg       0.70      0.70      0.64     11865

In [78]:
# Setup for A/B testing
#,"statePopulation2020","statePopulation2023","transAdultPop2022"
#,"antiTransLegislationRiskIndex32023"
#,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
In [79]:
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Only Pulse Data')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Only Pulse Data')
              precision    recall  f1-score   support

           0       0.74      0.90      0.81      3976
           1       0.65      0.89      0.76      3896
           2       0.53      0.23      0.32      3993

    accuracy                           0.67     11865
   macro avg       0.64      0.67      0.63     11865
weighted avg       0.64      0.67      0.63     11865

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      3976
           1       0.62      1.00      0.76      3896
           2       0.00      0.00      0.00      3993

    accuracy                           0.66     11865
   macro avg       0.44      0.67      0.53     11865
weighted avg       0.44      0.66      0.53     11865

In [80]:
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
            ,"SEXUAL_ORIENTATION","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]

# regression pick

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
sc_X = StandardScaler()
sc_y = StandardScaler()
sc_X_train = sc_X.fit_transform(X_train)
sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
sc_y_train = sc_y_train
In [81]:
log_regression = LogisticRegression(solver="newton-cg", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))

#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
    #print(np.exp(x)/(1 + np.exp(x)))
    coefArray.append(np.exp(x))

featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)
Training set score: 0.68
Test set score: 0.6743
         featureNames  Coefficients
0                WEEK      1.002763
1             stateId      1.000906
2         TBIRTH_YEAR      1.003440
3               EEDUC      0.005272
4        EGENID_BIRTH      1.041176
5  SEXUAL_ORIENTATION      1.000001
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:210: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.
  warnings.warn(
In [82]:
log_regression = LogisticRegression(solver="sag", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))

#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
    #print(np.exp(x)/(1 + np.exp(x)))
    coefArray.append(np.exp(x)/(1 + np.exp(x)))

featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)
Training set score: 0.4144
Test set score: 0.411
         featureNames  Coefficients
0                WEEK      0.500001
1             stateId      0.500001
2         TBIRTH_YEAR      0.499962
3               EEDUC      0.499995
4        EGENID_BIRTH      0.500000
5  SEXUAL_ORIENTATION      0.500001
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
In [83]:
log_regression = LogisticRegression(solver="saga", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))

#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
    #print(np.exp(x)/(1 + np.exp(x)))
    coefArray.append(np.exp(x)/(1 + np.exp(x)))

featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)
Training set score: 0.4144
Test set score: 0.411
         featureNames  Coefficients
0                WEEK      0.500000
1             stateId      0.500000
2         TBIRTH_YEAR      0.499962
3               EEDUC      0.499997
4        EGENID_BIRTH      0.500000
5  SEXUAL_ORIENTATION      0.500001
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
In [84]:
log_regression = LogisticRegression(solver="lbfgs", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))

#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
    #print(np.exp(x)/(1 + np.exp(x)))
    coefArray.append(np.exp(x)/(1 + np.exp(x)))

featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)
Training set score: 0.4144
Test set score: 0.411
         featureNames  Coefficients
0                WEEK      0.499999
1             stateId      0.499999
2         TBIRTH_YEAR      0.499962
3               EEDUC      0.500000
4        EGENID_BIRTH      0.500000
5  SEXUAL_ORIENTATION      0.500001
In [85]:
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
            ,"SEXUAL_ORIENTATION","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Full Dataset")
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-0.3, random_state = 0)
#sc_X = StandardScaler()
#sc_y = StandardScaler()
#sc_X_train = sc_X.fit_transform(X_train)
#sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
#sc_y_train = sc_y_train

#maxIter=1000000000
#log_regression = LogisticRegression(max_iter=max_iter)
#solvers = ["liblinear","newton-cg","sag","saga","lbfgs"]
#penalty=["l2"]
#cVals=[0.01,0.1,1.0,10.0,100.0]

#grid=dict(solver=solvers,penalty=penalty,C=cVals)
#cv=RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=state)
#grid_search = GridSearchCV(estimator=log_regression, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
#grid_result = grid_search.fit(X,y)
# summarize results
#print("Accuracy rate of Logistic Regression: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#Accuracy rate of Logistic Regression: 0.797286 using {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}


#log_regression = LogisticRegression(solver="newton-cg", random_state=state, penalty="l2", C=0.01, max_iter=maxIter).fit(X_train,y_train)
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
#print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
#print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))

#coefArray = []
#for ind in range(log_regression.coef_.shape[0]):
#    if ind == 0:
#        featFor = "Cisgender Men"
#    elif ind == 1:
#        featFor = "Cisgender Women"
#    else:
#        featFor = "Transgender"
#    featTitle = "Logistic Regression " + featFor + " Feature Coefficients"

#    for x in log_regression.coef_[ind]:
#        #print(np.exp(x)/(1 + np.exp(x)))
#        coefArray.append(np.exp(x)/(1 + np.exp(x)))

#    featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["Features", "Coefficients"])
#    featureDf = featureDf.sort_values(by=["Coefficients"], ascending=False)
#    print(featureDf)
    
#    #plot bar chart of importance
#    f, ax = plt.subplots(figsize=(20,12))
#    sns.barplot(x=featureDf["Features"], y=featureDf["Coefficients"], palette="flare")
#    plt.title(featTitle, fontsize=14)
#    plt.xticks(rotation=45)
    
#    plt.show()
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.791
                              Features  Coefficients
3                                EEDUC      0.510278
11           overallReligiosityPew2014      0.504194
12           veryReligiousStatista2017      0.501816
13     moderatelyReligiousStatista2017      0.501757
21                      relLibRfra2022      0.501642
15                     relLibScore2022      0.500698
20                  relLibMarriage2022      0.500558
2                          TBIRTH_YEAR      0.500448
18                    relLibHealth2022      0.500004
6                            INCOMEMIN      0.500001
7                  statePopulation2020      0.500000
8                  statePopulation2023      0.500000
10                   transAdultPop2022      0.499999
0                                 WEEK      0.499949
1                              stateId      0.499818
19             relLibHealthMandate2022      0.499021
14            nonreligiousStatista2017      0.499015
9   antiTransLegislationRiskIndex32023      0.498912
17                       relLibVax2022      0.497051
16                      relLibVote2022      0.491821
5                   SEXUAL_ORIENTATION      0.406382
4                         EGENID_BIRTH      0.112777
                              Features  Coefficients
3                                EEDUC      0.510278
11           overallReligiosityPew2014      0.504194
12           veryReligiousStatista2017      0.501816
13     moderatelyReligiousStatista2017      0.501757
21                      relLibRfra2022      0.501642
15                     relLibScore2022      0.500698
20                  relLibMarriage2022      0.500558
2                          TBIRTH_YEAR      0.500448
18                    relLibHealth2022      0.500004
6                            INCOMEMIN      0.500001
7                  statePopulation2020      0.500000
8                  statePopulation2023      0.500000
10                   transAdultPop2022      0.499999
0                                 WEEK      0.499949
1                              stateId      0.499818
19             relLibHealthMandate2022      0.499021
14            nonreligiousStatista2017      0.499015
9   antiTransLegislationRiskIndex32023      0.498912
17                       relLibVax2022      0.497051
16                      relLibVote2022      0.491821
5                   SEXUAL_ORIENTATION      0.406382
4                         EGENID_BIRTH      0.112777
                              Features  Coefficients
3                                EEDUC      0.510278
11           overallReligiosityPew2014      0.504194
12           veryReligiousStatista2017      0.501816
13     moderatelyReligiousStatista2017      0.501757
21                      relLibRfra2022      0.501642
15                     relLibScore2022      0.500698
20                  relLibMarriage2022      0.500558
2                          TBIRTH_YEAR      0.500448
18                    relLibHealth2022      0.500004
6                            INCOMEMIN      0.500001
7                  statePopulation2020      0.500000
8                  statePopulation2023      0.500000
10                   transAdultPop2022      0.499999
0                                 WEEK      0.499949
1                              stateId      0.499818
19             relLibHealthMandate2022      0.499021
14            nonreligiousStatista2017      0.499015
9   antiTransLegislationRiskIndex32023      0.498912
17                       relLibVax2022      0.497051
16                      relLibVote2022      0.491821
5                   SEXUAL_ORIENTATION      0.406382
4                         EGENID_BIRTH      0.112777
In [86]:
# Setup for A/B testing
# remove "SEXUAL_ORIENTATION" column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Sexuality Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6689
                              Features  Coefficients
3                                EEDUC      0.511605
20                      relLibRfra2022      0.505645
10           overallReligiosityPew2014      0.504704
12     moderatelyReligiousStatista2017      0.502584
19                  relLibMarriage2022      0.501647
11           veryReligiousStatista2017      0.501373
0                                 WEEK      0.500593
2                          TBIRTH_YEAR      0.500314
14                     relLibScore2022      0.500257
8   antiTransLegislationRiskIndex32023      0.500143
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
9                    transAdultPop2022      0.500000
17                    relLibHealth2022      0.499961
1                              stateId      0.499648
13            nonreligiousStatista2017      0.499627
16                       relLibVax2022      0.497257
18             relLibHealthMandate2022      0.496243
15                      relLibVote2022      0.489497
4                         EGENID_BIRTH      0.112855
                              Features  Coefficients
3                                EEDUC      0.511605
20                      relLibRfra2022      0.505645
10           overallReligiosityPew2014      0.504704
12     moderatelyReligiousStatista2017      0.502584
19                  relLibMarriage2022      0.501647
11           veryReligiousStatista2017      0.501373
0                                 WEEK      0.500593
2                          TBIRTH_YEAR      0.500314
14                     relLibScore2022      0.500257
8   antiTransLegislationRiskIndex32023      0.500143
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
9                    transAdultPop2022      0.500000
17                    relLibHealth2022      0.499961
1                              stateId      0.499648
13            nonreligiousStatista2017      0.499627
16                       relLibVax2022      0.497257
18             relLibHealthMandate2022      0.496243
15                      relLibVote2022      0.489497
4                         EGENID_BIRTH      0.112855
                              Features  Coefficients
3                                EEDUC      0.511605
20                      relLibRfra2022      0.505645
10           overallReligiosityPew2014      0.504704
12     moderatelyReligiousStatista2017      0.502584
19                  relLibMarriage2022      0.501647
11           veryReligiousStatista2017      0.501373
0                                 WEEK      0.500593
2                          TBIRTH_YEAR      0.500314
14                     relLibScore2022      0.500257
8   antiTransLegislationRiskIndex32023      0.500143
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
9                    transAdultPop2022      0.500000
17                    relLibHealth2022      0.499961
1                              stateId      0.499648
13            nonreligiousStatista2017      0.499627
16                       relLibVax2022      0.497257
18             relLibHealthMandate2022      0.496243
15                      relLibVote2022      0.489497
4                         EGENID_BIRTH      0.112855
In [105]:
# Setup for A/B testing
# remove "EGENID_BIRTH", column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","SEXUAL_ORIENTATION","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Sexuality Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.5102
                              Features  Coefficients
3                                EEDUC      0.510443
10           overallReligiosityPew2014      0.503726
18             relLibHealthMandate2022      0.502791
20                      relLibRfra2022      0.502573
11           veryReligiousStatista2017      0.501265
12     moderatelyReligiousStatista2017      0.501013
8   antiTransLegislationRiskIndex32023      0.500868
0                                 WEEK      0.500678
14                     relLibScore2022      0.500242
2                          TBIRTH_YEAR      0.500045
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
9                    transAdultPop2022      0.499999
1                              stateId      0.499939
13            nonreligiousStatista2017      0.499419
17                    relLibHealth2022      0.498873
19                  relLibMarriage2022      0.497932
16                       relLibVax2022      0.497623
15                      relLibVote2022      0.490879
4                   SEXUAL_ORIENTATION      0.402553
                              Features  Coefficients
3                                EEDUC      0.510443
10           overallReligiosityPew2014      0.503726
18             relLibHealthMandate2022      0.502791
20                      relLibRfra2022      0.502573
11           veryReligiousStatista2017      0.501265
12     moderatelyReligiousStatista2017      0.501013
8   antiTransLegislationRiskIndex32023      0.500868
0                                 WEEK      0.500678
14                     relLibScore2022      0.500242
2                          TBIRTH_YEAR      0.500045
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
9                    transAdultPop2022      0.499999
1                              stateId      0.499939
13            nonreligiousStatista2017      0.499419
17                    relLibHealth2022      0.498873
19                  relLibMarriage2022      0.497932
16                       relLibVax2022      0.497623
15                      relLibVote2022      0.490879
4                   SEXUAL_ORIENTATION      0.402553
                              Features  Coefficients
3                                EEDUC      0.510443
10           overallReligiosityPew2014      0.503726
18             relLibHealthMandate2022      0.502791
20                      relLibRfra2022      0.502573
11           veryReligiousStatista2017      0.501265
12     moderatelyReligiousStatista2017      0.501013
8   antiTransLegislationRiskIndex32023      0.500868
0                                 WEEK      0.500678
14                     relLibScore2022      0.500242
2                          TBIRTH_YEAR      0.500045
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
9                    transAdultPop2022      0.499999
1                              stateId      0.499939
13            nonreligiousStatista2017      0.499419
17                    relLibHealth2022      0.498873
19                  relLibMarriage2022      0.497932
16                       relLibVax2022      0.497623
15                      relLibVote2022      0.490879
4                   SEXUAL_ORIENTATION      0.402553
In [87]:
# Setup for A/B testing
# remove "EEDUC","INCOMEMIN" column
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","SEXUAL_ORIENTATION"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Education and Income Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.7853
                              Features  Coefficients
9            overallReligiosityPew2014      0.503246
11     moderatelyReligiousStatista2017      0.501590
19                      relLibRfra2022      0.501426
10           veryReligiousStatista2017      0.501360
13                     relLibScore2022      0.500738
18                  relLibMarriage2022      0.500612
2                          TBIRTH_YEAR      0.500497
15                       relLibVax2022      0.500336
16                    relLibHealth2022      0.500105
5                  statePopulation2020      0.500000
6                  statePopulation2023      0.500000
7                    transAdultPop2022      0.499999
1                              stateId      0.499872
0                                 WEEK      0.499861
12            nonreligiousStatista2017      0.499604
17             relLibHealthMandate2022      0.499491
8   antiTransLegislationRiskIndex32023      0.496482
14                      relLibVote2022      0.494724
4                   SEXUAL_ORIENTATION      0.403340
3                         EGENID_BIRTH      0.114981
                              Features  Coefficients
9            overallReligiosityPew2014      0.503246
11     moderatelyReligiousStatista2017      0.501590
19                      relLibRfra2022      0.501426
10           veryReligiousStatista2017      0.501360
13                     relLibScore2022      0.500738
18                  relLibMarriage2022      0.500612
2                          TBIRTH_YEAR      0.500497
15                       relLibVax2022      0.500336
16                    relLibHealth2022      0.500105
5                  statePopulation2020      0.500000
6                  statePopulation2023      0.500000
7                    transAdultPop2022      0.499999
1                              stateId      0.499872
0                                 WEEK      0.499861
12            nonreligiousStatista2017      0.499604
17             relLibHealthMandate2022      0.499491
8   antiTransLegislationRiskIndex32023      0.496482
14                      relLibVote2022      0.494724
4                   SEXUAL_ORIENTATION      0.403340
3                         EGENID_BIRTH      0.114981
                              Features  Coefficients
9            overallReligiosityPew2014      0.503246
11     moderatelyReligiousStatista2017      0.501590
19                      relLibRfra2022      0.501426
10           veryReligiousStatista2017      0.501360
13                     relLibScore2022      0.500738
18                  relLibMarriage2022      0.500612
2                          TBIRTH_YEAR      0.500497
15                       relLibVax2022      0.500336
16                    relLibHealth2022      0.500105
5                  statePopulation2020      0.500000
6                  statePopulation2023      0.500000
7                    transAdultPop2022      0.499999
1                              stateId      0.499872
0                                 WEEK      0.499861
12            nonreligiousStatista2017      0.499604
17             relLibHealthMandate2022      0.499491
8   antiTransLegislationRiskIndex32023      0.496482
14                      relLibVote2022      0.494724
4                   SEXUAL_ORIENTATION      0.403340
3                         EGENID_BIRTH      0.114981
In [88]:
# Setup for A/B testing
# remove ,"statePopulation2020","statePopulation2023","transAdultPop2022" column
#population table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Population Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.7093
                              Features  Coefficients
4                                EEDUC      0.509532
17                      relLibRfra2022      0.505297
16                  relLibMarriage2022      0.502339
7            overallReligiosityPew2014      0.502115
6   antiTransLegislationRiskIndex32023      0.501436
9      moderatelyReligiousStatista2017      0.501296
0                                 WEEK      0.500696
8            veryReligiousStatista2017      0.500224
11                     relLibScore2022      0.500088
5                            INCOMEMIN      0.500001
14                    relLibHealth2022      0.499730
1                              stateId      0.499727
10            nonreligiousStatista2017      0.498659
2                          TBIRTH_YEAR      0.498232
15             relLibHealthMandate2022      0.495723
13                       relLibVax2022      0.493200
12                      relLibVote2022      0.489051
3                         EGENID_BIRTH      0.111752
                              Features  Coefficients
4                                EEDUC      0.509532
17                      relLibRfra2022      0.505297
16                  relLibMarriage2022      0.502339
7            overallReligiosityPew2014      0.502115
6   antiTransLegislationRiskIndex32023      0.501436
9      moderatelyReligiousStatista2017      0.501296
0                                 WEEK      0.500696
8            veryReligiousStatista2017      0.500224
11                     relLibScore2022      0.500088
5                            INCOMEMIN      0.500001
14                    relLibHealth2022      0.499730
1                              stateId      0.499727
10            nonreligiousStatista2017      0.498659
2                          TBIRTH_YEAR      0.498232
15             relLibHealthMandate2022      0.495723
13                       relLibVax2022      0.493200
12                      relLibVote2022      0.489051
3                         EGENID_BIRTH      0.111752
                              Features  Coefficients
4                                EEDUC      0.509532
17                      relLibRfra2022      0.505297
16                  relLibMarriage2022      0.502339
7            overallReligiosityPew2014      0.502115
6   antiTransLegislationRiskIndex32023      0.501436
9      moderatelyReligiousStatista2017      0.501296
0                                 WEEK      0.500696
8            veryReligiousStatista2017      0.500224
11                     relLibScore2022      0.500088
5                            INCOMEMIN      0.500001
14                    relLibHealth2022      0.499730
1                              stateId      0.499727
10            nonreligiousStatista2017      0.498659
2                          TBIRTH_YEAR      0.498232
15             relLibHealthMandate2022      0.495723
13                       relLibVax2022      0.493200
12                      relLibVote2022      0.489051
3                         EGENID_BIRTH      0.111752
In [89]:
# Setup for A/B testing
# remove ,"antiTransLegislationRiskIndex32023" column
#anti-trans legislation table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Anti-Trans Legislation Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6684
                           Features  Coefficients
4                             EEDUC      0.511643
19                   relLibRfra2022      0.505801
9         overallReligiosityPew2014      0.505390
11  moderatelyReligiousStatista2017      0.502597
10        veryReligiousStatista2017      0.502021
18               relLibMarriage2022      0.501604
0                              WEEK      0.500581
13                  relLibScore2022      0.500353
2                       TBIRTH_YEAR      0.500315
5                         INCOMEMIN      0.500001
6               statePopulation2020      0.500000
7               statePopulation2023      0.500000
8                 transAdultPop2022      0.500000
16                 relLibHealth2022      0.499941
1                           stateId      0.499647
12         nonreligiousStatista2017      0.498976
15                    relLibVax2022      0.497316
17          relLibHealthMandate2022      0.496308
14                   relLibVote2022      0.489418
3                      EGENID_BIRTH      0.112749
                           Features  Coefficients
4                             EEDUC      0.511643
19                   relLibRfra2022      0.505801
9         overallReligiosityPew2014      0.505390
11  moderatelyReligiousStatista2017      0.502597
10        veryReligiousStatista2017      0.502021
18               relLibMarriage2022      0.501604
0                              WEEK      0.500581
13                  relLibScore2022      0.500353
2                       TBIRTH_YEAR      0.500315
5                         INCOMEMIN      0.500001
6               statePopulation2020      0.500000
7               statePopulation2023      0.500000
8                 transAdultPop2022      0.500000
16                 relLibHealth2022      0.499941
1                           stateId      0.499647
12         nonreligiousStatista2017      0.498976
15                    relLibVax2022      0.497316
17          relLibHealthMandate2022      0.496308
14                   relLibVote2022      0.489418
3                      EGENID_BIRTH      0.112749
                           Features  Coefficients
4                             EEDUC      0.511643
19                   relLibRfra2022      0.505801
9         overallReligiosityPew2014      0.505390
11  moderatelyReligiousStatista2017      0.502597
10        veryReligiousStatista2017      0.502021
18               relLibMarriage2022      0.501604
0                              WEEK      0.500581
13                  relLibScore2022      0.500353
2                       TBIRTH_YEAR      0.500315
5                         INCOMEMIN      0.500001
6               statePopulation2020      0.500000
7               statePopulation2023      0.500000
8                 transAdultPop2022      0.500000
16                 relLibHealth2022      0.499941
1                           stateId      0.499647
12         nonreligiousStatista2017      0.498976
15                    relLibVax2022      0.497316
17          relLibHealthMandate2022      0.496308
14                   relLibVote2022      0.489418
3                      EGENID_BIRTH      0.112749
In [90]:
# Setup for A/B testing
# remove ,"overallReligiosityPew2014" column
#Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Pew 2014 Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6694
                              Features  Coefficients
4                                EEDUC      0.511619
19                      relLibRfra2022      0.505808
11     moderatelyReligiousStatista2017      0.502805
18                  relLibMarriage2022      0.501627
10           veryReligiousStatista2017      0.501534
0                                 WEEK      0.500607
13                     relLibScore2022      0.500326
2                          TBIRTH_YEAR      0.500317
9   antiTransLegislationRiskIndex32023      0.500300
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
16                    relLibHealth2022      0.499978
1                              stateId      0.499649
12            nonreligiousStatista2017      0.499524
15                       relLibVax2022      0.497192
17             relLibHealthMandate2022      0.496067
14                      relLibVote2022      0.489147
3                         EGENID_BIRTH      0.111512
                              Features  Coefficients
4                                EEDUC      0.511619
19                      relLibRfra2022      0.505808
11     moderatelyReligiousStatista2017      0.502805
18                  relLibMarriage2022      0.501627
10           veryReligiousStatista2017      0.501534
0                                 WEEK      0.500607
13                     relLibScore2022      0.500326
2                          TBIRTH_YEAR      0.500317
9   antiTransLegislationRiskIndex32023      0.500300
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
16                    relLibHealth2022      0.499978
1                              stateId      0.499649
12            nonreligiousStatista2017      0.499524
15                       relLibVax2022      0.497192
17             relLibHealthMandate2022      0.496067
14                      relLibVote2022      0.489147
3                         EGENID_BIRTH      0.111512
                              Features  Coefficients
4                                EEDUC      0.511619
19                      relLibRfra2022      0.505808
11     moderatelyReligiousStatista2017      0.502805
18                  relLibMarriage2022      0.501627
10           veryReligiousStatista2017      0.501534
0                                 WEEK      0.500607
13                     relLibScore2022      0.500326
2                          TBIRTH_YEAR      0.500317
9   antiTransLegislationRiskIndex32023      0.500300
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
16                    relLibHealth2022      0.499978
1                              stateId      0.499649
12            nonreligiousStatista2017      0.499524
15                       relLibVax2022      0.497192
17             relLibHealthMandate2022      0.496067
14                      relLibVote2022      0.489147
3                         EGENID_BIRTH      0.111512
In [91]:
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017" column
#Statista 2017 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Statista 2017 Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6689
                              Features  Coefficients
4                                EEDUC      0.511610
17                      relLibRfra2022      0.505686
10           overallReligiosityPew2014      0.504812
16                  relLibMarriage2022      0.501656
0                                 WEEK      0.500590
2                          TBIRTH_YEAR      0.500315
11                     relLibScore2022      0.500276
9   antiTransLegislationRiskIndex32023      0.500180
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
14                    relLibHealth2022      0.499969
1                              stateId      0.499648
13                       relLibVax2022      0.497284
15             relLibHealthMandate2022      0.496187
12                      relLibVote2022      0.489382
3                         EGENID_BIRTH      0.112520
                              Features  Coefficients
4                                EEDUC      0.511610
17                      relLibRfra2022      0.505686
10           overallReligiosityPew2014      0.504812
16                  relLibMarriage2022      0.501656
0                                 WEEK      0.500590
2                          TBIRTH_YEAR      0.500315
11                     relLibScore2022      0.500276
9   antiTransLegislationRiskIndex32023      0.500180
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
14                    relLibHealth2022      0.499969
1                              stateId      0.499648
13                       relLibVax2022      0.497284
15             relLibHealthMandate2022      0.496187
12                      relLibVote2022      0.489382
3                         EGENID_BIRTH      0.112520
                              Features  Coefficients
4                                EEDUC      0.511610
17                      relLibRfra2022      0.505686
10           overallReligiosityPew2014      0.504812
16                  relLibMarriage2022      0.501656
0                                 WEEK      0.500590
2                          TBIRTH_YEAR      0.500315
11                     relLibScore2022      0.500276
9   antiTransLegislationRiskIndex32023      0.500180
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
14                    relLibHealth2022      0.499969
1                              stateId      0.499648
13                       relLibVax2022      0.497284
15             relLibHealthMandate2022      0.496187
12                      relLibVote2022      0.489382
3                         EGENID_BIRTH      0.112520
In [92]:
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014" column
#Statista 2017 and Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Statista 2017 and Pew 2014 Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6802
                              Features  Coefficients
4                                EEDUC      0.510759
16                      relLibRfra2022      0.506450
9   antiTransLegislationRiskIndex32023      0.501529
15                  relLibMarriage2022      0.501230
0                                 WEEK      0.500592
13                    relLibHealth2022      0.500099
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
2                          TBIRTH_YEAR      0.499922
1                              stateId      0.499648
14             relLibHealthMandate2022      0.497649
12                       relLibVax2022      0.494921
10                     relLibScore2022      0.491488
11                      relLibVote2022      0.488917
3                         EGENID_BIRTH      0.109164
                              Features  Coefficients
4                                EEDUC      0.510759
16                      relLibRfra2022      0.506450
9   antiTransLegislationRiskIndex32023      0.501529
15                  relLibMarriage2022      0.501230
0                                 WEEK      0.500592
13                    relLibHealth2022      0.500099
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
2                          TBIRTH_YEAR      0.499922
1                              stateId      0.499648
14             relLibHealthMandate2022      0.497649
12                       relLibVax2022      0.494921
10                     relLibScore2022      0.491488
11                      relLibVote2022      0.488917
3                         EGENID_BIRTH      0.109164
                              Features  Coefficients
4                                EEDUC      0.510759
16                      relLibRfra2022      0.506450
9   antiTransLegislationRiskIndex32023      0.501529
15                  relLibMarriage2022      0.501230
0                                 WEEK      0.500592
13                    relLibHealth2022      0.500099
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.500000
2                          TBIRTH_YEAR      0.499922
1                              stateId      0.499648
14             relLibHealthMandate2022      0.497649
12                       relLibVax2022      0.494921
10                     relLibScore2022      0.491488
11                      relLibVote2022      0.488917
3                         EGENID_BIRTH      0.109164
In [93]:
# Setup for A/B testing
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Religious Liberty Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6761
                              Features  Coefficients
4                                EEDUC      0.510815
13           overallReligiosityPew2014      0.506299
11     moderatelyReligiousStatista2017      0.504494
10           veryReligiousStatista2017      0.501713
9   antiTransLegislationRiskIndex32023      0.500621
0                                 WEEK      0.500606
12            nonreligiousStatista2017      0.500278
2                          TBIRTH_YEAR      0.500069
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.499999
1                              stateId      0.499662
3                         EGENID_BIRTH      0.111579
                              Features  Coefficients
4                                EEDUC      0.510815
13           overallReligiosityPew2014      0.506299
11     moderatelyReligiousStatista2017      0.504494
10           veryReligiousStatista2017      0.501713
9   antiTransLegislationRiskIndex32023      0.500621
0                                 WEEK      0.500606
12            nonreligiousStatista2017      0.500278
2                          TBIRTH_YEAR      0.500069
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.499999
1                              stateId      0.499662
3                         EGENID_BIRTH      0.111579
                              Features  Coefficients
4                                EEDUC      0.510815
13           overallReligiosityPew2014      0.506299
11     moderatelyReligiousStatista2017      0.504494
10           veryReligiousStatista2017      0.501713
9   antiTransLegislationRiskIndex32023      0.500621
0                                 WEEK      0.500606
12            nonreligiousStatista2017      0.500278
2                          TBIRTH_YEAR      0.500069
5                            INCOMEMIN      0.500001
6                  statePopulation2020      0.500000
7                  statePopulation2023      0.500000
8                    transAdultPop2022      0.499999
1                              stateId      0.499662
3                         EGENID_BIRTH      0.111579
In [94]:
# Setup for A/B testing
#,"statePopulation2020","statePopulation2023","transAdultPop2022"
#,"antiTransLegislationRiskIndex32023"
#,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Only Pulse Data")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
Logistic Regression Accuracy: 0.7099
       Features  Coefficients
4         EEDUC      0.509491
0          WEEK      0.500709
5     INCOMEMIN      0.500002
1       stateId      0.499759
2   TBIRTH_YEAR      0.498227
3  EGENID_BIRTH      0.111782
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
  warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
  warnings.warn("Line Search failed")
       Features  Coefficients
4         EEDUC      0.509491
0          WEEK      0.500709
5     INCOMEMIN      0.500002
1       stateId      0.499759
2   TBIRTH_YEAR      0.498227
3  EGENID_BIRTH      0.111782
       Features  Coefficients
4         EEDUC      0.509491
0          WEEK      0.500709
5     INCOMEMIN      0.500002
1       stateId      0.499759
2   TBIRTH_YEAR      0.498227
3  EGENID_BIRTH      0.111782
In [95]:
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
            ,"SEXUAL_ORIENTATION","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState, "Random Forest Feature Importance")
Random Forest (500 Tree) Regression Accuracy: 0.4966
                              Features  Importance
5                   SEXUAL_ORIENTATION    0.283867
4                         EGENID_BIRTH    0.247087
2                          TBIRTH_YEAR    0.115527
0                                 WEEK    0.082819
6                            INCOMEMIN    0.051162
3                                EEDUC    0.041775
1                              stateId    0.024697
8                  statePopulation2023    0.019611
15                     relLibScore2022    0.017565
7                  statePopulation2020    0.014974
11           overallReligiosityPew2014    0.014905
10                   transAdultPop2022    0.014747
18                    relLibHealth2022    0.014445
13     moderatelyReligiousStatista2017    0.011986
12           veryReligiousStatista2017    0.011783
14            nonreligiousStatista2017    0.011340
9   antiTransLegislationRiskIndex32023    0.007486
20                  relLibMarriage2022    0.005403
19             relLibHealthMandate2022    0.002955
21                      relLibRfra2022    0.002738
16                      relLibVote2022    0.002267
17                       relLibVax2022    0.000862
In [96]:
# Setup for A/B testing
# remove "SEXUAL_ORIENTATION" column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
            ,"transAdultPop2022","overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Sexuality Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2711
                              Features  Importance
4                         EGENID_BIRTH    0.239197
2                          TBIRTH_YEAR    0.200776
0                                 WEEK    0.127075
5                            INCOMEMIN    0.087728
3                                EEDUC    0.068879
1                              stateId    0.041188
14                     relLibScore2022    0.030655
7                  statePopulation2023    0.025917
9                    transAdultPop2022    0.023027
10           overallReligiosityPew2014    0.022958
6                  statePopulation2020    0.021619
17                    relLibHealth2022    0.021323
12     moderatelyReligiousStatista2017    0.019417
11           veryReligiousStatista2017    0.018438
13            nonreligiousStatista2017    0.017602
8   antiTransLegislationRiskIndex32023    0.011535
19                  relLibMarriage2022    0.009055
20                      relLibRfra2022    0.004458
18             relLibHealthMandate2022    0.004111
15                      relLibVote2022    0.003277
16                       relLibVax2022    0.001768
In [97]:
# Setup for A/B testing
# remove "EEDUC","INCOMEMIN" column
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","SEXUAL_ORIENTATION"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Education and Income Removed")
Random Forest (500 Tree) Regression Accuracy: 0.4744
                              Features  Importance
4                   SEXUAL_ORIENTATION    0.285457
3                         EGENID_BIRTH    0.248089
2                          TBIRTH_YEAR    0.166847
0                                 WEEK    0.121240
1                              stateId    0.024385
6                  statePopulation2023    0.018840
13                     relLibScore2022    0.017829
16                    relLibHealth2022    0.014760
5                  statePopulation2020    0.014618
7                    transAdultPop2022    0.014434
9            overallReligiosityPew2014    0.014231
10           veryReligiousStatista2017    0.012056
12            nonreligiousStatista2017    0.012010
11     moderatelyReligiousStatista2017    0.011817
8   antiTransLegislationRiskIndex32023    0.008044
18                  relLibMarriage2022    0.005661
17             relLibHealthMandate2022    0.003300
19                      relLibRfra2022    0.002953
14                      relLibVote2022    0.002526
15                       relLibVax2022    0.000904
In [98]:
# Setup for A/B testing
# remove ,"statePopulation2020","statePopulation2023","transAdultPop2022" column
#population table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Population Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2717
                              Features  Importance
3                         EGENID_BIRTH    0.239197
2                          TBIRTH_YEAR    0.200083
0                                 WEEK    0.127960
5                            INCOMEMIN    0.087268
4                                EEDUC    0.069257
1                              stateId    0.054054
11                     relLibScore2022    0.041764
7            overallReligiosityPew2014    0.030534
14                    relLibHealth2022    0.028542
9      moderatelyReligiousStatista2017    0.025326
8            veryReligiousStatista2017    0.024941
10            nonreligiousStatista2017    0.023795
6   antiTransLegislationRiskIndex32023    0.016182
16                  relLibMarriage2022    0.011925
17                      relLibRfra2022    0.006150
15             relLibHealthMandate2022    0.005858
12                      relLibVote2022    0.004419
13                       relLibVax2022    0.002745
In [99]:
# Setup for A/B testing
# remove ,"antiTransLegislationRiskIndex32023" column
#anti-trans legislation table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"overallReligiosityPew2014"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Anti-Trans Legislation Removed")
Random Forest (500 Tree) Regression Accuracy: 0.271
                           Features  Importance
3                      EGENID_BIRTH    0.239197
2                       TBIRTH_YEAR    0.201075
0                              WEEK    0.127463
5                         INCOMEMIN    0.087390
4                             EEDUC    0.069082
1                           stateId    0.042216
13                  relLibScore2022    0.031651
7               statePopulation2023    0.026780
8                 transAdultPop2022    0.024179
9         overallReligiosityPew2014    0.024081
6               statePopulation2020    0.021919
16                 relLibHealth2022    0.021759
11  moderatelyReligiousStatista2017    0.020444
10        veryReligiousStatista2017    0.019535
12         nonreligiousStatista2017    0.019072
18               relLibMarriage2022    0.009757
19                   relLibRfra2022    0.004788
17          relLibHealthMandate2022    0.004328
14                   relLibVote2022    0.003453
15                    relLibVax2022    0.001831
In [100]:
# Setup for A/B testing
# remove ,"overallReligiosityPew2014" column
#Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Pew 2014 Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2723
                              Features  Importance
3                         EGENID_BIRTH    0.239197
2                          TBIRTH_YEAR    0.200986
0                                 WEEK    0.127489
5                            INCOMEMIN    0.087619
4                                EEDUC    0.068820
1                              stateId    0.043777
13                     relLibScore2022    0.032480
7                  statePopulation2023    0.027228
8                    transAdultPop2022    0.024436
16                    relLibHealth2022    0.023153
6                  statePopulation2020    0.023089
12            nonreligiousStatista2017    0.022216
10           veryReligiousStatista2017    0.021502
11     moderatelyReligiousStatista2017    0.020735
9   antiTransLegislationRiskIndex32023    0.012801
18                  relLibMarriage2022    0.009710
19                      relLibRfra2022    0.004750
17             relLibHealthMandate2022    0.004410
14                      relLibVote2022    0.003699
15                       relLibVax2022    0.001903
In [101]:
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017" column
#Statista 2017 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"overallReligiosityPew2014"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Statista 2017 Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2722
                              Features  Importance
3                         EGENID_BIRTH    0.239197
2                          TBIRTH_YEAR    0.201996
0                                 WEEK    0.128081
5                            INCOMEMIN    0.087045
4                                EEDUC    0.068836
1                              stateId    0.048270
11                     relLibScore2022    0.036531
10           overallReligiosityPew2014    0.034922
7                  statePopulation2023    0.031234
8                    transAdultPop2022    0.028197
14                    relLibHealth2022    0.026239
6                  statePopulation2020    0.025789
9   antiTransLegislationRiskIndex32023    0.015637
16                  relLibMarriage2022    0.011671
17                      relLibRfra2022    0.005378
15             relLibHealthMandate2022    0.004677
12                      relLibVote2022    0.004106
13                       relLibVax2022    0.002194
In [102]:
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014" column
#Statista 2017 and Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
            ,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Statista 2017 and Pew 2014 Removed")
Random Forest (500 Tree) Regression Accuracy: 0.274
                              Features  Importance
3                         EGENID_BIRTH    0.239197
2                          TBIRTH_YEAR    0.202868
0                                 WEEK    0.128257
5                            INCOMEMIN    0.087695
4                                EEDUC    0.068477
1                              stateId    0.055199
10                     relLibScore2022    0.040815
7                  statePopulation2023    0.034473
8                    transAdultPop2022    0.031218
13                    relLibHealth2022    0.030359
6                  statePopulation2020    0.028814
9   antiTransLegislationRiskIndex32023    0.020202
15                  relLibMarriage2022    0.013167
16                      relLibRfra2022    0.006054
11                      relLibVote2022    0.005467
14             relLibHealthMandate2022    0.005383
12                       relLibVax2022    0.002356
In [103]:
# Setup for A/B testing
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
            ,"statePopulation2020","statePopulation2023","transAdultPop2022"
            ,"antiTransLegislationRiskIndex32023"
            ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Religious Liberty Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2721
                              Features  Importance
3                         EGENID_BIRTH    0.239197
2                          TBIRTH_YEAR    0.201343
0                                 WEEK    0.129005
5                            INCOMEMIN    0.087782
4                                EEDUC    0.069254
1                              stateId    0.054400
7                  statePopulation2023    0.034165
13           overallReligiosityPew2014    0.031626
8                    transAdultPop2022    0.031116
6                  statePopulation2020    0.029118
11     moderatelyReligiousStatista2017    0.026989
10           veryReligiousStatista2017    0.025652
12            nonreligiousStatista2017    0.024772
9   antiTransLegislationRiskIndex32023    0.015582
In [104]:
# Setup for A/B testing
#,"statePopulation2020","statePopulation2023","transAdultPop2022"
#,"antiTransLegislationRiskIndex32023"
#,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Only Pulse Data")
Random Forest (500 Tree) Regression Accuracy: 0.2661
       Features  Importance
3  EGENID_BIRTH    0.239197
2   TBIRTH_YEAR    0.226661
1       stateId    0.201604
0          WEEK    0.152386
5     INCOMEMIN    0.098260
4         EEDUC    0.081893